md/raid10: some tidying up in fix_read_error
[pandora-kernel.git] / drivers / md / raid10.c
index 8e94626..8e4f469 100644 (file)
@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
                raid_end_bio_io(r10_bio);
+               rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
        } else {
                /*
-                * oops, read error:
+                * oops, read error - keep the refcount on the rdev
                 */
                char b[BDEVNAME_SIZE];
                if (printk_ratelimit())
@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
                               bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
                reschedule_retry(r10_bio);
        }
-
-       rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 
 static void raid10_end_write_request(struct bio *bio, int error)
@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 {
        const sector_t this_sector = r10_bio->sector;
-       int disk, slot, nslot;
+       int disk, slot;
        const int sectors = r10_bio->sectors;
-       sector_t new_distance, current_distance;
+       sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
+       int do_balance;
+       int best_slot;
 
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
+retry:
+       best_slot = -1;
+       best_dist = MaxSector;
+       do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
         * device if no resync is going on (recovery is ok), or below
@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
         * above the resync window.
         */
        if (conf->mddev->recovery_cp < MaxSector
-           && (this_sector + sectors >= conf->next_resync)) {
-               /* make sure that disk is operational */
-               slot = 0;
-               disk = r10_bio->devs[slot].devnum;
+           && (this_sector + sectors >= conf->next_resync))
+               do_balance = 0;
 
-               while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-                      r10_bio->devs[slot].bio == IO_BLOCKED ||
-                      !test_bit(In_sync, &rdev->flags)) {
-                       slot++;
-                       if (slot == conf->copies) {
-                               slot = 0;
-                               disk = -1;
-                               break;
-                       }
-                       disk = r10_bio->devs[slot].devnum;
-               }
-               goto rb_out;
-       }
-
-
-       /* make sure the disk is operational */
-       slot = 0;
-       disk = r10_bio->devs[slot].devnum;
-       while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-              r10_bio->devs[slot].bio == IO_BLOCKED ||
-              !test_bit(In_sync, &rdev->flags)) {
-               slot ++;
-               if (slot == conf->copies) {
-                       disk = -1;
-                       goto rb_out;
-               }
+       for (slot = 0; slot < conf->copies ; slot++) {
+               if (r10_bio->devs[slot].bio == IO_BLOCKED)
+                       continue;
                disk = r10_bio->devs[slot].devnum;
-       }
-
-
-       current_distance = abs(r10_bio->devs[slot].addr -
-                              conf->mirrors[disk].head_position);
-
-       /* Find the disk whose head is closest,
-        * or - for far > 1 - find the closest to partition beginning */
-
-       for (nslot = slot; nslot < conf->copies; nslot++) {
-               int ndisk = r10_bio->devs[nslot].devnum;
-
-
-               if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
-                   r10_bio->devs[nslot].bio == IO_BLOCKED ||
-                   !test_bit(In_sync, &rdev->flags))
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (rdev == NULL)
+                       continue;
+               if (!test_bit(In_sync, &rdev->flags))
                        continue;
 
+               if (!do_balance)
+                       break;
+
                /* This optimisation is debatable, and completely destroys
                 * sequential read speed for 'far copies' arrays.  So only
                 * keep it for 'near' arrays, and review those later.
                 */
-               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
-                       disk = ndisk;
-                       slot = nslot;
+               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
                        break;
-               }
 
                /* for far > 1 always use the lowest address */
                if (conf->far_copies > 1)
-                       new_distance = r10_bio->devs[nslot].addr;
+                       new_distance = r10_bio->devs[slot].addr;
                else
-                       new_distance = abs(r10_bio->devs[nslot].addr -
-                                          conf->mirrors[ndisk].head_position);
-               if (new_distance < current_distance) {
-                       current_distance = new_distance;
-                       disk = ndisk;
-                       slot = nslot;
+                       new_distance = abs(r10_bio->devs[slot].addr -
+                                          conf->mirrors[disk].head_position);
+               if (new_distance < best_dist) {
+                       best_dist = new_distance;
+                       best_slot = slot;
                }
        }
+       if (slot == conf->copies)
+               slot = best_slot;
 
-rb_out:
-       r10_bio->read_slot = slot;
-/*     conf->next_seq_sect = this_sector + sectors;*/
-
-       if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
-               atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-       else
+       if (slot >= 0) {
+               disk = r10_bio->devs[slot].devnum;
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (!rdev)
+                       goto retry;
+               atomic_inc(&rdev->nr_pending);
+               if (test_bit(Faulty, &rdev->flags)) {
+                       /* Cannot risk returning a device that failed
+                        * before we inc'ed nr_pending
+                        */
+                       rdev_dec_pending(rdev, conf->mddev);
+                       goto retry;
+               }
+               r10_bio->read_slot = slot;
+       } else
                disk = -1;
        rcu_read_unlock();
 
@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
        int d = r10_bio->devs[r10_bio->read_slot].devnum;
 
-       rcu_read_lock();
-       rdev = rcu_dereference(conf->mirrors[d].rdev);
-       if (rdev) { /* If rdev is not NULL */
-               char b[BDEVNAME_SIZE];
-               int cur_read_error_count = 0;
+       /* still own a reference to this rdev, so it cannot
+        * have been cleared recently.
+        */
+       rdev = conf->mirrors[d].rdev;
 
-               bdevname(rdev->bdev, b);
+       if (test_bit(Faulty, &rdev->flags))
+               /* drive has already been failed, just ignore any
+                  more fix_read_error() attempts */
+               return;
 
-               if (test_bit(Faulty, &rdev->flags)) {
-                       rcu_read_unlock();
-                       /* drive has already been failed, just ignore any
-                          more fix_read_error() attempts */
-                       return;
-               }
+       check_decay_read_errors(mddev, rdev);
+       atomic_inc(&rdev->read_errors);
+       if (atomic_read(&rdev->read_errors) > max_read_errors) {
+               char b[BDEVNAME_SIZE];
+               bdevname(rdev->bdev, b);
 
-               check_decay_read_errors(mddev, rdev);
-               atomic_inc(&rdev->read_errors);
-               cur_read_error_count = atomic_read(&rdev->read_errors);
-               if (cur_read_error_count > max_read_errors) {
-                       rcu_read_unlock();
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Raid device exceeded "
-                              "read_error threshold "
-                              "[cur %d:max %d]\n",
-                              mdname(mddev),
-                              b, cur_read_error_count, max_read_errors);
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Failing raid "
-                              "device\n", mdname(mddev), b);
-                       md_error(mddev, conf->mirrors[d].rdev);
-                       return;
-               }
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Raid device exceeded "
+                      "read_error threshold [cur %d:max %d]\n",
+                      mdname(mddev), b,
+                      atomic_read(&rdev->read_errors), max_read_errors);
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Failing raid device\n",
+                      mdname(mddev), b);
+               md_error(mddev, conf->mirrors[d].rdev);
+               return;
        }
-       rcu_read_unlock();
 
        while(sectors) {
                int s = sectors;
@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "write failed"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                              rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                               "drive\n",
@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "corrected sectors"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
                                               mdname(mddev),
@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "md/raid10:%s: read error corrected"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                }
 
@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev)
                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                        recovery_request_write(mddev, r10_bio);
                else {
-                       int mirror;
+                       int slot = r10_bio->read_slot;
+                       int mirror = r10_bio->devs[slot].devnum;
                        /* we got a read error. Maybe the drive is bad.  Maybe just
                         * the block and we can fix it.
                         * We freeze all other IO, and try reading the block from
@@ -1677,6 +1648,7 @@ static void raid10d(mddev_t *mddev)
                                fix_read_error(conf, mddev, r10_bio);
                                unfreeze_array(conf);
                        }
+                       rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
 
                        bio = r10_bio->devs[r10_bio->read_slot].bio;
                        r10_bio->devs[r10_bio->read_slot].bio =