md/raid10: some tidying up in fix_read_error
[pandora-kernel.git] / drivers / md / raid10.c
index 747d061..8e4f469 100644 (file)
@@ -5,7 +5,7 @@
  *
  * RAID-10 support for md.
  *
- * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ * Base on code in raid1.c.  See raid1.c for further copyright information.
  *
  *
  * This program is free software; you can redistribute it and/or modify
  */
 #define        NR_RAID10_BIOS 256
 
-static void unplug_slaves(mddev_t *mddev);
-
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
        conf_t *conf = data;
-       r10bio_t *r10_bio;
        int size = offsetof(struct r10bio_s, devs[conf->copies]);
 
        /* allocate a r10bio with room for raid_disks entries in the bios array */
-       r10_bio = kzalloc(size, gfp_flags);
-       if (!r10_bio && conf->mddev)
-               unplug_slaves(conf->mddev);
-
-       return r10_bio;
+       return kzalloc(size, gfp_flags);
 }
 
 static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
        int nalloc;
 
        r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-       if (!r10_bio) {
-               unplug_slaves(conf->mddev);
+       if (!r10_bio)
                return NULL;
-       }
 
        if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
                nalloc = conf->copies; /* resync */
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
                raid_end_bio_io(r10_bio);
+               rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
        } else {
                /*
-                * oops, read error:
+                * oops, read error - keep the refcount on the rdev
                 */
                char b[BDEVNAME_SIZE];
                if (printk_ratelimit())
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
                               bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
                reschedule_retry(r10_bio);
        }
-
-       rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 
 static void raid10_end_write_request(struct bio *bio, int error)
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
 
 /*
  * RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
  * parameters: near_copies and far_copies.
  * near_copies * far_copies must be <= raid_disks.
  * Normally one of these will be 1.
  * If both are 1, we get raid0.
  * If near_copies == raid_disks, we get raid1.
  *
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
  * first chunk, followed by near_copies copies of the next chunk and
  * so on.
  * If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 {
        const sector_t this_sector = r10_bio->sector;
-       int disk, slot, nslot;
+       int disk, slot;
        const int sectors = r10_bio->sectors;
-       sector_t new_distance, current_distance;
+       sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
+       int do_balance;
+       int best_slot;
 
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
+retry:
+       best_slot = -1;
+       best_dist = MaxSector;
+       do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
         * device if no resync is going on (recovery is ok), or below
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
         * above the resync window.
         */
        if (conf->mddev->recovery_cp < MaxSector
-           && (this_sector + sectors >= conf->next_resync)) {
-               /* make sure that disk is operational */
-               slot = 0;
-               disk = r10_bio->devs[slot].devnum;
-
-               while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-                      r10_bio->devs[slot].bio == IO_BLOCKED ||
-                      !test_bit(In_sync, &rdev->flags)) {
-                       slot++;
-                       if (slot == conf->copies) {
-                               slot = 0;
-                               disk = -1;
-                               break;
-                       }
-                       disk = r10_bio->devs[slot].devnum;
-               }
-               goto rb_out;
-       }
+           && (this_sector + sectors >= conf->next_resync))
+               do_balance = 0;
 
-
-       /* make sure the disk is operational */
-       slot = 0;
-       disk = r10_bio->devs[slot].devnum;
-       while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-              r10_bio->devs[slot].bio == IO_BLOCKED ||
-              !test_bit(In_sync, &rdev->flags)) {
-               slot ++;
-               if (slot == conf->copies) {
-                       disk = -1;
-                       goto rb_out;
-               }
+       for (slot = 0; slot < conf->copies ; slot++) {
+               if (r10_bio->devs[slot].bio == IO_BLOCKED)
+                       continue;
                disk = r10_bio->devs[slot].devnum;
-       }
-
-
-       current_distance = abs(r10_bio->devs[slot].addr -
-                              conf->mirrors[disk].head_position);
-
-       /* Find the disk whose head is closest,
-        * or - for far > 1 - find the closest to partition beginning */
-
-       for (nslot = slot; nslot < conf->copies; nslot++) {
-               int ndisk = r10_bio->devs[nslot].devnum;
-
-
-               if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
-                   r10_bio->devs[nslot].bio == IO_BLOCKED ||
-                   !test_bit(In_sync, &rdev->flags))
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (rdev == NULL)
                        continue;
+               if (!test_bit(In_sync, &rdev->flags))
+                       continue;
+
+               if (!do_balance)
+                       break;
 
                /* This optimisation is debatable, and completely destroys
                 * sequential read speed for 'far copies' arrays.  So only
                 * keep it for 'near' arrays, and review those later.
                 */
-               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
-                       disk = ndisk;
-                       slot = nslot;
+               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
                        break;
-               }
 
                /* for far > 1 always use the lowest address */
                if (conf->far_copies > 1)
-                       new_distance = r10_bio->devs[nslot].addr;
+                       new_distance = r10_bio->devs[slot].addr;
                else
-                       new_distance = abs(r10_bio->devs[nslot].addr -
-                                          conf->mirrors[ndisk].head_position);
-               if (new_distance < current_distance) {
-                       current_distance = new_distance;
-                       disk = ndisk;
-                       slot = nslot;
+                       new_distance = abs(r10_bio->devs[slot].addr -
+                                          conf->mirrors[disk].head_position);
+               if (new_distance < best_dist) {
+                       best_dist = new_distance;
+                       best_slot = slot;
                }
        }
+       if (slot == conf->copies)
+               slot = best_slot;
 
-rb_out:
-       r10_bio->read_slot = slot;
-/*     conf->next_seq_sect = this_sector + sectors;*/
-
-       if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
-               atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-       else
+       if (slot >= 0) {
+               disk = r10_bio->devs[slot].devnum;
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (!rdev)
+                       goto retry;
+               atomic_inc(&rdev->nr_pending);
+               if (test_bit(Faulty, &rdev->flags)) {
+                       /* Cannot risk returning a device that failed
+                        * before we inc'ed nr_pending
+                        */
+                       rdev_dec_pending(rdev, conf->mddev);
+                       goto retry;
+               }
+               r10_bio->read_slot = slot;
+       } else
                disk = -1;
        rcu_read_unlock();
 
        return disk;
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-       conf_t *conf = mddev->private;
-       int i;
-
-       rcu_read_lock();
-       for (i=0; i < conf->raid_disks; i++) {
-               mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-               if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                       struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-                       atomic_inc(&rdev->nr_pending);
-                       rcu_read_unlock();
-
-                       blk_unplug(r_queue);
-
-                       rdev_dec_pending(rdev, mddev);
-                       rcu_read_lock();
-               }
-       }
-       rcu_read_unlock();
-}
-
-static void raid10_unplug(struct request_queue *q)
-{
-       mddev_t *mddev = q->queuedata;
-
-       unplug_slaves(q->queuedata);
-       md_wakeup_thread(mddev->thread);
-}
-
 static int raid10_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
@@ -649,23 +586,16 @@ static int raid10_congested(void *data, int bits)
        return ret;
 }
 
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
 {
        /* Any writes that have been queued but are awaiting
         * bitmap updates get flushed here.
-        * We return 1 if any requests were actually submitted.
         */
-       int rv = 0;
-
        spin_lock_irq(&conf->device_lock);
 
        if (conf->pending_bio_list.head) {
                struct bio *bio;
                bio = bio_list_get(&conf->pending_bio_list);
-               /* Spinlock only taken to quiet a warning */
-               spin_lock(conf->mddev->queue->queue_lock);
-               blk_remove_plug(conf->mddev->queue);
-               spin_unlock(conf->mddev->queue->queue_lock);
                spin_unlock_irq(&conf->device_lock);
                /* flush any pending bitmap writes to disk
                 * before proceeding w/ I/O */
@@ -677,11 +607,10 @@ static int flush_pending_writes(conf_t *conf)
                        generic_make_request(bio);
                        bio = next;
                }
-               rv = 1;
        } else
                spin_unlock_irq(&conf->device_lock);
-       return rv;
 }
+
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
@@ -711,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force)
 
        /* Wait until no block IO is waiting (unless 'force') */
        wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-                           conf->resync_lock,
-                           raid10_unplug(conf->mddev->queue));
+                           conf->resync_lock, );
 
        /* block any new IO from starting */
        conf->barrier++;
 
-       /* No wait for all pending IO to complete */
+       /* Now wait for all pending IO to complete */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-                           conf->resync_lock,
-                           raid10_unplug(conf->mddev->queue));
+                           conf->resync_lock, );
 
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -742,7 +669,7 @@ static void wait_barrier(conf_t *conf)
                conf->nr_waiting++;
                wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
                                    conf->resync_lock,
-                                   raid10_unplug(conf->mddev->queue));
+                                   );
                conf->nr_waiting--;
        }
        conf->nr_pending++;
@@ -778,8 +705,8 @@ static void freeze_array(conf_t *conf)
        wait_event_lock_irq(conf->wait_barrier,
                            conf->nr_pending == conf->nr_queued+1,
                            conf->resync_lock,
-                           ({ flush_pending_writes(conf);
-                              raid10_unplug(conf->mddev->queue); }));
+                           flush_pending_writes(conf));
+
        spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -806,6 +733,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
        unsigned long flags;
        mdk_rdev_t *blocked_rdev;
+       int plugged;
 
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
@@ -914,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
         */
+       plugged = mddev_check_plugged(mddev);
+
        raid10_find_phys(conf, r10_bio);
  retry_write:
        blocked_rdev = NULL;
@@ -974,7 +904,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                atomic_inc(&r10_bio->remaining);
                spin_lock_irqsave(&conf->device_lock, flags);
                bio_list_add(&conf->pending_bio_list, mbio);
-               blk_plug_device_unlocked(mddev->queue);
                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
 
@@ -991,9 +920,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        /* In case raid10d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
 
-       if (do_sync)
+       if (do_sync || !mddev->bitmap || !plugged)
                md_wakeup_thread(mddev->thread);
-
        return 0;
 }
 
@@ -1233,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
                        p->rdev = rdev;
                        goto abort;
                }
-               md_integrity_register(mddev);
+               err = md_integrity_register(mddev);
        }
 abort:
 
@@ -1509,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
        int d = r10_bio->devs[r10_bio->read_slot].devnum;
 
-       rcu_read_lock();
-       rdev = rcu_dereference(conf->mirrors[d].rdev);
-       if (rdev) { /* If rdev is not NULL */
-               char b[BDEVNAME_SIZE];
-               int cur_read_error_count = 0;
+       /* still own a reference to this rdev, so it cannot
+        * have been cleared recently.
+        */
+       rdev = conf->mirrors[d].rdev;
 
-               bdevname(rdev->bdev, b);
+       if (test_bit(Faulty, &rdev->flags))
+               /* drive has already been failed, just ignore any
+                  more fix_read_error() attempts */
+               return;
 
-               if (test_bit(Faulty, &rdev->flags)) {
-                       rcu_read_unlock();
-                       /* drive has already been failed, just ignore any
-                          more fix_read_error() attempts */
-                       return;
-               }
+       check_decay_read_errors(mddev, rdev);
+       atomic_inc(&rdev->read_errors);
+       if (atomic_read(&rdev->read_errors) > max_read_errors) {
+               char b[BDEVNAME_SIZE];
+               bdevname(rdev->bdev, b);
 
-               check_decay_read_errors(mddev, rdev);
-               atomic_inc(&rdev->read_errors);
-               cur_read_error_count = atomic_read(&rdev->read_errors);
-               if (cur_read_error_count > max_read_errors) {
-                       rcu_read_unlock();
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Raid device exceeded "
-                              "read_error threshold "
-                              "[cur %d:max %d]\n",
-                              mdname(mddev),
-                              b, cur_read_error_count, max_read_errors);
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Failing raid "
-                              "device\n", mdname(mddev), b);
-                       md_error(mddev, conf->mirrors[d].rdev);
-                       return;
-               }
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Raid device exceeded "
+                      "read_error threshold [cur %d:max %d]\n",
+                      mdname(mddev), b,
+                      atomic_read(&rdev->read_errors), max_read_errors);
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Failing raid device\n",
+                      mdname(mddev), b);
+               md_error(mddev, conf->mirrors[d].rdev);
+               return;
        }
-       rcu_read_unlock();
 
        while(sectors) {
                int s = sectors;
@@ -1611,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "write failed"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                              rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                               "drive\n",
@@ -1648,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "corrected sectors"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
                                               mdname(mddev),
@@ -1661,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "md/raid10:%s: read error corrected"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                }
 
@@ -1684,15 +1605,16 @@ static void raid10d(mddev_t *mddev)
        unsigned long flags;
        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
-       int unplug=0;
        mdk_rdev_t *rdev;
+       struct blk_plug plug;
 
        md_check_recovery(mddev);
 
+       blk_start_plug(&plug);
        for (;;) {
                char b[BDEVNAME_SIZE];
 
-               unplug += flush_pending_writes(conf);
+               flush_pending_writes(conf);
 
                spin_lock_irqsave(&conf->device_lock, flags);
                if (list_empty(head)) {
@@ -1706,14 +1628,13 @@ static void raid10d(mddev_t *mddev)
 
                mddev = r10_bio->mddev;
                conf = mddev->private;
-               if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+               if (test_bit(R10BIO_IsSync, &r10_bio->state))
                        sync_request_write(mddev, r10_bio);
-                       unplug = 1;
-               } else  if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+               else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                        recovery_request_write(mddev, r10_bio);
-                       unplug = 1;
-               } else {
-                       int mirror;
+               else {
+                       int slot = r10_bio->read_slot;
+                       int mirror = r10_bio->devs[slot].devnum;
                        /* we got a read error. Maybe the drive is bad.  Maybe just
                         * the block and we can fix it.
                         * We freeze all other IO, and try reading the block from
@@ -1727,6 +1648,7 @@ static void raid10d(mddev_t *mddev)
                                fix_read_error(conf, mddev, r10_bio);
                                unfreeze_array(conf);
                        }
+                       rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
 
                        bio = r10_bio->devs[r10_bio->read_slot].bio;
                        r10_bio->devs[r10_bio->read_slot].bio =
@@ -1759,14 +1681,12 @@ static void raid10d(mddev_t *mddev)
                                bio->bi_rw = READ | do_sync;
                                bio->bi_private = r10_bio;
                                bio->bi_end_io = raid10_end_read_request;
-                               unplug = 1;
                                generic_make_request(bio);
                        }
                }
                cond_resched();
        }
-       if (unplug)
-               unplug_slaves(mddev);
+       blk_finish_plug(&plug);
 }
 
 
@@ -2377,7 +2297,6 @@ static int run(mddev_t *mddev)
        md_set_array_sectors(mddev, size);
        mddev->resync_max_sectors = size;
 
-       mddev->queue->unplug_fn = raid10_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid10_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
 
@@ -2395,7 +2314,10 @@ static int run(mddev_t *mddev)
 
        if (conf->near_copies < conf->raid_disks)
                blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-       md_integrity_register(mddev);
+
+       if (md_integrity_register(mddev))
+               goto out_free_conf;
+
        return 0;
 
 out_free_conf: