md/raid10: some tidying up in fix_read_error

[pandora-kernel.git] / drivers / md / raid10.c
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index 747d061..8e4f469 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
   *
   * RAID-10 support for md.
   *
- * Base on code in raid1.c.  See raid1.c for futher copyright information.
+ * Base on code in raid1.c.  See raid1.c for further copyright information.
   *
   *
   * This program is free software; you can redistribute it and/or modify
@@ -57,23 +57,16 @@
   */
  #define        NR_RAID10_BIOS 256
  
-static void unplug_slaves(mddev_t *mddev);
-
  static void allow_barrier(conf_t *conf);
  static void lower_barrier(conf_t *conf);
  
  static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
  {
         conf_t *conf = data;
-       r10bio_t *r10_bio;
         int size = offsetof(struct r10bio_s, devs[conf->copies]);
  
         /* allocate a r10bio with room for raid_disks entries in the bios array */
-       r10_bio = kzalloc(size, gfp_flags);
-       if (!r10_bio && conf->mddev)
-               unplug_slaves(conf->mddev);
-
-       return r10_bio;
+       return kzalloc(size, gfp_flags);
  }
  
  static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
         int nalloc;
  
         r10_bio = r10bio_pool_alloc(gfp_flags, conf);
-       if (!r10_bio) {
-               unplug_slaves(conf->mddev);
+       if (!r10_bio)
                 return NULL;
-       }
  
         if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
                 nalloc = conf->copies; /* resync */
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
                  */
                 set_bit(R10BIO_Uptodate, &r10_bio->state);
                 raid_end_bio_io(r10_bio);
+               rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
         } else {
                 /*
-                * oops, read error:
+                * oops, read error - keep the refcount on the rdev
                  */
                 char b[BDEVNAME_SIZE];
                 if (printk_ratelimit())
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
                                bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
                 reschedule_retry(r10_bio);
         }
-
-       rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
  }
  
  static void raid10_end_write_request(struct bio *bio, int error)
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
  
  /*
   * RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
   * parameters: near_copies and far_copies.
   * near_copies * far_copies must be <= raid_disks.
   * Normally one of these will be 1.
   * If both are 1, we get raid0.
   * If near_copies == raid_disks, we get raid1.
   *
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
   * first chunk, followed by near_copies copies of the next chunk and
   * so on.
   * If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
  static int read_balance(conf_t *conf, r10bio_t *r10_bio)
  {
         const sector_t this_sector = r10_bio->sector;
-       int disk, slot, nslot;
+       int disk, slot;
         const int sectors = r10_bio->sectors;
-       sector_t new_distance, current_distance;
+       sector_t new_distance, best_dist;
         mdk_rdev_t *rdev;
+       int do_balance;
+       int best_slot;
  
         raid10_find_phys(conf, r10_bio);
         rcu_read_lock();
+retry:
+       best_slot = -1;
+       best_dist = MaxSector;
+       do_balance = 1;
         /*
          * Check if we can balance. We can balance on the whole
          * device if no resync is going on (recovery is ok), or below
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
          * above the resync window.
          */
         if (conf->mddev->recovery_cp < MaxSector
-           && (this_sector + sectors >= conf->next_resync)) {
-               /* make sure that disk is operational */
-               slot = 0;
-               disk = r10_bio->devs[slot].devnum;
-
-               while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-                      r10_bio->devs[slot].bio == IO_BLOCKED ||
-                      !test_bit(In_sync, &rdev->flags)) {
-                       slot++;
-                       if (slot == conf->copies) {
-                               slot = 0;
-                               disk = -1;
-                               break;
-                       }
-                       disk = r10_bio->devs[slot].devnum;
-               }
-               goto rb_out;
-       }
+           && (this_sector + sectors >= conf->next_resync))
+               do_balance = 0;
  
-
-       /* make sure the disk is operational */
-       slot = 0;
-       disk = r10_bio->devs[slot].devnum;
-       while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-              r10_bio->devs[slot].bio == IO_BLOCKED ||
-              !test_bit(In_sync, &rdev->flags)) {
-               slot ++;
-               if (slot == conf->copies) {
-                       disk = -1;
-                       goto rb_out;
-               }
+       for (slot = 0; slot < conf->copies ; slot++) {
+               if (r10_bio->devs[slot].bio == IO_BLOCKED)
+                       continue;
                 disk = r10_bio->devs[slot].devnum;
-       }
-
-
-       current_distance = abs(r10_bio->devs[slot].addr -
-                              conf->mirrors[disk].head_position);
-
-       /* Find the disk whose head is closest,
-        * or - for far > 1 - find the closest to partition beginning */
-
-       for (nslot = slot; nslot < conf->copies; nslot++) {
-               int ndisk = r10_bio->devs[nslot].devnum;
-
-
-               if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
-                   r10_bio->devs[nslot].bio == IO_BLOCKED ||
-                   !test_bit(In_sync, &rdev->flags))
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (rdev == NULL)
                         continue;
+               if (!test_bit(In_sync, &rdev->flags))
+                       continue;
+
+               if (!do_balance)
+                       break;
  
                 /* This optimisation is debatable, and completely destroys
                  * sequential read speed for 'far copies' arrays.  So only
                  * keep it for 'near' arrays, and review those later.
                  */
-               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
-                       disk = ndisk;
-                       slot = nslot;
+               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
                         break;
-               }
  
                 /* for far > 1 always use the lowest address */
                 if (conf->far_copies > 1)
-                       new_distance = r10_bio->devs[nslot].addr;
+                       new_distance = r10_bio->devs[slot].addr;
                 else
-                       new_distance = abs(r10_bio->devs[nslot].addr -
-                                          conf->mirrors[ndisk].head_position);
-               if (new_distance < current_distance) {
-                       current_distance = new_distance;
-                       disk = ndisk;
-                       slot = nslot;
+                       new_distance = abs(r10_bio->devs[slot].addr -
+                                          conf->mirrors[disk].head_position);
+               if (new_distance < best_dist) {
+                       best_dist = new_distance;
+                       best_slot = slot;
                 }
         }
+       if (slot == conf->copies)
+               slot = best_slot;
  
-rb_out:
-       r10_bio->read_slot = slot;
-/*     conf->next_seq_sect = this_sector + sectors;*/
-
-       if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
-               atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-       else
+       if (slot >= 0) {
+               disk = r10_bio->devs[slot].devnum;
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (!rdev)
+                       goto retry;
+               atomic_inc(&rdev->nr_pending);
+               if (test_bit(Faulty, &rdev->flags)) {
+                       /* Cannot risk returning a device that failed
+                        * before we inc'ed nr_pending
+                        */
+                       rdev_dec_pending(rdev, conf->mddev);
+                       goto retry;
+               }
+               r10_bio->read_slot = slot;
+       } else
                 disk = -1;
         rcu_read_unlock();
  
         return disk;
  }
  
-static void unplug_slaves(mddev_t *mddev)
-{
-       conf_t *conf = mddev->private;
-       int i;
-
-       rcu_read_lock();
-       for (i=0; i < conf->raid_disks; i++) {
-               mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
-               if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                       struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-                       atomic_inc(&rdev->nr_pending);
-                       rcu_read_unlock();
-
-                       blk_unplug(r_queue);
-
-                       rdev_dec_pending(rdev, mddev);
-                       rcu_read_lock();
-               }
-       }
-       rcu_read_unlock();
-}
-
-static void raid10_unplug(struct request_queue *q)
-{
-       mddev_t *mddev = q->queuedata;
-
-       unplug_slaves(q->queuedata);
-       md_wakeup_thread(mddev->thread);
-}
-
  static int raid10_congested(void *data, int bits)
  {
         mddev_t *mddev = data;
@@ -649,23 +586,16 @@ static int raid10_congested(void *data, int bits)
         return ret;
  }
  
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
  {
         /* Any writes that have been queued but are awaiting
          * bitmap updates get flushed here.
-        * We return 1 if any requests were actually submitted.
          */
-       int rv = 0;
-
         spin_lock_irq(&conf->device_lock);
  
         if (conf->pending_bio_list.head) {
                 struct bio *bio;
                 bio = bio_list_get(&conf->pending_bio_list);
-               /* Spinlock only taken to quiet a warning */
-               spin_lock(conf->mddev->queue->queue_lock);
-               blk_remove_plug(conf->mddev->queue);
-               spin_unlock(conf->mddev->queue->queue_lock);
                 spin_unlock_irq(&conf->device_lock);
                 /* flush any pending bitmap writes to disk
                  * before proceeding w/ I/O */
@@ -677,11 +607,10 @@ static int flush_pending_writes(conf_t *conf)
                         generic_make_request(bio);
                         bio = next;
                 }
-               rv = 1;
         } else
                 spin_unlock_irq(&conf->device_lock);
-       return rv;
  }
+
  /* Barriers....
   * Sometimes we need to suspend IO while we do something else,
   * either some resync/recovery, or reconfigure the array.
@@ -711,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force)
  
         /* Wait until no block IO is waiting (unless 'force') */
         wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-                           conf->resync_lock,
-                           raid10_unplug(conf->mddev->queue));
+                           conf->resync_lock, );
  
         /* block any new IO from starting */
         conf->barrier++;
  
-       /* No wait for all pending IO to complete */
+       /* Now wait for all pending IO to complete */
         wait_event_lock_irq(conf->wait_barrier,
                             !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
-                           conf->resync_lock,
-                           raid10_unplug(conf->mddev->queue));
+                           conf->resync_lock, );
  
         spin_unlock_irq(&conf->resync_lock);
  }
@@ -742,7 +669,7 @@ static void wait_barrier(conf_t *conf)
                 conf->nr_waiting++;
                 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
                                     conf->resync_lock,
-                                   raid10_unplug(conf->mddev->queue));
+                                   );
                 conf->nr_waiting--;
         }
         conf->nr_pending++;
@@ -778,8 +705,8 @@ static void freeze_array(conf_t *conf)
         wait_event_lock_irq(conf->wait_barrier,
                             conf->nr_pending == conf->nr_queued+1,
                             conf->resync_lock,
-                           ({ flush_pending_writes(conf);
-                              raid10_unplug(conf->mddev->queue); }));
+                           flush_pending_writes(conf));
+
         spin_unlock_irq(&conf->resync_lock);
  }
  
@@ -806,6 +733,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
         const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
         unsigned long flags;
         mdk_rdev_t *blocked_rdev;
+       int plugged;
  
         if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                 md_flush_request(mddev, bio);
@@ -914,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
          * inc refcount on their rdev.  Record them by setting
          * bios[x] to bio
          */
+       plugged = mddev_check_plugged(mddev);
+
         raid10_find_phys(conf, r10_bio);
   retry_write:
         blocked_rdev = NULL;
@@ -974,7 +904,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                 atomic_inc(&r10_bio->remaining);
                 spin_lock_irqsave(&conf->device_lock, flags);
                 bio_list_add(&conf->pending_bio_list, mbio);
-               blk_plug_device_unlocked(mddev->queue);
                 spin_unlock_irqrestore(&conf->device_lock, flags);
         }
  
@@ -991,9 +920,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
         /* In case raid10d snuck in to freeze_array */
         wake_up(&conf->wait_barrier);
  
-       if (do_sync)
+       if (do_sync || !mddev->bitmap || !plugged)
                 md_wakeup_thread(mddev->thread);
-
         return 0;
  }
  
@@ -1233,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
                         p->rdev = rdev;
                         goto abort;
                 }
-               md_integrity_register(mddev);
+               err = md_integrity_register(mddev);
         }
  abort:
  
@@ -1509,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
         int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
         int d = r10_bio->devs[r10_bio->read_slot].devnum;
  
-       rcu_read_lock();
-       rdev = rcu_dereference(conf->mirrors[d].rdev);
-       if (rdev) { /* If rdev is not NULL */
-               char b[BDEVNAME_SIZE];
-               int cur_read_error_count = 0;
+       /* still own a reference to this rdev, so it cannot
+        * have been cleared recently.
+        */
+       rdev = conf->mirrors[d].rdev;
  
-               bdevname(rdev->bdev, b);
+       if (test_bit(Faulty, &rdev->flags))
+               /* drive has already been failed, just ignore any
+                  more fix_read_error() attempts */
+               return;
  
-               if (test_bit(Faulty, &rdev->flags)) {
-                       rcu_read_unlock();
-                       /* drive has already been failed, just ignore any
-                          more fix_read_error() attempts */
-                       return;
-               }
+       check_decay_read_errors(mddev, rdev);
+       atomic_inc(&rdev->read_errors);
+       if (atomic_read(&rdev->read_errors) > max_read_errors) {
+               char b[BDEVNAME_SIZE];
+               bdevname(rdev->bdev, b);
  
-               check_decay_read_errors(mddev, rdev);
-               atomic_inc(&rdev->read_errors);
-               cur_read_error_count = atomic_read(&rdev->read_errors);
-               if (cur_read_error_count > max_read_errors) {
-                       rcu_read_unlock();
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Raid device exceeded "
-                              "read_error threshold "
-                              "[cur %d:max %d]\n",
-                              mdname(mddev),
-                              b, cur_read_error_count, max_read_errors);
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Failing raid "
-                              "device\n", mdname(mddev), b);
-                       md_error(mddev, conf->mirrors[d].rdev);
-                       return;
-               }
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Raid device exceeded "
+                      "read_error threshold [cur %d:max %d]\n",
+                      mdname(mddev), b,
+                      atomic_read(&rdev->read_errors), max_read_errors);
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Failing raid device\n",
+                      mdname(mddev), b);
+               md_error(mddev, conf->mirrors[d].rdev);
+               return;
         }
-       rcu_read_unlock();
  
         while(sectors) {
                 int s = sectors;
@@ -1611,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                                "write failed"
                                                " (%d sectors at %llu on %s)\n",
                                                mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                              rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                                bdevname(rdev->bdev, b));
                                         printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                                "drive\n",
@@ -1648,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                                "corrected sectors"
                                                " (%d sectors at %llu on %s)\n",
                                                mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                                bdevname(rdev->bdev, b));
                                         printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
                                                mdname(mddev),
@@ -1661,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                                "md/raid10:%s: read error corrected"
                                                " (%d sectors at %llu on %s)\n",
                                                mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                                bdevname(rdev->bdev, b));
                                 }
  
@@ -1684,15 +1605,16 @@ static void raid10d(mddev_t *mddev)
         unsigned long flags;
         conf_t *conf = mddev->private;
         struct list_head *head = &conf->retry_list;
-       int unplug=0;
         mdk_rdev_t *rdev;
+       struct blk_plug plug;
  
         md_check_recovery(mddev);
  
+       blk_start_plug(&plug);
         for (;;) {
                 char b[BDEVNAME_SIZE];
  
-               unplug += flush_pending_writes(conf);
+               flush_pending_writes(conf);
  
                 spin_lock_irqsave(&conf->device_lock, flags);
                 if (list_empty(head)) {
@@ -1706,14 +1628,13 @@ static void raid10d(mddev_t *mddev)
  
                 mddev = r10_bio->mddev;
                 conf = mddev->private;
-               if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+               if (test_bit(R10BIO_IsSync, &r10_bio->state))
                         sync_request_write(mddev, r10_bio);
-                       unplug = 1;
-               } else  if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+               else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                         recovery_request_write(mddev, r10_bio);
-                       unplug = 1;
-               } else {
-                       int mirror;
+               else {
+                       int slot = r10_bio->read_slot;
+                       int mirror = r10_bio->devs[slot].devnum;
                         /* we got a read error. Maybe the drive is bad.  Maybe just
                          * the block and we can fix it.
                          * We freeze all other IO, and try reading the block from
@@ -1727,6 +1648,7 @@ static void raid10d(mddev_t *mddev)
                                 fix_read_error(conf, mddev, r10_bio);
                                 unfreeze_array(conf);
                         }
+                       rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
  
                         bio = r10_bio->devs[r10_bio->read_slot].bio;
                         r10_bio->devs[r10_bio->read_slot].bio =
@@ -1759,14 +1681,12 @@ static void raid10d(mddev_t *mddev)
                                 bio->bi_rw = READ | do_sync;
                                 bio->bi_private = r10_bio;
                                 bio->bi_end_io = raid10_end_read_request;
-                               unplug = 1;
                                 generic_make_request(bio);
                         }
                 }
                 cond_resched();
         }
-       if (unplug)
-               unplug_slaves(mddev);
+       blk_finish_plug(&plug);
  }
  
  
@@ -2377,7 +2297,6 @@ static int run(mddev_t *mddev)
         md_set_array_sectors(mddev, size);
         mddev->resync_max_sectors = size;
  
-       mddev->queue->unplug_fn = raid10_unplug;
         mddev->queue->backing_dev_info.congested_fn = raid10_congested;
         mddev->queue->backing_dev_info.congested_data = mddev;
  
@@ -2395,7 +2314,10 @@ static int run(mddev_t *mddev)
  
         if (conf->near_copies < conf->raid_disks)
                 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
-       md_integrity_register(mddev);
+
+       if (md_integrity_register(mddev))
+               goto out_free_conf;
+
         return 0;
  
  out_free_conf: