md/raid5: don't do chunk aligned read on degraded array.

[pandora-kernel.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 78ac7dc..77dfd72 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -672,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                     *(conf->hash_locks + hash));
                 sh = __find_stripe(conf, sector, conf->generation - previous);
                 if (!sh) {
-                       if (!conf->inactive_blocked)
+                       if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
                                 sh = get_free_stripe(conf, hash);
+                               if (!sh && llist_empty(&conf->released_stripes) &&
+                                   !test_bit(R5_DID_ALLOC, &conf->cache_state))
+                                       set_bit(R5_ALLOC_MORE,
+                                               &conf->cache_state);
+                       }
                         if (noblock && sh == NULL)
                                 break;
                         if (!sh) {
-                               conf->inactive_blocked = 1;
+                               set_bit(R5_INACTIVE_BLOCKED,
+                                       &conf->cache_state);
                                 wait_event_lock_irq(
                                         conf->wait_for_stripe,
                                         !list_empty(conf->inactive_list + hash) &&
                                         (atomic_read(&conf->active_stripes)
                                          < (conf->max_nr_stripes * 3 / 4)
-                                        || !conf->inactive_blocked),
+                                        || !test_bit(R5_INACTIVE_BLOCKED,
+                                                     &conf->cache_state)),
                                         *(conf->hash_locks + hash));
-                               conf->inactive_blocked = 0;
+                               clear_bit(R5_INACTIVE_BLOCKED,
+                                         &conf->cache_state);
                         } else {
                                 init_stripe(sh, sector, previous);
                                 atomic_inc(&sh->count);
@@ -4602,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
          * how busy the stripe_cache is
          */
  
-       if (conf->inactive_blocked)
+       if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
                 return 1;
         if (conf->quiesce)
                 return 1;
@@ -4624,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
         unsigned int chunk_sectors = mddev->chunk_sectors;
         unsigned int bio_sectors = bvm->bi_size >> 9;
  
-       if ((bvm->bi_rw & 1) == WRITE)
-               return biovec->bv_len; /* always allow writes to be mergeable */
+       /*
+        * always allow writes to be mergeable, read as well if array
+        * is degraded as we'll go through stripe cache anyway.
+        */
+       if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+               return biovec->bv_len;
  
         if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                 chunk_sectors = mddev->new_chunk_sectors;
@@ -5102,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
  
         md_write_start(mddev, bi);
  
-       if (rw == READ &&
+       /*
+        * If array is degraded, better not do chunk aligned read because
+        * later we might have to read it again in order to reconstruct
+        * data on failed drives.
+        */
+       if (rw == READ && mddev->degraded == 0 &&
              mddev->reshape_position == MaxSector &&
              chunk_aligned_read(mddev,bi))
                 return;
@@ -5758,6 +5775,8 @@ static void raid5d(struct md_thread *thread)
                 int batch_size, released;
  
                 released = release_stripe_list(conf, conf->temp_inactive_list);
+               if (released)
+                       clear_bit(R5_DID_ALLOC, &conf->cache_state);
  
                 if (
                     !list_empty(&conf->bitmap_list)) {
@@ -5796,6 +5815,13 @@ static void raid5d(struct md_thread *thread)
         pr_debug("%d stripes handled\n", handled);
  
         spin_unlock_irq(&conf->device_lock);
+       if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+               grow_one_stripe(conf, __GFP_NOWARN);
+               /* Set flag even if allocation failed.  This helps
+                * slow down allocation requests when mem is short
+                */
+               set_bit(R5_DID_ALLOC, &conf->cache_state);
+       }
  
         async_tx_issue_pending_all();
         blk_finish_plug(&plug);
@@ -5811,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
         spin_lock(&mddev->lock);
         conf = mddev->private;
         if (conf)
-               ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+               ret = sprintf(page, "%d\n", conf->min_nr_stripes);
         spin_unlock(&mddev->lock);
         return ret;
  }
@@ -5825,10 +5851,12 @@ raid5_set_cache_size(struct mddev *mddev, int size)
         if (size <= 16 || size > 32768)
                 return -EINVAL;
  
+       conf->min_nr_stripes = size;
         while (size < conf->max_nr_stripes &&
                drop_one_stripe(conf))
                 ;
  
+
         err = md_allow_write(mddev);
         if (err)
                 return err;
@@ -5944,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
         conf = mddev->private;
         if (!conf)
                 err = -ENODEV;
-       else if (new > conf->max_nr_stripes)
+       else if (new > conf->min_nr_stripes)
                 err = -EINVAL;
         else
                 conf->bypass_threshold = new;
@@ -6225,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf)
  
  static void free_conf(struct r5conf *conf)
  {
+       if (conf->shrinker.seeks)
+               unregister_shrinker(&conf->shrinker);
         free_thread_groups(conf);
         shrink_stripes(conf);
         raid5_free_percpu(conf);
@@ -6292,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
         return err;
  }
  
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+                                     struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+       int ret = 0;
+       while (ret < sc->nr_to_scan) {
+               if (drop_one_stripe(conf) == 0)
+                       return SHRINK_STOP;
+               ret++;
+       }
+       return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+                                      struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+       if (conf->max_nr_stripes < conf->min_nr_stripes)
+               /* unlikely, but not impossible */
+               return 0;
+       return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
  static struct r5conf *setup_conf(struct mddev *mddev)
  {
         struct r5conf *conf;
@@ -6442,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                 conf->prev_algo = mddev->layout;
         }
  
-       memory = NR_STRIPES * (sizeof(struct stripe_head) +
+       conf->min_nr_stripes = NR_STRIPES;
+       memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
         atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-       if (grow_stripes(conf, NR_STRIPES)) {
+       if (grow_stripes(conf, conf->min_nr_stripes)) {
                 printk(KERN_ERR
                        "md/raid:%s: couldn't allocate %dkB for buffers\n",
                        mdname(mddev), memory);
@@ -6453,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         } else
                 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
                        mdname(mddev), memory);
+       /*
+        * Losing a stripe head costs more than the time to refill it,
+        * it reduces the queue depth and so can hurt throughput.
+        * So set it rather large, scaled by number of devices.
+        */
+       conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+       conf->shrinker.scan_objects = raid5_cache_scan;
+       conf->shrinker.count_objects = raid5_cache_count;
+       conf->shrinker.batch = 128;
+       conf->shrinker.flags = 0;
+       register_shrinker(&conf->shrinker);
  
         sprintf(pers_name, "raid%d", mddev->new_level);
         conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -7094,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev)
          */
         struct r5conf *conf = mddev->private;
         if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes ||
+           > conf->min_nr_stripes ||
             ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes) {
+           > conf->min_nr_stripes) {
                 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
                        mdname(mddev),
                        ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)