md/raid5: don't do chunk aligned read on degraded array.

[pandora-kernel.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index c82ce1f..77dfd72 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -497,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh)
         }
  }
  
-static int grow_buffers(struct stripe_head *sh)
+static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
  {
         int i;
         int num = sh->raid_conf->pool_size;
@@ -505,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh)
         for (i = 0; i < num; i++) {
                 struct page *page;
  
-               if (!(page = alloc_page(GFP_KERNEL))) {
+               if (!(page = alloc_page(gfp))) {
                         return 1;
                 }
                 sh->dev[i].page = page;
@@ -672,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                     *(conf->hash_locks + hash));
                 sh = __find_stripe(conf, sector, conf->generation - previous);
                 if (!sh) {
-                       if (!conf->inactive_blocked)
+                       if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
                                 sh = get_free_stripe(conf, hash);
+                               if (!sh && llist_empty(&conf->released_stripes) &&
+                                   !test_bit(R5_DID_ALLOC, &conf->cache_state))
+                                       set_bit(R5_ALLOC_MORE,
+                                               &conf->cache_state);
+                       }
                         if (noblock && sh == NULL)
                                 break;
                         if (!sh) {
-                               conf->inactive_blocked = 1;
+                               set_bit(R5_INACTIVE_BLOCKED,
+                                       &conf->cache_state);
                                 wait_event_lock_irq(
                                         conf->wait_for_stripe,
                                         !list_empty(conf->inactive_list + hash) &&
                                         (atomic_read(&conf->active_stripes)
                                          < (conf->max_nr_stripes * 3 / 4)
-                                        || !conf->inactive_blocked),
+                                        || !test_bit(R5_INACTIVE_BLOCKED,
+                                                     &conf->cache_state)),
                                         *(conf->hash_locks + hash));
-                               conf->inactive_blocked = 0;
+                               clear_bit(R5_INACTIVE_BLOCKED,
+                                         &conf->cache_state);
                         } else {
                                 init_stripe(sh, sector, previous);
                                 atomic_inc(&sh->count);
@@ -1963,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
         put_cpu();
  }
  
-static int grow_one_stripe(struct r5conf *conf, int hash)
+static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
  {
         struct stripe_head *sh;
-       sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
+       sh = kmem_cache_zalloc(conf->slab_cache, gfp);
         if (!sh)
                 return 0;
  
@@ -1974,12 +1982,13 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
  
         spin_lock_init(&sh->stripe_lock);
  
-       if (grow_buffers(sh)) {
+       if (grow_buffers(sh, gfp)) {
                 shrink_buffers(sh);
                 kmem_cache_free(conf->slab_cache, sh);
                 return 0;
         }
-       sh->hash_lock_index = hash;
+       sh->hash_lock_index =
+               conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
         /* we just created an active stripe so... */
         atomic_set(&sh->count, 1);
         atomic_inc(&conf->active_stripes);
@@ -1989,6 +1998,7 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
         INIT_LIST_HEAD(&sh->batch_list);
         sh->batch_head = NULL;
         release_stripe(sh);
+       conf->max_nr_stripes++;
         return 1;
  }
  
@@ -1996,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num)
  {
         struct kmem_cache *sc;
         int devs = max(conf->raid_disks, conf->previous_raid_disks);
-       int hash;
  
         if (conf->mddev->gendisk)
                 sprintf(conf->cache_name[0],
@@ -2014,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num)
                 return 1;
         conf->slab_cache = sc;
         conf->pool_size = devs;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (num--) {
-               if (!grow_one_stripe(conf, hash))
+       while (num--)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
                         return 1;
-               conf->max_nr_stripes++;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
         return 0;
  }
  
@@ -2210,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         return err;
  }
  
-static int drop_one_stripe(struct r5conf *conf, int hash)
+static int drop_one_stripe(struct r5conf *conf)
  {
         struct stripe_head *sh;
+       int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
  
         spin_lock_irq(conf->hash_locks + hash);
         sh = get_free_stripe(conf, hash);
@@ -2223,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash)
         shrink_buffers(sh);
         kmem_cache_free(conf->slab_cache, sh);
         atomic_dec(&conf->active_stripes);
+       conf->max_nr_stripes--;
         return 1;
  }
  
  static void shrink_stripes(struct r5conf *conf)
  {
-       int hash;
-       for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
-               while (drop_one_stripe(conf, hash))
-                       ;
+       while (conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
  
         if (conf->slab_cache)
                 kmem_cache_destroy(conf->slab_cache);
@@ -4603,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
          * how busy the stripe_cache is
          */
  
-       if (conf->inactive_blocked)
+       if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
                 return 1;
         if (conf->quiesce)
                 return 1;
@@ -4625,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
         unsigned int chunk_sectors = mddev->chunk_sectors;
         unsigned int bio_sectors = bvm->bi_size >> 9;
  
-       if ((bvm->bi_rw & 1) == WRITE)
-               return biovec->bv_len; /* always allow writes to be mergeable */
+       /*
+        * always allow writes to be mergeable, read as well if array
+        * is degraded as we'll go through stripe cache anyway.
+        */
+       if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+               return biovec->bv_len;
  
         if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                 chunk_sectors = mddev->new_chunk_sectors;
@@ -5103,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
  
         md_write_start(mddev, bi);
  
-       if (rw == READ &&
+       /*
+        * If array is degraded, better not do chunk aligned read because
+        * later we might have to read it again in order to reconstruct
+        * data on failed drives.
+        */
+       if (rw == READ && mddev->degraded == 0 &&
              mddev->reshape_position == MaxSector &&
              chunk_aligned_read(mddev,bi))
                 return;
@@ -5759,6 +5775,8 @@ static void raid5d(struct md_thread *thread)
                 int batch_size, released;
  
                 released = release_stripe_list(conf, conf->temp_inactive_list);
+               if (released)
+                       clear_bit(R5_DID_ALLOC, &conf->cache_state);
  
                 if (
                     !list_empty(&conf->bitmap_list)) {
@@ -5797,6 +5815,13 @@ static void raid5d(struct md_thread *thread)
         pr_debug("%d stripes handled\n", handled);
  
         spin_unlock_irq(&conf->device_lock);
+       if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+               grow_one_stripe(conf, __GFP_NOWARN);
+               /* Set flag even if allocation failed.  This helps
+                * slow down allocation requests when mem is short
+                */
+               set_bit(R5_DID_ALLOC, &conf->cache_state);
+       }
  
         async_tx_issue_pending_all();
         blk_finish_plug(&plug);
@@ -5812,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
         spin_lock(&mddev->lock);
         conf = mddev->private;
         if (conf)
-               ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+               ret = sprintf(page, "%d\n", conf->min_nr_stripes);
         spin_unlock(&mddev->lock);
         return ret;
  }
@@ -5822,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size)
  {
         struct r5conf *conf = mddev->private;
         int err;
-       int hash;
  
         if (size <= 16 || size > 32768)
                 return -EINVAL;
-       hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
-       while (size < conf->max_nr_stripes) {
-               if (drop_one_stripe(conf, hash))
-                       conf->max_nr_stripes--;
-               else
-                       break;
-               hash--;
-               if (hash < 0)
-                       hash = NR_STRIPE_HASH_LOCKS - 1;
-       }
+
+       conf->min_nr_stripes = size;
+       while (size < conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
+
+
         err = md_allow_write(mddev);
         if (err)
                 return err;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (size > conf->max_nr_stripes) {
-               if (grow_one_stripe(conf, hash))
-                       conf->max_nr_stripes++;
-               else break;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
+       while (size > conf->max_nr_stripes)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
+                       break;
+
         return 0;
  }
  EXPORT_SYMBOL(raid5_set_cache_size);
@@ -5879,6 +5898,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                 raid5_show_stripe_cache_size,
                                 raid5_store_stripe_cache_size);
  
+static ssize_t
+raid5_show_rmw_level(struct mddev  *mddev, char *page)
+{
+       struct r5conf *conf = mddev->private;
+       if (conf)
+               return sprintf(page, "%d\n", conf->rmw_level);
+       else
+               return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
+{
+       struct r5conf *conf = mddev->private;
+       unsigned long new;
+
+       if (!conf)
+               return -ENODEV;
+
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+
+       if (kstrtoul(page, 10, &new))
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW &&
+           new != PARITY_ENABLE_RMW &&
+           new != PARITY_PREFER_RMW)
+               return -EINVAL;
+
+       conf->rmw_level = new;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+                        raid5_show_rmw_level,
+                        raid5_store_rmw_level);
+
+
  static ssize_t
  raid5_show_preread_threshold(struct mddev *mddev, char *page)
  {
@@ -5910,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
         conf = mddev->private;
         if (!conf)
                 err = -ENODEV;
-       else if (new > conf->max_nr_stripes)
+       else if (new > conf->min_nr_stripes)
                 err = -EINVAL;
         else
                 conf->bypass_threshold = new;
@@ -6065,6 +6127,7 @@ static struct attribute *raid5_attrs[] =  {
         &raid5_preread_bypass_threshold.attr,
         &raid5_group_thread_cnt.attr,
         &raid5_skip_copy.attr,
+       &raid5_rmw_level.attr,
         NULL,
  };
  static struct attribute_group raid5_attrs_group = {
@@ -6190,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf)
  
  static void free_conf(struct r5conf *conf)
  {
+       if (conf->shrinker.seeks)
+               unregister_shrinker(&conf->shrinker);
         free_thread_groups(conf);
         shrink_stripes(conf);
         raid5_free_percpu(conf);
@@ -6257,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
         return err;
  }
  
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+                                     struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+       int ret = 0;
+       while (ret < sc->nr_to_scan) {
+               if (drop_one_stripe(conf) == 0)
+                       return SHRINK_STOP;
+               ret++;
+       }
+       return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+                                      struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+       if (conf->max_nr_stripes < conf->min_nr_stripes)
+               /* unlikely, but not impossible */
+               return 0;
+       return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
  static struct r5conf *setup_conf(struct mddev *mddev)
  {
         struct r5conf *conf;
@@ -6407,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                 conf->prev_algo = mddev->layout;
         }
  
-       memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+       conf->min_nr_stripes = NR_STRIPES;
+       memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
                  max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
         atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-       if (grow_stripes(conf, NR_STRIPES)) {
+       if (grow_stripes(conf, conf->min_nr_stripes)) {
                 printk(KERN_ERR
                        "md/raid:%s: couldn't allocate %dkB for buffers\n",
                        mdname(mddev), memory);
@@ -6418,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         } else
                 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
                        mdname(mddev), memory);
+       /*
+        * Losing a stripe head costs more than the time to refill it,
+        * it reduces the queue depth and so can hurt throughput.
+        * So set it rather large, scaled by number of devices.
+        */
+       conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+       conf->shrinker.scan_objects = raid5_cache_scan;
+       conf->shrinker.count_objects = raid5_cache_count;
+       conf->shrinker.batch = 128;
+       conf->shrinker.flags = 0;
+       register_shrinker(&conf->shrinker);
  
         sprintf(pers_name, "raid%d", mddev->new_level);
         conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -7059,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev)
          */
         struct r5conf *conf = mddev->private;
         if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes ||
+           > conf->min_nr_stripes ||
             ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes) {
+           > conf->min_nr_stripes) {
                 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
                        mdname(mddev),
                        ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)