raid5: offload stripe handle to workqueue

author Shaohua Li <shli@kernel.org>

Wed, 28 Aug 2013 06:30:16 +0000 (14:30 +0800)

committer NeilBrown <neilb@suse.de>

Wed, 28 Aug 2013 06:46:38 +0000 (16:46 +1000)
author Shaohua Li <shli@kernel.org>
Wed, 28 Aug 2013 06:30:16 +0000 (14:30 +0800)
committer NeilBrown <neilb@suse.de>
Wed, 28 Aug 2013 06:46:38 +0000 (16:46 +1000)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index d87a2de..32fa113 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -53,6 +53,7 @@
  #include <linux/cpu.h>
  #include <linux/slab.h>
  #include <linux/ratelimit.h>
+#include <linux/nodemask.h>
  #include <trace/events/block.h>
  
  #include "md.h"
@@ -60,6 +61,10 @@
  #include "raid0.h"
  #include "bitmap.h"
  
+#define cpu_to_group(cpu) cpu_to_node(cpu)
+#define ANY_GROUP NUMA_NO_NODE
+
+static struct workqueue_struct *raid5_wq;
  /*
   * Stripe cache
   */
@@ -200,6 +205,34 @@ static int stripe_operations_active(struct stripe_head *sh)
                test_bit(STRIPE_COMPUTE_RUN, &sh->state);
  }
  
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+       struct r5conf *conf = sh->raid_conf;
+       struct r5worker_group *group;
+       int i, cpu = sh->cpu;
+
+       if (!cpu_online(cpu)) {
+               cpu = cpumask_any(cpu_online_mask);
+               sh->cpu = cpu;
+       }
+
+       if (list_empty(&sh->lru)) {
+               struct r5worker_group *group;
+               group = conf->worker_groups + cpu_to_group(cpu);
+               list_add_tail(&sh->lru, &group->handle_list);
+       }
+
+       if (conf->worker_cnt_per_group == 0) {
+               md_wakeup_thread(conf->mddev->thread);
+               return;
+       }
+
+       group = conf->worker_groups + cpu_to_group(sh->cpu);
+
+       for (i = 0; i < conf->worker_cnt_per_group; i++)
+               queue_work_on(sh->cpu, raid5_wq, &group->workers[i].work);
+}
+
  static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
  {
         BUG_ON(!list_empty(&sh->lru));
@@ -214,7 +247,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
                 else {
                         clear_bit(STRIPE_DELAYED, &sh->state);
                         clear_bit(STRIPE_BIT_DELAY, &sh->state);
-                       list_add_tail(&sh->lru, &conf->handle_list);
+                       if (conf->worker_cnt_per_group == 0) {
+                               list_add_tail(&sh->lru, &conf->handle_list);
+                       } else {
+                               raid5_wakeup_stripe_thread(sh);
+                               return;
+                       }
                 }
                 md_wakeup_thread(conf->mddev->thread);
         } else {
@@ -409,6 +447,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                 raid5_build_block(sh, i, previous);
         }
         insert_hash(conf, sh);
+       sh->cpu = smp_processor_id();
  }
  
  static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -3830,6 +3869,7 @@ static void raid5_activate_delayed(struct r5conf *conf)
                         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                 atomic_inc(&conf->preread_active_stripes);
                         list_add_tail(&sh->lru, &conf->hold_list);
+                       raid5_wakeup_stripe_thread(sh);
                 }
         }
  }
@@ -4109,18 +4149,32 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
   * head of the hold_list has changed, i.e. the head was promoted to the
   * handle_list.
   */
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
  {
-       struct stripe_head *sh;
+       struct stripe_head *sh = NULL, *tmp;
+       struct list_head *handle_list = NULL;
+
+       if (conf->worker_cnt_per_group == 0) {
+               handle_list = &conf->handle_list;
+       } else if (group != ANY_GROUP) {
+               handle_list = &conf->worker_groups[group].handle_list;
+       } else {
+               int i;
+               for (i = 0; i < conf->group_cnt; i++) {
+                       handle_list = &conf->worker_groups[i].handle_list;
+                       if (!list_empty(handle_list))
+                               break;
+               }
+       }
  
         pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
                   __func__,
-                 list_empty(&conf->handle_list) ? "empty" : "busy",
+                 list_empty(handle_list) ? "empty" : "busy",
                   list_empty(&conf->hold_list) ? "empty" : "busy",
                   atomic_read(&conf->pending_full_writes), conf->bypass_count);
  
-       if (!list_empty(&conf->handle_list)) {
-               sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+       if (!list_empty(handle_list)) {
+               sh = list_entry(handle_list->next, typeof(*sh), lru);
  
                 if (list_empty(&conf->hold_list))
                         conf->bypass_count = 0;
@@ -4138,12 +4192,25 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
                    ((conf->bypass_threshold &&
                      conf->bypass_count > conf->bypass_threshold) ||
                     atomic_read(&conf->pending_full_writes) == 0)) {
-               sh = list_entry(conf->hold_list.next,
-                               typeof(*sh), lru);
-               conf->bypass_count -= conf->bypass_threshold;
-               if (conf->bypass_count < 0)
-                       conf->bypass_count = 0;
-       } else
+
+               list_for_each_entry(tmp, &conf->hold_list,  lru) {
+                       if (conf->worker_cnt_per_group == 0 ||
+                           group == ANY_GROUP ||
+                           !cpu_online(tmp->cpu) ||
+                           cpu_to_group(tmp->cpu) == group) {
+                               sh = tmp;
+                               break;
+                       }
+               }
+
+               if (sh) {
+                       conf->bypass_count -= conf->bypass_threshold;
+                       if (conf->bypass_count < 0)
+                               conf->bypass_count = 0;
+               }
+       }
+
+       if (!sh)
                 return NULL;
  
         list_del_init(&sh->lru);
@@ -4844,13 +4911,13 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
  }
  
  #define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, int group)
  {
         struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
         int i, batch_size = 0;
  
         while (batch_size < MAX_STRIPE_BATCH &&
-                       (sh = __get_priority_stripe(conf)) != NULL)
+                       (sh = __get_priority_stripe(conf, group)) != NULL)
                 batch[batch_size++] = sh;
  
         if (batch_size == 0)
@@ -4868,6 +4935,38 @@ static int handle_active_stripes(struct r5conf *conf)
         return batch_size;
  }
  
+static void raid5_do_work(struct work_struct *work)
+{
+       struct r5worker *worker = container_of(work, struct r5worker, work);
+       struct r5worker_group *group = worker->group;
+       struct r5conf *conf = group->conf;
+       int group_id = group - conf->worker_groups;
+       int handled;
+       struct blk_plug plug;
+
+       pr_debug("+++ raid5worker active\n");
+
+       blk_start_plug(&plug);
+       handled = 0;
+       spin_lock_irq(&conf->device_lock);
+       while (1) {
+               int batch_size, released;
+
+               released = release_stripe_list(conf);
+
+               batch_size = handle_active_stripes(conf, group_id);
+               if (!batch_size && !released)
+                       break;
+               handled += batch_size;
+       }
+       pr_debug("%d stripes handled\n", handled);
+
+       spin_unlock_irq(&conf->device_lock);
+       blk_finish_plug(&plug);
+
+       pr_debug("--- raid5worker inactive\n");
+}
+
  /*
   * This is our raid5 kernel thread.
   *
@@ -4917,7 +5016,7 @@ static void raid5d(struct md_thread *thread)
                         handled++;
                 }
  
-               batch_size = handle_active_stripes(conf);
+               batch_size = handle_active_stripes(conf, ANY_GROUP);
                 if (!batch_size && !released)
                         break;
                 handled += batch_size;
@@ -5057,6 +5156,54 @@ static struct attribute_group raid5_attrs_group = {
         .attrs = raid5_attrs,
  };
  
+static int alloc_thread_groups(struct r5conf *conf, int cnt)
+{
+       int i, j;
+       ssize_t size;
+       struct r5worker *workers;
+
+       conf->worker_cnt_per_group = cnt;
+       if (cnt == 0) {
+               conf->worker_groups = NULL;
+               return 0;
+       }
+       conf->group_cnt = num_possible_nodes();
+       size = sizeof(struct r5worker) * cnt;
+       workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
+       conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
+                               conf->group_cnt, GFP_NOIO);
+       if (!conf->worker_groups || !workers) {
+               kfree(workers);
+               kfree(conf->worker_groups);
+               conf->worker_groups = NULL;
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < conf->group_cnt; i++) {
+               struct r5worker_group *group;
+
+               group = &conf->worker_groups[i];
+               INIT_LIST_HEAD(&group->handle_list);
+               group->conf = conf;
+               group->workers = workers + i * cnt;
+
+               for (j = 0; j < cnt; j++) {
+                       group->workers[j].group = group;
+                       INIT_WORK(&group->workers[j].work, raid5_do_work);
+               }
+       }
+
+       return 0;
+}
+
+static void free_thread_groups(struct r5conf *conf)
+{
+       if (conf->worker_groups)
+               kfree(conf->worker_groups[0].workers);
+       kfree(conf->worker_groups);
+       conf->worker_groups = NULL;
+}
+
  static sector_t
  raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
  {
@@ -5097,6 +5244,7 @@ static void raid5_free_percpu(struct r5conf *conf)
  
  static void free_conf(struct r5conf *conf)
  {
+       free_thread_groups(conf);
         shrink_stripes(conf);
         raid5_free_percpu(conf);
         kfree(conf->disks);
@@ -5225,6 +5373,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
         conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
         if (conf == NULL)
                 goto abort;
+       /* Don't enable multi-threading by default*/
+       if (alloc_thread_groups(conf, 0))
+               goto abort;
         spin_lock_init(&conf->device_lock);
         init_waitqueue_head(&conf->wait_for_stripe);
         init_waitqueue_head(&conf->wait_for_overlap);
@@ -6530,6 +6681,10 @@ static struct md_personality raid4_personality =
  
  static int __init raid5_init(void)
  {
+       raid5_wq = alloc_workqueue("raid5wq",
+               WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
+       if (!raid5_wq)
+               return -ENOMEM;
         register_md_personality(&raid6_personality);
         register_md_personality(&raid5_personality);
         register_md_personality(&raid4_personality);
@@ -6541,6 +6696,7 @@ static void raid5_exit(void)
         unregister_md_personality(&raid6_personality);
         unregister_md_personality(&raid5_personality);
         unregister_md_personality(&raid4_personality);
+       destroy_workqueue(raid5_wq);
  }
  
  module_init(raid5_init);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h

index a98f99d..1053663 100644 (file)
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -212,6 +212,7 @@ struct stripe_head {
         enum check_states       check_state;
         enum reconstruct_states reconstruct_state;
         spinlock_t              stripe_lock;
+       int                     cpu;
         /**
          * struct stripe_operations
          * @target - STRIPE_OP_COMPUTE_BLK target
@@ -365,6 +366,17 @@ struct disk_info {
         struct md_rdev  *rdev, *replacement;
  };
  
+struct r5worker {
+       struct work_struct work;
+       struct r5worker_group *group;
+};
+
+struct r5worker_group {
+       struct list_head handle_list;
+       struct r5conf *conf;
+       struct r5worker *workers;
+};
+
  struct r5conf {
         struct hlist_head       *stripe_hashtbl;
         struct mddev            *mddev;
@@ -461,6 +473,9 @@ struct r5conf {
          * the new thread here until we fully activate the array.
          */
         struct md_thread        *thread;
+       struct r5worker_group   *worker_groups;
+       int                     group_cnt;
+       int                     worker_cnt_per_group;
  };
  
  /*
author	Shaohua Li <shli@kernel.org>
	Wed, 28 Aug 2013 06:30:16 +0000 (14:30 +0800)
committer	NeilBrown <neilb@suse.de>
	Wed, 28 Aug 2013 06:46:38 +0000 (16:46 +1000)
drivers/md/raid5.c		patch \| blob \| history
drivers/md/raid5.h		patch \| blob \| history