md/raid5: don't do chunk aligned read on degraded array.
[pandora-kernel.git] / drivers / md / raid5.c
index c82ce1f..77dfd72 100644 (file)
@@ -497,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh)
        }
 }
 
-static int grow_buffers(struct stripe_head *sh)
+static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 {
        int i;
        int num = sh->raid_conf->pool_size;
@@ -505,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh)
        for (i = 0; i < num; i++) {
                struct page *page;
 
-               if (!(page = alloc_page(GFP_KERNEL))) {
+               if (!(page = alloc_page(gfp))) {
                        return 1;
                }
                sh->dev[i].page = page;
@@ -672,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                    *(conf->hash_locks + hash));
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
-                       if (!conf->inactive_blocked)
+                       if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
                                sh = get_free_stripe(conf, hash);
+                               if (!sh && llist_empty(&conf->released_stripes) &&
+                                   !test_bit(R5_DID_ALLOC, &conf->cache_state))
+                                       set_bit(R5_ALLOC_MORE,
+                                               &conf->cache_state);
+                       }
                        if (noblock && sh == NULL)
                                break;
                        if (!sh) {
-                               conf->inactive_blocked = 1;
+                               set_bit(R5_INACTIVE_BLOCKED,
+                                       &conf->cache_state);
                                wait_event_lock_irq(
                                        conf->wait_for_stripe,
                                        !list_empty(conf->inactive_list + hash) &&
                                        (atomic_read(&conf->active_stripes)
                                         < (conf->max_nr_stripes * 3 / 4)
-                                        || !conf->inactive_blocked),
+                                        || !test_bit(R5_INACTIVE_BLOCKED,
+                                                     &conf->cache_state)),
                                        *(conf->hash_locks + hash));
-                               conf->inactive_blocked = 0;
+                               clear_bit(R5_INACTIVE_BLOCKED,
+                                         &conf->cache_state);
                        } else {
                                init_stripe(sh, sector, previous);
                                atomic_inc(&sh->count);
@@ -1963,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
 
-static int grow_one_stripe(struct r5conf *conf, int hash)
+static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
        struct stripe_head *sh;
-       sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
+       sh = kmem_cache_zalloc(conf->slab_cache, gfp);
        if (!sh)
                return 0;
 
@@ -1974,12 +1982,13 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
 
        spin_lock_init(&sh->stripe_lock);
 
-       if (grow_buffers(sh)) {
+       if (grow_buffers(sh, gfp)) {
                shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
-       sh->hash_lock_index = hash;
+       sh->hash_lock_index =
+               conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        /* we just created an active stripe so... */
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
@@ -1989,6 +1998,7 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
        INIT_LIST_HEAD(&sh->batch_list);
        sh->batch_head = NULL;
        release_stripe(sh);
+       conf->max_nr_stripes++;
        return 1;
 }
 
@@ -1996,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
-       int hash;
 
        if (conf->mddev->gendisk)
                sprintf(conf->cache_name[0],
@@ -2014,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (num--) {
-               if (!grow_one_stripe(conf, hash))
+       while (num--)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
                        return 1;
-               conf->max_nr_stripes++;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
        return 0;
 }
 
@@ -2210,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
 
-static int drop_one_stripe(struct r5conf *conf, int hash)
+static int drop_one_stripe(struct r5conf *conf)
 {
        struct stripe_head *sh;
+       int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
 
        spin_lock_irq(conf->hash_locks + hash);
        sh = get_free_stripe(conf, hash);
@@ -2223,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash)
        shrink_buffers(sh);
        kmem_cache_free(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
+       conf->max_nr_stripes--;
        return 1;
 }
 
 static void shrink_stripes(struct r5conf *conf)
 {
-       int hash;
-       for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
-               while (drop_one_stripe(conf, hash))
-                       ;
+       while (conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
 
        if (conf->slab_cache)
                kmem_cache_destroy(conf->slab_cache);
@@ -4603,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
         * how busy the stripe_cache is
         */
 
-       if (conf->inactive_blocked)
+       if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
                return 1;
        if (conf->quiesce)
                return 1;
@@ -4625,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
 
-       if ((bvm->bi_rw & 1) == WRITE)
-               return biovec->bv_len; /* always allow writes to be mergeable */
+       /*
+        * always allow writes to be mergeable, read as well if array
+        * is degraded as we'll go through stripe cache anyway.
+        */
+       if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+               return biovec->bv_len;
 
        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                chunk_sectors = mddev->new_chunk_sectors;
@@ -5103,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 
        md_write_start(mddev, bi);
 
-       if (rw == READ &&
+       /*
+        * If array is degraded, better not do chunk aligned read because
+        * later we might have to read it again in order to reconstruct
+        * data on failed drives.
+        */
+       if (rw == READ && mddev->degraded == 0 &&
             mddev->reshape_position == MaxSector &&
             chunk_aligned_read(mddev,bi))
                return;
@@ -5759,6 +5775,8 @@ static void raid5d(struct md_thread *thread)
                int batch_size, released;
 
                released = release_stripe_list(conf, conf->temp_inactive_list);
+               if (released)
+                       clear_bit(R5_DID_ALLOC, &conf->cache_state);
 
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -5797,6 +5815,13 @@ static void raid5d(struct md_thread *thread)
        pr_debug("%d stripes handled\n", handled);
 
        spin_unlock_irq(&conf->device_lock);
+       if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+               grow_one_stripe(conf, __GFP_NOWARN);
+               /* Set flag even if allocation failed.  This helps
+                * slow down allocation requests when mem is short
+                */
+               set_bit(R5_DID_ALLOC, &conf->cache_state);
+       }
 
        async_tx_issue_pending_all();
        blk_finish_plug(&plug);
@@ -5812,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
        spin_lock(&mddev->lock);
        conf = mddev->private;
        if (conf)
-               ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+               ret = sprintf(page, "%d\n", conf->min_nr_stripes);
        spin_unlock(&mddev->lock);
        return ret;
 }
@@ -5822,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
        int err;
-       int hash;
 
        if (size <= 16 || size > 32768)
                return -EINVAL;
-       hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
-       while (size < conf->max_nr_stripes) {
-               if (drop_one_stripe(conf, hash))
-                       conf->max_nr_stripes--;
-               else
-                       break;
-               hash--;
-               if (hash < 0)
-                       hash = NR_STRIPE_HASH_LOCKS - 1;
-       }
+
+       conf->min_nr_stripes = size;
+       while (size < conf->max_nr_stripes &&
+              drop_one_stripe(conf))
+               ;
+
+
        err = md_allow_write(mddev);
        if (err)
                return err;
-       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-       while (size > conf->max_nr_stripes) {
-               if (grow_one_stripe(conf, hash))
-                       conf->max_nr_stripes++;
-               else break;
-               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-       }
+
+       while (size > conf->max_nr_stripes)
+               if (!grow_one_stripe(conf, GFP_KERNEL))
+                       break;
+
        return 0;
 }
 EXPORT_SYMBOL(raid5_set_cache_size);
@@ -5879,6 +5898,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
                                raid5_show_stripe_cache_size,
                                raid5_store_stripe_cache_size);
 
+static ssize_t
+raid5_show_rmw_level(struct mddev  *mddev, char *page)
+{
+       struct r5conf *conf = mddev->private;
+       if (conf)
+               return sprintf(page, "%d\n", conf->rmw_level);
+       else
+               return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
+{
+       struct r5conf *conf = mddev->private;
+       unsigned long new;
+
+       if (!conf)
+               return -ENODEV;
+
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+
+       if (kstrtoul(page, 10, &new))
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+               return -EINVAL;
+
+       if (new != PARITY_DISABLE_RMW &&
+           new != PARITY_ENABLE_RMW &&
+           new != PARITY_PREFER_RMW)
+               return -EINVAL;
+
+       conf->rmw_level = new;
+       return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+                        raid5_show_rmw_level,
+                        raid5_store_rmw_level);
+
+
 static ssize_t
 raid5_show_preread_threshold(struct mddev *mddev, char *page)
 {
@@ -5910,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
        conf = mddev->private;
        if (!conf)
                err = -ENODEV;
-       else if (new > conf->max_nr_stripes)
+       else if (new > conf->min_nr_stripes)
                err = -EINVAL;
        else
                conf->bypass_threshold = new;
@@ -6065,6 +6127,7 @@ static struct attribute *raid5_attrs[] =  {
        &raid5_preread_bypass_threshold.attr,
        &raid5_group_thread_cnt.attr,
        &raid5_skip_copy.attr,
+       &raid5_rmw_level.attr,
        NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -6190,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+       if (conf->shrinker.seeks)
+               unregister_shrinker(&conf->shrinker);
        free_thread_groups(conf);
        shrink_stripes(conf);
        raid5_free_percpu(conf);
@@ -6257,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
        return err;
 }
 
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+                                     struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+       int ret = 0;
+       while (ret < sc->nr_to_scan) {
+               if (drop_one_stripe(conf) == 0)
+                       return SHRINK_STOP;
+               ret++;
+       }
+       return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+                                      struct shrink_control *sc)
+{
+       struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+       if (conf->max_nr_stripes < conf->min_nr_stripes)
+               /* unlikely, but not impossible */
+               return 0;
+       return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
 static struct r5conf *setup_conf(struct mddev *mddev)
 {
        struct r5conf *conf;
@@ -6407,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
                conf->prev_algo = mddev->layout;
        }
 
-       memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+       conf->min_nr_stripes = NR_STRIPES;
+       memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
        atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-       if (grow_stripes(conf, NR_STRIPES)) {
+       if (grow_stripes(conf, conf->min_nr_stripes)) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate %dkB for buffers\n",
                       mdname(mddev), memory);
@@ -6418,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        } else
                printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
                       mdname(mddev), memory);
+       /*
+        * Losing a stripe head costs more than the time to refill it,
+        * it reduces the queue depth and so can hurt throughput.
+        * So set it rather large, scaled by number of devices.
+        */
+       conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+       conf->shrinker.scan_objects = raid5_cache_scan;
+       conf->shrinker.count_objects = raid5_cache_count;
+       conf->shrinker.batch = 128;
+       conf->shrinker.flags = 0;
+       register_shrinker(&conf->shrinker);
 
        sprintf(pers_name, "raid%d", mddev->new_level);
        conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -7059,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev)
         */
        struct r5conf *conf = mddev->private;
        if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes ||
+           > conf->min_nr_stripes ||
            ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-           > conf->max_nr_stripes) {
+           > conf->min_nr_stripes) {
                printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
                       mdname(mddev),
                       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)