md/raid5 revise rules for when to update metadata during reshape

[pandora-kernel.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 81789fa..3bbc6d6 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -45,11 +45,11 @@
  
  #include <linux/blkdev.h>
  #include <linux/kthread.h>
+#include <linux/raid/pq.h>
  #include <linux/async_tx.h>
  #include <linux/seq_file.h>
  #include "md.h"
  #include "raid5.h"
-#include "raid6.h"
  #include "bitmap.h"
  
  /*
@@ -94,11 +94,6 @@
  
  #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
  
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-
  /*
   * We maintain a biased count of active stripes in the bottom 16 bits of
   * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -304,7 +299,7 @@ static int grow_buffers(struct stripe_head *sh, int num)
         return 0;
  }
  
-static void raid5_build_block(struct stripe_head *sh, int i);
+static void raid5_build_block(struct stripe_head *sh, int i, int previous);
  static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
                             struct stripe_head *sh);
  
@@ -323,6 +318,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
  
         remove_hash(sh);
  
+       sh->generation = conf->generation - previous;
         sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
         sh->sector = sector;
         stripe_set_idx(sector, conf, previous, sh);
@@ -341,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                         BUG();
                 }
                 dev->flags = 0;
-               raid5_build_block(sh, i);
+               raid5_build_block(sh, i, previous);
         }
         insert_hash(conf, sh);
  }
  
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
+                                        short generation)
  {
         struct stripe_head *sh;
         struct hlist_node *hn;
@@ -354,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
         CHECK_DEVLOCK();
         pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
         hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-               if (sh->sector == sector && sh->disks == disks)
+               if (sh->sector == sector && sh->generation == generation)
                         return sh;
         pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
         return NULL;
@@ -368,7 +365,6 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
                   int previous, int noblock)
  {
         struct stripe_head *sh;
-       int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
  
         pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
  
@@ -378,7 +374,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
                 wait_event_lock_irq(conf->wait_for_stripe,
                                     conf->quiesce == 0,
                                     conf->device_lock, /* nothing */);
-               sh = __find_stripe(conf, sector, disks);
+               sh = __find_stripe(conf, sector, conf->generation - previous);
                 if (!sh) {
                         if (!conf->inactive_blocked)
                                 sh = get_free_stripe(conf);
@@ -399,7 +395,8 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
                                 init_stripe(sh, sector, previous);
                 } else {
                         if (atomic_read(&sh->count)) {
-                         BUG_ON(!list_empty(&sh->lru));
+                               BUG_ON(!list_empty(&sh->lru)
+                                   && !test_bit(STRIPE_EXPANDING, &sh->state));
                         } else {
                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
                                         atomic_inc(&conf->active_stripes);
@@ -933,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num)
         struct kmem_cache *sc;
         int devs = conf->raid_disks;
  
-       sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
-       sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
+       sprintf(conf->cache_name[0],
+               "raid%d-%s", conf->level, mdname(conf->mddev));
+       sprintf(conf->cache_name[1],
+               "raid%d-%s-alt", conf->level, mdname(conf->mddev));
         conf->active_name = 0;
         sc = kmem_cache_create(conf->cache_name[conf->active_name],
                                sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -949,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
         return 0;
  }
  
-#ifdef CONFIG_MD_RAID5_RESHAPE
  static int resize_stripes(raid5_conf_t *conf, int newsize)
  {
         /* Make all the stripes able to hold 'newsize' devices.
@@ -1074,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
         conf->pool_size = newsize;
         return err;
  }
-#endif
  
  static int drop_one_stripe(raid5_conf_t *conf)
  {
@@ -1214,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
  }
  
  
-static sector_t compute_blocknr(struct stripe_head *sh, int i);
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
         
-static void raid5_build_block(struct stripe_head *sh, int i)
+static void raid5_build_block(struct stripe_head *sh, int i, int previous)
  {
         struct r5dev *dev = &sh->dev[i];
  
@@ -1232,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i)
         dev->req.bi_private = sh;
  
         dev->flags = 0;
-       dev->sector = compute_blocknr(sh, i);
+       dev->sector = compute_blocknr(sh, i, previous);
  }
  
  static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -1275,7 +1272,10 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
         int pd_idx, qd_idx;
         int ddf_layout = 0;
         sector_t new_sector;
-       int sectors_per_chunk = conf->chunk_size >> 9;
+       int algorithm = previous ? conf->prev_algo
+                                : conf->algorithm;
+       int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+                                        : (conf->chunk_size >> 9);
         int raid_disks = previous ? conf->previous_raid_disks
                                   : conf->raid_disks;
         int data_disks = raid_disks - conf->max_degraded;
@@ -1308,7 +1308,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                 pd_idx = data_disks;
                 break;
         case 5:
-               switch (conf->algorithm) {
+               switch (algorithm) {
                 case ALGORITHM_LEFT_ASYMMETRIC:
                         pd_idx = data_disks - stripe % raid_disks;
                         if (*dd_idx >= pd_idx)
@@ -1336,13 +1336,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                         break;
                 default:
                         printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                               conf->algorithm);
+                               algorithm);
                         BUG();
                 }
                 break;
         case 6:
  
-               switch (conf->algorithm) {
+               switch (algorithm) {
                 case ALGORITHM_LEFT_ASYMMETRIC:
                         pd_idx = raid_disks - 1 - (stripe % raid_disks);
                         qd_idx = pd_idx + 1;
@@ -1455,7 +1455,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
  
                 default:
                         printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                              conf->algorithm);
+                              algorithm);
                         BUG();
                 }
                 break;
@@ -1474,13 +1474,16 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
  }
  
  
-static sector_t compute_blocknr(struct stripe_head *sh, int i)
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
  {
         raid5_conf_t *conf = sh->raid_conf;
         int raid_disks = sh->disks;
         int data_disks = raid_disks - conf->max_degraded;
         sector_t new_sector = sh->sector, check;
-       int sectors_per_chunk = conf->chunk_size >> 9;
+       int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+                                        : (conf->chunk_size >> 9);
+       int algorithm = previous ? conf->prev_algo
+                                : conf->algorithm;
         sector_t stripe;
         int chunk_offset;
         int chunk_number, dummy1, dd_idx = i;
@@ -1497,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
         switch(conf->level) {
         case 4: break;
         case 5:
-               switch (conf->algorithm) {
+               switch (algorithm) {
                 case ALGORITHM_LEFT_ASYMMETRIC:
                 case ALGORITHM_RIGHT_ASYMMETRIC:
                         if (i > sh->pd_idx)
@@ -1516,14 +1519,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                         break;
                 default:
                         printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                              conf->algorithm);
+                              algorithm);
                         BUG();
                 }
                 break;
         case 6:
                 if (i == sh->qd_idx)
                         return 0; /* It is the Q disk */
-               switch (conf->algorithm) {
+               switch (algorithm) {
                 case ALGORITHM_LEFT_ASYMMETRIC:
                 case ALGORITHM_RIGHT_ASYMMETRIC:
                 case ALGORITHM_ROTATING_ZERO_RESTART:
@@ -1571,7 +1574,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                         break;
                 default:
                         printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                              conf->algorithm);
+                              algorithm);
                         BUG();
                 }
                 break;
@@ -1581,8 +1584,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
         r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
  
         check = raid5_compute_sector(conf, r_sector,
-                                    (raid_disks != conf->raid_disks),
-                                    &dummy1, &sh2);
+                                    previous, &dummy1, &sh2);
         if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
                 || sh2.qd_idx != sh->qd_idx) {
                 printk(KERN_ERR "compute_blocknr: map not correct\n");
@@ -1994,7 +1996,9 @@ static int page_is_zero(struct page *p)
  static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
                             struct stripe_head *sh)
  {
-       int sectors_per_chunk = conf->chunk_size >> 9;
+       int sectors_per_chunk =
+               previous ? (conf->prev_chunk >> 9)
+                        : (conf->chunk_size >> 9);
         int dd_idx;
         int chunk_offset = sector_div(stripe, sectors_per_chunk);
         int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
@@ -2374,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf,
                 struct r6_state *r6s, int disks)
  {
         int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
-       int qd_idx = r6s->qd_idx;
+       int qd_idx = sh->qd_idx;
         for (i = disks; i--; ) {
                 struct r5dev *dev = &sh->dev[i];
                 /* Would I have to read this buffer for reconstruct_write */
@@ -2564,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
         int update_p = 0, update_q = 0;
         struct r5dev *dev;
         int pd_idx = sh->pd_idx;
-       int qd_idx = r6s->qd_idx;
+       int qd_idx = sh->qd_idx;
  
         set_bit(STRIPE_HANDLE, &sh->state);
  
@@ -2660,11 +2664,11 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
         struct dma_async_tx_descriptor *tx = NULL;
         clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
         for (i = 0; i < sh->disks; i++)
-               if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
+               if (i != sh->pd_idx && i != sh->qd_idx) {
                         int dd_idx, j;
                         struct stripe_head *sh2;
  
-                       sector_t bn = compute_blocknr(sh, i);
+                       sector_t bn = compute_blocknr(sh, i, 1);
                         sector_t s = raid5_compute_sector(conf, bn, 0,
                                                           &dd_idx, NULL);
                         sh2 = get_active_stripe(conf, s, 0, 1);
@@ -2939,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
  
         /* Finish reconstruct operations initiated by the expansion process */
         if (sh->reconstruct_state == reconstruct_state_result) {
+               struct stripe_head *sh2
+                       = get_active_stripe(conf, sh->sector, 1, 1);
+               if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                       /* sh cannot be written until sh2 has been read.
+                        * so arrange for sh to be delayed a little
+                        */
+                       set_bit(STRIPE_DELAYED, &sh->state);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                             &sh2->state))
+                               atomic_inc(&conf->preread_active_stripes);
+                       release_stripe(sh2);
+                       goto unlock;
+               }
+               if (sh2)
+                       release_stripe(sh2);
+
                 sh->reconstruct_state = reconstruct_state_idle;
                 clear_bit(STRIPE_EXPANDING, &sh->state);
                 for (i = conf->raid_disks; i--; ) {
@@ -2987,17 +3008,16 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         raid5_conf_t *conf = sh->raid_conf;
         int disks = sh->disks;
         struct bio *return_bi = NULL;
-       int i, pd_idx = sh->pd_idx;
+       int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
         struct stripe_head_state s;
         struct r6_state r6s;
         struct r5dev *dev, *pdev, *qdev;
         mdk_rdev_t *blocked_rdev = NULL;
  
-       r6s.qd_idx = sh->qd_idx;
         pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
                 "pd_idx=%d, qd_idx=%d\n",
                (unsigned long long)sh->sector, sh->state,
-              atomic_read(&sh->count), pd_idx, r6s.qd_idx);
+              atomic_read(&sh->count), pd_idx, qd_idx);
         memset(&s, 0, sizeof(s));
  
         spin_lock(&sh->lock);
@@ -3108,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         pdev = &sh->dev[pd_idx];
         r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
                 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
-       qdev = &sh->dev[r6s.qd_idx];
-       r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
-               || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
+       qdev = &sh->dev[qd_idx];
+       r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
+               || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
  
         if ( s.written &&
              ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
@@ -3168,6 +3188,23 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 }
  
         if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+               struct stripe_head *sh2
+                       = get_active_stripe(conf, sh->sector, 1, 1);
+               if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                       /* sh cannot be written until sh2 has been read.
+                        * so arrange for sh to be delayed a little
+                        */
+                       set_bit(STRIPE_DELAYED, &sh->state);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                             &sh2->state))
+                               atomic_inc(&conf->preread_active_stripes);
+                       release_stripe(sh2);
+                       goto unlock;
+               }
+               if (sh2)
+                       release_stripe(sh2);
+
                 /* Need to write out all blocks after computing P&Q */
                 sh->disks = conf->raid_disks;
                 stripe_set_idx(sh->sector, conf, 0, sh);
@@ -3321,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
         if ((bvm->bi_rw & 1) == WRITE)
                 return biovec->bv_len; /* always allow writes to be mergeable */
  
+       if (mddev->new_chunk < mddev->chunk_size)
+               chunk_sectors = mddev->new_chunk >> 9;
         max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
         if (max < 0) max = 0;
         if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3336,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
         unsigned int chunk_sectors = mddev->chunk_size >> 9;
         unsigned int bio_sectors = bio->bi_size >> 9;
  
+       if (mddev->new_chunk < mddev->chunk_size)
+               chunk_sectors = mddev->new_chunk >> 9;
         return  chunk_sectors >=
                 ((sector & (chunk_sectors - 1)) + bio_sectors);
  }
@@ -3596,25 +3637,27 @@ static int make_request(struct request_queue *q, struct bio * bi)
  
         retry:
                 previous = 0;
+               disks = conf->raid_disks;
                 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-               if (likely(conf->expand_progress == MaxSector))
-                       disks = conf->raid_disks;
-               else {
-                       /* spinlock is needed as expand_progress may be
+               if (unlikely(conf->reshape_progress != MaxSector)) {
+                       /* spinlock is needed as reshape_progress may be
                          * 64bit on a 32bit platform, and so it might be
                          * possible to see a half-updated value
-                        * Ofcourse expand_progress could change after
+                        * Ofcourse reshape_progress could change after
                          * the lock is dropped, so once we get a reference
                          * to the stripe that we think it is, we will have
                          * to check again.
                          */
                         spin_lock_irq(&conf->device_lock);
-                       disks = conf->raid_disks;
-                       if (logical_sector >= conf->expand_progress) {
+                       if (mddev->delta_disks < 0
+                           ? logical_sector < conf->reshape_progress
+                           : logical_sector >= conf->reshape_progress) {
                                 disks = conf->previous_raid_disks;
                                 previous = 1;
                         } else {
-                               if (logical_sector >= conf->expand_lo) {
+                               if (mddev->delta_disks < 0
+                                   ? logical_sector < conf->reshape_safe
+                                   : logical_sector >= conf->reshape_safe) {
                                         spin_unlock_irq(&conf->device_lock);
                                         schedule();
                                         goto retry;
@@ -3634,7 +3677,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                 sh = get_active_stripe(conf, new_sector, previous,
                                        (bi->bi_rw&RWA_MASK));
                 if (sh) {
-                       if (unlikely(conf->expand_progress != MaxSector)) {
+                       if (unlikely(previous)) {
                                 /* expansion might have moved on while waiting for a
                                  * stripe, so we must do the range check again.
                                  * Expansion could still move past after this
@@ -3645,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                  */
                                 int must_retry = 0;
                                 spin_lock_irq(&conf->device_lock);
-                               if (logical_sector <  conf->expand_progress &&
-                                   disks == conf->previous_raid_disks)
+                               if (mddev->delta_disks < 0
+                                   ? logical_sector >= conf->reshape_progress
+                                   : logical_sector < conf->reshape_progress)
                                         /* mismatch, need to try again */
                                         must_retry = 1;
                                 spin_unlock_irq(&conf->device_lock);
@@ -3701,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
         return 0;
  }
  
+static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
+
  static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
  {
         /* reshaping is quite different to recovery/resync so it is
@@ -3720,52 +3766,112 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         int new_data_disks = conf->raid_disks - conf->max_degraded;
         int i;
         int dd_idx;
-       sector_t writepos, safepos, gap;
-
-       if (sector_nr == 0 &&
-           conf->expand_progress != 0) {
-               /* restarting in the middle, skip the initial sectors */
-               sector_nr = conf->expand_progress;
+       sector_t writepos, readpos, safepos;
+       sector_t stripe_addr;
+       int reshape_sectors;
+       struct list_head stripes;
+
+       if (sector_nr == 0) {
+               /* If restarting in the middle, skip the initial sectors */
+               if (mddev->delta_disks < 0 &&
+                   conf->reshape_progress < raid5_size(mddev, 0, 0)) {
+                       sector_nr = raid5_size(mddev, 0, 0)
+                               - conf->reshape_progress;
+               } else if (mddev->delta_disks > 0 &&
+                          conf->reshape_progress > 0)
+                       sector_nr = conf->reshape_progress;
                 sector_div(sector_nr, new_data_disks);
-               *skipped = 1;
-               return sector_nr;
+               if (sector_nr) {
+                       *skipped = 1;
+                       return sector_nr;
+               }
         }
  
+       /* We need to process a full chunk at a time.
+        * If old and new chunk sizes differ, we need to process the
+        * largest of these
+        */
+       if (mddev->new_chunk > mddev->chunk_size)
+               reshape_sectors = mddev->new_chunk / 512;
+       else
+               reshape_sectors = mddev->chunk_size / 512;
+
         /* we update the metadata when there is more than 3Meg
          * in the block range (that is rather arbitrary, should
          * probably be time based) or when the data about to be
          * copied would over-write the source of the data at
          * the front of the range.
-        * i.e. one new_stripe forward from expand_progress new_maps
-        * to after where expand_lo old_maps to
+        * i.e. one new_stripe along from reshape_progress new_maps
+        * to after where reshape_safe old_maps to
          */
-       writepos = conf->expand_progress +
-               conf->chunk_size/512*(new_data_disks);
+       writepos = conf->reshape_progress;
         sector_div(writepos, new_data_disks);
-       safepos = conf->expand_lo;
+       readpos = conf->reshape_progress;
+       sector_div(readpos, data_disks);
+       safepos = conf->reshape_safe;
         sector_div(safepos, data_disks);
-       gap = conf->expand_progress - conf->expand_lo;
+       if (mddev->delta_disks < 0) {
+               writepos -= reshape_sectors;
+               readpos += reshape_sectors;
+               safepos += reshape_sectors;
+       } else {
+               writepos += reshape_sectors;
+               readpos -= reshape_sectors;
+               safepos -= reshape_sectors;
+       }
  
-       if (writepos >= safepos ||
-           gap > (new_data_disks)*3000*2 /*3Meg*/) {
+       /* 'writepos' is the most advanced device address we might write.
+        * 'readpos' is the least advanced device address we might read.
+        * 'safepos' is the least address recorded in the metadata as having
+        *     been reshaped.
+        * If 'readpos' is behind 'writepos', then there is no way that we can
+        * ensure safety in the face of a crash - that must be done by userspace
+        * making a backup of the data.  So in that case there is no particular
+        * rush to update metadata.
+        * Otherwise if 'safepos' is behind 'writepos', then we really need to
+        * update the metadata to advance 'safepos' to match 'readpos' so that
+        * we can be safe in the event of a crash.
+        * So we insist on updating metadata if safepos is behind writepos and
+        * readpos is beyond writepos.
+        * In any case, update the metadata every 10 seconds.
+        * Maybe that number should be configurable, but I'm not sure it is
+        * worth it.... maybe it could be a multiple of safemode_delay???
+        */
+       if ((mddev->delta_disks < 0
+            ? (safepos > writepos && readpos < writepos)
+            : (safepos < writepos && readpos > writepos)) ||
+           time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
                 /* Cannot proceed until we've updated the superblock... */
                 wait_event(conf->wait_for_overlap,
                            atomic_read(&conf->reshape_stripes)==0);
-               mddev->reshape_position = conf->expand_progress;
+               mddev->reshape_position = conf->reshape_progress;
+               conf->reshape_checkpoint = jiffies;
                 set_bit(MD_CHANGE_DEVS, &mddev->flags);
                 md_wakeup_thread(mddev->thread);
                 wait_event(mddev->sb_wait, mddev->flags == 0 ||
                            kthread_should_stop());
                 spin_lock_irq(&conf->device_lock);
-               conf->expand_lo = mddev->reshape_position;
+               conf->reshape_safe = mddev->reshape_position;
                 spin_unlock_irq(&conf->device_lock);
                 wake_up(&conf->wait_for_overlap);
         }
  
-       for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+       if (mddev->delta_disks < 0) {
+               BUG_ON(conf->reshape_progress == 0);
+               stripe_addr = writepos;
+               BUG_ON((mddev->dev_sectors &
+                       ~((sector_t)reshape_sectors - 1))
+                      - reshape_sectors - stripe_addr
+                      != sector_nr);
+       } else {
+               BUG_ON(writepos != sector_nr + reshape_sectors);
+               stripe_addr = sector_nr;
+       }
+       INIT_LIST_HEAD(&stripes);
+       for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
                 int j;
                 int skipped = 0;
-               sh = get_active_stripe(conf, sector_nr+i, 0, 0);
+               sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
                 set_bit(STRIPE_EXPANDING, &sh->state);
                 atomic_inc(&conf->reshape_stripes);
                 /* If any of this stripe is beyond the end of the old
@@ -3778,8 +3884,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                         if (conf->level == 6 &&
                             j == sh->qd_idx)
                                 continue;
-                       s = compute_blocknr(sh, j);
-                       if (s < mddev->array_sectors) {
+                       s = compute_blocknr(sh, j, 0);
+                       if (s < raid5_size(mddev, 0, 0)) {
                                 skipped = 1;
                                 continue;
                         }
@@ -3791,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                         set_bit(STRIPE_EXPAND_READY, &sh->state);
                         set_bit(STRIPE_HANDLE, &sh->state);
                 }
-               release_stripe(sh);
+               list_add(&sh->lru, &stripes);
         }
         spin_lock_irq(&conf->device_lock);
-       conf->expand_progress = (sector_nr + i) * new_data_disks;
+       if (mddev->delta_disks < 0)
+               conf->reshape_progress -= reshape_sectors * new_data_disks;
+       else
+               conf->reshape_progress += reshape_sectors * new_data_disks;
         spin_unlock_irq(&conf->device_lock);
         /* Ok, those stripe are ready. We can start scheduling
          * reads on the source stripes.
@@ -3802,10 +3911,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
          * block on the destination stripes.
          */
         first_sector =
-               raid5_compute_sector(conf, sector_nr*(new_data_disks),
+               raid5_compute_sector(conf, stripe_addr*(new_data_disks),
                                      1, &dd_idx, NULL);
         last_sector =
-               raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512)
+               raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
                                             *(new_data_disks) - 1),
                                      1, &dd_idx, NULL);
         if (last_sector >= mddev->dev_sectors)
@@ -3817,26 +3926,35 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                 release_stripe(sh);
                 first_sector += STRIPE_SECTORS;
         }
+       /* Now that the sources are clearly marked, we can release
+        * the destination stripes
+        */
+       while (!list_empty(&stripes)) {
+               sh = list_entry(stripes.next, struct stripe_head, lru);
+               list_del_init(&sh->lru);
+               release_stripe(sh);
+       }
         /* If this takes us to the resync_max point where we have to pause,
          * then we need to write out the superblock.
          */
-       sector_nr += conf->chunk_size>>9;
+       sector_nr += reshape_sectors;
         if (sector_nr >= mddev->resync_max) {
                 /* Cannot proceed until we've updated the superblock... */
                 wait_event(conf->wait_for_overlap,
                            atomic_read(&conf->reshape_stripes) == 0);
-               mddev->reshape_position = conf->expand_progress;
+               mddev->reshape_position = conf->reshape_progress;
+               conf->reshape_checkpoint = jiffies;
                 set_bit(MD_CHANGE_DEVS, &mddev->flags);
                 md_wakeup_thread(mddev->thread);
                 wait_event(mddev->sb_wait,
                            !test_bit(MD_CHANGE_DEVS, &mddev->flags)
                            || kthread_should_stop());
                 spin_lock_irq(&conf->device_lock);
-               conf->expand_lo = mddev->reshape_position;
+               conf->reshape_safe = mddev->reshape_position;
                 spin_unlock_irq(&conf->device_lock);
                 wake_up(&conf->wait_for_overlap);
         }
-       return conf->chunk_size>>9;
+       return reshape_sectors;
  }
  
  /* FIXME go_faster isn't used */
@@ -3852,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
         if (sector_nr >= max_sector) {
                 /* just being told to finish up .. nothing much to do */
                 unplug_slaves(mddev);
+
                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
                         end_reshape(conf);
                         return 0;
@@ -4164,6 +4283,26 @@ static struct attribute_group raid5_attrs_group = {
         .attrs = raid5_attrs,
  };
  
+static sector_t
+raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       if (!sectors)
+               sectors = mddev->dev_sectors;
+       if (!raid_disks) {
+               /* size is defined by the smallest of previous and new size */
+               if (conf->raid_disks < conf->previous_raid_disks)
+                       raid_disks = conf->raid_disks;
+               else
+                       raid_disks = conf->previous_raid_disks;
+       }
+
+       sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+       sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+       return sectors * (raid_disks - conf->max_degraded);
+}
+
  static raid5_conf_t *setup_conf(mddev_t *mddev)
  {
         raid5_conf_t *conf;
@@ -4265,7 +4404,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                 conf->max_degraded = 1;
         conf->algorithm = mddev->new_layout;
         conf->max_nr_stripes = NR_STRIPES;
-       conf->expand_progress = mddev->reshape_position;
+       conf->reshape_progress = mddev->reshape_position;
+       if (conf->reshape_progress != MaxSector) {
+               conf->prev_chunk = mddev->chunk_size;
+               conf->prev_algo = mddev->layout;
+       }
  
         memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
                  conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
@@ -4313,29 +4456,21 @@ static int run(mddev_t *mddev)
                  */
                 sector_t here_new, here_old;
                 int old_disks;
-               int max_degraded = (mddev->level == 5 ? 1 : 2);
+               int max_degraded = (mddev->level == 6 ? 2 : 1);
  
-               if (mddev->new_level != mddev->level ||
-                   mddev->new_layout != mddev->layout ||
-                   mddev->new_chunk != mddev->chunk_size) {
+               if (mddev->new_level != mddev->level) {
                         printk(KERN_ERR "raid5: %s: unsupported reshape "
                                "required - aborting.\n",
                                mdname(mddev));
                         return -EINVAL;
                 }
-               if (mddev->delta_disks <= 0) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape "
-                              "(reduce disks) required - aborting.\n",
-                              mdname(mddev));
-                       return -EINVAL;
-               }
                 old_disks = mddev->raid_disks - mddev->delta_disks;
                 /* reshape_position must be on a new-stripe boundary, and one
                  * further up in new geometry must map after here in old
                  * geometry.
                  */
                 here_new = mddev->reshape_position;
-               if (sector_div(here_new, (mddev->chunk_size>>9)*
+               if (sector_div(here_new, (mddev->new_chunk>>9)*
                                (mddev->raid_disks - max_degraded))) {
                         printk(KERN_ERR "raid5: reshape_position not "
                                "on a stripe boundary\n");
@@ -4361,10 +4496,12 @@ static int run(mddev_t *mddev)
                 BUG_ON(mddev->chunk_size != mddev->new_chunk);
                 BUG_ON(mddev->delta_disks != 0);
         }
-       conf = setup_conf(mddev);
  
-       if (conf == NULL)
-               return -EIO;
+       if (mddev->private == NULL)
+               conf = setup_conf(mddev);
+       else
+               conf = mddev->private;
+
         if (IS_ERR(conf))
                 return PTR_ERR(conf);
  
@@ -4410,20 +4547,20 @@ static int run(mddev_t *mddev)
  
         if (mddev->degraded == 0)
                 printk("raid5: raid level %d set %s active with %d out of %d"
-                       " devices, algorithm %d\n", conf->level, mdname(mddev), 
-                       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-                       conf->algorithm);
+                      " devices, algorithm %d\n", conf->level, mdname(mddev),
+                      mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+                      mddev->new_layout);
         else
                 printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
                         " out of %d devices, algorithm %d\n", conf->level,
                         mdname(mddev), mddev->raid_disks - mddev->degraded,
-                       mddev->raid_disks, conf->algorithm);
+                       mddev->raid_disks, mddev->new_layout);
  
         print_raid5_conf(conf);
  
-       if (conf->expand_progress != MaxSector) {
+       if (conf->reshape_progress != MaxSector) {
                 printk("...ok start reshape thread\n");
-               conf->expand_lo = conf->expand_progress;
+               conf->reshape_safe = conf->reshape_progress;
                 atomic_set(&conf->reshape_stripes, 0);
                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4456,8 +4593,7 @@ static int run(mddev_t *mddev)
         mddev->queue->backing_dev_info.congested_data = mddev;
         mddev->queue->backing_dev_info.congested_fn = raid5_congested;
  
-       mddev->array_sectors = mddev->dev_sectors *
-               (conf->previous_raid_disks - conf->max_degraded);
+       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
  
         blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
  
@@ -4604,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
         print_raid5_conf(conf);
         rdev = p->rdev;
         if (rdev) {
+               if (number >= conf->raid_disks &&
+                   conf->reshape_progress == MaxSector)
+                       clear_bit(In_sync, &rdev->flags);
+
                 if (test_bit(In_sync, &rdev->flags) ||
                     atomic_read(&rdev->nr_pending)) {
                         err = -EBUSY;
@@ -4613,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                  * isn't possible.
                  */
                 if (!test_bit(Faulty, &rdev->flags) &&
-                   mddev->degraded <= conf->max_degraded) {
+                   mddev->degraded <= conf->max_degraded &&
+                   number < conf->raid_disks) {
                         err = -EBUSY;
                         goto abort;
                 }
@@ -4680,11 +4821,12 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
          * any io in the removed space completes, but it hardly seems
          * worth it.
          */
-       raid5_conf_t *conf = mddev_to_conf(mddev);
-
         sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-       mddev->array_sectors = sectors * (mddev->raid_disks
-                                         - conf->max_degraded);
+       md_set_array_sectors(mddev, raid5_size(mddev, sectors,
+                                              mddev->raid_disks));
+       if (mddev->array_sectors >
+           raid5_size(mddev, sectors, mddev->raid_disks))
+               return -EINVAL;
         set_capacity(mddev->gendisk, mddev->array_sectors);
         mddev->changed = 1;
         if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
@@ -4696,20 +4838,31 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
         return 0;
  }
  
-#ifdef CONFIG_MD_RAID5_RESHAPE
  static int raid5_check_reshape(mddev_t *mddev)
  {
         raid5_conf_t *conf = mddev_to_conf(mddev);
-       int err;
  
-       if (mddev->delta_disks < 0 ||
-           mddev->new_level != mddev->level)
-               return -EINVAL; /* Cannot shrink array or change level yet */
-       if (mddev->delta_disks == 0)
-               return 0; /* nothing to do */
+       if (mddev->delta_disks == 0 &&
+           mddev->new_layout == mddev->layout &&
+           mddev->new_chunk == mddev->chunk_size)
+               return -EINVAL; /* nothing to do */
         if (mddev->bitmap)
                 /* Cannot grow a bitmap yet */
                 return -EBUSY;
+       if (mddev->degraded > conf->max_degraded)
+               return -EINVAL;
+       if (mddev->delta_disks < 0) {
+               /* We might be able to shrink, but the devices must
+                * be made bigger first.
+                * For raid6, 4 is the minimum size.
+                * Otherwise 2 is the minimum
+                */
+               int min = 2;
+               if (mddev->level == 6)
+                       min = 4;
+               if (mddev->raid_disks + mddev->delta_disks < min)
+                       return -EINVAL;
+       }
  
         /* Can only proceed if there are plenty of stripe_heads.
          * We need a minimum of one full stripe,, and for sensible progress
@@ -4722,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev)
         if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
             (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
                 printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-                      (mddev->chunk_size / STRIPE_SIZE)*4);
+                      (max(mddev->chunk_size, mddev->new_chunk)
+                       / STRIPE_SIZE)*4);
                 return -ENOSPC;
         }
  
-       err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
-       if (err)
-               return err;
-
-       if (mddev->degraded > conf->max_degraded)
-               return -EINVAL;
-       /* looks like we might be able to manage this */
-       return 0;
+       return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
  }
  
  static int raid5_start_reshape(mddev_t *mddev)
@@ -4758,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev)
                  */
                 return -EINVAL;
  
+       /* Refuse to reduce size of the array.  Any reductions in
+        * array size must be through explicit setting of array_size
+        * attribute.
+        */
+       if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
+           < mddev->array_sectors) {
+               printk(KERN_ERR "md: %s: array size must be reduced "
+                      "before number of disks\n", mdname(mddev));
+               return -EINVAL;
+       }
+
         atomic_set(&conf->reshape_stripes, 0);
         spin_lock_irq(&conf->device_lock);
         conf->previous_raid_disks = conf->raid_disks;
         conf->raid_disks += mddev->delta_disks;
-       conf->expand_progress = 0;
-       conf->expand_lo = 0;
+       conf->prev_chunk = conf->chunk_size;
+       conf->chunk_size = mddev->new_chunk;
+       conf->prev_algo = conf->algorithm;
+       conf->algorithm = mddev->new_layout;
+       if (mddev->delta_disks < 0)
+               conf->reshape_progress = raid5_size(mddev, 0, 0);
+       else
+               conf->reshape_progress = 0;
+       conf->reshape_safe = conf->reshape_progress;
+       conf->generation++;
         spin_unlock_irq(&conf->device_lock);
  
         /* Add some new drives, as many as will fit.
@@ -4788,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
                                 break;
                 }
  
-       spin_lock_irqsave(&conf->device_lock, flags);
-       mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
-       spin_unlock_irqrestore(&conf->device_lock, flags);
+       if (mddev->delta_disks > 0) {
+               spin_lock_irqsave(&conf->device_lock, flags);
+               mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
+                       - added_devices;
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+       }
         mddev->raid_disks = conf->raid_disks;
         mddev->reshape_position = 0;
         set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -4805,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev)
                 mddev->recovery = 0;
                 spin_lock_irq(&conf->device_lock);
                 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-               conf->expand_progress = MaxSector;
+               conf->reshape_progress = MaxSector;
                 spin_unlock_irq(&conf->device_lock);
                 return -EAGAIN;
         }
+       conf->reshape_checkpoint = jiffies;
         md_wakeup_thread(mddev->sync_thread);
         md_new_event(mddev);
         return 0;
  }
-#endif
  
+/* This is called from the reshape thread and should make any
+ * changes needed in 'conf'
+ */
  static void end_reshape(raid5_conf_t *conf)
  {
-       struct block_device *bdev;
  
         if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-               conf->mddev->array_sectors = conf->mddev->dev_sectors *
-                       (conf->raid_disks - conf->max_degraded);
-               set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
-               conf->mddev->changed = 1;
-
-               bdev = bdget_disk(conf->mddev->gendisk, 0);
-               if (bdev) {
-                       mutex_lock(&bdev->bd_inode->i_mutex);
-                       i_size_write(bdev->bd_inode,
-                                    (loff_t)conf->mddev->array_sectors << 9);
-                       mutex_unlock(&bdev->bd_inode->i_mutex);
-                       bdput(bdev);
-               }
+
                 spin_lock_irq(&conf->device_lock);
-               conf->expand_progress = MaxSector;
+               conf->previous_raid_disks = conf->raid_disks;
+               conf->reshape_progress = MaxSector;
                 spin_unlock_irq(&conf->device_lock);
-               conf->mddev->reshape_position = MaxSector;
+               wake_up(&conf->wait_for_overlap);
  
                 /* read-ahead size must cover two whole stripes, which is
                  * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
                  */
                 {
-                       int data_disks = conf->previous_raid_disks - conf->max_degraded;
-                       int stripe = data_disks *
-                               (conf->mddev->chunk_size / PAGE_SIZE);
+                       int data_disks = conf->raid_disks - conf->max_degraded;
+                       int stripe = data_disks * (conf->chunk_size
+                                                  / PAGE_SIZE);
                         if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
                                 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
                 }
         }
  }
  
+/* This is called from the raid5d thread with mddev_lock held.
+ * It makes config changes to the device.
+ */
+static void raid5_finish_reshape(mddev_t *mddev)
+{
+       struct block_device *bdev;
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+
+               if (mddev->delta_disks > 0) {
+                       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+                       set_capacity(mddev->gendisk, mddev->array_sectors);
+                       mddev->changed = 1;
+
+                       bdev = bdget_disk(mddev->gendisk, 0);
+                       if (bdev) {
+                               mutex_lock(&bdev->bd_inode->i_mutex);
+                               i_size_write(bdev->bd_inode,
+                                            (loff_t)mddev->array_sectors << 9);
+                               mutex_unlock(&bdev->bd_inode->i_mutex);
+                               bdput(bdev);
+                       }
+               } else {
+                       int d;
+                       mddev->degraded = conf->raid_disks;
+                       for (d = 0; d < conf->raid_disks ; d++)
+                               if (conf->disks[d].rdev &&
+                                   test_bit(In_sync,
+                                            &conf->disks[d].rdev->flags))
+                                       mddev->degraded--;
+                       for (d = conf->raid_disks ;
+                            d < conf->raid_disks - mddev->delta_disks;
+                            d++)
+                               raid5_remove_disk(mddev, d);
+               }
+               mddev->layout = conf->algorithm;
+               mddev->chunk_size = conf->chunk_size;
+               mddev->reshape_position = MaxSector;
+               mddev->delta_disks = 0;
+       }
+}
+
  static void raid5_quiesce(mddev_t *mddev, int state)
  {
         raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4880,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state)
         }
  }
  
+
+static void *raid5_takeover_raid1(mddev_t *mddev)
+{
+       int chunksect;
+
+       if (mddev->raid_disks != 2 ||
+           mddev->degraded > 1)
+               return ERR_PTR(-EINVAL);
+
+       /* Should check if there are write-behind devices? */
+
+       chunksect = 64*2; /* 64K by default */
+
+       /* The array must be an exact multiple of chunksize */
+       while (chunksect && (mddev->array_sectors & (chunksect-1)))
+               chunksect >>= 1;
+
+       if ((chunksect<<9) < STRIPE_SIZE)
+               /* array size does not allow a suitable chunk size */
+               return ERR_PTR(-EINVAL);
+
+       mddev->new_level = 5;
+       mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
+       mddev->new_chunk = chunksect << 9;
+
+       return setup_conf(mddev);
+}
+
+static void *raid5_takeover_raid6(mddev_t *mddev)
+{
+       int new_layout;
+
+       switch (mddev->layout) {
+       case ALGORITHM_LEFT_ASYMMETRIC_6:
+               new_layout = ALGORITHM_LEFT_ASYMMETRIC;
+               break;
+       case ALGORITHM_RIGHT_ASYMMETRIC_6:
+               new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
+               break;
+       case ALGORITHM_LEFT_SYMMETRIC_6:
+               new_layout = ALGORITHM_LEFT_SYMMETRIC;
+               break;
+       case ALGORITHM_RIGHT_SYMMETRIC_6:
+               new_layout = ALGORITHM_RIGHT_SYMMETRIC;
+               break;
+       case ALGORITHM_PARITY_0_6:
+               new_layout = ALGORITHM_PARITY_0;
+               break;
+       case ALGORITHM_PARITY_N:
+               new_layout = ALGORITHM_PARITY_N;
+               break;
+       default:
+               return ERR_PTR(-EINVAL);
+       }
+       mddev->new_level = 5;
+       mddev->new_layout = new_layout;
+       mddev->delta_disks = -1;
+       mddev->raid_disks -= 1;
+       return setup_conf(mddev);
+}
+
+
+static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+       /* For a 2-drive array, the layout and chunk size can be changed
+        * immediately as not restriping is needed.
+        * For larger arrays we record the new value - after validation
+        * to be used by a reshape pass.
+        */
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+               return -EINVAL;
+       if (new_chunk > 0) {
+               if (new_chunk & (new_chunk-1))
+                       /* not a power of 2 */
+                       return -EINVAL;
+               if (new_chunk < PAGE_SIZE)
+                       return -EINVAL;
+               if (mddev->array_sectors & ((new_chunk>>9)-1))
+                       /* not factor of array size */
+                       return -EINVAL;
+       }
+
+       /* They look valid */
+
+       if (mddev->raid_disks == 2) {
+
+               if (new_layout >= 0) {
+                       conf->algorithm = new_layout;
+                       mddev->layout = mddev->new_layout = new_layout;
+               }
+               if (new_chunk > 0) {
+                       conf->chunk_size = new_chunk;
+                       mddev->chunk_size = mddev->new_chunk = new_chunk;
+               }
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               md_wakeup_thread(mddev->thread);
+       } else {
+               if (new_layout >= 0)
+                       mddev->new_layout = new_layout;
+               if (new_chunk > 0)
+                       mddev->new_chunk = new_chunk;
+       }
+       return 0;
+}
+
+static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+       if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+               return -EINVAL;
+       if (new_chunk > 0) {
+               if (new_chunk & (new_chunk-1))
+                       /* not a power of 2 */
+                       return -EINVAL;
+               if (new_chunk < PAGE_SIZE)
+                       return -EINVAL;
+               if (mddev->array_sectors & ((new_chunk>>9)-1))
+                       /* not factor of array size */
+                       return -EINVAL;
+       }
+
+       /* They look valid */
+
+       if (new_layout >= 0)
+               mddev->new_layout = new_layout;
+       if (new_chunk > 0)
+               mddev->new_chunk = new_chunk;
+
+       return 0;
+}
+
+static void *raid5_takeover(mddev_t *mddev)
+{
+       /* raid5 can take over:
+        *  raid0 - if all devices are the same - make it a raid4 layout
+        *  raid1 - if there are two drives.  We need to know the chunk size
+        *  raid4 - trivial - just use a raid4 layout.
+        *  raid6 - Providing it is a *_6 layout
+        *
+        * For now, just do raid1
+        */
+
+       if (mddev->level == 1)
+               return raid5_takeover_raid1(mddev);
+       if (mddev->level == 4) {
+               mddev->new_layout = ALGORITHM_PARITY_N;
+               mddev->new_level = 5;
+               return setup_conf(mddev);
+       }
+       if (mddev->level == 6)
+               return raid5_takeover_raid6(mddev);
+
+       return ERR_PTR(-EINVAL);
+}
+
+
+static struct mdk_personality raid5_personality;
+
+static void *raid6_takeover(mddev_t *mddev)
+{
+       /* Currently can only take over a raid5.  We map the
+        * personality to an equivalent raid6 personality
+        * with the Q block at the end.
+        */
+       int new_layout;
+
+       if (mddev->pers != &raid5_personality)
+               return ERR_PTR(-EINVAL);
+       if (mddev->degraded > 1)
+               return ERR_PTR(-EINVAL);
+       if (mddev->raid_disks > 253)
+               return ERR_PTR(-EINVAL);
+       if (mddev->raid_disks < 3)
+               return ERR_PTR(-EINVAL);
+
+       switch (mddev->layout) {
+       case ALGORITHM_LEFT_ASYMMETRIC:
+               new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+               break;
+       case ALGORITHM_RIGHT_ASYMMETRIC:
+               new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+               break;
+       case ALGORITHM_LEFT_SYMMETRIC:
+               new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
+               break;
+       case ALGORITHM_RIGHT_SYMMETRIC:
+               new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+               break;
+       case ALGORITHM_PARITY_0:
+               new_layout = ALGORITHM_PARITY_0_6;
+               break;
+       case ALGORITHM_PARITY_N:
+               new_layout = ALGORITHM_PARITY_N;
+               break;
+       default:
+               return ERR_PTR(-EINVAL);
+       }
+       mddev->new_level = 6;
+       mddev->new_layout = new_layout;
+       mddev->delta_disks = 1;
+       mddev->raid_disks += 1;
+       return setup_conf(mddev);
+}
+
+
  static struct mdk_personality raid6_personality =
  {
         .name           = "raid6",
@@ -4895,11 +5304,13 @@ static struct mdk_personality raid6_personality =
         .spare_active   = raid5_spare_active,
         .sync_request   = sync_request,
         .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+       .size           = raid5_size,
         .check_reshape  = raid5_check_reshape,
         .start_reshape  = raid5_start_reshape,
-#endif
+       .finish_reshape = raid5_finish_reshape,
         .quiesce        = raid5_quiesce,
+       .takeover       = raid6_takeover,
+       .reconfig       = raid6_reconfig,
  };
  static struct mdk_personality raid5_personality =
  {
@@ -4916,11 +5327,13 @@ static struct mdk_personality raid5_personality =
         .spare_active   = raid5_spare_active,
         .sync_request   = sync_request,
         .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+       .size           = raid5_size,
         .check_reshape  = raid5_check_reshape,
         .start_reshape  = raid5_start_reshape,
-#endif
+       .finish_reshape = raid5_finish_reshape,
         .quiesce        = raid5_quiesce,
+       .takeover       = raid5_takeover,
+       .reconfig       = raid5_reconfig,
  };
  
  static struct mdk_personality raid4_personality =
@@ -4938,20 +5351,15 @@ static struct mdk_personality raid4_personality =
         .spare_active   = raid5_spare_active,
         .sync_request   = sync_request,
         .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+       .size           = raid5_size,
         .check_reshape  = raid5_check_reshape,
         .start_reshape  = raid5_start_reshape,
-#endif
+       .finish_reshape = raid5_finish_reshape,
         .quiesce        = raid5_quiesce,
  };
  
  static int __init raid5_init(void)
  {
-       int e;
-
-       e = raid6_select_algo();
-       if ( e )
-               return e;
         register_md_personality(&raid6_personality);
         register_md_personality(&raid5_personality);
         register_md_personality(&raid4_personality);