Pull ec into release branch

[pandora-kernel.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 467c169..8d59914 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1050,7 +1050,7 @@ static void compute_parity5(struct stripe_head *sh, int method)
  static void compute_parity6(struct stripe_head *sh, int method)
  {
         raid6_conf_t *conf = sh->raid_conf;
-       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
         struct bio *chosen;
         /**** FIX THIS: This could be very bad if disks is close to 256 ****/
         void *ptrs[disks];
@@ -1131,8 +1131,7 @@ static void compute_parity6(struct stripe_head *sh, int method)
  /* Compute one missing block */
  static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
  {
-       raid6_conf_t *conf = sh->raid_conf;
-       int i, count, disks = conf->raid_disks;
+       int i, count, disks = sh->disks;
         void *ptr[MAX_XOR_BLOCKS], *p;
         int pd_idx = sh->pd_idx;
         int qd_idx = raid6_next_disk(pd_idx, disks);
@@ -1170,8 +1169,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
  /* Compute two missing blocks */
  static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
  {
-       raid6_conf_t *conf = sh->raid_conf;
-       int i, count, disks = conf->raid_disks;
+       int i, count, disks = sh->disks;
         int pd_idx = sh->pd_idx;
         int qd_idx = raid6_next_disk(pd_idx, disks);
         int d0_idx = raid6_next_disk(qd_idx, disks);
@@ -1887,11 +1885,11 @@ static void handle_stripe5(struct stripe_head *sh)
  static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
  {
         raid6_conf_t *conf = sh->raid_conf;
-       int disks = conf->raid_disks;
+       int disks = sh->disks;
         struct bio *return_bi= NULL;
         struct bio *bi;
         int i;
-       int syncing;
+       int syncing, expanding, expanded;
         int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
         int non_overwrite = 0;
         int failed_num[2] = {0, 0};
@@ -1909,6 +1907,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         clear_bit(STRIPE_DELAYED, &sh->state);
  
         syncing = test_bit(STRIPE_SYNCING, &sh->state);
+       expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+       expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
         /* Now to look around and see what can be done */
  
         rcu_read_lock();
@@ -2114,13 +2114,15 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
          * parity, or to satisfy requests
          * or to load a block that is being partially written.
          */
-       if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
+       if (to_read || non_overwrite || (to_write && failed) ||
+           (syncing && (uptodate < disks)) || expanding) {
                 for (i=disks; i--;) {
                         dev = &sh->dev[i];
                         if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
                             (dev->toread ||
                              (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
                              syncing ||
+                            expanding ||
                              (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
                              (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
                                     )
@@ -2355,6 +2357,79 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                                 }
                         }
                 }
+
+       if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+               /* Need to write out all blocks after computing P&Q */
+               sh->disks = conf->raid_disks;
+               sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
+                                            conf->raid_disks);
+               compute_parity6(sh, RECONSTRUCT_WRITE);
+               for (i = conf->raid_disks ; i-- ;  ) {
+                       set_bit(R5_LOCKED, &sh->dev[i].flags);
+                       locked++;
+                       set_bit(R5_Wantwrite, &sh->dev[i].flags);
+               }
+               clear_bit(STRIPE_EXPANDING, &sh->state);
+       } else if (expanded) {
+               clear_bit(STRIPE_EXPAND_READY, &sh->state);
+               atomic_dec(&conf->reshape_stripes);
+               wake_up(&conf->wait_for_overlap);
+               md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+       }
+
+       if (expanding && locked == 0) {
+               /* We have read all the blocks in this stripe and now we need to
+                * copy some of them into a target stripe for expand.
+                */
+               clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+               for (i = 0; i < sh->disks ; i++)
+                       if (i != pd_idx && i != qd_idx) {
+                               int dd_idx2, pd_idx2, j;
+                               struct stripe_head *sh2;
+
+                               sector_t bn = compute_blocknr(sh, i);
+                               sector_t s = raid5_compute_sector(
+                                       bn, conf->raid_disks,
+                                       conf->raid_disks - conf->max_degraded,
+                                       &dd_idx2, &pd_idx2, conf);
+                               sh2 = get_active_stripe(conf, s,
+                                                       conf->raid_disks,
+                                                      pd_idx2, 1);
+                               if (sh2 == NULL)
+                                       /* so for only the early blocks of
+                                        * this stripe have been requests.
+                                        * When later blocks get requests, we
+                                        * will try again
+                                        */
+                                       continue;
+                               if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+                                   test_bit(R5_Expanded,
+                                            &sh2->dev[dd_idx2].flags)) {
+                                       /* must have already done this block */
+                                       release_stripe(sh2);
+                                       continue;
+                               }
+                               memcpy(page_address(sh2->dev[dd_idx2].page),
+                                      page_address(sh->dev[i].page),
+                                      STRIPE_SIZE);
+                               set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags);
+                               set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags);
+                               for (j = 0 ; j < conf->raid_disks ; j++)
+                                       if (j != sh2->pd_idx &&
+                                           j != raid6_next_disk(sh2->pd_idx,
+                                                          sh2->disks) &&
+                                           !test_bit(R5_Expanded,
+                                                     &sh2->dev[j].flags))
+                                               break;
+                               if (j == conf->raid_disks) {
+                                       set_bit(STRIPE_EXPAND_READY,
+                                               &sh2->state);
+                                       set_bit(STRIPE_HANDLE, &sh2->state);
+                               }
+                               release_stripe(sh2);
+                       }
+       }
+
         spin_unlock(&sh->lock);
  
         while ((bi=return_bi)) {
@@ -2395,7 +2470,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 rcu_read_unlock();
  
                 if (rdev) {
-                       if (syncing)
+                       if (syncing || expanding || expanded)
                                 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
  
                         bi->bi_bdev = rdev->bdev;
@@ -2620,7 +2695,7 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
         }
         bi = conf->retry_read_aligned_list;
         if(bi) {
-               conf->retry_read_aligned = bi->bi_next;
+               conf->retry_read_aligned_list = bi->bi_next;
                 bi->bi_next = NULL;
                 bi->bi_phys_segments = 1; /* biased count of active stripes */
                 bi->bi_hw_segments = 0; /* count of processed stripes */
@@ -2669,6 +2744,27 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error)
         return 0;
  }
  
+static int bio_fits_rdev(struct bio *bi)
+{
+       request_queue_t *q = bdev_get_queue(bi->bi_bdev);
+
+       if ((bi->bi_size>>9) > q->max_sectors)
+               return 0;
+       blk_recount_segments(q, bi);
+       if (bi->bi_phys_segments > q->max_phys_segments ||
+           bi->bi_hw_segments > q->max_hw_segments)
+               return 0;
+
+       if (q->merge_bvec_fn)
+               /* it's too hard to apply the merge_bvec_fn at this stage,
+                * just just give up
+                */
+               return 0;
+
+       return 1;
+}
+
+
  static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
  {
         mddev_t *mddev = q->queuedata;
@@ -2715,6 +2811,13 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio)
                 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
                 align_bi->bi_sector += rdev->data_offset;
  
+               if (!bio_fits_rdev(align_bi)) {
+                       /* too big in some way */
+                       bio_put(align_bi);
+                       rdev_dec_pending(rdev, mddev);
+                       return 0;
+               }
+
                 spin_lock_irq(&conf->device_lock);
                 wait_event_lock_irq(conf->wait_for_stripe,
                                     conf->quiesce == 0,
@@ -2887,8 +2990,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         struct stripe_head *sh;
         int pd_idx;
         sector_t first_sector, last_sector;
-       int raid_disks;
-       int data_disks;
+       int raid_disks = conf->previous_raid_disks;
+       int data_disks = raid_disks - conf->max_degraded;
+       int new_data_disks = conf->raid_disks - conf->max_degraded;
         int i;
         int dd_idx;
         sector_t writepos, safepos, gap;
@@ -2897,7 +3001,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
             conf->expand_progress != 0) {
                 /* restarting in the middle, skip the initial sectors */
                 sector_nr = conf->expand_progress;
-               sector_div(sector_nr, conf->raid_disks-1);
+               sector_div(sector_nr, new_data_disks);
                 *skipped = 1;
                 return sector_nr;
         }
@@ -2911,14 +3015,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
          * to after where expand_lo old_maps to
          */
         writepos = conf->expand_progress +
-               conf->chunk_size/512*(conf->raid_disks-1);
-       sector_div(writepos, conf->raid_disks-1);
+               conf->chunk_size/512*(new_data_disks);
+       sector_div(writepos, new_data_disks);
         safepos = conf->expand_lo;
-       sector_div(safepos, conf->previous_raid_disks-1);
+       sector_div(safepos, data_disks);
         gap = conf->expand_progress - conf->expand_lo;
  
         if (writepos >= safepos ||
-           gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+           gap > (new_data_disks)*3000*2 /*3Meg*/) {
                 /* Cannot proceed until we've updated the superblock... */
                 wait_event(conf->wait_for_overlap,
                            atomic_read(&conf->reshape_stripes)==0);
@@ -2948,6 +3052,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                         sector_t s;
                         if (j == sh->pd_idx)
                                 continue;
+                       if (conf->level == 6 &&
+                           j == raid6_next_disk(sh->pd_idx, sh->disks))
+                               continue;
                         s = compute_blocknr(sh, j);
                         if (s < (mddev->array_size<<1)) {
                                 skipped = 1;
@@ -2964,28 +3071,27 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                 release_stripe(sh);
         }
         spin_lock_irq(&conf->device_lock);
-       conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+       conf->expand_progress = (sector_nr + i) * new_data_disks;
         spin_unlock_irq(&conf->device_lock);
         /* Ok, those stripe are ready. We can start scheduling
          * reads on the source stripes.
          * The source stripes are determined by mapping the first and last
          * block on the destination stripes.
          */
-       raid_disks = conf->previous_raid_disks;
-       data_disks = raid_disks - 1;
         first_sector =
-               raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+               raid5_compute_sector(sector_nr*(new_data_disks),
                                      raid_disks, data_disks,
                                      &dd_idx, &pd_idx, conf);
         last_sector =
                 raid5_compute_sector((sector_nr+conf->chunk_size/512)
-                                    *(conf->raid_disks-1) -1,
+                                    *(new_data_disks) -1,
                                      raid_disks, data_disks,
                                      &dd_idx, &pd_idx, conf);
         if (last_sector >= (mddev->size<<1))
                 last_sector = (mddev->size<<1)-1;
         while (first_sector <= last_sector) {
-               pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+               pd_idx = stripe_to_pdidx(first_sector, conf,
+                                        conf->previous_raid_disks);
                 sh = get_active_stripe(conf, first_sector,
                                        conf->previous_raid_disks, pd_idx, 0);
                 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
@@ -3107,7 +3213,9 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
         last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
  
         for (; logical_sector < last_sector;
-            logical_sector += STRIPE_SECTORS, scnt++) {
+            logical_sector += STRIPE_SECTORS,
+                    sector += STRIPE_SECTORS,
+                    scnt++) {
  
                 if (scnt < raid_bio->bi_hw_segments)
                         /* already done this stripe */
@@ -3123,7 +3231,13 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                 }
  
                 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
-               add_stripe_bio(sh, raid_bio, dd_idx, 0);
+               if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+                       release_stripe(sh);
+                       raid_bio->bi_hw_segments = scnt;
+                       conf->retry_read_aligned = raid_bio;
+                       return handled;
+               }
+
                 handle_stripe(sh, NULL);
                 release_stripe(sh);
                 handled++;
@@ -3312,35 +3426,44 @@ static int run(mddev_t *mddev)
                  */
                 sector_t here_new, here_old;
                 int old_disks;
+               int max_degraded = (mddev->level == 5 ? 1 : 2);
  
                 if (mddev->new_level != mddev->level ||
                     mddev->new_layout != mddev->layout ||
                     mddev->new_chunk != mddev->chunk_size) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+                       printk(KERN_ERR "raid5: %s: unsupported reshape "
+                              "required - aborting.\n",
                                mdname(mddev));
                         return -EINVAL;
                 }
                 if (mddev->delta_disks <= 0) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+                       printk(KERN_ERR "raid5: %s: unsupported reshape "
+                              "(reduce disks) required - aborting.\n",
                                mdname(mddev));
                         return -EINVAL;
                 }
                 old_disks = mddev->raid_disks - mddev->delta_disks;
                 /* reshape_position must be on a new-stripe boundary, and one
-                * further up in new geometry must map after here in old geometry.
+                * further up in new geometry must map after here in old
+                * geometry.
                  */
                 here_new = mddev->reshape_position;
-               if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
-                       printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+               if (sector_div(here_new, (mddev->chunk_size>>9)*
+                              (mddev->raid_disks - max_degraded))) {
+                       printk(KERN_ERR "raid5: reshape_position not "
+                              "on a stripe boundary\n");
                         return -EINVAL;
                 }
                 /* here_new is the stripe we will write to */
                 here_old = mddev->reshape_position;
-               sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
-               /* here_old is the first stripe that we might need to read from */
+               sector_div(here_old, (mddev->chunk_size>>9)*
+                          (old_disks-max_degraded));
+               /* here_old is the first stripe that we might need to read
+                * from */
                 if (here_new >= here_old) {
                         /* Reading from the same stripe as writing to - bad */
-                       printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+                       printk(KERN_ERR "raid5: reshape_position too early for "
+                              "auto-recovery - aborting.\n");
                         return -EINVAL;
                 }
                 printk(KERN_INFO "raid5: reshape will continue\n");
@@ -3519,12 +3642,15 @@ static int run(mddev_t *mddev)
         }
  
         /* Ok, everything is just fine now */
-       sysfs_create_group(&mddev->kobj, &raid5_attrs_group);
+       if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
+               printk(KERN_WARNING
+                      "raid5: failed to create sysfs attributes for %s\n",
+                      mdname(mddev));
  
         mddev->queue->unplug_fn = raid5_unplug_device;
         mddev->queue->issue_flush_fn = raid5_issue_flush;
-       mddev->queue->backing_dev_info.congested_fn = raid5_congested;
         mddev->queue->backing_dev_info.congested_data = mddev;
+       mddev->queue->backing_dev_info.congested_fn = raid5_congested;
  
         mddev->array_size =  mddev->size * (conf->previous_raid_disks -
                                             conf->max_degraded);
@@ -3555,6 +3681,7 @@ static int stop(mddev_t *mddev)
         mddev->thread = NULL;
         shrink_stripes(conf);
         kfree(conf->stripe_hashtbl);
+       mddev->queue->backing_dev_info.congested_fn = NULL;
         blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
         sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
         kfree(conf->disks);
@@ -3778,6 +3905,8 @@ static int raid5_check_reshape(mddev_t *mddev)
         if (err)
                 return err;
  
+       if (mddev->degraded > conf->max_degraded)
+               return -EINVAL;
         /* looks like we might be able to manage this */
         return 0;
  }
@@ -3791,8 +3920,7 @@ static int raid5_start_reshape(mddev_t *mddev)
         int added_devices = 0;
         unsigned long flags;
  
-       if (mddev->degraded ||
-           test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+       if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                 return -EBUSY;
  
         ITERATE_RDEV(mddev, rdev, rtmp)
@@ -3800,7 +3928,7 @@ static int raid5_start_reshape(mddev_t *mddev)
                     !test_bit(Faulty, &rdev->flags))
                         spares++;
  
-       if (spares < mddev->delta_disks-1)
+       if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
                 /* Not enough devices even to make a degraded array
                  * of that size
                  */
@@ -3826,7 +3954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
                                 added_devices++;
                                 rdev->recovery_offset = 0;
                                 sprintf(nm, "rd%d", rdev->raid_disk);
-                               sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+                               if (sysfs_create_link(&mddev->kobj,
+                                                     &rdev->kobj, nm))
+                                       printk(KERN_WARNING
+                                              "raid5: failed to create "
+                                              " link %s for %s\n",
+                                              nm, mdname(mddev));
                         } else
                                 break;
                 }
@@ -3863,7 +3996,8 @@ static void end_reshape(raid5_conf_t *conf)
         struct block_device *bdev;
  
         if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-               conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+               conf->mddev->array_size = conf->mddev->size *
+                       (conf->raid_disks - conf->max_degraded);
                 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
                 conf->mddev->changed = 1;
  
@@ -3936,6 +4070,10 @@ static struct mdk_personality raid6_personality =
         .spare_active   = raid5_spare_active,
         .sync_request   = sync_request,
         .resize         = raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+       .check_reshape  = raid5_check_reshape,
+       .start_reshape  = raid5_start_reshape,
+#endif
         .quiesce        = raid5_quiesce,
  };
  static struct mdk_personality raid5_personality =
@@ -3975,6 +4113,10 @@ static struct mdk_personality raid4_personality =
         .spare_active   = raid5_spare_active,
         .sync_request   = sync_request,
         .resize         = raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+       .check_reshape  = raid5_check_reshape,
+       .start_reshape  = raid5_start_reshape,
+#endif
         .quiesce        = raid5_quiesce,
  };