init_stripe(sh, sector, previous);
} else {
if (atomic_read(&sh->count)) {
- BUG_ON(!list_empty(&sh->lru));
+ BUG_ON(!list_empty(&sh->lru)
+ && !test_bit(STRIPE_EXPANDING, &sh->state));
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
return 0;
}
-#ifdef CONFIG_MD_RAID5_RESHAPE
static int resize_stripes(raid5_conf_t *conf, int newsize)
{
/* Make all the stripes able to hold 'newsize' devices.
conf->pool_size = newsize;
return err;
}
-#endif
static int drop_one_stripe(raid5_conf_t *conf)
{
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
+ struct stripe_head *sh2
+ = get_active_stripe(conf, sh->sector, 1, 1);
+ if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+ /* sh cannot be written until sh2 has been read.
+ * so arrange for sh to be delayed a little
+ */
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+ &sh2->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe(sh2);
+ goto unlock;
+ }
+ if (sh2)
+ release_stripe(sh2);
+
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
for (i = conf->raid_disks; i--; ) {
}
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+ struct stripe_head *sh2
+ = get_active_stripe(conf, sh->sector, 1, 1);
+ if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+ /* sh cannot be written until sh2 has been read.
+ * so arrange for sh to be delayed a little
+ */
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+ &sh2->state))
+ atomic_inc(&conf->preread_active_stripes);
+ release_stripe(sh2);
+ goto unlock;
+ }
+ if (sh2)
+ release_stripe(sh2);
+
/* Need to write out all blocks after computing P&Q */
sh->disks = conf->raid_disks;
stripe_set_idx(sh->sector, conf, 0, sh);
retry:
previous = 0;
+ disks = conf->raid_disks;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
- if (likely(conf->reshape_progress == MaxSector))
- disks = conf->raid_disks;
- else {
+ if (unlikely(conf->reshape_progress != MaxSector)) {
/* spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be
* possible to see a half-updated value
* to check again.
*/
spin_lock_irq(&conf->device_lock);
- disks = conf->raid_disks;
if (mddev->delta_disks < 0
? logical_sector < conf->reshape_progress
: logical_sector >= conf->reshape_progress) {
sh = get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK));
if (sh) {
- if (unlikely(conf->reshape_progress != MaxSector)) {
+ if (unlikely(previous)) {
/* expansion might have moved on while waiting for a
* stripe, so we must do the range check again.
* Expansion could still move past after this
*/
int must_retry = 0;
spin_lock_irq(&conf->device_lock);
- if ((mddev->delta_disks < 0
- ? logical_sector >= conf->reshape_progress
- : logical_sector < conf->reshape_progress)
- && previous)
+ if (mddev->delta_disks < 0
+ ? logical_sector >= conf->reshape_progress
+ : logical_sector < conf->reshape_progress)
/* mismatch, need to try again */
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
int new_data_disks = conf->raid_disks - conf->max_degraded;
int i;
int dd_idx;
- sector_t writepos, safepos, gap;
+ sector_t writepos, readpos, safepos;
sector_t stripe_addr;
int reshape_sectors;
+ struct list_head stripes;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
*/
writepos = conf->reshape_progress;
sector_div(writepos, new_data_disks);
+ readpos = conf->reshape_progress;
+ sector_div(readpos, data_disks);
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->delta_disks < 0) {
writepos -= reshape_sectors;
+ readpos += reshape_sectors;
safepos += reshape_sectors;
- gap = conf->reshape_safe - conf->reshape_progress;
} else {
writepos += reshape_sectors;
+ readpos -= reshape_sectors;
safepos -= reshape_sectors;
- gap = conf->reshape_progress - conf->reshape_safe;
}
+ /* 'writepos' is the most advanced device address we might write.
+ * 'readpos' is the least advanced device address we might read.
+ * 'safepos' is the least address recorded in the metadata as having
+ * been reshaped.
+ * If 'readpos' is behind 'writepos', then there is no way that we can
+ * ensure safety in the face of a crash - that must be done by userspace
+ * making a backup of the data. So in that case there is no particular
+ * rush to update metadata.
+ * Otherwise if 'safepos' is behind 'writepos', then we really need to
+ * update the metadata to advance 'safepos' to match 'readpos' so that
+ * we can be safe in the event of a crash.
+ * So we insist on updating metadata if safepos is behind writepos and
+ * readpos is beyond writepos.
+ * In any case, update the metadata every 10 seconds.
+ * Maybe that number should be configurable, but I'm not sure it is
+ * worth it.... maybe it could be a multiple of safemode_delay???
+ */
if ((mddev->delta_disks < 0
- ? writepos < safepos
- : writepos > safepos) ||
- gap > (new_data_disks)*3000*2 /*3Meg*/) {
+ ? (safepos > writepos && readpos < writepos)
+ : (safepos < writepos && readpos > writepos)) ||
+ time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->reshape_progress;
+ conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->flags == 0 ||
BUG_ON(writepos != sector_nr + reshape_sectors);
stripe_addr = sector_nr;
}
+ INIT_LIST_HEAD(&stripes);
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped = 0;
set_bit(STRIPE_EXPAND_READY, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
- release_stripe(sh);
+ list_add(&sh->lru, &stripes);
}
spin_lock_irq(&conf->device_lock);
if (mddev->delta_disks < 0)
release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
+ /* Now that the sources are clearly marked, we can release
+ * the destination stripes
+ */
+ while (!list_empty(&stripes)) {
+ sh = list_entry(stripes.next, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ release_stripe(sh);
+ }
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
*/
wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0);
mddev->reshape_position = conf->reshape_progress;
+ conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
return 0;
}
-#ifdef CONFIG_MD_RAID5_RESHAPE
static int raid5_check_reshape(mddev_t *mddev)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
spin_unlock_irq(&conf->device_lock);
return -EAGAIN;
}
+ conf->reshape_checkpoint = jiffies;
md_wakeup_thread(mddev->sync_thread);
md_new_event(mddev);
return 0;
}
-#endif
/* This is called from the reshape thread and should make any
* changes needed in 'conf'
conf->previous_raid_disks = conf->raid_disks;
conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_for_overlap);
/* read-ahead size must cover two whole stripes, which is
* 2 * (datadisks) * chunksize where 'n' is the number of raid devices
.sync_request = sync_request,
.resize = raid5_resize,
.size = raid5_size,
-#ifdef CONFIG_MD_RAID5_RESHAPE
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
-#endif
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.reconfig = raid6_reconfig,
.sync_request = sync_request,
.resize = raid5_resize,
.size = raid5_size,
-#ifdef CONFIG_MD_RAID5_RESHAPE
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
-#endif
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.reconfig = raid5_reconfig,
.sync_request = sync_request,
.resize = raid5_resize,
.size = raid5_size,
-#ifdef CONFIG_MD_RAID5_RESHAPE
.check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape,
.finish_reshape = raid5_finish_reshape,
-#endif
.quiesce = raid5_quiesce,
};