*
* RAID-10 support for md.
*
- * Base on code in raid1.c. See raid1.c for futher copyright information.
+ * Base on code in raid1.c. See raid1.c for further copyright information.
*
*
* This program is free software; you can redistribute it and/or modify
*/
#define NR_RAID10_BIOS 256
-static void unplug_slaves(mddev_t *mddev);
-
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
conf_t *conf = data;
- r10bio_t *r10_bio;
int size = offsetof(struct r10bio_s, devs[conf->copies]);
/* allocate a r10bio with room for raid_disks entries in the bios array */
- r10_bio = kzalloc(size, gfp_flags);
- if (!r10_bio && conf->mddev)
- unplug_slaves(conf->mddev);
-
- return r10_bio;
+ return kzalloc(size, gfp_flags);
}
static void r10bio_pool_free(void *r10_bio, void *data)
int nalloc;
r10_bio = r10bio_pool_alloc(gfp_flags, conf);
- if (!r10_bio) {
- unplug_slaves(conf->mddev);
+ if (!r10_bio)
return NULL;
- }
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
nalloc = conf->copies; /* resync */
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
+ rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
} else {
/*
- * oops, read error:
+ * oops, read error - keep the refcount on the rdev
*/
char b[BDEVNAME_SIZE];
if (printk_ratelimit())
bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
reschedule_retry(r10_bio);
}
-
- rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
}
static void raid10_end_write_request(struct bio *bio, int error)
/*
* RAID10 layout manager
- * Aswell as the chunksize and raid_disks count, there are two
+ * As well as the chunksize and raid_disks count, there are two
* parameters: near_copies and far_copies.
* near_copies * far_copies must be <= raid_disks.
* Normally one of these will be 1.
* If both are 1, we get raid0.
* If near_copies == raid_disks, we get raid1.
*
- * Chunks are layed out in raid0 style with near_copies copies of the
+ * Chunks are laid out in raid0 style with near_copies copies of the
* first chunk, followed by near_copies copies of the next chunk and
* so on.
* If far_copies > 1, then after 1/far_copies of the array has been assigned
static int read_balance(conf_t *conf, r10bio_t *r10_bio)
{
const sector_t this_sector = r10_bio->sector;
- int disk, slot, nslot;
+ int disk, slot;
const int sectors = r10_bio->sectors;
- sector_t new_distance, current_distance;
+ sector_t new_distance, best_dist;
mdk_rdev_t *rdev;
+ int do_balance;
+ int best_slot;
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
+retry:
+ best_slot = -1;
+ best_dist = MaxSector;
+ do_balance = 1;
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below
* above the resync window.
*/
if (conf->mddev->recovery_cp < MaxSector
- && (this_sector + sectors >= conf->next_resync)) {
- /* make sure that disk is operational */
- slot = 0;
- disk = r10_bio->devs[slot].devnum;
-
- while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
- r10_bio->devs[slot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags)) {
- slot++;
- if (slot == conf->copies) {
- slot = 0;
- disk = -1;
- break;
- }
- disk = r10_bio->devs[slot].devnum;
- }
- goto rb_out;
- }
+ && (this_sector + sectors >= conf->next_resync))
+ do_balance = 0;
-
- /* make sure the disk is operational */
- slot = 0;
- disk = r10_bio->devs[slot].devnum;
- while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
- r10_bio->devs[slot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags)) {
- slot ++;
- if (slot == conf->copies) {
- disk = -1;
- goto rb_out;
- }
+ for (slot = 0; slot < conf->copies ; slot++) {
+ if (r10_bio->devs[slot].bio == IO_BLOCKED)
+ continue;
disk = r10_bio->devs[slot].devnum;
- }
-
-
- current_distance = abs(r10_bio->devs[slot].addr -
- conf->mirrors[disk].head_position);
-
- /* Find the disk whose head is closest,
- * or - for far > 1 - find the closest to partition beginning */
-
- for (nslot = slot; nslot < conf->copies; nslot++) {
- int ndisk = r10_bio->devs[nslot].devnum;
-
-
- if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
- r10_bio->devs[nslot].bio == IO_BLOCKED ||
- !test_bit(In_sync, &rdev->flags))
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (rdev == NULL)
continue;
+ if (!test_bit(In_sync, &rdev->flags))
+ continue;
+
+ if (!do_balance)
+ break;
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
*/
- if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
- disk = ndisk;
- slot = nslot;
+ if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
break;
- }
/* for far > 1 always use the lowest address */
if (conf->far_copies > 1)
- new_distance = r10_bio->devs[nslot].addr;
+ new_distance = r10_bio->devs[slot].addr;
else
- new_distance = abs(r10_bio->devs[nslot].addr -
- conf->mirrors[ndisk].head_position);
- if (new_distance < current_distance) {
- current_distance = new_distance;
- disk = ndisk;
- slot = nslot;
+ new_distance = abs(r10_bio->devs[slot].addr -
+ conf->mirrors[disk].head_position);
+ if (new_distance < best_dist) {
+ best_dist = new_distance;
+ best_slot = slot;
}
}
+ if (slot == conf->copies)
+ slot = best_slot;
-rb_out:
- r10_bio->read_slot = slot;
-/* conf->next_seq_sect = this_sector + sectors;*/
-
- if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
- atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
- else
+ if (slot >= 0) {
+ disk = r10_bio->devs[slot].devnum;
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ /* Cannot risk returning a device that failed
+ * before we inc'ed nr_pending
+ */
+ rdev_dec_pending(rdev, conf->mddev);
+ goto retry;
+ }
+ r10_bio->read_slot = slot;
+ } else
disk = -1;
rcu_read_unlock();
return disk;
}
-static void unplug_slaves(mddev_t *mddev)
-{
- conf_t *conf = mddev->private;
- int i;
-
- rcu_read_lock();
- for (i=0; i < conf->raid_disks; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
- struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
-
- blk_unplug(r_queue);
-
- rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
-}
-
-static void raid10_unplug(struct request_queue *q)
-{
- mddev_t *mddev = q->queuedata;
-
- unplug_slaves(q->queuedata);
- md_wakeup_thread(mddev->thread);
-}
-
static int raid10_congested(void *data, int bits)
{
mddev_t *mddev = data;
return ret;
}
-static int flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(conf_t *conf)
{
/* Any writes that have been queued but are awaiting
* bitmap updates get flushed here.
- * We return 1 if any requests were actually submitted.
*/
- int rv = 0;
-
spin_lock_irq(&conf->device_lock);
if (conf->pending_bio_list.head) {
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
- /* Spinlock only taken to quiet a warning */
- spin_lock(conf->mddev->queue->queue_lock);
- blk_remove_plug(conf->mddev->queue);
- spin_unlock(conf->mddev->queue->queue_lock);
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to disk
* before proceeding w/ I/O */
generic_make_request(bio);
bio = next;
}
- rv = 1;
} else
spin_unlock_irq(&conf->device_lock);
- return rv;
}
+
/* Barriers....
* Sometimes we need to suspend IO while we do something else,
* either some resync/recovery, or reconfigure the array.
/* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
- conf->resync_lock,
- raid10_unplug(conf->mddev->queue));
+ conf->resync_lock, );
/* block any new IO from starting */
conf->barrier++;
- /* No wait for all pending IO to complete */
+ /* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier,
!conf->nr_pending && conf->barrier < RESYNC_DEPTH,
- conf->resync_lock,
- raid10_unplug(conf->mddev->queue));
+ conf->resync_lock, );
spin_unlock_irq(&conf->resync_lock);
}
conf->nr_waiting++;
wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
conf->resync_lock,
- raid10_unplug(conf->mddev->queue));
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
wait_event_lock_irq(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+1,
conf->resync_lock,
- ({ flush_pending_writes(conf);
- raid10_unplug(conf->mddev->queue); }));
+ flush_pending_writes(conf));
+
spin_unlock_irq(&conf->resync_lock);
}
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
unsigned long flags;
mdk_rdev_t *blocked_rdev;
+ int plugged;
if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio);
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
+ plugged = mddev_check_plugged(mddev);
+
raid10_find_phys(conf, r10_bio);
retry_write:
blocked_rdev = NULL;
atomic_inc(&r10_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
- blk_plug_device_unlocked(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
- if (do_sync)
+ if (do_sync || !mddev->bitmap || !plugged)
md_wakeup_thread(mddev->thread);
-
return 0;
}
p->rdev = rdev;
goto abort;
}
- md_integrity_register(mddev);
+ err = md_integrity_register(mddev);
}
abort:
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev) { /* If rdev is not NULL */
- char b[BDEVNAME_SIZE];
- int cur_read_error_count = 0;
+ /* still own a reference to this rdev, so it cannot
+ * have been cleared recently.
+ */
+ rdev = conf->mirrors[d].rdev;
- bdevname(rdev->bdev, b);
+ if (test_bit(Faulty, &rdev->flags))
+ /* drive has already been failed, just ignore any
+ more fix_read_error() attempts */
+ return;
- if (test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
- /* drive has already been failed, just ignore any
- more fix_read_error() attempts */
- return;
- }
+ check_decay_read_errors(mddev, rdev);
+ atomic_inc(&rdev->read_errors);
+ if (atomic_read(&rdev->read_errors) > max_read_errors) {
+ char b[BDEVNAME_SIZE];
+ bdevname(rdev->bdev, b);
- check_decay_read_errors(mddev, rdev);
- atomic_inc(&rdev->read_errors);
- cur_read_error_count = atomic_read(&rdev->read_errors);
- if (cur_read_error_count > max_read_errors) {
- rcu_read_unlock();
- printk(KERN_NOTICE
- "md/raid10:%s: %s: Raid device exceeded "
- "read_error threshold "
- "[cur %d:max %d]\n",
- mdname(mddev),
- b, cur_read_error_count, max_read_errors);
- printk(KERN_NOTICE
- "md/raid10:%s: %s: Failing raid "
- "device\n", mdname(mddev), b);
- md_error(mddev, conf->mirrors[d].rdev);
- return;
- }
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Raid device exceeded "
+ "read_error threshold [cur %d:max %d]\n",
+ mdname(mddev), b,
+ atomic_read(&rdev->read_errors), max_read_errors);
+ printk(KERN_NOTICE
+ "md/raid10:%s: %s: Failing raid device\n",
+ mdname(mddev), b);
+ md_error(mddev, conf->mirrors[d].rdev);
+ return;
}
- rcu_read_unlock();
while(sectors) {
int s = sectors;
"write failed"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing "
"drive\n",
"corrected sectors"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
mdname(mddev),
"md/raid10:%s: read error corrected"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
- (unsigned long long)(sect+
- rdev->data_offset),
+ (unsigned long long)(
+ sect + rdev->data_offset),
bdevname(rdev->bdev, b));
}
unsigned long flags;
conf_t *conf = mddev->private;
struct list_head *head = &conf->retry_list;
- int unplug=0;
mdk_rdev_t *rdev;
+ struct blk_plug plug;
md_check_recovery(mddev);
+ blk_start_plug(&plug);
for (;;) {
char b[BDEVNAME_SIZE];
- unplug += flush_pending_writes(conf);
+ flush_pending_writes(conf);
spin_lock_irqsave(&conf->device_lock, flags);
if (list_empty(head)) {
mddev = r10_bio->mddev;
conf = mddev->private;
- if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
+ if (test_bit(R10BIO_IsSync, &r10_bio->state))
sync_request_write(mddev, r10_bio);
- unplug = 1;
- } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
+ else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
recovery_request_write(mddev, r10_bio);
- unplug = 1;
- } else {
- int mirror;
+ else {
+ int slot = r10_bio->read_slot;
+ int mirror = r10_bio->devs[slot].devnum;
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
* We freeze all other IO, and try reading the block from
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
}
+ rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
bio = r10_bio->devs[r10_bio->read_slot].bio;
r10_bio->devs[r10_bio->read_slot].bio =
bio->bi_rw = READ | do_sync;
bio->bi_private = r10_bio;
bio->bi_end_io = raid10_end_read_request;
- unplug = 1;
generic_make_request(bio);
}
}
cond_resched();
}
- if (unplug)
- unplug_slaves(mddev);
+ blk_finish_plug(&plug);
}
md_set_array_sectors(mddev, size);
mddev->resync_max_sectors = size;
- mddev->queue->unplug_fn = raid10_unplug;
mddev->queue->backing_dev_info.congested_fn = raid10_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
if (conf->near_copies < conf->raid_disks)
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
- md_integrity_register(mddev);
+
+ if (md_integrity_register(mddev))
+ goto out_free_conf;
+
return 0;
out_free_conf: