spin_lock_irqsave(&conf->device_lock, flags);
if (r1_bio->mddev->degraded == conf->raid_disks ||
(r1_bio->mddev->degraded == conf->raid_disks-1 &&
- !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+ test_bit(In_sync, &conf->mirrors[mirror].rdev->flags)))
uptodate = 1;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
r1_bio->bios[mirror] = NULL;
to_put = bio;
- set_bit(R1BIO_Uptodate, &r1_bio->state);
+ /*
+ * Do not set R1BIO_Uptodate if the current device is
+ * rebuilding or Faulty. This is because we cannot use
+ * such device for properly reading the data back (we could
+ * potentially use it, if the current write would have felt
+ * before rdev->recovery_offset, but for simplicity we don't
+ * check this here.
+ */
+ if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) &&
+ !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */
if (is_badblock(conf->mirrors[mirror].rdev,
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
- if (best_disk < 0)
+ if (best_disk < 0) {
+ if (is_badblock(rdev, this_sector, sectors,
+ &first_bad, &bad_sectors)) {
+ if (first_bad < this_sector)
+ /* Cannot use this */
+ continue;
+ best_good_sectors = first_bad - this_sector;
+ } else
+ best_good_sectors = sectors;
best_disk = disk;
+ }
continue;
}
/* This is a reasonable device to use. It might
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
conf->nr_waiting++;
- wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+ /* Wait for the barrier to drop.
+ * However if there are already pending
+ * requests (preventing the barrier from
+ * rising completely), and the
+ * pre-process bio queue isn't empty,
+ * then don't wait, as we need to empty
+ * that queue to get the nr_pending
+ * count down.
+ */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->barrier ||
+ (conf->nr_pending &&
+ current->bio_list &&
+ !bio_list_empty(current->bio_list)),
conf->resync_lock,
- );
+ );
conf->nr_waiting--;
}
conf->nr_pending++;
*/
DEFINE_WAIT(w);
for (;;) {
- flush_signals(current);
+ sigset_t full, old;
prepare_to_wait(&conf->wait_barrier,
&w, TASK_INTERRUPTIBLE);
if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
bio->bi_sector >= mddev->suspend_hi)
break;
+ sigfillset(&full);
+ sigprocmask(SIG_BLOCK, &full, &old);
schedule();
+ sigprocmask(SIG_SETMASK, &old, NULL);
}
finish_wait(&conf->wait_barrier, &w);
}
* know the original bi_idx, so we just free
* them all
*/
- __bio_for_each_segment(bvec, mbio, j, 0)
+ bio_for_each_segment_all(bvec, mbio, j)
bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
{
char b[BDEVNAME_SIZE];
struct r1conf *conf = mddev->private;
+ unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
return;
}
set_bit(Blocked, &rdev->flags);
+ spin_lock_irqsave(&conf->device_lock, flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} else
set_bit(Faulty, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
"md/raid1:%s: Operation continuing on %d devices.\n",
* Find all failed disks within the RAID1 configuration
* and mark them readable.
* Called under mddev lock, so rcu protection not needed.
+ * device_lock used to avoid races with raid1_end_read_request
+ * which expects 'In_sync' flags and ->degraded to be consistent.
*/
+ spin_lock_irqsave(&conf->device_lock, flags);
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
if (rdev
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
- spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded -= count;
spin_unlock_irqrestore(&conf->device_lock, flags);
if (atomic_dec_and_test(&r1_bio->remaining)) {
/* if we're here, all write(s) have completed, so clean up */
- md_done_sync(mddev, r1_bio->sectors, 1);
- put_buf(r1_bio);
+ int s = r1_bio->sectors;
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, 1);
+ }
}
}
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
{
int m;
+ bool fail = false;
for (m = 0; m < conf->raid_disks ; m++)
if (r1_bio->bios[m] == IO_MADE_GOOD) {
struct md_rdev *rdev = conf->mirrors[m].rdev;
* narrow down and record precise write
* errors.
*/
+ fail = true;
if (!narrow_write_error(r1_bio, m)) {
md_error(conf->mddev,
conf->mirrors[m].rdev);
rdev_dec_pending(conf->mirrors[m].rdev,
conf->mddev);
}
- if (test_bit(R1BIO_WriteError, &r1_bio->state))
- close_write(r1_bio);
- raid_end_bio_io(r1_bio);
+ if (fail) {
+ spin_lock_irq(&conf->device_lock);
+ list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+ conf->nr_queued++;
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else {
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ close_write(r1_bio);
+ raid_end_bio_io(r1_bio);
+ }
}
static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
md_check_recovery(mddev);
+ if (!list_empty_careful(&conf->bio_end_io_list) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ LIST_HEAD(tmp);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+ while (!list_empty(&conf->bio_end_io_list)) {
+ list_move(conf->bio_end_io_list.prev, &tmp);
+ conf->nr_queued--;
+ }
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ while (!list_empty(&tmp)) {
+ r1_bio = list_first_entry(&conf->bio_end_io_list,
+ struct r1bio, retry_list);
+ list_del(&r1_bio->retry_list);
+ if (mddev->degraded)
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (test_bit(R1BIO_WriteError, &r1_bio->state))
+ close_write(r1_bio);
+ raid_end_bio_io(r1_bio);
+ }
+ }
+
blk_start_plug(&plug);
for (;;) {
/* There is nowhere to write, so all non-sync
* drives must be failed - so we are finished
*/
- sector_t rv = max_sector - sector_nr;
+ sector_t rv;
+ if (min_bad > 0)
+ max_sector = sector_nr + min_bad;
+ rv = max_sector - sector_nr;
*skipped = 1;
put_buf(r1_bio);
return rv;
*/
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
atomic_set(&r1_bio->remaining, read_targets);
- for (i=0; i<conf->raid_disks; i++) {
+ for (i = 0; i < conf->raid_disks && read_targets; i++) {
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
+ read_targets--;
md_sync_acct(bio->bi_bdev, nr_sectors);
generic_make_request(bio);
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
+ INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);