X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=drivers%2Fmd%2Fraid1.c;h=dc9d2def0270df7460a4eb0e20c4cf494ae94319;hb=1e9abb5b1dc90cfcf0e6f1e10f7aaf34d0f33b6b;hp=87bfe9e7d8cad704fc598a727beb14cc978c0088;hpb=aa43f77939c97bf9d3580c6a5e71a5a40290e451;p=pandora-kernel.git diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 87bfe9e7d8ca..dc9d2def0270 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -271,7 +271,7 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int */ update_head_pos(mirror, r1_bio); - if (uptodate || conf->working_disks <= 1) { + if (uptodate || (conf->raid_disks - conf->mddev->degraded) <= 1) { /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -601,6 +601,32 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } +static int raid1_congested(void *data, int bits) +{ + mddev_t *mddev = data; + conf_t *conf = mddev_to_conf(mddev); + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + request_queue_t *q = bdev_get_queue(rdev->bdev); + + /* Note the '|| 1' - when read_balance prefers + * non-congested targets, it can be removed + */ + if ((bits & (1<backing_dev_info, bits); + else + ret &= bdi_congested(&q->backing_dev_info, bits); + } + } + rcu_read_unlock(); + return ret; +} + + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -929,11 +955,14 @@ static void status(struct seq_file *seq, mddev_t *mddev) int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); - for (i = 0; i < conf->raid_disks; i++) + conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf(seq, "]"); } @@ -950,49 +979,52 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && (conf->raid_disks - mddev->degraded) == 1) /* * Don't fail the drive, act as though we were just a * normal single drive */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); - mddev->sb_dirty = 1; + set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) { int i; - mirror_info_t *tmp; printk("RAID1 conf printout:\n"); if (!conf) { printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } + rcu_read_unlock(); } static void close_sync(conf_t *conf) @@ -1008,20 +1040,21 @@ static int raid1_spare_active(mddev_t *mddev) { int i; conf_t *conf = mddev->private; - mirror_info_t *tmp; /* * Find all failed disks within the RAID1 configuration - * and mark them readable + * and mark them readable. + * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - conf->working_disks++; + mdk_rdev_t *rdev = conf->mirrors[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_and_set_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1237,7 +1270,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) /* ouch - failed to read all of that. * Try some synchronous reads of other devices to get * good data, much like with normal read errors. Only - * read into the pages we already have so they we don't + * read into the pages we already have so we don't * need to re-issue the read request. * We don't need to freeze the array, because being in an * active sync request, there is no normal IO, and @@ -1257,6 +1290,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) s = PAGE_SIZE >> 9; do { if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ rdev = conf->mirrors[d].rdev; if (sync_page_io(rdev->bdev, sect + rdev->data_offset, @@ -1359,6 +1396,95 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * 3. Performs writes following reads for array syncronising. */ +static void fix_read_error(conf_t *conf, int read_disk, + sector_t sect, int sectors) +{ + mddev_t *mddev = conf->mddev; + while(sectors) { + int s = sectors; + int d = read_disk; + int success = 0; + int start; + mdk_rdev_t *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + /* Note: no rcu protection needed here + * as this is synchronous in the raid1d thread + * which is the thread that might remove + * a device. If raid1d ever becomes multi-threaded.... + */ + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ)) + success = 1; + else { + d++; + if (d == conf->raid_disks) + d = 0; + } + } while (!success && d != read_disk); + + if (!success) { + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[read_disk].rdev); + break; + } + /* write it back and re-read */ + start = d; + while (d != read_disk) { + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + d = start; + while (d != read_disk) { + char b[BDEVNAME_SIZE]; + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) + == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + else { + atomic_add(s, &rdev->corrected_errors); + printk(KERN_INFO + "raid1:%s: read error corrected " + "(%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)sect + + rdev->data_offset, + bdevname(rdev->bdev, b)); + } + } + } + sectors -= s; + sect += s; + } +} + static void raid1d(mddev_t *mddev) { r1bio_t *r1_bio; @@ -1451,80 +1577,14 @@ static void raid1d(mddev_t *mddev) * This is all done synchronously while the array is * frozen */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - freeze_array(conf); - if (mddev->ro == 0) while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - - do { - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) - success = 1; - else { - d++; - if (d == conf->raid_disks) - d = 0; - } - } while (!success && d != r1_bio->read_disk); - - if (success) { - /* write it back and re-read */ - int start = d; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } - } - d = start; - while (d != r1_bio->read_disk) { - if (d==0) - d = conf->raid_disks; - d--; - rdev = conf->mirrors[d].rdev; - if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else - printk(KERN_INFO "raid1:%s: read error corrected (%d sectors at %llu on %s)\n", - mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b)); - } - } - } else { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - break; - } - sectors -= s; - sect += s; + if (mddev->ro == 0) { + freeze_array(conf); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, + r1_bio->sectors); + unfreeze_array(conf); } - unfreeze_array(conf); - bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" @@ -1787,19 +1847,17 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0; iraid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { - md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } } } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, - nr_sectors); + md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } - return nr_sectors; } @@ -1871,15 +1929,11 @@ static int run(mddev_t *mddev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (conf->working_disks == 1) - mddev->recovery_cp = MaxSector; spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -1887,11 +1941,6 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - if (!conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; - } mddev->degraded = 0; for (i = 0; i < conf->raid_disks; i++) { @@ -1904,6 +1953,13 @@ static int run(mddev_t *mddev) mddev->degraded++; } } + if (mddev->degraded == conf->raid_disks) { + printk(KERN_ERR "raid1: no operational mirrors for %s\n", + mdname(mddev)); + goto out_free_conf; + } + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; /* * find the first working one and use it as a starting point @@ -1935,6 +1991,8 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; mddev->queue->issue_flush_fn = raid1_issue_flush; + mddev->queue->backing_dev_info.congested_fn = raid1_congested; + mddev->queue->backing_dev_info.congested_data = mddev; return 0; @@ -2022,7 +2080,7 @@ static int raid1_reshape(mddev_t *mddev) mirror_info_t *newmirrors; conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; - + unsigned long flags; int d, d2; /* Cannot change chunk_size, layout, or level */ @@ -2081,7 +2139,9 @@ static int raid1_reshape(mddev_t *mddev) kfree(conf->poolinfo); conf->poolinfo = newpoolinfo; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0;