X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=drivers%2Fmd%2Fraid10.c;h=8e4f469a75b0a962598cd440ba300ed5c724e12e;hb=7c4e06ff2b6a4c09638551dfde76f37f9fca5c0c;hp=2da83d5665925eea27b441e954ca145569278055;hpb=8b9686ff4ddfdf45662024edd567920e6db87beb;p=pandora-kernel.git diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2da83d566592..8e4f469a75b0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error) */ set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); + rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } else { /* - * oops, read error: + * oops, read error - keep the refcount on the rdev */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) @@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error) bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); reschedule_retry(r10_bio); } - - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); } static void raid10_end_write_request(struct bio *bio, int error) @@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q, static int read_balance(conf_t *conf, r10bio_t *r10_bio) { const sector_t this_sector = r10_bio->sector; - int disk, slot, nslot; + int disk, slot; const int sectors = r10_bio->sectors; - sector_t new_distance, current_distance; + sector_t new_distance, best_dist; mdk_rdev_t *rdev; + int do_balance; + int best_slot; raid10_find_phys(conf, r10_bio); rcu_read_lock(); +retry: + best_slot = -1; + best_dist = MaxSector; + do_balance = 1; /* * Check if we can balance. We can balance on the whole * device if no resync is going on (recovery is ok), or below @@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) * above the resync window. */ if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) { - /* make sure that disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; + && (this_sector + sectors >= conf->next_resync)) + do_balance = 0; - while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot++; - if (slot == conf->copies) { - slot = 0; - disk = -1; - break; - } - disk = r10_bio->devs[slot].devnum; - } - goto rb_out; - } - - - /* make sure the disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; - while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot ++; - if (slot == conf->copies) { - disk = -1; - goto rb_out; - } + for (slot = 0; slot < conf->copies ; slot++) { + if (r10_bio->devs[slot].bio == IO_BLOCKED) + continue; disk = r10_bio->devs[slot].devnum; - } - - - current_distance = abs(r10_bio->devs[slot].addr - - conf->mirrors[disk].head_position); - - /* Find the disk whose head is closest, - * or - for far > 1 - find the closest to partition beginning */ - - for (nslot = slot; nslot < conf->copies; nslot++) { - int ndisk = r10_bio->devs[nslot].devnum; - - - if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || - r10_bio->devs[nslot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (rdev == NULL) + continue; + if (!test_bit(In_sync, &rdev->flags)) continue; + if (!do_balance) + break; + /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { - disk = ndisk; - slot = nslot; + if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) break; - } /* for far > 1 always use the lowest address */ if (conf->far_copies > 1) - new_distance = r10_bio->devs[nslot].addr; + new_distance = r10_bio->devs[slot].addr; else - new_distance = abs(r10_bio->devs[nslot].addr - - conf->mirrors[ndisk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - disk = ndisk; - slot = nslot; + new_distance = abs(r10_bio->devs[slot].addr - + conf->mirrors[disk].head_position); + if (new_distance < best_dist) { + best_dist = new_distance; + best_slot = slot; } } + if (slot == conf->copies) + slot = best_slot; -rb_out: - r10_bio->read_slot = slot; -/* conf->next_seq_sect = this_sector + sectors;*/ - - if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) - atomic_inc(&conf->mirrors[disk].rdev->nr_pending); - else + if (slot >= 0) { + disk = r10_bio->devs[slot].devnum; + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (test_bit(Faulty, &rdev->flags)) { + /* Cannot risk returning a device that failed + * before we inc'ed nr_pending + */ + rdev_dec_pending(rdev, conf->mddev); + goto retry; + } + r10_bio->read_slot = slot; + } else disk = -1; rcu_read_unlock(); @@ -634,12 +611,6 @@ static void flush_pending_writes(conf_t *conf) spin_unlock_irq(&conf->device_lock); } -static void md_kick_device(mddev_t *mddev) -{ - blk_flush_plug(current); - md_wakeup_thread(mddev->thread); -} - /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -669,15 +640,15 @@ static void raise_barrier(conf_t *conf, int force) /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock, md_kick_device(conf->mddev)); + conf->resync_lock, ); /* block any new IO from starting */ conf->barrier++; - /* No wait for all pending IO to complete */ + /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, md_kick_device(conf->mddev)); + conf->resync_lock, ); spin_unlock_irq(&conf->resync_lock); } @@ -698,7 +669,7 @@ static void wait_barrier(conf_t *conf) conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, - md_kick_device(conf->mddev)); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -734,8 +705,8 @@ static void freeze_array(conf_t *conf) wait_event_lock_irq(conf->wait_barrier, conf->nr_pending == conf->nr_queued+1, conf->resync_lock, - ({ flush_pending_writes(conf); - md_kick_device(conf->mddev); })); + flush_pending_writes(conf)); + spin_unlock_irq(&conf->resync_lock); } @@ -762,6 +733,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); unsigned long flags; mdk_rdev_t *blocked_rdev; + int plugged; if (unlikely(bio->bi_rw & REQ_FLUSH)) { md_flush_request(mddev, bio); @@ -870,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) * inc refcount on their rdev. Record them by setting * bios[x] to bio */ + plugged = mddev_check_plugged(mddev); + raid10_find_phys(conf, r10_bio); retry_write: blocked_rdev = NULL; @@ -946,9 +920,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (do_sync || !mddev->bitmap) + if (do_sync || !mddev->bitmap || !plugged) md_wakeup_thread(mddev->thread); - return 0; } @@ -1464,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[r10_bio->read_slot].devnum; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev) { /* If rdev is not NULL */ - char b[BDEVNAME_SIZE]; - int cur_read_error_count = 0; + /* still own a reference to this rdev, so it cannot + * have been cleared recently. + */ + rdev = conf->mirrors[d].rdev; - bdevname(rdev->bdev, b); + if (test_bit(Faulty, &rdev->flags)) + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; - if (test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); - /* drive has already been failed, just ignore any - more fix_read_error() attempts */ - return; - } + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + if (atomic_read(&rdev->read_errors) > max_read_errors) { + char b[BDEVNAME_SIZE]; + bdevname(rdev->bdev, b); - check_decay_read_errors(mddev, rdev); - atomic_inc(&rdev->read_errors); - cur_read_error_count = atomic_read(&rdev->read_errors); - if (cur_read_error_count > max_read_errors) { - rcu_read_unlock(); - printk(KERN_NOTICE - "md/raid10:%s: %s: Raid device exceeded " - "read_error threshold " - "[cur %d:max %d]\n", - mdname(mddev), - b, cur_read_error_count, max_read_errors); - printk(KERN_NOTICE - "md/raid10:%s: %s: Failing raid " - "device\n", mdname(mddev), b); - md_error(mddev, conf->mirrors[d].rdev); - return; - } + printk(KERN_NOTICE + "md/raid10:%s: %s: Raid device exceeded " + "read_error threshold [cur %d:max %d]\n", + mdname(mddev), b, + atomic_read(&rdev->read_errors), max_read_errors); + printk(KERN_NOTICE + "md/raid10:%s: %s: Failing raid device\n", + mdname(mddev), b); + md_error(mddev, conf->mirrors[d].rdev); + return; } - rcu_read_unlock(); while(sectors) { int s = sectors; @@ -1566,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "write failed" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing " "drive\n", @@ -1603,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "corrected sectors" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", mdname(mddev), @@ -1616,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) "md/raid10:%s: read error corrected" " (%d sectors at %llu on %s)\n", mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), + (unsigned long long)( + sect + rdev->data_offset), bdevname(rdev->bdev, b)); } @@ -1640,9 +1606,11 @@ static void raid10d(mddev_t *mddev) conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; mdk_rdev_t *rdev; + struct blk_plug plug; md_check_recovery(mddev); + blk_start_plug(&plug); for (;;) { char b[BDEVNAME_SIZE]; @@ -1665,7 +1633,8 @@ static void raid10d(mddev_t *mddev) else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) recovery_request_write(mddev, r10_bio); else { - int mirror; + int slot = r10_bio->read_slot; + int mirror = r10_bio->devs[slot].devnum; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. * We freeze all other IO, and try reading the block from @@ -1679,6 +1648,7 @@ static void raid10d(mddev_t *mddev) fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } + rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); bio = r10_bio->devs[r10_bio->read_slot].bio; r10_bio->devs[r10_bio->read_slot].bio = @@ -1716,6 +1686,7 @@ static void raid10d(mddev_t *mddev) } cond_resched(); } + blk_finish_plug(&plug); }