From: Linus Torvalds Date: Mon, 21 Jul 2008 17:29:12 +0000 (-0700) Subject: Merge branch 'for-linus' of git://neil.brown.name/md X-Git-Tag: v2.6.27-rc1~957 X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=commitdiff_plain;h=8a392625b665c676a77c62f8608d10ff430bcb83;hp=-c Merge branch 'for-linus' of git://neil.brown.name/md * 'for-linus' of git://neil.brown.name/md: (52 commits) md: Protect access to mddev->disks list using RCU md: only count actual openers as access which prevent a 'stop' md: linear: Make array_size sector-based and rename it to array_sectors. md: Make mddev->array_size sector-based. md: Make super_type->rdev_size_change() take sector-based sizes. md: Fix check for overlapping devices. md: Tidy up rdev_size_store a bit: md: Remove some unused macros. md: Turn rdev->sb_offset into a sector-based quantity. md: Make calc_dev_sboffset() return a sector count. md: Replace calc_dev_size() by calc_num_sectors(). md: Make update_size() take the number of sectors. md: Better control of when do_md_stop is allowed to stop the array. md: get_disk_info(): Don't convert between signed and unsigned and back. md: Simplify restart_array(). md: alloc_disk_sb(): Return proper error value. md: Simplify sb_equal(). md: Simplify uuid_equal(). md: sb_equal(): Fix misleading printk. md: Fix a typo in the comment to cmd_match(). ... --- 8a392625b665c676a77c62f8608d10ff430bcb83 diff --combined drivers/md/linear.c index 6a866d7c8ae5,1cafaa959443..b1eebf88c209 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@@ -50,19 -50,17 +50,19 @@@ static inline dev_info_t *which_dev(mdd /** * linear_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int linear_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; dev_info_t *dev0; - unsigned long maxsectors, bio_sectors = bio->bi_size >> 9; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); dev0 = which_dev(mddev, sector); maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); @@@ -122,13 -120,13 +122,13 @@@ static linear_conf_t *linear_conf(mddev return NULL; cnt = 0; - conf->array_size = 0; + conf->array_sectors = 0; rdev_for_each(rdev, tmp, mddev) { int j = rdev->raid_disk; dev_info_t *disk = conf->disks + j; - if (j < 0 || j > raid_disks || disk->rdev) { + if (j < 0 || j >= raid_disks || disk->rdev) { printk("linear: disk numbering problem. Aborting!\n"); goto out; } @@@ -146,7 -144,7 +146,7 @@@ blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->size = rdev->size; - conf->array_size += rdev->size; + conf->array_sectors += rdev->size * 2; cnt++; } @@@ -155,7 -153,7 +155,7 @@@ goto out; } - min_spacing = conf->array_size; + min_spacing = conf->array_sectors / 2; sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); /* min_spacing is the minimum spacing that will fit the hash @@@ -164,7 -162,7 +164,7 @@@ * that is larger than min_spacing as use the size of that as * the actual spacing */ - conf->hash_spacing = conf->array_size; + conf->hash_spacing = conf->array_sectors / 2; for (i=0; i < cnt-1 ; i++) { sector_t sz = 0; int j; @@@ -194,7 -192,7 +194,7 @@@ unsigned round; unsigned long base; - sz = conf->array_size >> conf->preshift; + sz = conf->array_sectors >> (conf->preshift + 1); sz += 1; /* force round-up */ base = conf->hash_spacing >> conf->preshift; round = sector_div(sz, base); @@@ -221,7 -219,7 +221,7 @@@ curr_offset = 0; i = 0; for (curr_offset = 0; - curr_offset < conf->array_size; + curr_offset < conf->array_sectors / 2; curr_offset += conf->hash_spacing) { while (i < raid_disks-1 && @@@ -258,7 -256,7 +258,7 @@@ static int linear_run (mddev_t *mddev if (!conf) return 1; mddev->private = conf; - mddev->array_size = conf->array_size; + mddev->array_sectors = conf->array_sectors; blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@@ -292,8 -290,8 +292,8 @@@ static int linear_add(mddev_t *mddev, m newconf->prev = mddev_to_conf(mddev); mddev->private = newconf; mddev->raid_disks++; - mddev->array_size = newconf->array_size; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = newconf->array_sectors; + set_capacity(mddev->gendisk, mddev->array_sectors); return 0; } diff --combined drivers/md/raid0.c index bcbb82594a19,2f30ebd8b7ab..183610635661 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@@ -241,20 -241,18 +241,20 @@@ static int create_strip_zones (mddev_t /** * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid0_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ @@@ -295,16 -293,16 +295,16 @@@ static int raid0_run (mddev_t *mddev goto out_free_conf; /* calculate array device size */ - mddev->array_size = 0; + mddev->array_sectors = 0; rdev_for_each(rdev, tmp, mddev) - mddev->array_size += rdev->size; + mddev->array_sectors += rdev->size * 2; printk("raid0 : md_size is %llu blocks.\n", - (unsigned long long)mddev->array_size); + (unsigned long long)mddev->array_sectors / 2); printk("raid0 : conf->hash_spacing is %llu blocks.\n", (unsigned long long)conf->hash_spacing); { - sector_t s = mddev->array_size; + sector_t s = mddev->array_sectors / 2; sector_t space = conf->hash_spacing; int round; conf->preshift = 0; diff --combined drivers/md/raid10.c index 22bb2b1b886d,2acea4025243..159535d73567 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@@ -439,27 -439,26 +439,27 @@@ static sector_t raid10_find_virt(conf_ /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */ -static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio, - struct bio_vec *bio_vec) +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= bio_vec->bv_len && bio_sectors == 0) - return bio_vec->bv_len; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; else return max; } @@@ -1114,24 -1113,30 +1114,30 @@@ static int raid10_spare_active(mddev_t static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int mirror; mirror_info_t *p; + int first = 0; + int last = mddev->raid_disks - 1; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ - return 0; + return -EBUSY; if (!enough(conf)) - return 0; + return -EINVAL; + + if (rdev->raid_disk) + first = last = rdev->raid_disk; if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else - mirror = 0; - for ( ; mirror < mddev->raid_disks; mirror++) + mirror = first; + for ( ; mirror <= last ; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, @@@ -1146,7 -1151,7 +1152,7 @@@ p->head_position = 0; rdev->raid_disk = mirror; - found = 1; + err = 0; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); @@@ -1154,7 -1159,7 +1160,7 @@@ } print_conf(conf); - return found; + return err; } static int raid10_remove_disk(mddev_t *mddev, int number) @@@ -2159,7 -2164,7 +2165,7 @@@ static int run(mddev_t *mddev /* * Ok, everything is just fine now */ - mddev->array_size = size << (conf->chunk_shift-1); + mddev->array_sectors = size << conf->chunk_shift; mddev->resync_max_sectors = size << conf->chunk_shift; mddev->queue->unplug_fn = raid10_unplug; diff --combined drivers/md/raid5.c index 9ce7154845c6,42a480ba767b..55e7c56045a0 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@@ -115,15 -115,20 +115,20 @@@ static void return_io(struct bio *retur return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); bi = return_bi; } } static void print_raid5_conf (raid5_conf_t *conf); + static int stripe_operations_active(struct stripe_head *sh) + { + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); + } + static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { @@@ -143,7 -148,7 +148,7 @@@ } md_wakeup_thread(conf->mddev->thread); } else { - BUG_ON(sh->ops.pending); + BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@@ -245,7 -250,7 +250,7 @@@ static void init_stripe(struct stripe_h BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + BUG_ON(stripe_operations_active(sh)); CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", @@@ -346,62 -351,18 +351,18 @@@ static struct stripe_head *get_active_s return sh; } - /* test_and_ack_op() ensures that we only dequeue an operation once */ - #define test_and_ack_op(op, pend) \ - do { \ - if (test_bit(op, &sh->ops.pending) && \ - !test_bit(op, &sh->ops.complete)) { \ - if (test_and_set_bit(op, &sh->ops.ack)) \ - clear_bit(op, &pend); \ - else \ - ack++; \ - } else \ - clear_bit(op, &pend); \ - } while (0) - - /* find new work to run, do not resubmit work that is already - * in flight - */ - static unsigned long get_stripe_work(struct stripe_head *sh) - { - unsigned long pending; - int ack = 0; - - pending = sh->ops.pending; - - test_and_ack_op(STRIPE_OP_BIOFILL, pending); - test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); - test_and_ack_op(STRIPE_OP_PREXOR, pending); - test_and_ack_op(STRIPE_OP_BIODRAIN, pending); - test_and_ack_op(STRIPE_OP_POSTXOR, pending); - test_and_ack_op(STRIPE_OP_CHECK, pending); - if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) - ack++; - - sh->ops.count -= ack; - if (unlikely(sh->ops.count < 0)) { - printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " - "ops.complete: %#lx\n", pending, sh->ops.pending, - sh->ops.ack, sh->ops.complete); - BUG(); - } - - return pending; - } - static void raid5_end_read_request(struct bio *bi, int error); static void raid5_end_write_request(struct bio *bi, int error); - static void ops_run_io(struct stripe_head *sh) + static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { raid5_conf_t *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); - set_bit(STRIPE_IO_STARTED, &sh->state); for (i = disks; i--; ) { int rw; struct bio *bi; @@@ -430,11 -391,11 +391,11 @@@ rcu_read_unlock(); if (rdev) { - if (test_bit(STRIPE_SYNCING, &sh->state) || - test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || - test_bit(STRIPE_EXPAND_READY, &sh->state)) + if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); + set_bit(STRIPE_IO_STARTED, &sh->state); + bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, @@@ -528,38 -489,34 +489,34 @@@ static void ops_complete_biofill(void * (unsigned long long)sh->sector); /* clear completed biofills */ + spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* acknowledge completion of a biofill operation */ /* and check if we need to reply to a read request, * new R5_Wantfill requests are held off until - * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) + * !STRIPE_BIOFILL_RUN */ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi, *rbi2; - /* The access to dev->read is outside of the - * spin_lock_irq(&conf->device_lock), but is protected - * by the STRIPE_OP_BIOFILL pending bit - */ BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); if (--rbi->bi_phys_segments == 0) { rbi->bi_next = return_bi; return_bi = rbi; } - spin_unlock_irq(&conf->device_lock); rbi = rbi2; } } } - set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + spin_unlock_irq(&conf->device_lock); + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@@ -610,13 -567,14 +567,14 @@@ static void ops_complete_compute5(void set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } - static struct dma_async_tx_descriptor * - ops_run_compute5(struct stripe_head *sh, unsigned long pending) + static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@@ -646,10 -604,6 +604,6 @@@ ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute5, sh); - /* ack now if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) - async_tx_ack(tx); - return tx; } @@@ -659,8 -613,6 +613,6 @@@ static void ops_complete_prexor(void *s pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); } static struct dma_async_tx_descriptor * @@@ -680,7 -632,7 +632,7 @@@ ops_run_prexor(struct stripe_head *sh, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) + if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@@ -692,16 -644,10 +644,10 @@@ } static struct dma_async_tx_descriptor * - ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) + ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - int pd_idx = sh->pd_idx, i; - - /* check if prexor is active which means only process blocks - * that are part of a read-modify-write (Wantprexor) - */ - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@@ -709,20 -655,8 +655,8 @@@ for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; struct bio *chosen; - int towrite; - - towrite = 0; - if (prexor) { /* rmw */ - if (dev->towrite && - test_bit(R5_Wantprexor, &dev->flags)) - towrite = 1; - } else { /* rcw */ - if (i != pd_idx && dev->towrite && - test_bit(R5_LOCKED, &dev->flags)) - towrite = 1; - } - if (towrite) { + if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; spin_lock(&sh->lock); @@@ -745,18 -679,6 +679,6 @@@ } static void ops_complete_postxor(void *stripe_head_ref) - { - struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); - } - - static void ops_complete_write(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; @@@ -770,16 -692,21 +692,21 @@@ set_bit(R5_UPTODATE, &dev->flags); } - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } static void - ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) + ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@@ -787,9 -714,8 +714,8 @@@ int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int prexor = 0; unsigned long flags; - dma_async_tx_callback callback; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@@ -797,7 -723,8 +723,8 @@@ /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (prexor) { + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@@ -813,10 -740,6 +740,6 @@@ } } - /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? - ops_complete_write : ops_complete_postxor; - /* 1/ if we prexor'd then the dest is reused as a source * 2/ if we did not prexor then we are redoing the parity * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST @@@ -830,25 -753,20 +753,20 @@@ if (unlikely(count == 1)) { flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && - sh->ops.zero_sum_result == 0) - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@@ -875,46 -793,42 +793,42 @@@ static void ops_run_check(struct stripe tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); - if (tx) - set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - else - clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - atomic_inc(&sh->count); tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, ops_complete_check, sh); } - static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) + static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; - if (test_bit(STRIPE_OP_BIOFILL, &pending)) { + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) - tx = ops_run_compute5(sh, pending); + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + tx = ops_run_compute5(sh); + /* terminate the chain if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + async_tx_ack(tx); + } - if (test_bit(STRIPE_OP_PREXOR, &pending)) + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) tx = ops_run_prexor(sh, tx); - if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { - tx = ops_run_biodrain(sh, tx, pending); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &pending)) - ops_run_postxor(sh, tx, pending); + if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) + ops_run_postxor(sh, tx); - if (test_bit(STRIPE_OP_CHECK, &pending)) + if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); - if (test_bit(STRIPE_OP_IO, &pending)) - ops_run_io(sh); - if (overlap_clear) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@@ -997,14 -911,16 +911,16 @@@ static int resize_stripes(raid5_conf_t struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err = 0; + int err; struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ - md_allow_write(conf->mddev); + err = md_allow_write(conf->mddev); + if (err) + return err; /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@@ -1703,11 -1619,11 +1619,11 @@@ static void compute_block_2(struct stri } } - static int - handle_write_operations5(struct stripe_head *sh, int rcw, int expand) + static void + schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; - int locked = 0; if (rcw) { /* if we are not expanding this is a proper write request, and @@@ -1715,53 -1631,48 +1631,48 @@@ * stripe cache */ if (!expand) { - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - sh->ops.count++; - } + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - sh->ops.count++; + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->towrite) { set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } - if (locked + 1 == disks) + if (s->locked + 1 == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - - sh->ops.count += 3; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) continue; - /* For a read-modify write there may be blocks that are - * locked for reading while others are ready to be - * written so we distinguish these blocks by the - * R5_Wantprexor bit - */ if (dev->towrite && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - set_bit(R5_Wantprexor, &dev->flags); + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } } @@@ -1771,13 -1682,11 +1682,11 @@@ */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - locked++; + s->locked++; - pr_debug("%s: stripe %llu locked: %d pending: %lx\n", + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, - locked, sh->ops.pending); - - return locked; + s->locked, s->ops_request); } /* @@@ -1876,7 -1785,7 +1785,7 @@@ static int stripe_to_pdidx(sector_t str } static void - handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, + handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi) { @@@ -1967,48 -1876,38 +1876,38 @@@ md_wakeup_thread(conf->mddev->thread); } - /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks - * to process + /* fetch_block5 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill5 to continue */ - static int __handle_issuing_new_read_requests5(struct stripe_head *sh, - struct stripe_head_state *s, int disk_idx, int disks) + static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; - /* don't schedule compute operations or reads on the parity block while - * a check is in flight - */ - if ((disk_idx == sh->pd_idx) && - test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - return ~0; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || (s->failed && - (failed_dev->toread || (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags) - ))))) { - /* 1/ We would like to get this block, possibly by computing it, - * but we might not be able to. - * - * 2/ Since parity check operations potentially make the parity - * block !uptodate it will need to be refreshed before any - * compute operations on data disks are scheduled. - * - * 3/ We hold off parity block re-reads until check operations - * have quiesced. + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed && + (failed_dev->toread || + (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { + /* We would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync */ if ((s->uptodate == disks - 1) && - (s->failed && disk_idx == s->failed_num) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + (s->failed && disk_idx == s->failed_num)) { + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; s->req_compute = 1; - sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will @@@ -2016,53 -1915,40 +1915,40 @@@ * subsequent operation. */ s->uptodate++; - return 0; /* uptodate + compute == disks */ + return 1; /* uptodate + compute == disks */ } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); } } - return ~0; + return 0; } - static void handle_issuing_new_read_requests5(struct stripe_head *sh, + /** + * handle_stripe_fill5 - read or compute data to satisfy pending requests. + */ + static void handle_stripe_fill5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; - /* Clear completed compute operations. Parity recovery - * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled - * later on in this routine - */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write */ - if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) for (i = disks; i--; ) - if (__handle_issuing_new_read_requests5( - sh, s, i, disks) == 0) + if (fetch_block5(sh, s, i, disks)) break; - } set_bit(STRIPE_HANDLE, &sh->state); } - static void handle_issuing_new_read_requests6(struct stripe_head *sh, + static void handle_stripe_fill6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@@ -2121,12 -2007,12 +2007,12 @@@ } - /* handle_completed_write_requests + /* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ - static void handle_completed_write_requests(raid5_conf_t *conf, + static void handle_stripe_clean_event(raid5_conf_t *conf, struct stripe_head *sh, int disks, struct bio **return_bi) { int i; @@@ -2171,7 -2057,7 +2057,7 @@@ md_wakeup_thread(conf->mddev->thread); } - static void handle_issuing_new_write_requests5(raid5_conf_t *conf, + static void handle_stripe_dirtying5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { int rmw = 0, rcw = 0, i; @@@ -2215,9 -2101,6 +2101,6 @@@ "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@@ -2241,9 -2124,6 +2124,6 @@@ "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@@ -2261,14 -2141,13 +2141,13 @@@ * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ - if ((s->req_compute || - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && - (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) - s->locked += handle_write_operations5(sh, rcw == 0, 0); + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + schedule_reconstruction5(sh, s, rcw == 0, 0); } - static void handle_issuing_new_write_requests6(raid5_conf_t *conf, + static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@@ -2371,92 -2250,86 +2250,86 @@@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { - int canceled_check = 0; + struct r5dev *dev = NULL; set_bit(STRIPE_HANDLE, &sh->state); - /* complete a check operation */ - if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ if (s->failed == 0) { - if (sh->ops.zero_sum_result == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += - STRIPE_SECTORS; - if (test_bit( - MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - set_bit(STRIPE_OP_COMPUTE_BLK, - &sh->ops.pending); - set_bit(STRIPE_OP_MOD_REPAIR_PD, - &sh->ops.pending); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.count++; - s->uptodate++; - } - } - } else - canceled_check = 1; /* STRIPE_INSYNC is not set */ - } - - /* start a new check operation if there are no failures, the stripe is - * not insync, and a repair is not in flight - */ - if (s->failed == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; s->uptodate--; + break; } - } - - /* check if we can clear a parity disk reconstruct */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - + dev = &sh->dev[s->failed_num]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; - /* Wait for check parity and compute block operations to complete - * before write-back. If a failure occurred while the check operation - * was in flight we need to cycle this stripe through handle_stripe - * since the parity block may not be uptodate - */ - if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { - struct r5dev *dev; /* either failed parity check, or recovery is happening */ - if (s->failed == 0) - s->failed_num = sh->pd_idx; - dev = &sh->dev[s->failed_num]; BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(s->uptodate != disks); set_bit(R5_LOCKED, &dev->flags); + s->locked++; set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); - s->locked++; set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@@ -2641,15 -2514,14 +2514,14 @@@ static void handle_stripe5(struct strip struct bio *return_bi = NULL; struct stripe_head_state s; struct r5dev *dev; - unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; int prexor; memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " - "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, - sh->ops.pending, sh->ops.ack, sh->ops.complete); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " + "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->check_state, + sh->reconstruct_state); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@@ -2658,15 -2530,8 +2530,8 @@@ s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - /* clean-up completed biofill operations */ - if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); - } + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@@ -2680,10 -2545,10 +2545,10 @@@ /* maybe we can request a biofill operation * * new wantfill requests are only permitted while - * STRIPE_OP_BIOFILL is clear + * ops_complete_biofill is guaranteed to be inactive */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ @@@ -2727,8 -2592,10 +2592,10 @@@ goto unlock; } - if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) - sh->ops.count++; + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", @@@ -2738,8 -2605,7 +2605,7 @@@ * need to be failed */ if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 1 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@@ -2755,48 -2621,25 +2621,25 @@@ !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) || (s.failed == 1 && s.failed_num == sh->pd_idx))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || - test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) - handle_issuing_new_read_requests5(sh, &s, disks); + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + handle_stripe_fill5(sh, &s, disks); /* Now we check to see if any write operations have recently * completed */ - - /* leave prexor set until postxor is done, allows us to distinguish - * a rmw from a rcw during biodrain - */ prexor = 0; - if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) prexor = 1; - clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - - for (i = disks; i--; ) - clear_bit(R5_Wantprexor, &sh->dev[i].flags); - } - - /* if only POSTXOR is set then this is an 'expand' postxor */ - if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; /* All the 'written' buffers and the parity block are ready to * be written back to disk @@@ -2808,9 -2651,6 +2651,6 @@@ (i == sh->pd_idx || dev->written)) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; if (prexor) continue; if (!test_bit(R5_Insync, &dev->flags) || @@@ -2832,20 -2672,18 +2672,18 @@@ * 2/ A 'check' operation is in flight, as it may clobber the parity * block. */ - if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - handle_issuing_new_write_requests5(conf, sh, &s, disks); + if (s.to_write && !sh->reconstruct_state && !sh->check_state) + handle_stripe_dirtying5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough * data is available. The parity check is held off while parity * dependent operations are in flight. */ - if ((s.syncing && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_INSYNC, &sh->state)) || - test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks5(conf, sh, &s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@@ -2864,52 -2702,35 +2702,35 @@@ dev = &sh->dev[s.failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); s.locked++; } } - /* Finish postxor operations initiated by the expansion - * process - */ - if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && - !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { - + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - - for (i = conf->raid_disks; i--; ) { + for (i = conf->raid_disks; i--; ) set_bit(R5_Wantwrite, &sh->dev[i].flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; - } } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - s.locked += handle_write_operations5(sh, 1, 1); - } else if (s.expanded && - s.locked == 0 && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + schedule_reconstruction5(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@@ -2917,12 -2738,9 +2738,9 @@@ } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, NULL); - if (sh->ops.count) - pending = get_stripe_work(sh); - unlock: spin_unlock(&sh->lock); @@@ -2930,11 -2748,12 +2748,12 @@@ if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - if (pending) - raid5_run_ops(sh, pending); + if (s.ops_request) + raid5_run_ops(sh, s.ops_request); - return_io(return_bi); + ops_run_io(sh, &s); + return_io(return_bi); } static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) @@@ -3042,8 -2861,7 +2861,7 @@@ * might need to be failed */ if (s.failed > 2 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 2 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@@ -3068,7 -2886,7 +2886,7 @@@ ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@@ -3076,11 -2894,11 +2894,11 @@@ */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || (s.syncing && (s.uptodate < disks)) || s.expanding) - handle_issuing_new_read_requests6(sh, &s, &r6s, disks); + handle_stripe_fill6(sh, &s, &r6s, disks); /* now to consider writing and what else, if anything should be read */ if (s.to_write) - handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); + handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@@ -3136,7 -2954,7 +2954,7 @@@ } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, &r6s); unlock: @@@ -3146,68 -2964,9 +2964,9 @@@ if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - return_io(return_bi); - - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; - - set_bit(STRIPE_IO_STARTED, &sh->state); - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); + ops_run_io(sh, &s); - if (rdev) { - if (s.syncing || s.expanding || s.expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - pr_debug("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); - } else { - if (rw == WRITE) - set_bit(STRIPE_DEGRADED, &sh->state); - pr_debug("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } + return_io(return_bi); } static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) @@@ -3314,17 -3073,15 +3073,17 @@@ static int raid5_congested(void *data, /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; - if (bio_data_dir(bio) == WRITE) + if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@@ -3697,9 -3454,7 +3456,7 @@@ static int make_request(struct request_ if ( rw == WRITE ) md_write_end(mddev); - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); } return 0; } @@@ -3785,7 -3540,7 +3542,7 @@@ static sector_t reshape_request(mddev_ j == raid6_next_disk(sh->pd_idx, sh->disks)) continue; s = compute_blocknr(sh, j); - if (s < (mddev->array_size<<1)) { + if (s < mddev->array_sectors) { skipped = 1; continue; } @@@ -4002,12 -3757,8 +3759,8 @@@ static int retry_aligned_read(raid5_co spin_lock_irq(&conf->device_lock); remaining = --raid_bio->bi_phys_segments; spin_unlock_irq(&conf->device_lock); - if (remaining == 0) { - - raid_bio->bi_end_io(raid_bio, - test_bit(BIO_UPTODATE, &raid_bio->bi_flags) - ? 0 : -EIO); - } + if (remaining == 0) + bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; @@@ -4094,6 -3845,8 +3847,8 @@@ raid5_store_stripe_cache_size(mddev_t * { raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; if (!conf) @@@ -4109,7 -3862,9 +3864,9 @@@ else break; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; @@@ -4434,7 -4189,7 +4191,7 @@@ static int run(mddev_t *mddev mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_size = mddev->size * (conf->previous_raid_disks - + mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - conf->max_degraded); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@@ -4609,35 -4364,41 +4366,41 @@@ abort static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { raid5_conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int disk; struct disk_info *p; + int first = 0; + int last = conf->raid_disks - 1; if (mddev->degraded > conf->max_degraded) /* no point adding a device */ - return 0; + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; /* * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) disk = rdev->saved_raid_disk; else - disk = 0; - for ( ; disk < conf->raid_disks; disk++) + disk = first; + for ( ; disk <= last ; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - found = 1; + err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_raid5_conf(conf); - return found; + return err; } static int raid5_resize(mddev_t *mddev, sector_t sectors) @@@ -4652,8 -4413,9 +4415,9 @@@ raid5_conf_t *conf = mddev_to_conf(mddev); sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors * (mddev->raid_disks + - conf->max_degraded); + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->size << 1; @@@ -4738,7 -4500,7 +4502,7 @@@ static int raid5_start_reshape(mddev_t rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev)) { + if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; set_bit(In_sync, &rdev->flags); added_devices++; @@@ -4786,15 -4548,16 +4550,16 @@@ static void end_reshape(raid5_conf_t *c struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_size = conf->mddev->size * + conf->mddev->array_sectors = 2 * conf->mddev->size * (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); conf->mddev->changed = 1; bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); + i_size_write(bdev->bd_inode, + (loff_t)conf->mddev->array_sectors << 9); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); }