2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
46 #include <linux/blkdev.h>
47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h>
50 #include <linux/async.h>
51 #include <linux/seq_file.h>
52 #include <linux/cpu.h>
61 #define NR_STRIPES 256
62 #define STRIPE_SIZE PAGE_SIZE
63 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
64 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
65 #define IO_THRESHOLD 1
66 #define BYPASS_THRESHOLD 1
67 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
68 #define HASH_MASK (NR_HASH - 1)
70 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
72 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
73 * order without overlap. There may be several bio's per stripe+device, and
74 * a bio could span several devices.
75 * When walking this list for a particular stripe+device, we must never proceed
76 * beyond a bio that extends past this device, as the next bio might no longer
78 * This macro is used to determine the 'next' bio in the list, given the sector
79 * of the current stripe+device
81 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
83 * The following can be used to debug the driver
85 #define RAID5_PARANOIA 1
86 #if RAID5_PARANOIA && defined(CONFIG_SMP)
87 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
89 # define CHECK_DEVLOCK()
97 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
100 * We maintain a biased count of active stripes in the bottom 16 bits of
101 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
103 static inline int raid5_bi_phys_segments(struct bio *bio)
105 return bio->bi_phys_segments & 0xffff;
108 static inline int raid5_bi_hw_segments(struct bio *bio)
110 return (bio->bi_phys_segments >> 16) & 0xffff;
113 static inline int raid5_dec_bi_phys_segments(struct bio *bio)
115 --bio->bi_phys_segments;
116 return raid5_bi_phys_segments(bio);
119 static inline int raid5_dec_bi_hw_segments(struct bio *bio)
121 unsigned short val = raid5_bi_hw_segments(bio);
124 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
128 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
130 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
133 /* Find first data disk in a raid6 stripe */
134 static inline int raid6_d0(struct stripe_head *sh)
137 /* ddf always start from first device */
139 /* md starts just after Q block */
140 if (sh->qd_idx == sh->disks - 1)
143 return sh->qd_idx + 1;
145 static inline int raid6_next_disk(int disk, int raid_disks)
148 return (disk < raid_disks) ? disk : 0;
151 /* When walking through the disks in a raid5, starting at raid6_d0,
152 * We need to map each disk to a 'slot', where the data disks are slot
153 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
154 * is raid_disks-1. This help does that mapping.
156 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
157 int *count, int syndrome_disks)
161 if (idx == sh->pd_idx)
162 return syndrome_disks;
163 if (idx == sh->qd_idx)
164 return syndrome_disks + 1;
169 static void return_io(struct bio *return_bi)
171 struct bio *bi = return_bi;
174 return_bi = bi->bi_next;
182 static void print_raid5_conf (raid5_conf_t *conf);
184 static int stripe_operations_active(struct stripe_head *sh)
186 return sh->check_state || sh->reconstruct_state ||
187 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
188 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
191 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
193 if (atomic_dec_and_test(&sh->count)) {
194 BUG_ON(!list_empty(&sh->lru));
195 BUG_ON(atomic_read(&conf->active_stripes)==0);
196 if (test_bit(STRIPE_HANDLE, &sh->state)) {
197 if (test_bit(STRIPE_DELAYED, &sh->state)) {
198 list_add_tail(&sh->lru, &conf->delayed_list);
199 blk_plug_device(conf->mddev->queue);
200 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
201 sh->bm_seq - conf->seq_write > 0) {
202 list_add_tail(&sh->lru, &conf->bitmap_list);
203 blk_plug_device(conf->mddev->queue);
205 clear_bit(STRIPE_BIT_DELAY, &sh->state);
206 list_add_tail(&sh->lru, &conf->handle_list);
208 md_wakeup_thread(conf->mddev->thread);
210 BUG_ON(stripe_operations_active(sh));
211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
212 atomic_dec(&conf->preread_active_stripes);
213 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
214 md_wakeup_thread(conf->mddev->thread);
216 atomic_dec(&conf->active_stripes);
217 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 list_add_tail(&sh->lru, &conf->inactive_list);
219 wake_up(&conf->wait_for_stripe);
220 if (conf->retry_read_aligned)
221 md_wakeup_thread(conf->mddev->thread);
227 static void release_stripe(struct stripe_head *sh)
229 raid5_conf_t *conf = sh->raid_conf;
232 spin_lock_irqsave(&conf->device_lock, flags);
233 __release_stripe(conf, sh);
234 spin_unlock_irqrestore(&conf->device_lock, flags);
237 static inline void remove_hash(struct stripe_head *sh)
239 pr_debug("remove_hash(), stripe %llu\n",
240 (unsigned long long)sh->sector);
242 hlist_del_init(&sh->hash);
245 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
247 struct hlist_head *hp = stripe_hash(conf, sh->sector);
249 pr_debug("insert_hash(), stripe %llu\n",
250 (unsigned long long)sh->sector);
253 hlist_add_head(&sh->hash, hp);
257 /* find an idle stripe, make sure it is unhashed, and return it. */
258 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
260 struct stripe_head *sh = NULL;
261 struct list_head *first;
264 if (list_empty(&conf->inactive_list))
266 first = conf->inactive_list.next;
267 sh = list_entry(first, struct stripe_head, lru);
268 list_del_init(first);
270 atomic_inc(&conf->active_stripes);
275 static void shrink_buffers(struct stripe_head *sh, int num)
280 for (i=0; i<num ; i++) {
284 sh->dev[i].page = NULL;
289 static int grow_buffers(struct stripe_head *sh, int num)
293 for (i=0; i<num; i++) {
296 if (!(page = alloc_page(GFP_KERNEL))) {
299 sh->dev[i].page = page;
304 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
305 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
306 struct stripe_head *sh);
308 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
310 raid5_conf_t *conf = sh->raid_conf;
313 BUG_ON(atomic_read(&sh->count) != 0);
314 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
315 BUG_ON(stripe_operations_active(sh));
318 pr_debug("init_stripe called, stripe %llu\n",
319 (unsigned long long)sh->sector);
323 sh->generation = conf->generation - previous;
324 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
326 stripe_set_idx(sector, conf, previous, sh);
330 for (i = sh->disks; i--; ) {
331 struct r5dev *dev = &sh->dev[i];
333 if (dev->toread || dev->read || dev->towrite || dev->written ||
334 test_bit(R5_LOCKED, &dev->flags)) {
335 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
336 (unsigned long long)sh->sector, i, dev->toread,
337 dev->read, dev->towrite, dev->written,
338 test_bit(R5_LOCKED, &dev->flags));
342 raid5_build_block(sh, i, previous);
344 insert_hash(conf, sh);
347 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
350 struct stripe_head *sh;
351 struct hlist_node *hn;
354 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
355 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
356 if (sh->sector == sector && sh->generation == generation)
358 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
362 static void unplug_slaves(mddev_t *mddev);
363 static void raid5_unplug_device(struct request_queue *q);
365 static struct stripe_head *
366 get_active_stripe(raid5_conf_t *conf, sector_t sector,
367 int previous, int noblock)
369 struct stripe_head *sh;
371 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
373 spin_lock_irq(&conf->device_lock);
376 wait_event_lock_irq(conf->wait_for_stripe,
378 conf->device_lock, /* nothing */);
379 sh = __find_stripe(conf, sector, conf->generation - previous);
381 if (!conf->inactive_blocked)
382 sh = get_free_stripe(conf);
383 if (noblock && sh == NULL)
386 conf->inactive_blocked = 1;
387 wait_event_lock_irq(conf->wait_for_stripe,
388 !list_empty(&conf->inactive_list) &&
389 (atomic_read(&conf->active_stripes)
390 < (conf->max_nr_stripes *3/4)
391 || !conf->inactive_blocked),
393 raid5_unplug_device(conf->mddev->queue)
395 conf->inactive_blocked = 0;
397 init_stripe(sh, sector, previous);
399 if (atomic_read(&sh->count)) {
400 BUG_ON(!list_empty(&sh->lru)
401 && !test_bit(STRIPE_EXPANDING, &sh->state));
403 if (!test_bit(STRIPE_HANDLE, &sh->state))
404 atomic_inc(&conf->active_stripes);
405 if (list_empty(&sh->lru) &&
406 !test_bit(STRIPE_EXPANDING, &sh->state))
408 list_del_init(&sh->lru);
411 } while (sh == NULL);
414 atomic_inc(&sh->count);
416 spin_unlock_irq(&conf->device_lock);
421 raid5_end_read_request(struct bio *bi, int error);
423 raid5_end_write_request(struct bio *bi, int error);
425 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
427 raid5_conf_t *conf = sh->raid_conf;
428 int i, disks = sh->disks;
432 for (i = disks; i--; ) {
436 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
438 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
443 bi = &sh->dev[i].req;
447 bi->bi_end_io = raid5_end_write_request;
449 bi->bi_end_io = raid5_end_read_request;
452 rdev = rcu_dereference(conf->disks[i].rdev);
453 if (rdev && test_bit(Faulty, &rdev->flags))
456 atomic_inc(&rdev->nr_pending);
460 if (s->syncing || s->expanding || s->expanded)
461 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
463 set_bit(STRIPE_IO_STARTED, &sh->state);
465 bi->bi_bdev = rdev->bdev;
466 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
467 __func__, (unsigned long long)sh->sector,
469 atomic_inc(&sh->count);
470 bi->bi_sector = sh->sector + rdev->data_offset;
471 bi->bi_flags = 1 << BIO_UPTODATE;
475 bi->bi_io_vec = &sh->dev[i].vec;
476 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
477 bi->bi_io_vec[0].bv_offset = 0;
478 bi->bi_size = STRIPE_SIZE;
481 test_bit(R5_ReWrite, &sh->dev[i].flags))
482 atomic_add(STRIPE_SECTORS,
483 &rdev->corrected_errors);
484 generic_make_request(bi);
487 set_bit(STRIPE_DEGRADED, &sh->state);
488 pr_debug("skip op %ld on disc %d for sector %llu\n",
489 bi->bi_rw, i, (unsigned long long)sh->sector);
490 clear_bit(R5_LOCKED, &sh->dev[i].flags);
491 set_bit(STRIPE_HANDLE, &sh->state);
496 static struct dma_async_tx_descriptor *
497 async_copy_data(int frombio, struct bio *bio, struct page *page,
498 sector_t sector, struct dma_async_tx_descriptor *tx)
501 struct page *bio_page;
504 struct async_submit_ctl submit;
506 if (bio->bi_sector >= sector)
507 page_offset = (signed)(bio->bi_sector - sector) * 512;
509 page_offset = (signed)(sector - bio->bi_sector) * -512;
511 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
512 bio_for_each_segment(bvl, bio, i) {
513 int len = bio_iovec_idx(bio, i)->bv_len;
517 if (page_offset < 0) {
518 b_offset = -page_offset;
519 page_offset += b_offset;
523 if (len > 0 && page_offset + len > STRIPE_SIZE)
524 clen = STRIPE_SIZE - page_offset;
529 b_offset += bio_iovec_idx(bio, i)->bv_offset;
530 bio_page = bio_iovec_idx(bio, i)->bv_page;
532 tx = async_memcpy(page, bio_page, page_offset,
533 b_offset, clen, &submit);
535 tx = async_memcpy(bio_page, page, b_offset,
536 page_offset, clen, &submit);
538 /* chain the operations */
539 submit.depend_tx = tx;
541 if (clen < len) /* hit end of page */
549 static void ops_complete_biofill(void *stripe_head_ref)
551 struct stripe_head *sh = stripe_head_ref;
552 struct bio *return_bi = NULL;
553 raid5_conf_t *conf = sh->raid_conf;
556 pr_debug("%s: stripe %llu\n", __func__,
557 (unsigned long long)sh->sector);
559 /* clear completed biofills */
560 spin_lock_irq(&conf->device_lock);
561 for (i = sh->disks; i--; ) {
562 struct r5dev *dev = &sh->dev[i];
564 /* acknowledge completion of a biofill operation */
565 /* and check if we need to reply to a read request,
566 * new R5_Wantfill requests are held off until
567 * !STRIPE_BIOFILL_RUN
569 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
570 struct bio *rbi, *rbi2;
575 while (rbi && rbi->bi_sector <
576 dev->sector + STRIPE_SECTORS) {
577 rbi2 = r5_next_bio(rbi, dev->sector);
578 if (!raid5_dec_bi_phys_segments(rbi)) {
579 rbi->bi_next = return_bi;
586 spin_unlock_irq(&conf->device_lock);
587 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
589 return_io(return_bi);
591 set_bit(STRIPE_HANDLE, &sh->state);
595 static void ops_run_biofill(struct stripe_head *sh)
597 struct dma_async_tx_descriptor *tx = NULL;
598 raid5_conf_t *conf = sh->raid_conf;
599 struct async_submit_ctl submit;
602 pr_debug("%s: stripe %llu\n", __func__,
603 (unsigned long long)sh->sector);
605 for (i = sh->disks; i--; ) {
606 struct r5dev *dev = &sh->dev[i];
607 if (test_bit(R5_Wantfill, &dev->flags)) {
609 spin_lock_irq(&conf->device_lock);
610 dev->read = rbi = dev->toread;
612 spin_unlock_irq(&conf->device_lock);
613 while (rbi && rbi->bi_sector <
614 dev->sector + STRIPE_SECTORS) {
615 tx = async_copy_data(0, rbi, dev->page,
617 rbi = r5_next_bio(rbi, dev->sector);
622 atomic_inc(&sh->count);
623 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
624 async_trigger_callback(&submit);
627 static void mark_target_uptodate(struct stripe_head *sh, int target)
634 tgt = &sh->dev[target];
635 set_bit(R5_UPTODATE, &tgt->flags);
636 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
637 clear_bit(R5_Wantcompute, &tgt->flags);
640 static void ops_complete_compute(void *stripe_head_ref)
642 struct stripe_head *sh = stripe_head_ref;
644 pr_debug("%s: stripe %llu\n", __func__,
645 (unsigned long long)sh->sector);
647 /* mark the computed target(s) as uptodate */
648 mark_target_uptodate(sh, sh->ops.target);
649 mark_target_uptodate(sh, sh->ops.target2);
651 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
652 if (sh->check_state == check_state_compute_run)
653 sh->check_state = check_state_compute_result;
654 set_bit(STRIPE_HANDLE, &sh->state);
658 /* return a pointer to the address conversion region of the scribble buffer */
659 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
660 struct raid5_percpu *percpu)
662 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
665 static struct dma_async_tx_descriptor *
666 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
668 int disks = sh->disks;
669 struct page **xor_srcs = percpu->scribble;
670 int target = sh->ops.target;
671 struct r5dev *tgt = &sh->dev[target];
672 struct page *xor_dest = tgt->page;
674 struct dma_async_tx_descriptor *tx;
675 struct async_submit_ctl submit;
678 pr_debug("%s: stripe %llu block: %d\n",
679 __func__, (unsigned long long)sh->sector, target);
680 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
682 for (i = disks; i--; )
684 xor_srcs[count++] = sh->dev[i].page;
686 atomic_inc(&sh->count);
688 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
689 ops_complete_compute, sh, to_addr_conv(sh, percpu));
690 if (unlikely(count == 1))
691 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
693 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
698 /* set_syndrome_sources - populate source buffers for gen_syndrome
699 * @srcs - (struct page *) array of size sh->disks
700 * @sh - stripe_head to parse
702 * Populates srcs in proper layout order for the stripe and returns the
703 * 'count' of sources to be used in a call to async_gen_syndrome. The P
704 * destination buffer is recorded in srcs[count] and the Q destination
705 * is recorded in srcs[count+1]].
707 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
709 int disks = sh->disks;
710 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
711 int d0_idx = raid6_d0(sh);
715 for (i = 0; i < disks; i++)
716 srcs[i] = (void *)raid6_empty_zero_page;
721 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
723 srcs[slot] = sh->dev[i].page;
724 i = raid6_next_disk(i, disks);
725 } while (i != d0_idx);
726 BUG_ON(count != syndrome_disks);
731 static struct dma_async_tx_descriptor *
732 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
734 int disks = sh->disks;
735 struct page **blocks = percpu->scribble;
737 int qd_idx = sh->qd_idx;
738 struct dma_async_tx_descriptor *tx;
739 struct async_submit_ctl submit;
745 if (sh->ops.target < 0)
746 target = sh->ops.target2;
747 else if (sh->ops.target2 < 0)
748 target = sh->ops.target;
750 /* we should only have one valid target */
753 pr_debug("%s: stripe %llu block: %d\n",
754 __func__, (unsigned long long)sh->sector, target);
756 tgt = &sh->dev[target];
757 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
760 atomic_inc(&sh->count);
762 if (target == qd_idx) {
763 count = set_syndrome_sources(blocks, sh);
764 blocks[count] = NULL; /* regenerating p is not necessary */
765 BUG_ON(blocks[count+1] != dest); /* q should already be set */
766 init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
767 to_addr_conv(sh, percpu));
768 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
770 /* Compute any data- or p-drive using XOR */
772 for (i = disks; i-- ; ) {
773 if (i == target || i == qd_idx)
775 blocks[count++] = sh->dev[i].page;
778 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
779 ops_complete_compute, sh,
780 to_addr_conv(sh, percpu));
781 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
787 static struct dma_async_tx_descriptor *
788 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
790 int i, count, disks = sh->disks;
791 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
792 int d0_idx = raid6_d0(sh);
793 int faila = -1, failb = -1;
794 int target = sh->ops.target;
795 int target2 = sh->ops.target2;
796 struct r5dev *tgt = &sh->dev[target];
797 struct r5dev *tgt2 = &sh->dev[target2];
798 struct dma_async_tx_descriptor *tx;
799 struct page **blocks = percpu->scribble;
800 struct async_submit_ctl submit;
802 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
803 __func__, (unsigned long long)sh->sector, target, target2);
804 BUG_ON(target < 0 || target2 < 0);
805 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
806 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
808 /* we need to open-code set_syndrome_sources to handle to the
809 * slot number conversion for 'faila' and 'failb'
811 for (i = 0; i < disks ; i++)
812 blocks[i] = (void *)raid6_empty_zero_page;
816 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
818 blocks[slot] = sh->dev[i].page;
824 i = raid6_next_disk(i, disks);
825 } while (i != d0_idx);
826 BUG_ON(count != syndrome_disks);
828 BUG_ON(faila == failb);
831 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
832 __func__, (unsigned long long)sh->sector, faila, failb);
834 atomic_inc(&sh->count);
836 if (failb == syndrome_disks+1) {
837 /* Q disk is one of the missing disks */
838 if (faila == syndrome_disks) {
839 /* Missing P+Q, just recompute */
840 init_async_submit(&submit, 0, NULL, ops_complete_compute,
841 sh, to_addr_conv(sh, percpu));
842 return async_gen_syndrome(blocks, 0, count+2,
843 STRIPE_SIZE, &submit);
847 int qd_idx = sh->qd_idx;
849 /* Missing D+Q: recompute D from P, then recompute Q */
850 if (target == qd_idx)
851 data_target = target2;
853 data_target = target;
856 for (i = disks; i-- ; ) {
857 if (i == data_target || i == qd_idx)
859 blocks[count++] = sh->dev[i].page;
861 dest = sh->dev[data_target].page;
862 init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
863 NULL, NULL, to_addr_conv(sh, percpu));
864 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
867 count = set_syndrome_sources(blocks, sh);
868 init_async_submit(&submit, 0, tx, ops_complete_compute,
869 sh, to_addr_conv(sh, percpu));
870 return async_gen_syndrome(blocks, 0, count+2,
871 STRIPE_SIZE, &submit);
875 init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
876 to_addr_conv(sh, percpu));
877 if (failb == syndrome_disks) {
878 /* We're missing D+P. */
879 return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
880 faila, blocks, &submit);
882 /* We're missing D+D. */
883 return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE,
884 faila, failb, blocks, &submit);
889 static void ops_complete_prexor(void *stripe_head_ref)
891 struct stripe_head *sh = stripe_head_ref;
893 pr_debug("%s: stripe %llu\n", __func__,
894 (unsigned long long)sh->sector);
897 static struct dma_async_tx_descriptor *
898 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
899 struct dma_async_tx_descriptor *tx)
901 int disks = sh->disks;
902 struct page **xor_srcs = percpu->scribble;
903 int count = 0, pd_idx = sh->pd_idx, i;
904 struct async_submit_ctl submit;
906 /* existing parity data subtracted */
907 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
909 pr_debug("%s: stripe %llu\n", __func__,
910 (unsigned long long)sh->sector);
912 for (i = disks; i--; ) {
913 struct r5dev *dev = &sh->dev[i];
914 /* Only process blocks that are known to be uptodate */
915 if (test_bit(R5_Wantdrain, &dev->flags))
916 xor_srcs[count++] = dev->page;
919 init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
920 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
921 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
926 static struct dma_async_tx_descriptor *
927 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
929 int disks = sh->disks;
932 pr_debug("%s: stripe %llu\n", __func__,
933 (unsigned long long)sh->sector);
935 for (i = disks; i--; ) {
936 struct r5dev *dev = &sh->dev[i];
939 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
942 spin_lock(&sh->lock);
943 chosen = dev->towrite;
945 BUG_ON(dev->written);
946 wbi = dev->written = chosen;
947 spin_unlock(&sh->lock);
949 while (wbi && wbi->bi_sector <
950 dev->sector + STRIPE_SECTORS) {
951 tx = async_copy_data(1, wbi, dev->page,
953 wbi = r5_next_bio(wbi, dev->sector);
961 static void ops_complete_reconstruct(void *stripe_head_ref)
963 struct stripe_head *sh = stripe_head_ref;
964 int disks = sh->disks;
965 int pd_idx = sh->pd_idx;
966 int qd_idx = sh->qd_idx;
969 pr_debug("%s: stripe %llu\n", __func__,
970 (unsigned long long)sh->sector);
972 for (i = disks; i--; ) {
973 struct r5dev *dev = &sh->dev[i];
975 if (dev->written || i == pd_idx || i == qd_idx)
976 set_bit(R5_UPTODATE, &dev->flags);
979 if (sh->reconstruct_state == reconstruct_state_drain_run)
980 sh->reconstruct_state = reconstruct_state_drain_result;
981 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
982 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
984 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
985 sh->reconstruct_state = reconstruct_state_result;
988 set_bit(STRIPE_HANDLE, &sh->state);
993 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
994 struct dma_async_tx_descriptor *tx)
996 int disks = sh->disks;
997 struct page **xor_srcs = percpu->scribble;
998 struct async_submit_ctl submit;
999 int count = 0, pd_idx = sh->pd_idx, i;
1000 struct page *xor_dest;
1002 unsigned long flags;
1004 pr_debug("%s: stripe %llu\n", __func__,
1005 (unsigned long long)sh->sector);
1007 /* check if prexor is active which means only process blocks
1008 * that are part of a read-modify-write (written)
1010 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1012 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1013 for (i = disks; i--; ) {
1014 struct r5dev *dev = &sh->dev[i];
1016 xor_srcs[count++] = dev->page;
1019 xor_dest = sh->dev[pd_idx].page;
1020 for (i = disks; i--; ) {
1021 struct r5dev *dev = &sh->dev[i];
1023 xor_srcs[count++] = dev->page;
1027 /* 1/ if we prexor'd then the dest is reused as a source
1028 * 2/ if we did not prexor then we are redoing the parity
1029 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1030 * for the synchronous xor case
1032 flags = ASYNC_TX_ACK |
1033 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1035 atomic_inc(&sh->count);
1037 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1038 to_addr_conv(sh, percpu));
1039 if (unlikely(count == 1))
1040 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1042 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1046 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1047 struct dma_async_tx_descriptor *tx)
1049 struct async_submit_ctl submit;
1050 struct page **blocks = percpu->scribble;
1053 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1055 count = set_syndrome_sources(blocks, sh);
1057 atomic_inc(&sh->count);
1059 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1060 sh, to_addr_conv(sh, percpu));
1061 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1064 static void ops_complete_check(void *stripe_head_ref)
1066 struct stripe_head *sh = stripe_head_ref;
1068 pr_debug("%s: stripe %llu\n", __func__,
1069 (unsigned long long)sh->sector);
1071 sh->check_state = check_state_check_result;
1072 set_bit(STRIPE_HANDLE, &sh->state);
1076 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1078 int disks = sh->disks;
1079 int pd_idx = sh->pd_idx;
1080 int qd_idx = sh->qd_idx;
1081 struct page *xor_dest;
1082 struct page **xor_srcs = percpu->scribble;
1083 struct dma_async_tx_descriptor *tx;
1084 struct async_submit_ctl submit;
1088 pr_debug("%s: stripe %llu\n", __func__,
1089 (unsigned long long)sh->sector);
1092 xor_dest = sh->dev[pd_idx].page;
1093 xor_srcs[count++] = xor_dest;
1094 for (i = disks; i--; ) {
1095 if (i == pd_idx || i == qd_idx)
1097 xor_srcs[count++] = sh->dev[i].page;
1100 init_async_submit(&submit, 0, NULL, NULL, NULL,
1101 to_addr_conv(sh, percpu));
1102 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1103 &sh->ops.zero_sum_result, &submit);
1105 atomic_inc(&sh->count);
1106 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1107 tx = async_trigger_callback(&submit);
1110 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1112 struct page **srcs = percpu->scribble;
1113 struct async_submit_ctl submit;
1116 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1117 (unsigned long long)sh->sector, checkp);
1119 count = set_syndrome_sources(srcs, sh);
1123 atomic_inc(&sh->count);
1124 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1125 sh, to_addr_conv(sh, percpu));
1126 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1127 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1130 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1132 int overlap_clear = 0, i, disks = sh->disks;
1133 struct dma_async_tx_descriptor *tx = NULL;
1134 raid5_conf_t *conf = sh->raid_conf;
1135 int level = conf->level;
1136 struct raid5_percpu *percpu;
1140 percpu = per_cpu_ptr(conf->percpu, cpu);
1141 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1142 ops_run_biofill(sh);
1146 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1148 tx = ops_run_compute5(sh, percpu);
1150 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1151 tx = ops_run_compute6_1(sh, percpu);
1153 tx = ops_run_compute6_2(sh, percpu);
1155 /* terminate the chain if reconstruct is not set to be run */
1156 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1160 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1161 tx = ops_run_prexor(sh, percpu, tx);
1163 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1164 tx = ops_run_biodrain(sh, tx);
1168 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1170 ops_run_reconstruct5(sh, percpu, tx);
1172 ops_run_reconstruct6(sh, percpu, tx);
1175 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1176 if (sh->check_state == check_state_run)
1177 ops_run_check_p(sh, percpu);
1178 else if (sh->check_state == check_state_run_q)
1179 ops_run_check_pq(sh, percpu, 0);
1180 else if (sh->check_state == check_state_run_pq)
1181 ops_run_check_pq(sh, percpu, 1);
1187 for (i = disks; i--; ) {
1188 struct r5dev *dev = &sh->dev[i];
1189 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1190 wake_up(&sh->raid_conf->wait_for_overlap);
1195 static int grow_one_stripe(raid5_conf_t *conf)
1197 struct stripe_head *sh;
1198 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1201 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
1202 sh->raid_conf = conf;
1203 spin_lock_init(&sh->lock);
1205 if (grow_buffers(sh, conf->raid_disks)) {
1206 shrink_buffers(sh, conf->raid_disks);
1207 kmem_cache_free(conf->slab_cache, sh);
1210 sh->disks = conf->raid_disks;
1211 /* we just created an active stripe so... */
1212 atomic_set(&sh->count, 1);
1213 atomic_inc(&conf->active_stripes);
1214 INIT_LIST_HEAD(&sh->lru);
1219 static int grow_stripes(raid5_conf_t *conf, int num)
1221 struct kmem_cache *sc;
1222 int devs = conf->raid_disks;
1224 sprintf(conf->cache_name[0],
1225 "raid%d-%s", conf->level, mdname(conf->mddev));
1226 sprintf(conf->cache_name[1],
1227 "raid%d-%s-alt", conf->level, mdname(conf->mddev));
1228 conf->active_name = 0;
1229 sc = kmem_cache_create(conf->cache_name[conf->active_name],
1230 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1234 conf->slab_cache = sc;
1235 conf->pool_size = devs;
1237 if (!grow_one_stripe(conf))
1243 * scribble_len - return the required size of the scribble region
1244 * @num - total number of disks in the array
1246 * The size must be enough to contain:
1247 * 1/ a struct page pointer for each device in the array +2
1248 * 2/ room to convert each entry in (1) to its corresponding dma
1249 * (dma_map_page()) or page (page_address()) address.
1251 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1252 * calculate over all devices (not just the data blocks), using zeros in place
1253 * of the P and Q blocks.
1255 static size_t scribble_len(int num)
1259 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1264 static int resize_stripes(raid5_conf_t *conf, int newsize)
1266 /* Make all the stripes able to hold 'newsize' devices.
1267 * New slots in each stripe get 'page' set to a new page.
1269 * This happens in stages:
1270 * 1/ create a new kmem_cache and allocate the required number of
1272 * 2/ gather all the old stripe_heads and tranfer the pages across
1273 * to the new stripe_heads. This will have the side effect of
1274 * freezing the array as once all stripe_heads have been collected,
1275 * no IO will be possible. Old stripe heads are freed once their
1276 * pages have been transferred over, and the old kmem_cache is
1277 * freed when all stripes are done.
1278 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
1279 * we simple return a failre status - no need to clean anything up.
1280 * 4/ allocate new pages for the new slots in the new stripe_heads.
1281 * If this fails, we don't bother trying the shrink the
1282 * stripe_heads down again, we just leave them as they are.
1283 * As each stripe_head is processed the new one is released into
1286 * Once step2 is started, we cannot afford to wait for a write,
1287 * so we use GFP_NOIO allocations.
1289 struct stripe_head *osh, *nsh;
1290 LIST_HEAD(newstripes);
1291 struct disk_info *ndisks;
1294 struct kmem_cache *sc;
1297 if (newsize <= conf->pool_size)
1298 return 0; /* never bother to shrink */
1300 err = md_allow_write(conf->mddev);
1305 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1306 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1311 for (i = conf->max_nr_stripes; i; i--) {
1312 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
1316 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1318 nsh->raid_conf = conf;
1319 spin_lock_init(&nsh->lock);
1321 list_add(&nsh->lru, &newstripes);
1324 /* didn't get enough, give up */
1325 while (!list_empty(&newstripes)) {
1326 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1327 list_del(&nsh->lru);
1328 kmem_cache_free(sc, nsh);
1330 kmem_cache_destroy(sc);
1333 /* Step 2 - Must use GFP_NOIO now.
1334 * OK, we have enough stripes, start collecting inactive
1335 * stripes and copying them over
1337 list_for_each_entry(nsh, &newstripes, lru) {
1338 spin_lock_irq(&conf->device_lock);
1339 wait_event_lock_irq(conf->wait_for_stripe,
1340 !list_empty(&conf->inactive_list),
1342 unplug_slaves(conf->mddev)
1344 osh = get_free_stripe(conf);
1345 spin_unlock_irq(&conf->device_lock);
1346 atomic_set(&nsh->count, 1);
1347 for(i=0; i<conf->pool_size; i++)
1348 nsh->dev[i].page = osh->dev[i].page;
1349 for( ; i<newsize; i++)
1350 nsh->dev[i].page = NULL;
1351 kmem_cache_free(conf->slab_cache, osh);
1353 kmem_cache_destroy(conf->slab_cache);
1356 * At this point, we are holding all the stripes so the array
1357 * is completely stalled, so now is a good time to resize
1358 * conf->disks and the scribble region
1360 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1362 for (i=0; i<conf->raid_disks; i++)
1363 ndisks[i] = conf->disks[i];
1365 conf->disks = ndisks;
1370 conf->scribble_len = scribble_len(newsize);
1371 for_each_present_cpu(cpu) {
1372 struct raid5_percpu *percpu;
1375 percpu = per_cpu_ptr(conf->percpu, cpu);
1376 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1379 kfree(percpu->scribble);
1380 percpu->scribble = scribble;
1388 /* Step 4, return new stripes to service */
1389 while(!list_empty(&newstripes)) {
1390 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1391 list_del_init(&nsh->lru);
1393 for (i=conf->raid_disks; i < newsize; i++)
1394 if (nsh->dev[i].page == NULL) {
1395 struct page *p = alloc_page(GFP_NOIO);
1396 nsh->dev[i].page = p;
1400 release_stripe(nsh);
1402 /* critical section pass, GFP_NOIO no longer needed */
1404 conf->slab_cache = sc;
1405 conf->active_name = 1-conf->active_name;
1406 conf->pool_size = newsize;
1410 static int drop_one_stripe(raid5_conf_t *conf)
1412 struct stripe_head *sh;
1414 spin_lock_irq(&conf->device_lock);
1415 sh = get_free_stripe(conf);
1416 spin_unlock_irq(&conf->device_lock);
1419 BUG_ON(atomic_read(&sh->count));
1420 shrink_buffers(sh, conf->pool_size);
1421 kmem_cache_free(conf->slab_cache, sh);
1422 atomic_dec(&conf->active_stripes);
1426 static void shrink_stripes(raid5_conf_t *conf)
1428 while (drop_one_stripe(conf))
1431 if (conf->slab_cache)
1432 kmem_cache_destroy(conf->slab_cache);
1433 conf->slab_cache = NULL;
1436 static void raid5_end_read_request(struct bio * bi, int error)
1438 struct stripe_head *sh = bi->bi_private;
1439 raid5_conf_t *conf = sh->raid_conf;
1440 int disks = sh->disks, i;
1441 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1442 char b[BDEVNAME_SIZE];
1446 for (i=0 ; i<disks; i++)
1447 if (bi == &sh->dev[i].req)
1450 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1451 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1459 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1460 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1461 rdev = conf->disks[i].rdev;
1462 printk_rl(KERN_INFO "raid5:%s: read error corrected"
1463 " (%lu sectors at %llu on %s)\n",
1464 mdname(conf->mddev), STRIPE_SECTORS,
1465 (unsigned long long)(sh->sector
1466 + rdev->data_offset),
1467 bdevname(rdev->bdev, b));
1468 clear_bit(R5_ReadError, &sh->dev[i].flags);
1469 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1471 if (atomic_read(&conf->disks[i].rdev->read_errors))
1472 atomic_set(&conf->disks[i].rdev->read_errors, 0);
1474 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1476 rdev = conf->disks[i].rdev;
1478 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1479 atomic_inc(&rdev->read_errors);
1480 if (conf->mddev->degraded)
1481 printk_rl(KERN_WARNING
1482 "raid5:%s: read error not correctable "
1483 "(sector %llu on %s).\n",
1484 mdname(conf->mddev),
1485 (unsigned long long)(sh->sector
1486 + rdev->data_offset),
1488 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1490 printk_rl(KERN_WARNING
1491 "raid5:%s: read error NOT corrected!! "
1492 "(sector %llu on %s).\n",
1493 mdname(conf->mddev),
1494 (unsigned long long)(sh->sector
1495 + rdev->data_offset),
1497 else if (atomic_read(&rdev->read_errors)
1498 > conf->max_nr_stripes)
1500 "raid5:%s: Too many read errors, failing device %s.\n",
1501 mdname(conf->mddev), bdn);
1505 set_bit(R5_ReadError, &sh->dev[i].flags);
1507 clear_bit(R5_ReadError, &sh->dev[i].flags);
1508 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1509 md_error(conf->mddev, rdev);
1512 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1513 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1514 set_bit(STRIPE_HANDLE, &sh->state);
1518 static void raid5_end_write_request(struct bio *bi, int error)
1520 struct stripe_head *sh = bi->bi_private;
1521 raid5_conf_t *conf = sh->raid_conf;
1522 int disks = sh->disks, i;
1523 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1525 for (i=0 ; i<disks; i++)
1526 if (bi == &sh->dev[i].req)
1529 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1530 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1538 md_error(conf->mddev, conf->disks[i].rdev);
1540 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1542 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1543 set_bit(STRIPE_HANDLE, &sh->state);
1548 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1550 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1552 struct r5dev *dev = &sh->dev[i];
1554 bio_init(&dev->req);
1555 dev->req.bi_io_vec = &dev->vec;
1557 dev->req.bi_max_vecs++;
1558 dev->vec.bv_page = dev->page;
1559 dev->vec.bv_len = STRIPE_SIZE;
1560 dev->vec.bv_offset = 0;
1562 dev->req.bi_sector = sh->sector;
1563 dev->req.bi_private = sh;
1566 dev->sector = compute_blocknr(sh, i, previous);
1569 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1571 char b[BDEVNAME_SIZE];
1572 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1573 pr_debug("raid5: error called\n");
1575 if (!test_bit(Faulty, &rdev->flags)) {
1576 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1577 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1578 unsigned long flags;
1579 spin_lock_irqsave(&conf->device_lock, flags);
1581 spin_unlock_irqrestore(&conf->device_lock, flags);
1583 * if recovery was running, make sure it aborts.
1585 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1587 set_bit(Faulty, &rdev->flags);
1589 "raid5: Disk failure on %s, disabling device.\n"
1590 "raid5: Operation continuing on %d devices.\n",
1591 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1596 * Input: a 'big' sector number,
1597 * Output: index of the data and parity disk, and the sector # in them.
1599 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1600 int previous, int *dd_idx,
1601 struct stripe_head *sh)
1604 unsigned long chunk_number;
1605 unsigned int chunk_offset;
1608 sector_t new_sector;
1609 int algorithm = previous ? conf->prev_algo
1611 int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
1612 : (conf->chunk_size >> 9);
1613 int raid_disks = previous ? conf->previous_raid_disks
1615 int data_disks = raid_disks - conf->max_degraded;
1617 /* First compute the information on this sector */
1620 * Compute the chunk number and the sector offset inside the chunk
1622 chunk_offset = sector_div(r_sector, sectors_per_chunk);
1623 chunk_number = r_sector;
1624 BUG_ON(r_sector != chunk_number);
1627 * Compute the stripe number
1629 stripe = chunk_number / data_disks;
1632 * Compute the data disk and parity disk indexes inside the stripe
1634 *dd_idx = chunk_number % data_disks;
1637 * Select the parity disk based on the user selected algorithm.
1639 pd_idx = qd_idx = ~0;
1640 switch(conf->level) {
1642 pd_idx = data_disks;
1645 switch (algorithm) {
1646 case ALGORITHM_LEFT_ASYMMETRIC:
1647 pd_idx = data_disks - stripe % raid_disks;
1648 if (*dd_idx >= pd_idx)
1651 case ALGORITHM_RIGHT_ASYMMETRIC:
1652 pd_idx = stripe % raid_disks;
1653 if (*dd_idx >= pd_idx)
1656 case ALGORITHM_LEFT_SYMMETRIC:
1657 pd_idx = data_disks - stripe % raid_disks;
1658 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1660 case ALGORITHM_RIGHT_SYMMETRIC:
1661 pd_idx = stripe % raid_disks;
1662 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1664 case ALGORITHM_PARITY_0:
1668 case ALGORITHM_PARITY_N:
1669 pd_idx = data_disks;
1672 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1679 switch (algorithm) {
1680 case ALGORITHM_LEFT_ASYMMETRIC:
1681 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1682 qd_idx = pd_idx + 1;
1683 if (pd_idx == raid_disks-1) {
1684 (*dd_idx)++; /* Q D D D P */
1686 } else if (*dd_idx >= pd_idx)
1687 (*dd_idx) += 2; /* D D P Q D */
1689 case ALGORITHM_RIGHT_ASYMMETRIC:
1690 pd_idx = stripe % raid_disks;
1691 qd_idx = pd_idx + 1;
1692 if (pd_idx == raid_disks-1) {
1693 (*dd_idx)++; /* Q D D D P */
1695 } else if (*dd_idx >= pd_idx)
1696 (*dd_idx) += 2; /* D D P Q D */
1698 case ALGORITHM_LEFT_SYMMETRIC:
1699 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1700 qd_idx = (pd_idx + 1) % raid_disks;
1701 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1703 case ALGORITHM_RIGHT_SYMMETRIC:
1704 pd_idx = stripe % raid_disks;
1705 qd_idx = (pd_idx + 1) % raid_disks;
1706 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1709 case ALGORITHM_PARITY_0:
1714 case ALGORITHM_PARITY_N:
1715 pd_idx = data_disks;
1716 qd_idx = data_disks + 1;
1719 case ALGORITHM_ROTATING_ZERO_RESTART:
1720 /* Exactly the same as RIGHT_ASYMMETRIC, but or
1721 * of blocks for computing Q is different.
1723 pd_idx = stripe % raid_disks;
1724 qd_idx = pd_idx + 1;
1725 if (pd_idx == raid_disks-1) {
1726 (*dd_idx)++; /* Q D D D P */
1728 } else if (*dd_idx >= pd_idx)
1729 (*dd_idx) += 2; /* D D P Q D */
1733 case ALGORITHM_ROTATING_N_RESTART:
1734 /* Same a left_asymmetric, by first stripe is
1735 * D D D P Q rather than
1738 pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
1739 qd_idx = pd_idx + 1;
1740 if (pd_idx == raid_disks-1) {
1741 (*dd_idx)++; /* Q D D D P */
1743 } else if (*dd_idx >= pd_idx)
1744 (*dd_idx) += 2; /* D D P Q D */
1748 case ALGORITHM_ROTATING_N_CONTINUE:
1749 /* Same as left_symmetric but Q is before P */
1750 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1751 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1752 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1756 case ALGORITHM_LEFT_ASYMMETRIC_6:
1757 /* RAID5 left_asymmetric, with Q on last device */
1758 pd_idx = data_disks - stripe % (raid_disks-1);
1759 if (*dd_idx >= pd_idx)
1761 qd_idx = raid_disks - 1;
1764 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1765 pd_idx = stripe % (raid_disks-1);
1766 if (*dd_idx >= pd_idx)
1768 qd_idx = raid_disks - 1;
1771 case ALGORITHM_LEFT_SYMMETRIC_6:
1772 pd_idx = data_disks - stripe % (raid_disks-1);
1773 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1774 qd_idx = raid_disks - 1;
1777 case ALGORITHM_RIGHT_SYMMETRIC_6:
1778 pd_idx = stripe % (raid_disks-1);
1779 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1780 qd_idx = raid_disks - 1;
1783 case ALGORITHM_PARITY_0_6:
1786 qd_idx = raid_disks - 1;
1791 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1799 sh->pd_idx = pd_idx;
1800 sh->qd_idx = qd_idx;
1801 sh->ddf_layout = ddf_layout;
1804 * Finally, compute the new sector number
1806 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1811 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1813 raid5_conf_t *conf = sh->raid_conf;
1814 int raid_disks = sh->disks;
1815 int data_disks = raid_disks - conf->max_degraded;
1816 sector_t new_sector = sh->sector, check;
1817 int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
1818 : (conf->chunk_size >> 9);
1819 int algorithm = previous ? conf->prev_algo
1823 int chunk_number, dummy1, dd_idx = i;
1825 struct stripe_head sh2;
1828 chunk_offset = sector_div(new_sector, sectors_per_chunk);
1829 stripe = new_sector;
1830 BUG_ON(new_sector != stripe);
1832 if (i == sh->pd_idx)
1834 switch(conf->level) {
1837 switch (algorithm) {
1838 case ALGORITHM_LEFT_ASYMMETRIC:
1839 case ALGORITHM_RIGHT_ASYMMETRIC:
1843 case ALGORITHM_LEFT_SYMMETRIC:
1844 case ALGORITHM_RIGHT_SYMMETRIC:
1847 i -= (sh->pd_idx + 1);
1849 case ALGORITHM_PARITY_0:
1852 case ALGORITHM_PARITY_N:
1855 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1861 if (i == sh->qd_idx)
1862 return 0; /* It is the Q disk */
1863 switch (algorithm) {
1864 case ALGORITHM_LEFT_ASYMMETRIC:
1865 case ALGORITHM_RIGHT_ASYMMETRIC:
1866 case ALGORITHM_ROTATING_ZERO_RESTART:
1867 case ALGORITHM_ROTATING_N_RESTART:
1868 if (sh->pd_idx == raid_disks-1)
1869 i--; /* Q D D D P */
1870 else if (i > sh->pd_idx)
1871 i -= 2; /* D D P Q D */
1873 case ALGORITHM_LEFT_SYMMETRIC:
1874 case ALGORITHM_RIGHT_SYMMETRIC:
1875 if (sh->pd_idx == raid_disks-1)
1876 i--; /* Q D D D P */
1881 i -= (sh->pd_idx + 2);
1884 case ALGORITHM_PARITY_0:
1887 case ALGORITHM_PARITY_N:
1889 case ALGORITHM_ROTATING_N_CONTINUE:
1890 if (sh->pd_idx == 0)
1891 i--; /* P D D D Q */
1892 else if (i > sh->pd_idx)
1893 i -= 2; /* D D Q P D */
1895 case ALGORITHM_LEFT_ASYMMETRIC_6:
1896 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1900 case ALGORITHM_LEFT_SYMMETRIC_6:
1901 case ALGORITHM_RIGHT_SYMMETRIC_6:
1903 i += data_disks + 1;
1904 i -= (sh->pd_idx + 1);
1906 case ALGORITHM_PARITY_0_6:
1910 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1917 chunk_number = stripe * data_disks + i;
1918 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
1920 check = raid5_compute_sector(conf, r_sector,
1921 previous, &dummy1, &sh2);
1922 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
1923 || sh2.qd_idx != sh->qd_idx) {
1924 printk(KERN_ERR "compute_blocknr: map not correct\n");
1932 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1933 int rcw, int expand)
1935 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1936 raid5_conf_t *conf = sh->raid_conf;
1937 int level = conf->level;
1940 /* if we are not expanding this is a proper write request, and
1941 * there will be bios with new data to be drained into the
1945 sh->reconstruct_state = reconstruct_state_drain_run;
1946 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1948 sh->reconstruct_state = reconstruct_state_run;
1950 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1952 for (i = disks; i--; ) {
1953 struct r5dev *dev = &sh->dev[i];
1956 set_bit(R5_LOCKED, &dev->flags);
1957 set_bit(R5_Wantdrain, &dev->flags);
1959 clear_bit(R5_UPTODATE, &dev->flags);
1963 if (s->locked + conf->max_degraded == disks)
1964 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1965 atomic_inc(&conf->pending_full_writes);
1968 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1969 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1971 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1972 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1973 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1974 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
1976 for (i = disks; i--; ) {
1977 struct r5dev *dev = &sh->dev[i];
1982 (test_bit(R5_UPTODATE, &dev->flags) ||
1983 test_bit(R5_Wantcompute, &dev->flags))) {
1984 set_bit(R5_Wantdrain, &dev->flags);
1985 set_bit(R5_LOCKED, &dev->flags);
1986 clear_bit(R5_UPTODATE, &dev->flags);
1992 /* keep the parity disk(s) locked while asynchronous operations
1995 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1996 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2000 int qd_idx = sh->qd_idx;
2001 struct r5dev *dev = &sh->dev[qd_idx];
2003 set_bit(R5_LOCKED, &dev->flags);
2004 clear_bit(R5_UPTODATE, &dev->flags);
2008 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2009 __func__, (unsigned long long)sh->sector,
2010 s->locked, s->ops_request);
2014 * Each stripe/dev can have one or more bion attached.
2015 * toread/towrite point to the first in a chain.
2016 * The bi_next chain must be in order.
2018 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2021 raid5_conf_t *conf = sh->raid_conf;
2024 pr_debug("adding bh b#%llu to stripe s#%llu\n",
2025 (unsigned long long)bi->bi_sector,
2026 (unsigned long long)sh->sector);
2029 spin_lock(&sh->lock);
2030 spin_lock_irq(&conf->device_lock);
2032 bip = &sh->dev[dd_idx].towrite;
2033 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
2036 bip = &sh->dev[dd_idx].toread;
2037 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2038 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2040 bip = & (*bip)->bi_next;
2042 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2045 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2049 bi->bi_phys_segments++;
2050 spin_unlock_irq(&conf->device_lock);
2051 spin_unlock(&sh->lock);
2053 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2054 (unsigned long long)bi->bi_sector,
2055 (unsigned long long)sh->sector, dd_idx);
2057 if (conf->mddev->bitmap && firstwrite) {
2058 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2060 sh->bm_seq = conf->seq_flush+1;
2061 set_bit(STRIPE_BIT_DELAY, &sh->state);
2065 /* check if page is covered */
2066 sector_t sector = sh->dev[dd_idx].sector;
2067 for (bi=sh->dev[dd_idx].towrite;
2068 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2069 bi && bi->bi_sector <= sector;
2070 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2071 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2072 sector = bi->bi_sector + (bi->bi_size>>9);
2074 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2075 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2080 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2081 spin_unlock_irq(&conf->device_lock);
2082 spin_unlock(&sh->lock);
2086 static void end_reshape(raid5_conf_t *conf);
2088 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
2089 struct stripe_head *sh)
2091 int sectors_per_chunk =
2092 previous ? (conf->prev_chunk >> 9)
2093 : (conf->chunk_size >> 9);
2095 int chunk_offset = sector_div(stripe, sectors_per_chunk);
2096 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2098 raid5_compute_sector(conf,
2099 stripe * (disks - conf->max_degraded)
2100 *sectors_per_chunk + chunk_offset,
2106 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2107 struct stripe_head_state *s, int disks,
2108 struct bio **return_bi)
2111 for (i = disks; i--; ) {
2115 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2118 rdev = rcu_dereference(conf->disks[i].rdev);
2119 if (rdev && test_bit(In_sync, &rdev->flags))
2120 /* multiple read failures in one stripe */
2121 md_error(conf->mddev, rdev);
2124 spin_lock_irq(&conf->device_lock);
2125 /* fail all writes first */
2126 bi = sh->dev[i].towrite;
2127 sh->dev[i].towrite = NULL;
2133 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2134 wake_up(&conf->wait_for_overlap);
2136 while (bi && bi->bi_sector <
2137 sh->dev[i].sector + STRIPE_SECTORS) {
2138 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2139 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2140 if (!raid5_dec_bi_phys_segments(bi)) {
2141 md_write_end(conf->mddev);
2142 bi->bi_next = *return_bi;
2147 /* and fail all 'written' */
2148 bi = sh->dev[i].written;
2149 sh->dev[i].written = NULL;
2150 if (bi) bitmap_end = 1;
2151 while (bi && bi->bi_sector <
2152 sh->dev[i].sector + STRIPE_SECTORS) {
2153 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2154 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2155 if (!raid5_dec_bi_phys_segments(bi)) {
2156 md_write_end(conf->mddev);
2157 bi->bi_next = *return_bi;
2163 /* fail any reads if this device is non-operational and
2164 * the data has not reached the cache yet.
2166 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2167 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2168 test_bit(R5_ReadError, &sh->dev[i].flags))) {
2169 bi = sh->dev[i].toread;
2170 sh->dev[i].toread = NULL;
2171 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2172 wake_up(&conf->wait_for_overlap);
2173 if (bi) s->to_read--;
2174 while (bi && bi->bi_sector <
2175 sh->dev[i].sector + STRIPE_SECTORS) {
2176 struct bio *nextbi =
2177 r5_next_bio(bi, sh->dev[i].sector);
2178 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2179 if (!raid5_dec_bi_phys_segments(bi)) {
2180 bi->bi_next = *return_bi;
2186 spin_unlock_irq(&conf->device_lock);
2188 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2189 STRIPE_SECTORS, 0, 0);
2192 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2193 if (atomic_dec_and_test(&conf->pending_full_writes))
2194 md_wakeup_thread(conf->mddev->thread);
2197 /* fetch_block5 - checks the given member device to see if its data needs
2198 * to be read or computed to satisfy a request.
2200 * Returns 1 when no more member devices need to be checked, otherwise returns
2201 * 0 to tell the loop in handle_stripe_fill5 to continue
2203 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2204 int disk_idx, int disks)
2206 struct r5dev *dev = &sh->dev[disk_idx];
2207 struct r5dev *failed_dev = &sh->dev[s->failed_num];
2209 /* is the data in this block needed, and can we get it? */
2210 if (!test_bit(R5_LOCKED, &dev->flags) &&
2211 !test_bit(R5_UPTODATE, &dev->flags) &&
2213 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2214 s->syncing || s->expanding ||
2216 (failed_dev->toread ||
2217 (failed_dev->towrite &&
2218 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
2219 /* We would like to get this block, possibly by computing it,
2220 * otherwise read it if the backing disk is insync
2222 if ((s->uptodate == disks - 1) &&
2223 (s->failed && disk_idx == s->failed_num)) {
2224 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2225 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2226 set_bit(R5_Wantcompute, &dev->flags);
2227 sh->ops.target = disk_idx;
2228 sh->ops.target2 = -1;
2230 /* Careful: from this point on 'uptodate' is in the eye
2231 * of raid_run_ops which services 'compute' operations
2232 * before writes. R5_Wantcompute flags a block that will
2233 * be R5_UPTODATE by the time it is needed for a
2234 * subsequent operation.
2237 return 1; /* uptodate + compute == disks */
2238 } else if (test_bit(R5_Insync, &dev->flags)) {
2239 set_bit(R5_LOCKED, &dev->flags);
2240 set_bit(R5_Wantread, &dev->flags);
2242 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2251 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
2253 static void handle_stripe_fill5(struct stripe_head *sh,
2254 struct stripe_head_state *s, int disks)
2258 /* look for blocks to read/compute, skip this if a compute
2259 * is already in flight, or if the stripe contents are in the
2260 * midst of changing due to a write
2262 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2263 !sh->reconstruct_state)
2264 for (i = disks; i--; )
2265 if (fetch_block5(sh, s, i, disks))
2267 set_bit(STRIPE_HANDLE, &sh->state);
2270 /* fetch_block6 - checks the given member device to see if its data needs
2271 * to be read or computed to satisfy a request.
2273 * Returns 1 when no more member devices need to be checked, otherwise returns
2274 * 0 to tell the loop in handle_stripe_fill6 to continue
2276 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2277 struct r6_state *r6s, int disk_idx, int disks)
2279 struct r5dev *dev = &sh->dev[disk_idx];
2280 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2281 &sh->dev[r6s->failed_num[1]] };
2283 if (!test_bit(R5_LOCKED, &dev->flags) &&
2284 !test_bit(R5_UPTODATE, &dev->flags) &&
2286 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2287 s->syncing || s->expanding ||
2289 (fdev[0]->toread || s->to_write)) ||
2291 (fdev[1]->toread || s->to_write)))) {
2292 /* we would like to get this block, possibly by computing it,
2293 * otherwise read it if the backing disk is insync
2295 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2296 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2297 if ((s->uptodate == disks - 1) &&
2298 (s->failed && (disk_idx == r6s->failed_num[0] ||
2299 disk_idx == r6s->failed_num[1]))) {
2300 /* have disk failed, and we're requested to fetch it;
2303 pr_debug("Computing stripe %llu block %d\n",
2304 (unsigned long long)sh->sector, disk_idx);
2305 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2306 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2307 set_bit(R5_Wantcompute, &dev->flags);
2308 sh->ops.target = disk_idx;
2309 sh->ops.target2 = -1; /* no 2nd target */
2313 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2314 /* Computing 2-failure is *very* expensive; only
2315 * do it if failed >= 2
2318 for (other = disks; other--; ) {
2319 if (other == disk_idx)
2321 if (!test_bit(R5_UPTODATE,
2322 &sh->dev[other].flags))
2326 pr_debug("Computing stripe %llu blocks %d,%d\n",
2327 (unsigned long long)sh->sector,
2329 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2330 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2331 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2332 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2333 sh->ops.target = disk_idx;
2334 sh->ops.target2 = other;
2338 } else if (test_bit(R5_Insync, &dev->flags)) {
2339 set_bit(R5_LOCKED, &dev->flags);
2340 set_bit(R5_Wantread, &dev->flags);
2342 pr_debug("Reading block %d (sync=%d)\n",
2343 disk_idx, s->syncing);
2351 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2353 static void handle_stripe_fill6(struct stripe_head *sh,
2354 struct stripe_head_state *s, struct r6_state *r6s,
2359 /* look for blocks to read/compute, skip this if a compute
2360 * is already in flight, or if the stripe contents are in the
2361 * midst of changing due to a write
2363 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2364 !sh->reconstruct_state)
2365 for (i = disks; i--; )
2366 if (fetch_block6(sh, s, r6s, i, disks))
2368 set_bit(STRIPE_HANDLE, &sh->state);
2372 /* handle_stripe_clean_event
2373 * any written block on an uptodate or failed drive can be returned.
2374 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2375 * never LOCKED, so we don't need to test 'failed' directly.
2377 static void handle_stripe_clean_event(raid5_conf_t *conf,
2378 struct stripe_head *sh, int disks, struct bio **return_bi)
2383 for (i = disks; i--; )
2384 if (sh->dev[i].written) {
2386 if (!test_bit(R5_LOCKED, &dev->flags) &&
2387 test_bit(R5_UPTODATE, &dev->flags)) {
2388 /* We can return any write requests */
2389 struct bio *wbi, *wbi2;
2391 pr_debug("Return write for disc %d\n", i);
2392 spin_lock_irq(&conf->device_lock);
2394 dev->written = NULL;
2395 while (wbi && wbi->bi_sector <
2396 dev->sector + STRIPE_SECTORS) {
2397 wbi2 = r5_next_bio(wbi, dev->sector);
2398 if (!raid5_dec_bi_phys_segments(wbi)) {
2399 md_write_end(conf->mddev);
2400 wbi->bi_next = *return_bi;
2405 if (dev->towrite == NULL)
2407 spin_unlock_irq(&conf->device_lock);
2409 bitmap_endwrite(conf->mddev->bitmap,
2412 !test_bit(STRIPE_DEGRADED, &sh->state),
2417 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2418 if (atomic_dec_and_test(&conf->pending_full_writes))
2419 md_wakeup_thread(conf->mddev->thread);
2422 static void handle_stripe_dirtying5(raid5_conf_t *conf,
2423 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2425 int rmw = 0, rcw = 0, i;
2426 for (i = disks; i--; ) {
2427 /* would I have to read this buffer for read_modify_write */
2428 struct r5dev *dev = &sh->dev[i];
2429 if ((dev->towrite || i == sh->pd_idx) &&
2430 !test_bit(R5_LOCKED, &dev->flags) &&
2431 !(test_bit(R5_UPTODATE, &dev->flags) ||
2432 test_bit(R5_Wantcompute, &dev->flags))) {
2433 if (test_bit(R5_Insync, &dev->flags))
2436 rmw += 2*disks; /* cannot read it */
2438 /* Would I have to read this buffer for reconstruct_write */
2439 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2440 !test_bit(R5_LOCKED, &dev->flags) &&
2441 !(test_bit(R5_UPTODATE, &dev->flags) ||
2442 test_bit(R5_Wantcompute, &dev->flags))) {
2443 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2448 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2449 (unsigned long long)sh->sector, rmw, rcw);
2450 set_bit(STRIPE_HANDLE, &sh->state);
2451 if (rmw < rcw && rmw > 0)
2452 /* prefer read-modify-write, but need to get some data */
2453 for (i = disks; i--; ) {
2454 struct r5dev *dev = &sh->dev[i];
2455 if ((dev->towrite || i == sh->pd_idx) &&
2456 !test_bit(R5_LOCKED, &dev->flags) &&
2457 !(test_bit(R5_UPTODATE, &dev->flags) ||
2458 test_bit(R5_Wantcompute, &dev->flags)) &&
2459 test_bit(R5_Insync, &dev->flags)) {
2461 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2462 pr_debug("Read_old block "
2463 "%d for r-m-w\n", i);
2464 set_bit(R5_LOCKED, &dev->flags);
2465 set_bit(R5_Wantread, &dev->flags);
2468 set_bit(STRIPE_DELAYED, &sh->state);
2469 set_bit(STRIPE_HANDLE, &sh->state);
2473 if (rcw <= rmw && rcw > 0)
2474 /* want reconstruct write, but need to get some data */
2475 for (i = disks; i--; ) {
2476 struct r5dev *dev = &sh->dev[i];
2477 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2479 !test_bit(R5_LOCKED, &dev->flags) &&
2480 !(test_bit(R5_UPTODATE, &dev->flags) ||
2481 test_bit(R5_Wantcompute, &dev->flags)) &&
2482 test_bit(R5_Insync, &dev->flags)) {
2484 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2485 pr_debug("Read_old block "
2486 "%d for Reconstruct\n", i);
2487 set_bit(R5_LOCKED, &dev->flags);
2488 set_bit(R5_Wantread, &dev->flags);
2491 set_bit(STRIPE_DELAYED, &sh->state);
2492 set_bit(STRIPE_HANDLE, &sh->state);
2496 /* now if nothing is locked, and if we have enough data,
2497 * we can start a write request
2499 /* since handle_stripe can be called at any time we need to handle the
2500 * case where a compute block operation has been submitted and then a
2501 * subsequent call wants to start a write request. raid_run_ops only
2502 * handles the case where compute block and reconstruct are requested
2503 * simultaneously. If this is not the case then new writes need to be
2504 * held off until the compute completes.
2506 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2507 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2508 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2509 schedule_reconstruction(sh, s, rcw == 0, 0);
2512 static void handle_stripe_dirtying6(raid5_conf_t *conf,
2513 struct stripe_head *sh, struct stripe_head_state *s,
2514 struct r6_state *r6s, int disks)
2516 int rcw = 0, pd_idx = sh->pd_idx, i;
2517 int qd_idx = sh->qd_idx;
2519 set_bit(STRIPE_HANDLE, &sh->state);
2520 for (i = disks; i--; ) {
2521 struct r5dev *dev = &sh->dev[i];
2522 /* check if we haven't enough data */
2523 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2524 i != pd_idx && i != qd_idx &&
2525 !test_bit(R5_LOCKED, &dev->flags) &&
2526 !(test_bit(R5_UPTODATE, &dev->flags) ||
2527 test_bit(R5_Wantcompute, &dev->flags))) {
2529 if (!test_bit(R5_Insync, &dev->flags))
2530 continue; /* it's a failed drive */
2533 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2534 pr_debug("Read_old stripe %llu "
2535 "block %d for Reconstruct\n",
2536 (unsigned long long)sh->sector, i);
2537 set_bit(R5_LOCKED, &dev->flags);
2538 set_bit(R5_Wantread, &dev->flags);
2541 pr_debug("Request delayed stripe %llu "
2542 "block %d for Reconstruct\n",
2543 (unsigned long long)sh->sector, i);
2544 set_bit(STRIPE_DELAYED, &sh->state);
2545 set_bit(STRIPE_HANDLE, &sh->state);
2549 /* now if nothing is locked, and if we have enough data, we can start a
2552 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2553 s->locked == 0 && rcw == 0 &&
2554 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2555 schedule_reconstruction(sh, s, 1, 0);
2559 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2560 struct stripe_head_state *s, int disks)
2562 struct r5dev *dev = NULL;
2564 set_bit(STRIPE_HANDLE, &sh->state);
2566 switch (sh->check_state) {
2567 case check_state_idle:
2568 /* start a new check operation if there are no failures */
2569 if (s->failed == 0) {
2570 BUG_ON(s->uptodate != disks);
2571 sh->check_state = check_state_run;
2572 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2573 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2577 dev = &sh->dev[s->failed_num];
2579 case check_state_compute_result:
2580 sh->check_state = check_state_idle;
2582 dev = &sh->dev[sh->pd_idx];
2584 /* check that a write has not made the stripe insync */
2585 if (test_bit(STRIPE_INSYNC, &sh->state))
2588 /* either failed parity check, or recovery is happening */
2589 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2590 BUG_ON(s->uptodate != disks);
2592 set_bit(R5_LOCKED, &dev->flags);
2594 set_bit(R5_Wantwrite, &dev->flags);
2596 clear_bit(STRIPE_DEGRADED, &sh->state);
2597 set_bit(STRIPE_INSYNC, &sh->state);
2599 case check_state_run:
2600 break; /* we will be called again upon completion */
2601 case check_state_check_result:
2602 sh->check_state = check_state_idle;
2604 /* if a failure occurred during the check operation, leave
2605 * STRIPE_INSYNC not set and let the stripe be handled again
2610 /* handle a successful check operation, if parity is correct
2611 * we are done. Otherwise update the mismatch count and repair
2612 * parity if !MD_RECOVERY_CHECK
2614 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2615 /* parity is correct (on disc,
2616 * not in buffer any more)
2618 set_bit(STRIPE_INSYNC, &sh->state);
2620 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2621 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2622 /* don't try to repair!! */
2623 set_bit(STRIPE_INSYNC, &sh->state);
2625 sh->check_state = check_state_compute_run;
2626 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2627 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2628 set_bit(R5_Wantcompute,
2629 &sh->dev[sh->pd_idx].flags);
2630 sh->ops.target = sh->pd_idx;
2631 sh->ops.target2 = -1;
2636 case check_state_compute_run:
2639 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2640 __func__, sh->check_state,
2641 (unsigned long long) sh->sector);
2647 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2648 struct stripe_head_state *s,
2649 struct r6_state *r6s, int disks)
2651 int pd_idx = sh->pd_idx;
2652 int qd_idx = sh->qd_idx;
2655 set_bit(STRIPE_HANDLE, &sh->state);
2657 BUG_ON(s->failed > 2);
2659 /* Want to check and possibly repair P and Q.
2660 * However there could be one 'failed' device, in which
2661 * case we can only check one of them, possibly using the
2662 * other to generate missing data
2665 switch (sh->check_state) {
2666 case check_state_idle:
2667 /* start a new check operation if there are < 2 failures */
2668 if (s->failed == r6s->q_failed) {
2669 /* The only possible failed device holds Q, so it
2670 * makes sense to check P (If anything else were failed,
2671 * we would have used P to recreate it).
2673 sh->check_state = check_state_run;
2675 if (!r6s->q_failed && s->failed < 2) {
2676 /* Q is not failed, and we didn't use it to generate
2677 * anything, so it makes sense to check it
2679 if (sh->check_state == check_state_run)
2680 sh->check_state = check_state_run_pq;
2682 sh->check_state = check_state_run_q;
2685 /* discard potentially stale zero_sum_result */
2686 sh->ops.zero_sum_result = 0;
2688 if (sh->check_state == check_state_run) {
2689 /* async_xor_zero_sum destroys the contents of P */
2690 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2693 if (sh->check_state >= check_state_run &&
2694 sh->check_state <= check_state_run_pq) {
2695 /* async_syndrome_zero_sum preserves P and Q, so
2696 * no need to mark them !uptodate here
2698 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2702 /* we have 2-disk failure */
2703 BUG_ON(s->failed != 2);
2705 case check_state_compute_result:
2706 sh->check_state = check_state_idle;
2708 /* check that a write has not made the stripe insync */
2709 if (test_bit(STRIPE_INSYNC, &sh->state))
2712 /* now write out any block on a failed drive,
2713 * or P or Q if they were recomputed
2715 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2716 if (s->failed == 2) {
2717 dev = &sh->dev[r6s->failed_num[1]];
2719 set_bit(R5_LOCKED, &dev->flags);
2720 set_bit(R5_Wantwrite, &dev->flags);
2722 if (s->failed >= 1) {
2723 dev = &sh->dev[r6s->failed_num[0]];
2725 set_bit(R5_LOCKED, &dev->flags);
2726 set_bit(R5_Wantwrite, &dev->flags);
2728 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2729 dev = &sh->dev[pd_idx];
2731 set_bit(R5_LOCKED, &dev->flags);
2732 set_bit(R5_Wantwrite, &dev->flags);
2734 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2735 dev = &sh->dev[qd_idx];
2737 set_bit(R5_LOCKED, &dev->flags);
2738 set_bit(R5_Wantwrite, &dev->flags);
2740 clear_bit(STRIPE_DEGRADED, &sh->state);
2742 set_bit(STRIPE_INSYNC, &sh->state);
2744 case check_state_run:
2745 case check_state_run_q:
2746 case check_state_run_pq:
2747 break; /* we will be called again upon completion */
2748 case check_state_check_result:
2749 sh->check_state = check_state_idle;
2751 /* handle a successful check operation, if parity is correct
2752 * we are done. Otherwise update the mismatch count and repair
2753 * parity if !MD_RECOVERY_CHECK
2755 if (sh->ops.zero_sum_result == 0) {
2756 /* both parities are correct */
2758 set_bit(STRIPE_INSYNC, &sh->state);
2760 /* in contrast to the raid5 case we can validate
2761 * parity, but still have a failure to write
2764 sh->check_state = check_state_compute_result;
2765 /* Returning at this point means that we may go
2766 * off and bring p and/or q uptodate again so
2767 * we make sure to check zero_sum_result again
2768 * to verify if p or q need writeback
2772 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2773 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2774 /* don't try to repair!! */
2775 set_bit(STRIPE_INSYNC, &sh->state);
2777 int *target = &sh->ops.target;
2779 sh->ops.target = -1;
2780 sh->ops.target2 = -1;
2781 sh->check_state = check_state_compute_run;
2782 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2783 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2784 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2785 set_bit(R5_Wantcompute,
2786 &sh->dev[pd_idx].flags);
2788 target = &sh->ops.target2;
2791 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2792 set_bit(R5_Wantcompute,
2793 &sh->dev[qd_idx].flags);
2800 case check_state_compute_run:
2803 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2804 __func__, sh->check_state,
2805 (unsigned long long) sh->sector);
2810 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2811 struct r6_state *r6s)
2815 /* We have read all the blocks in this stripe and now we need to
2816 * copy some of them into a target stripe for expand.
2818 struct dma_async_tx_descriptor *tx = NULL;
2819 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2820 for (i = 0; i < sh->disks; i++)
2821 if (i != sh->pd_idx && i != sh->qd_idx) {
2823 struct stripe_head *sh2;
2824 struct async_submit_ctl submit;
2826 sector_t bn = compute_blocknr(sh, i, 1);
2827 sector_t s = raid5_compute_sector(conf, bn, 0,
2829 sh2 = get_active_stripe(conf, s, 0, 1);
2831 /* so far only the early blocks of this stripe
2832 * have been requested. When later blocks
2833 * get requested, we will try again
2836 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2837 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2838 /* must have already done this block */
2839 release_stripe(sh2);
2843 /* place all the copies on one channel */
2844 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2845 tx = async_memcpy(sh2->dev[dd_idx].page,
2846 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2849 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2850 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2851 for (j = 0; j < conf->raid_disks; j++)
2852 if (j != sh2->pd_idx &&
2853 (!r6s || j != sh2->qd_idx) &&
2854 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2856 if (j == conf->raid_disks) {
2857 set_bit(STRIPE_EXPAND_READY, &sh2->state);
2858 set_bit(STRIPE_HANDLE, &sh2->state);
2860 release_stripe(sh2);
2863 /* done submitting copies, wait for them to complete */
2866 dma_wait_for_async_tx(tx);
2872 * handle_stripe - do things to a stripe.
2874 * We lock the stripe and then examine the state of various bits
2875 * to see what needs to be done.
2877 * return some read request which now have data
2878 * return some write requests which are safely on disc
2879 * schedule a read on some buffers
2880 * schedule a write of some buffers
2881 * return confirmation of parity correctness
2883 * buffers are taken off read_list or write_list, and bh_cache buffers
2884 * get BH_Lock set before the stripe lock is released.
2888 static bool handle_stripe5(struct stripe_head *sh)
2890 raid5_conf_t *conf = sh->raid_conf;
2891 int disks = sh->disks, i;
2892 struct bio *return_bi = NULL;
2893 struct stripe_head_state s;
2895 mdk_rdev_t *blocked_rdev = NULL;
2898 memset(&s, 0, sizeof(s));
2899 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2900 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2901 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2902 sh->reconstruct_state);
2904 spin_lock(&sh->lock);
2905 clear_bit(STRIPE_HANDLE, &sh->state);
2906 clear_bit(STRIPE_DELAYED, &sh->state);
2908 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2909 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2910 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2912 /* Now to look around and see what can be done */
2914 for (i=disks; i--; ) {
2916 struct r5dev *dev = &sh->dev[i];
2917 clear_bit(R5_Insync, &dev->flags);
2919 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2920 "written %p\n", i, dev->flags, dev->toread, dev->read,
2921 dev->towrite, dev->written);
2923 /* maybe we can request a biofill operation
2925 * new wantfill requests are only permitted while
2926 * ops_complete_biofill is guaranteed to be inactive
2928 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2929 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2930 set_bit(R5_Wantfill, &dev->flags);
2932 /* now count some things */
2933 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2934 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2935 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2937 if (test_bit(R5_Wantfill, &dev->flags))
2939 else if (dev->toread)
2943 if (!test_bit(R5_OVERWRITE, &dev->flags))
2948 rdev = rcu_dereference(conf->disks[i].rdev);
2949 if (blocked_rdev == NULL &&
2950 rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2951 blocked_rdev = rdev;
2952 atomic_inc(&rdev->nr_pending);
2954 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2955 /* The ReadError flag will just be confusing now */