Merge branch 'for-linus' of git://neil.brown.name/md
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 23 May 2011 05:03:03 +0000 (22:03 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 23 May 2011 05:03:03 +0000 (22:03 -0700)
* 'for-linus' of git://neil.brown.name/md:
  md: allow resync_start to be set while an array is active.
  md/raid10:  reformat some loops with less indenting.
  md/raid10: remove unused variable.
  md/raid10: make more use of 'slot' in raid10d.
  md/raid10: some tidying up in fix_read_error
  md/raid1: improve handling of pages allocated for write-behind.
  md/raid1: try fix_sync_read_error before process_checks.
  md/raid1: tidy up new functions: process_checks and fix_sync_read_error.
  md/raid1: split out two sub-functions from sync_request_write
  md: make error_handler functions more uniform and correct.
  md/multipath: discard ->working_disks in favour of ->degraded
  md/raid1: clean up read_balance.
  md: simplify raid10 read_balance
  md/bitmap: fix saving of events_cleared and other state.
  md: reject a re-add request that cannot be honoured.
  md: Fix race when creating a new md device.

drivers/md/bitmap.c
drivers/md/md.c
drivers/md/multipath.c
drivers/md/multipath.h
drivers/md/raid1.c
drivers/md/raid1.h
drivers/md/raid10.c
drivers/md/raid5.c

index 5c93627..70bd738 100644 (file)
@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
        spin_unlock_irqrestore(&bitmap->lock, flags);
        sb = kmap_atomic(bitmap->sb_page, KM_USER0);
        sb->events = cpu_to_le64(bitmap->mddev->events);
-       if (bitmap->mddev->events < bitmap->events_cleared) {
+       if (bitmap->mddev->events < bitmap->events_cleared)
                /* rocking back to read-only */
                bitmap->events_cleared = bitmap->mddev->events;
-               sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
-       }
+       sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
+       sb->state = cpu_to_le32(bitmap->flags);
        /* Just in case these have been changed via sysfs: */
        sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
        sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -618,7 +618,7 @@ success:
        if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
                bitmap->flags |= BITMAP_HOSTENDIAN;
        bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
-       if (sb->state & cpu_to_le32(BITMAP_STALE))
+       if (bitmap->flags & BITMAP_STALE)
                bitmap->events_cleared = bitmap->mddev->events;
        err = 0;
 out:
@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
        switch (op) {
        case MASK_SET:
                sb->state |= cpu_to_le32(bits);
+               bitmap->flags |= bits;
                break;
        case MASK_UNSET:
                sb->state &= cpu_to_le32(~bits);
+               bitmap->flags &= ~bits;
                break;
        default:
                BUG();
index 7d6f7f1..aa640a8 100644 (file)
@@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
        char *e;
        unsigned long long n = simple_strtoull(buf, &e, 10);
 
-       if (mddev->pers)
+       if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
                return -EBUSY;
        if (cmd_match(buf, "none"))
                n = MaxSector;
@@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name)
        disk->fops = &md_fops;
        disk->private_data = mddev;
        disk->queue = mddev->queue;
+       blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
        /* Allow extended partitions.  This makes the
         * 'mdp' device redundant, but we can't really
         * remove it now.
         */
        disk->flags |= GENHD_FL_EXT_DEVT;
-       add_disk(disk);
        mddev->gendisk = disk;
+       /* As soon as we call add_disk(), another thread could get
+        * through to md_open, so make sure it doesn't get too far
+        */
+       mutex_lock(&mddev->open_mutex);
+       add_disk(disk);
+
        error = kobject_init_and_add(&mddev->kobj, &md_ktype,
                                     &disk_to_dev(disk)->kobj, "%s", "md");
        if (error) {
@@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name)
        if (mddev->kobj.sd &&
            sysfs_create_group(&mddev->kobj, &md_bitmap_group))
                printk(KERN_DEBUG "pointless warning\n");
-
-       blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
+       mutex_unlock(&mddev->open_mutex);
  abort:
        mutex_unlock(&disks_mutex);
        if (!error && mddev->kobj.sd) {
@@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                } else
                        super_types[mddev->major_version].
                                validate_super(mddev, rdev);
+               if ((info->state & (1<<MD_DISK_SYNC)) &&
+                   (!test_bit(In_sync, &rdev->flags) ||
+                    rdev->raid_disk != info->raid_disk)) {
+                       /* This was a hot-add request, but events doesn't
+                        * match, so reject it.
+                        */
+                       export_rdev(rdev);
+                       return -EINVAL;
+               }
+
                if (test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = rdev->raid_disk;
                else
index c358909..3535c23 100644 (file)
@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
        int i;
        
        seq_printf (seq, " [%d/%d] [", conf->raid_disks,
-                                                conf->working_disks);
+                   conf->raid_disks - mddev->degraded);
        for (i = 0; i < conf->raid_disks; i++)
                seq_printf (seq, "%s",
                               conf->multipaths[i].rdev && 
@@ -186,35 +186,36 @@ static int multipath_congested(void *data, int bits)
 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
        multipath_conf_t *conf = mddev->private;
+       char b[BDEVNAME_SIZE];
 
-       if (conf->working_disks <= 1) {
+       if (conf->raid_disks - mddev->degraded <= 1) {
                /*
                 * Uh oh, we can do nothing if this is our last path, but
                 * first check if this is a queued request for a device
                 * which has just failed.
                 */
                printk(KERN_ALERT 
-                       "multipath: only one IO path left and IO error.\n");
+                      "multipath: only one IO path left and IO error.\n");
                /* leave it active... it's all we have */
-       } else {
-               /*
-                * Mark disk as unusable
-                */
-               if (!test_bit(Faulty, &rdev->flags)) {
-                       char b[BDEVNAME_SIZE];
-                       clear_bit(In_sync, &rdev->flags);
-                       set_bit(Faulty, &rdev->flags);
-                       set_bit(MD_CHANGE_DEVS, &mddev->flags);
-                       conf->working_disks--;
-                       mddev->degraded++;
-                       printk(KERN_ALERT "multipath: IO failure on %s,"
-                               " disabling IO path.\n"
-                               "multipath: Operation continuing"
-                               " on %d IO paths.\n",
-                               bdevname (rdev->bdev,b),
-                               conf->working_disks);
-               }
+               return;
+       }
+       /*
+        * Mark disk as unusable
+        */
+       if (test_and_clear_bit(In_sync, &rdev->flags)) {
+               unsigned long flags;
+               spin_lock_irqsave(&conf->device_lock, flags);
+               mddev->degraded++;
+               spin_unlock_irqrestore(&conf->device_lock, flags);
        }
+       set_bit(Faulty, &rdev->flags);
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       printk(KERN_ALERT "multipath: IO failure on %s,"
+              " disabling IO path.\n"
+              "multipath: Operation continuing"
+              " on %d IO paths.\n",
+              bdevname(rdev->bdev, b),
+              conf->raid_disks - mddev->degraded);
 }
 
 static void print_multipath_conf (multipath_conf_t *conf)
@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
                printk("(conf==NULL)\n");
                return;
        }
-       printk(" --- wd:%d rd:%d\n", conf->working_disks,
+       printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
                         conf->raid_disks);
 
        for (i = 0; i < conf->raid_disks; i++) {
@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
                                                           PAGE_CACHE_SIZE - 1);
                        }
 
-                       conf->working_disks++;
+                       spin_lock_irq(&conf->device_lock);
                        mddev->degraded--;
                        rdev->raid_disk = path;
                        set_bit(In_sync, &rdev->flags);
+                       spin_unlock_irq(&conf->device_lock);
                        rcu_assign_pointer(p->rdev, rdev);
                        err = 0;
                        md_integrity_add_rdev(rdev, mddev);
@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
        int disk_idx;
        struct multipath_info *disk;
        mdk_rdev_t *rdev;
+       int working_disks;
 
        if (md_check_no_bitmap(mddev))
                return -EINVAL;
@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
                goto out_free_conf;
        }
 
-       conf->working_disks = 0;
+       working_disks = 0;
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                disk_idx = rdev->raid_disk;
                if (disk_idx < 0 ||
@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
                }
 
                if (!test_bit(Faulty, &rdev->flags))
-                       conf->working_disks++;
+                       working_disks++;
        }
 
        conf->raid_disks = mddev->raid_disks;
@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
        spin_lock_init(&conf->device_lock);
        INIT_LIST_HEAD(&conf->retry_list);
 
-       if (!conf->working_disks) {
+       if (!working_disks) {
                printk(KERN_ERR "multipath: no operational IO paths for %s\n",
                        mdname(mddev));
                goto out_free_conf;
        }
-       mddev->degraded = conf->raid_disks - conf->working_disks;
+       mddev->degraded = conf->raid_disks - working_disks;
 
        conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
                                                 sizeof(struct multipath_bh));
@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev)
 
        printk(KERN_INFO 
                "multipath: array %s active with %d out of %d IO paths\n",
-               mdname(mddev), conf->working_disks, mddev->raid_disks);
+               mdname(mddev), conf->raid_disks - mddev->degraded,
+              mddev->raid_disks);
        /*
         * Ok, everything is just fine now
         */
index d1c2a8d..3c5a45e 100644 (file)
@@ -9,7 +9,6 @@ struct multipath_private_data {
        mddev_t                 *mddev;
        struct multipath_info   *multipaths;
        int                     raid_disks;
-       int                     working_disks;
        spinlock_t              device_lock;
        struct list_head        retry_list;
 
index 2b7a7ff..5d09609 100644 (file)
@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error)
        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 }
 
-static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv,
-                             int behind)
+static void r1_bio_write_done(r1bio_t *r1_bio)
 {
        if (atomic_dec_and_test(&r1_bio->remaining))
        {
                /* it really is the end of this request */
                if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
                        /* free extra copy of the data pages */
-                       int i = vcnt;
+                       int i = r1_bio->behind_page_count;
                        while (i--)
-                               safe_put_page(bv[i].bv_page);
+                               safe_put_page(r1_bio->behind_pages[i]);
+                       kfree(r1_bio->behind_pages);
+                       r1_bio->behind_pages = NULL;
                }
                /* clear the bitmap if all writes complete successfully */
                bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
                                r1_bio->sectors,
                                !test_bit(R1BIO_Degraded, &r1_bio->state),
-                               behind);
+                               test_bit(R1BIO_BehindIO, &r1_bio->state));
                md_write_end(r1_bio->mddev);
                raid_end_bio_io(r1_bio);
        }
@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
         * Let's see if all mirrored write operations have finished
         * already.
         */
-       r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind);
+       r1_bio_write_done(r1_bio);
 
        if (to_put)
                bio_put(to_put);
@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 {
        const sector_t this_sector = r1_bio->sector;
        const int sectors = r1_bio->sectors;
-       int new_disk = -1;
        int start_disk;
+       int best_disk;
        int i;
-       sector_t new_distance, current_distance;
+       sector_t best_dist;
        mdk_rdev_t *rdev;
        int choose_first;
 
@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
         * We take the first readable disk when above the resync window.
         */
  retry:
+       best_disk = -1;
+       best_dist = MaxSector;
        if (conf->mddev->recovery_cp < MaxSector &&
            (this_sector + sectors >= conf->next_resync)) {
                choose_first = 1;
@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
                start_disk = conf->last_used;
        }
 
-       /* make sure the disk is operational */
        for (i = 0 ; i < conf->raid_disks ; i++) {
+               sector_t dist;
                int disk = start_disk + i;
                if (disk >= conf->raid_disks)
                        disk -= conf->raid_disks;
@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
                rdev = rcu_dereference(conf->mirrors[disk].rdev);
                if (r1_bio->bios[disk] == IO_BLOCKED
                    || rdev == NULL
-                   || !test_bit(In_sync, &rdev->flags))
+                   || test_bit(Faulty, &rdev->flags))
                        continue;
-
-               new_disk = disk;
-               if (!test_bit(WriteMostly, &rdev->flags))
-                       break;
-       }
-
-       if (new_disk < 0 || choose_first)
-               goto rb_out;
-
-       /*
-        * Don't change to another disk for sequential reads:
-        */
-       if (conf->next_seq_sect == this_sector)
-               goto rb_out;
-       if (this_sector == conf->mirrors[new_disk].head_position)
-               goto rb_out;
-
-       current_distance = abs(this_sector 
-                              - conf->mirrors[new_disk].head_position);
-
-       /* look for a better disk - i.e. head is closer */
-       start_disk = new_disk;
-       for (i = 1; i < conf->raid_disks; i++) {
-               int disk = start_disk + 1;
-               if (disk >= conf->raid_disks)
-                       disk -= conf->raid_disks;
-
-               rdev = rcu_dereference(conf->mirrors[disk].rdev);
-               if (r1_bio->bios[disk] == IO_BLOCKED
-                   || rdev == NULL
-                   || !test_bit(In_sync, &rdev->flags)
-                   || test_bit(WriteMostly, &rdev->flags))
+               if (!test_bit(In_sync, &rdev->flags) &&
+                   rdev->recovery_offset < this_sector + sectors)
                        continue;
-
-               if (!atomic_read(&rdev->nr_pending)) {
-                       new_disk = disk;
+               if (test_bit(WriteMostly, &rdev->flags)) {
+                       /* Don't balance among write-mostly, just
+                        * use the first as a last resort */
+                       if (best_disk < 0)
+                               best_disk = disk;
+                       continue;
+               }
+               /* This is a reasonable device to use.  It might
+                * even be best.
+                */
+               dist = abs(this_sector - conf->mirrors[disk].head_position);
+               if (choose_first
+                   /* Don't change to another disk for sequential reads */
+                   || conf->next_seq_sect == this_sector
+                   || dist == 0
+                   /* If device is idle, use it */
+                   || atomic_read(&rdev->nr_pending) == 0) {
+                       best_disk = disk;
                        break;
                }
-               new_distance = abs(this_sector - conf->mirrors[disk].head_position);
-               if (new_distance < current_distance) {
-                       current_distance = new_distance;
-                       new_disk = disk;
+               if (dist < best_dist) {
+                       best_dist = dist;
+                       best_disk = disk;
                }
        }
 
- rb_out:
-       if (new_disk >= 0) {
-               rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+       if (best_disk >= 0) {
+               rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
                if (!rdev)
                        goto retry;
                atomic_inc(&rdev->nr_pending);
-               if (!test_bit(In_sync, &rdev->flags)) {
+               if (test_bit(Faulty, &rdev->flags)) {
                        /* cannot risk returning a device that failed
                         * before we inc'ed nr_pending
                         */
@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
                        goto retry;
                }
                conf->next_seq_sect = this_sector + sectors;
-               conf->last_used = new_disk;
+               conf->last_used = best_disk;
        }
        rcu_read_unlock();
 
-       return new_disk;
+       return best_disk;
 }
 
 static int raid1_congested(void *data, int bits)
@@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf)
 
 
 /* duplicate the data pages for behind I/O 
- * We return a list of bio_vec rather than just page pointers
- * as it makes freeing easier
  */
-static struct bio_vec *alloc_behind_pages(struct bio *bio)
+static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
 {
        int i;
        struct bio_vec *bvec;
-       struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
+       struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
                                        GFP_NOIO);
        if (unlikely(!pages))
-               goto do_sync_io;
+               return;
 
        bio_for_each_segment(bvec, bio, i) {
-               pages[i].bv_page = alloc_page(GFP_NOIO);
-               if (unlikely(!pages[i].bv_page))
+               pages[i] = alloc_page(GFP_NOIO);
+               if (unlikely(!pages[i]))
                        goto do_sync_io;
-               memcpy(kmap(pages[i].bv_page) + bvec->bv_offset,
+               memcpy(kmap(pages[i]) + bvec->bv_offset,
                        kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
-               kunmap(pages[i].bv_page);
+               kunmap(pages[i]);
                kunmap(bvec->bv_page);
        }
-
-       return pages;
+       r1_bio->behind_pages = pages;
+       r1_bio->behind_page_count = bio->bi_vcnt;
+       set_bit(R1BIO_BehindIO, &r1_bio->state);
+       return;
 
 do_sync_io:
-       if (pages)
-               for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++)
-                       put_page(pages[i].bv_page);
+       for (i = 0; i < bio->bi_vcnt; i++)
+               if (pages[i])
+                       put_page(pages[i]);
        kfree(pages);
        PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
-       return NULL;
 }
 
 static int make_request(mddev_t *mddev, struct bio * bio)
@@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        int i, targets = 0, disks;
        struct bitmap *bitmap;
        unsigned long flags;
-       struct bio_vec *behind_pages = NULL;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
@@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        if (bitmap &&
            (atomic_read(&bitmap->behind_writes)
             < mddev->bitmap_info.max_write_behind) &&
-           !waitqueue_active(&bitmap->behind_wait) &&
-           (behind_pages = alloc_behind_pages(bio)) != NULL)
-               set_bit(R1BIO_BehindIO, &r1_bio->state);
+           !waitqueue_active(&bitmap->behind_wait))
+               alloc_behind_pages(bio, r1_bio);
 
        atomic_set(&r1_bio->remaining, 1);
        atomic_set(&r1_bio->behind_remaining, 0);
@@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
                mbio->bi_private = r1_bio;
 
-               if (behind_pages) {
+               if (r1_bio->behind_pages) {
                        struct bio_vec *bvec;
                        int j;
 
@@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                         * them all
                         */
                        __bio_for_each_segment(bvec, mbio, j, 0)
-                               bvec->bv_page = behind_pages[j].bv_page;
+                               bvec->bv_page = r1_bio->behind_pages[j];
                        if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
                                atomic_inc(&r1_bio->behind_remaining);
                }
@@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                bio_list_add(&conf->pending_bio_list, mbio);
                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
-       r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
-       kfree(behind_pages); /* the behind pages are attached to the bios now */
+       r1_bio_write_done(r1_bio);
 
        /* In case raid1d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
@@ -1196,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error)
        }
 }
 
-static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+static int fix_sync_read_error(r1bio_t *r1_bio)
 {
+       /* Try some synchronous reads of other devices to get
+        * good data, much like with normal read errors.  Only
+        * read into the pages we already have so we don't
+        * need to re-issue the read request.
+        * We don't need to freeze the array, because being in an
+        * active sync request, there is no normal IO, and
+        * no overlapping syncs.
+        */
+       mddev_t *mddev = r1_bio->mddev;
        conf_t *conf = mddev->private;
-       int i;
-       int disks = conf->raid_disks;
-       struct bio *bio, *wbio;
-
-       bio = r1_bio->bios[r1_bio->read_disk];
+       struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+       sector_t sect = r1_bio->sector;
+       int sectors = r1_bio->sectors;
+       int idx = 0;
 
+       while(sectors) {
+               int s = sectors;
+               int d = r1_bio->read_disk;
+               int success = 0;
+               mdk_rdev_t *rdev;
+               int start;
 
-       if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
-               /* We have read all readable devices.  If we haven't
-                * got the block, then there is no hope left.
-                * If we have, then we want to do a comparison
-                * and skip the write if everything is the same.
-                * If any blocks failed to read, then we need to
-                * attempt an over-write
-                */
-               int primary;
-               if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-                       for (i=0; i<mddev->raid_disks; i++)
-                               if (r1_bio->bios[i]->bi_end_io == end_sync_read)
-                                       md_error(mddev, conf->mirrors[i].rdev);
+               if (s > (PAGE_SIZE>>9))
+                       s = PAGE_SIZE >> 9;
+               do {
+                       if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+                               /* No rcu protection needed here devices
+                                * can only be removed when no resync is
+                                * active, and resync is currently active
+                                */
+                               rdev = conf->mirrors[d].rdev;
+                               if (sync_page_io(rdev,
+                                                sect,
+                                                s<<9,
+                                                bio->bi_io_vec[idx].bv_page,
+                                                READ, false)) {
+                                       success = 1;
+                                       break;
+                               }
+                       }
+                       d++;
+                       if (d == conf->raid_disks)
+                               d = 0;
+               } while (!success && d != r1_bio->read_disk);
 
-                       md_done_sync(mddev, r1_bio->sectors, 1);
+               if (!success) {
+                       char b[BDEVNAME_SIZE];
+                       /* Cannot read from anywhere, array is toast */
+                       md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+                       printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
+                              " for block %llu\n",
+                              mdname(mddev),
+                              bdevname(bio->bi_bdev, b),
+                              (unsigned long long)r1_bio->sector);
+                       md_done_sync(mddev, r1_bio->sectors, 0);
                        put_buf(r1_bio);
-                       return;
+                       return 0;
                }
-               for (primary=0; primary<mddev->raid_disks; primary++)
-                       if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-                           test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
-                               r1_bio->bios[primary]->bi_end_io = NULL;
-                               rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
-                               break;
-                       }
-               r1_bio->read_disk = primary;
-               for (i=0; i<mddev->raid_disks; i++)
-                       if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
-                               int j;
-                               int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
-                               struct bio *pbio = r1_bio->bios[primary];
-                               struct bio *sbio = r1_bio->bios[i];
-
-                               if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
-                                       for (j = vcnt; j-- ; ) {
-                                               struct page *p, *s;
-                                               p = pbio->bi_io_vec[j].bv_page;
-                                               s = sbio->bi_io_vec[j].bv_page;
-                                               if (memcmp(page_address(p),
-                                                          page_address(s),
-                                                          PAGE_SIZE))
-                                                       break;
-                                       }
-                               } else
-                                       j = 0;
-                               if (j >= 0)
-                                       mddev->resync_mismatches += r1_bio->sectors;
-                               if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-                                             && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
-                                       sbio->bi_end_io = NULL;
-                                       rdev_dec_pending(conf->mirrors[i].rdev, mddev);
-                               } else {
-                                       /* fixup the bio for reuse */
-                                       int size;
-                                       sbio->bi_vcnt = vcnt;
-                                       sbio->bi_size = r1_bio->sectors << 9;
-                                       sbio->bi_idx = 0;
-                                       sbio->bi_phys_segments = 0;
-                                       sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-                                       sbio->bi_flags |= 1 << BIO_UPTODATE;
-                                       sbio->bi_next = NULL;
-                                       sbio->bi_sector = r1_bio->sector +
-                                               conf->mirrors[i].rdev->data_offset;
-                                       sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
-                                       size = sbio->bi_size;
-                                       for (j = 0; j < vcnt ; j++) {
-                                               struct bio_vec *bi;
-                                               bi = &sbio->bi_io_vec[j];
-                                               bi->bv_offset = 0;
-                                               if (size > PAGE_SIZE)
-                                                       bi->bv_len = PAGE_SIZE;
-                                               else
-                                                       bi->bv_len = size;
-                                               size -= PAGE_SIZE;
-                                               memcpy(page_address(bi->bv_page),
-                                                      page_address(pbio->bi_io_vec[j].bv_page),
-                                                      PAGE_SIZE);
-                                       }
 
-                               }
-                       }
+               start = d;
+               /* write it back and re-read */
+               while (d != r1_bio->read_disk) {
+                       if (d == 0)
+                               d = conf->raid_disks;
+                       d--;
+                       if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+                               continue;
+                       rdev = conf->mirrors[d].rdev;
+                       if (sync_page_io(rdev,
+                                        sect,
+                                        s<<9,
+                                        bio->bi_io_vec[idx].bv_page,
+                                        WRITE, false) == 0) {
+                               r1_bio->bios[d]->bi_end_io = NULL;
+                               rdev_dec_pending(rdev, mddev);
+                               md_error(mddev, rdev);
+                       } else
+                               atomic_add(s, &rdev->corrected_errors);
+               }
+               d = start;
+               while (d != r1_bio->read_disk) {
+                       if (d == 0)
+                               d = conf->raid_disks;
+                       d--;
+                       if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+                               continue;
+                       rdev = conf->mirrors[d].rdev;
+                       if (sync_page_io(rdev,
+                                        sect,
+                                        s<<9,
+                                        bio->bi_io_vec[idx].bv_page,
+                                        READ, false) == 0)
+                               md_error(mddev, rdev);
+               }
+               sectors -= s;
+               sect += s;
+               idx ++;
        }
-       if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-               /* ouch - failed to read all of that.
-                * Try some synchronous reads of other devices to get
-                * good data, much like with normal read errors.  Only
-                * read into the pages we already have so we don't
-                * need to re-issue the read request.
-                * We don't need to freeze the array, because being in an
-                * active sync request, there is no normal IO, and
-                * no overlapping syncs.
-                */
-               sector_t sect = r1_bio->sector;
-               int sectors = r1_bio->sectors;
-               int idx = 0;
-
-               while(sectors) {
-                       int s = sectors;
-                       int d = r1_bio->read_disk;
-                       int success = 0;
-                       mdk_rdev_t *rdev;
-
-                       if (s > (PAGE_SIZE>>9))
-                               s = PAGE_SIZE >> 9;
-                       do {
-                               if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
-                                       /* No rcu protection needed here devices
-                                        * can only be removed when no resync is
-                                        * active, and resync is currently active
-                                        */
-                                       rdev = conf->mirrors[d].rdev;
-                                       if (sync_page_io(rdev,
-                                                        sect,
-                                                        s<<9,
-                                                        bio->bi_io_vec[idx].bv_page,
-                                                        READ, false)) {
-                                               success = 1;
-                                               break;
-                                       }
-                               }
-                               d++;
-                               if (d == conf->raid_disks)
-                                       d = 0;
-                       } while (!success && d != r1_bio->read_disk);
-
-                       if (success) {
-                               int start = d;
-                               /* write it back and re-read */
-                               set_bit(R1BIO_Uptodate, &r1_bio->state);
-                               while (d != r1_bio->read_disk) {
-                                       if (d == 0)
-                                               d = conf->raid_disks;
-                                       d--;
-                                       if (r1_bio->bios[d]->bi_end_io != end_sync_read)
-                                               continue;
-                                       rdev = conf->mirrors[d].rdev;
-                                       atomic_add(s, &rdev->corrected_errors);
-                                       if (sync_page_io(rdev,
-                                                        sect,
-                                                        s<<9,
-                                                        bio->bi_io_vec[idx].bv_page,
-                                                        WRITE, false) == 0)
-                                               md_error(mddev, rdev);
-                               }
-                               d = start;
-                               while (d != r1_bio->read_disk) {
-                                       if (d == 0)
-                                               d = conf->raid_disks;
-                                       d--;
-                                       if (r1_bio->bios[d]->bi_end_io != end_sync_read)
-                                               continue;
-                                       rdev = conf->mirrors[d].rdev;
-                                       if (sync_page_io(rdev,
-                                                        sect,
-                                                        s<<9,
-                                                        bio->bi_io_vec[idx].bv_page,
-                                                        READ, false) == 0)
-                                               md_error(mddev, rdev);
-                               }
-                       } else {
-                               char b[BDEVNAME_SIZE];
-                               /* Cannot read from anywhere, array is toast */
-                               md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
-                               printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
-                                      " for block %llu\n",
-                                      mdname(mddev),
-                                      bdevname(bio->bi_bdev, b),
-                                      (unsigned long long)r1_bio->sector);
-                               md_done_sync(mddev, r1_bio->sectors, 0);
-                               put_buf(r1_bio);
-                               return;
+       set_bit(R1BIO_Uptodate, &r1_bio->state);
+       set_bit(BIO_UPTODATE, &bio->bi_flags);
+       return 1;
+}
+
+static int process_checks(r1bio_t *r1_bio)
+{
+       /* We have read all readable devices.  If we haven't
+        * got the block, then there is no hope left.
+        * If we have, then we want to do a comparison
+        * and skip the write if everything is the same.
+        * If any blocks failed to read, then we need to
+        * attempt an over-write
+        */
+       mddev_t *mddev = r1_bio->mddev;
+       conf_t *conf = mddev->private;
+       int primary;
+       int i;
+
+       for (primary = 0; primary < conf->raid_disks; primary++)
+               if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+                   test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+                       r1_bio->bios[primary]->bi_end_io = NULL;
+                       rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
+                       break;
+               }
+       r1_bio->read_disk = primary;
+       for (i = 0; i < conf->raid_disks; i++) {
+               int j;
+               int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
+               struct bio *pbio = r1_bio->bios[primary];
+               struct bio *sbio = r1_bio->bios[i];
+               int size;
+
+               if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+                       continue;
+
+               if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+                       for (j = vcnt; j-- ; ) {
+                               struct page *p, *s;
+                               p = pbio->bi_io_vec[j].bv_page;
+                               s = sbio->bi_io_vec[j].bv_page;
+                               if (memcmp(page_address(p),
+                                          page_address(s),
+                                          PAGE_SIZE))
+                                       break;
                        }
-                       sectors -= s;
-                       sect += s;
-                       idx ++;
+               } else
+                       j = 0;
+               if (j >= 0)
+                       mddev->resync_mismatches += r1_bio->sectors;
+               if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+                             && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+                       /* No need to write to this device. */
+                       sbio->bi_end_io = NULL;
+                       rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+                       continue;
+               }
+               /* fixup the bio for reuse */
+               sbio->bi_vcnt = vcnt;
+               sbio->bi_size = r1_bio->sectors << 9;
+               sbio->bi_idx = 0;
+               sbio->bi_phys_segments = 0;
+               sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+               sbio->bi_flags |= 1 << BIO_UPTODATE;
+               sbio->bi_next = NULL;
+               sbio->bi_sector = r1_bio->sector +
+                       conf->mirrors[i].rdev->data_offset;
+               sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+               size = sbio->bi_size;
+               for (j = 0; j < vcnt ; j++) {
+                       struct bio_vec *bi;
+                       bi = &sbio->bi_io_vec[j];
+                       bi->bv_offset = 0;
+                       if (size > PAGE_SIZE)
+                               bi->bv_len = PAGE_SIZE;
+                       else
+                               bi->bv_len = size;
+                       size -= PAGE_SIZE;
+                       memcpy(page_address(bi->bv_page),
+                              page_address(pbio->bi_io_vec[j].bv_page),
+                              PAGE_SIZE);
                }
        }
+       return 0;
+}
 
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+{
+       conf_t *conf = mddev->private;
+       int i;
+       int disks = conf->raid_disks;
+       struct bio *bio, *wbio;
+
+       bio = r1_bio->bios[r1_bio->read_disk];
+
+       if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+               /* ouch - failed to read all of that. */
+               if (!fix_sync_read_error(r1_bio))
+                       return;
+
+       if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+               if (process_checks(r1_bio) < 0)
+                       return;
        /*
         * schedule writes
         */
@@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
        set_capacity(mddev->gendisk, mddev->array_sectors);
        revalidate_disk(mddev->gendisk);
        if (sectors > mddev->dev_sectors &&
-           mddev->recovery_cp == MaxSector) {
+           mddev->recovery_cp > mddev->dev_sectors) {
                mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
index cbfdf1a..5fc4ca1 100644 (file)
@@ -94,7 +94,9 @@ struct r1bio_s {
        int                     read_disk;
 
        struct list_head        retry_list;
-       struct bitmap_update    *bitmap_update;
+       /* Next two are only valid when R1BIO_BehindIO is set */
+       struct page             **behind_pages;
+       int                     behind_page_count;
        /*
         * if the IO is in WRITE direction, then multiple bios are used.
         * We choose the number when they are allocated.
index 8e94626..6e84668 100644 (file)
@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
                raid_end_bio_io(r10_bio);
+               rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
        } else {
                /*
-                * oops, read error:
+                * oops, read error - keep the refcount on the rdev
                 */
                char b[BDEVNAME_SIZE];
                if (printk_ratelimit())
@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
                               bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
                reschedule_retry(r10_bio);
        }
-
-       rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 
 static void raid10_end_write_request(struct bio *bio, int error)
@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 {
        const sector_t this_sector = r10_bio->sector;
-       int disk, slot, nslot;
+       int disk, slot;
        const int sectors = r10_bio->sectors;
-       sector_t new_distance, current_distance;
+       sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
+       int do_balance;
+       int best_slot;
 
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
+retry:
+       best_slot = -1;
+       best_dist = MaxSector;
+       do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
         * device if no resync is going on (recovery is ok), or below
@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
         * above the resync window.
         */
        if (conf->mddev->recovery_cp < MaxSector
-           && (this_sector + sectors >= conf->next_resync)) {
-               /* make sure that disk is operational */
-               slot = 0;
-               disk = r10_bio->devs[slot].devnum;
-
-               while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-                      r10_bio->devs[slot].bio == IO_BLOCKED ||
-                      !test_bit(In_sync, &rdev->flags)) {
-                       slot++;
-                       if (slot == conf->copies) {
-                               slot = 0;
-                               disk = -1;
-                               break;
-                       }
-                       disk = r10_bio->devs[slot].devnum;
-               }
-               goto rb_out;
-       }
-
+           && (this_sector + sectors >= conf->next_resync))
+               do_balance = 0;
 
-       /* make sure the disk is operational */
-       slot = 0;
-       disk = r10_bio->devs[slot].devnum;
-       while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
-              r10_bio->devs[slot].bio == IO_BLOCKED ||
-              !test_bit(In_sync, &rdev->flags)) {
-               slot ++;
-               if (slot == conf->copies) {
-                       disk = -1;
-                       goto rb_out;
-               }
+       for (slot = 0; slot < conf->copies ; slot++) {
+               if (r10_bio->devs[slot].bio == IO_BLOCKED)
+                       continue;
                disk = r10_bio->devs[slot].devnum;
-       }
-
-
-       current_distance = abs(r10_bio->devs[slot].addr -
-                              conf->mirrors[disk].head_position);
-
-       /* Find the disk whose head is closest,
-        * or - for far > 1 - find the closest to partition beginning */
-
-       for (nslot = slot; nslot < conf->copies; nslot++) {
-               int ndisk = r10_bio->devs[nslot].devnum;
-
-
-               if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
-                   r10_bio->devs[nslot].bio == IO_BLOCKED ||
-                   !test_bit(In_sync, &rdev->flags))
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (rdev == NULL)
                        continue;
+               if (!test_bit(In_sync, &rdev->flags))
+                       continue;
+
+               if (!do_balance)
+                       break;
 
                /* This optimisation is debatable, and completely destroys
                 * sequential read speed for 'far copies' arrays.  So only
                 * keep it for 'near' arrays, and review those later.
                 */
-               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
-                       disk = ndisk;
-                       slot = nslot;
+               if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
                        break;
-               }
 
                /* for far > 1 always use the lowest address */
                if (conf->far_copies > 1)
-                       new_distance = r10_bio->devs[nslot].addr;
+                       new_distance = r10_bio->devs[slot].addr;
                else
-                       new_distance = abs(r10_bio->devs[nslot].addr -
-                                          conf->mirrors[ndisk].head_position);
-               if (new_distance < current_distance) {
-                       current_distance = new_distance;
-                       disk = ndisk;
-                       slot = nslot;
+                       new_distance = abs(r10_bio->devs[slot].addr -
+                                          conf->mirrors[disk].head_position);
+               if (new_distance < best_dist) {
+                       best_dist = new_distance;
+                       best_slot = slot;
                }
        }
+       if (slot == conf->copies)
+               slot = best_slot;
 
-rb_out:
-       r10_bio->read_slot = slot;
-/*     conf->next_seq_sect = this_sector + sectors;*/
-
-       if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
-               atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-       else
+       if (slot >= 0) {
+               disk = r10_bio->devs[slot].devnum;
+               rdev = rcu_dereference(conf->mirrors[disk].rdev);
+               if (!rdev)
+                       goto retry;
+               atomic_inc(&rdev->nr_pending);
+               if (test_bit(Faulty, &rdev->flags)) {
+                       /* Cannot risk returning a device that failed
+                        * before we inc'ed nr_pending
+                        */
+                       rdev_dec_pending(rdev, conf->mddev);
+                       goto retry;
+               }
+               r10_bio->read_slot = slot;
+       } else
                disk = -1;
        rcu_read_unlock();
 
@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
        int d = r10_bio->devs[r10_bio->read_slot].devnum;
 
-       rcu_read_lock();
-       rdev = rcu_dereference(conf->mirrors[d].rdev);
-       if (rdev) { /* If rdev is not NULL */
-               char b[BDEVNAME_SIZE];
-               int cur_read_error_count = 0;
+       /* still own a reference to this rdev, so it cannot
+        * have been cleared recently.
+        */
+       rdev = conf->mirrors[d].rdev;
 
-               bdevname(rdev->bdev, b);
+       if (test_bit(Faulty, &rdev->flags))
+               /* drive has already been failed, just ignore any
+                  more fix_read_error() attempts */
+               return;
 
-               if (test_bit(Faulty, &rdev->flags)) {
-                       rcu_read_unlock();
-                       /* drive has already been failed, just ignore any
-                          more fix_read_error() attempts */
-                       return;
-               }
+       check_decay_read_errors(mddev, rdev);
+       atomic_inc(&rdev->read_errors);
+       if (atomic_read(&rdev->read_errors) > max_read_errors) {
+               char b[BDEVNAME_SIZE];
+               bdevname(rdev->bdev, b);
 
-               check_decay_read_errors(mddev, rdev);
-               atomic_inc(&rdev->read_errors);
-               cur_read_error_count = atomic_read(&rdev->read_errors);
-               if (cur_read_error_count > max_read_errors) {
-                       rcu_read_unlock();
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Raid device exceeded "
-                              "read_error threshold "
-                              "[cur %d:max %d]\n",
-                              mdname(mddev),
-                              b, cur_read_error_count, max_read_errors);
-                       printk(KERN_NOTICE
-                              "md/raid10:%s: %s: Failing raid "
-                              "device\n", mdname(mddev), b);
-                       md_error(mddev, conf->mirrors[d].rdev);
-                       return;
-               }
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Raid device exceeded "
+                      "read_error threshold [cur %d:max %d]\n",
+                      mdname(mddev), b,
+                      atomic_read(&rdev->read_errors), max_read_errors);
+               printk(KERN_NOTICE
+                      "md/raid10:%s: %s: Failing raid device\n",
+                      mdname(mddev), b);
+               md_error(mddev, conf->mirrors[d].rdev);
+               return;
        }
-       rcu_read_unlock();
 
        while(sectors) {
                int s = sectors;
@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "write failed"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                              rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing "
                                               "drive\n",
@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "corrected sectors"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                        printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
                                               mdname(mddev),
@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
                                               "md/raid10:%s: read error corrected"
                                               " (%d sectors at %llu on %s)\n",
                                               mdname(mddev), s,
-                                              (unsigned long long)(sect+
-                                                   rdev->data_offset),
+                                              (unsigned long long)(
+                                                      sect + rdev->data_offset),
                                               bdevname(rdev->bdev, b));
                                }
 
@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev)
                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                        recovery_request_write(mddev, r10_bio);
                else {
-                       int mirror;
+                       int slot = r10_bio->read_slot;
+                       int mirror = r10_bio->devs[slot].devnum;
                        /* we got a read error. Maybe the drive is bad.  Maybe just
                         * the block and we can fix it.
                         * We freeze all other IO, and try reading the block from
@@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
                                fix_read_error(conf, mddev, r10_bio);
                                unfreeze_array(conf);
                        }
+                       rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
 
-                       bio = r10_bio->devs[r10_bio->read_slot].bio;
-                       r10_bio->devs[r10_bio->read_slot].bio =
+                       bio = r10_bio->devs[slot].bio;
+                       r10_bio->devs[slot].bio =
                                mddev->ro ? IO_BLOCKED : NULL;
                        mirror = read_balance(conf, r10_bio);
                        if (mirror == -1) {
@@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
                        } else {
                                const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
                                bio_put(bio);
+                               slot = r10_bio->read_slot;
                                rdev = conf->mirrors[mirror].rdev;
                                if (printk_ratelimit())
                                        printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev)
                                               (unsigned long long)r10_bio->sector);
                                bio = bio_clone_mddev(r10_bio->master_bio,
                                                      GFP_NOIO, mddev);
-                               r10_bio->devs[r10_bio->read_slot].bio = bio;
-                               bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
+                               r10_bio->devs[slot].bio = bio;
+                               bio->bi_sector = r10_bio->devs[slot].addr
                                        + rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                bio->bi_rw = READ | do_sync;
@@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf)
  *
  */
 
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
+                            int *skipped, int go_faster)
 {
        conf_t *conf = mddev->private;
        r10bio_t *r10_bio;
        struct bio *biolist = NULL, *bio;
        sector_t max_sector, nr_sectors;
-       int disk;
        int i;
        int max_sync;
        sector_t sync_blocks;
@@ -1858,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                int j, k;
                r10_bio = NULL;
 
-               for (i=0 ; i<conf->raid_disks; i++)
-                       if (conf->mirrors[i].rdev &&
-                           !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
-                               int still_degraded = 0;
-                               /* want to reconstruct this device */
-                               r10bio_t *rb2 = r10_bio;
-                               sector_t sect = raid10_find_virt(conf, sector_nr, i);
-                               int must_sync;
-                               /* Unless we are doing a full sync, we only need
-                                * to recover the block if it is set in the bitmap
-                                */
-                               must_sync = bitmap_start_sync(mddev->bitmap, sect,
-                                                             &sync_blocks, 1);
-                               if (sync_blocks < max_sync)
-                                       max_sync = sync_blocks;
-                               if (!must_sync &&
-                                   !conf->fullsync) {
-                                       /* yep, skip the sync_blocks here, but don't assume
-                                        * that there will never be anything to do here
-                                        */
-                                       chunks_skipped = -1;
-                                       continue;
-                               }
+               for (i=0 ; i<conf->raid_disks; i++) {
+                       int still_degraded;
+                       r10bio_t *rb2;
+                       sector_t sect;
+                       int must_sync;
 
-                               r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-                               raise_barrier(conf, rb2 != NULL);
-                               atomic_set(&r10_bio->remaining, 0);
+                       if (conf->mirrors[i].rdev == NULL ||
+                           test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 
+                               continue;
 
-                               r10_bio->master_bio = (struct bio*)rb2;
-                               if (rb2)
-                                       atomic_inc(&rb2->remaining);
-                               r10_bio->mddev = mddev;
-                               set_bit(R10BIO_IsRecover, &r10_bio->state);
-                               r10_bio->sector = sect;
+                       still_degraded = 0;
+                       /* want to reconstruct this device */
+                       rb2 = r10_bio;
+                       sect = raid10_find_virt(conf, sector_nr, i);
+                       /* Unless we are doing a full sync, we only need
+                        * to recover the block if it is set in the bitmap
+                        */
+                       must_sync = bitmap_start_sync(mddev->bitmap, sect,
+                                                     &sync_blocks, 1);
+                       if (sync_blocks < max_sync)
+                               max_sync = sync_blocks;
+                       if (!must_sync &&
+                           !conf->fullsync) {
+                               /* yep, skip the sync_blocks here, but don't assume
+                                * that there will never be anything to do here
+                                */
+                               chunks_skipped = -1;
+                               continue;
+                       }
 
-                               raid10_find_phys(conf, r10_bio);
+                       r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
+                       raise_barrier(conf, rb2 != NULL);
+                       atomic_set(&r10_bio->remaining, 0);
 
-                               /* Need to check if the array will still be
-                                * degraded
-                                */
-                               for (j=0; j<conf->raid_disks; j++)
-                                       if (conf->mirrors[j].rdev == NULL ||
-                                           test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
-                                               still_degraded = 1;
-                                               break;
-                                       }
-
-                               must_sync = bitmap_start_sync(mddev->bitmap, sect,
-                                                             &sync_blocks, still_degraded);
-
-                               for (j=0; j<conf->copies;j++) {
-                                       int d = r10_bio->devs[j].devnum;
-                                       if (conf->mirrors[d].rdev &&
-                                           test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
-                                               /* This is where we read from */
-                                               bio = r10_bio->devs[0].bio;
-                                               bio->bi_next = biolist;
-                                               biolist = bio;
-                                               bio->bi_private = r10_bio;
-                                               bio->bi_end_io = end_sync_read;
-                                               bio->bi_rw = READ;
-                                               bio->bi_sector = r10_bio->devs[j].addr +
-                                                       conf->mirrors[d].rdev->data_offset;
-                                               bio->bi_bdev = conf->mirrors[d].rdev->bdev;
-                                               atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-                                               atomic_inc(&r10_bio->remaining);
-                                               /* and we write to 'i' */
-
-                                               for (k=0; k<conf->copies; k++)
-                                                       if (r10_bio->devs[k].devnum == i)
-                                                               break;
-                                               BUG_ON(k == conf->copies);
-                                               bio = r10_bio->devs[1].bio;
-                                               bio->bi_next = biolist;
-                                               biolist = bio;
-                                               bio->bi_private = r10_bio;
-                                               bio->bi_end_io = end_sync_write;
-                                               bio->bi_rw = WRITE;
-                                               bio->bi_sector = r10_bio->devs[k].addr +
-                                                       conf->mirrors[i].rdev->data_offset;
-                                               bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-
-                                               r10_bio->devs[0].devnum = d;
-                                               r10_bio->devs[1].devnum = i;
+                       r10_bio->master_bio = (struct bio*)rb2;
+                       if (rb2)
+                               atomic_inc(&rb2->remaining);
+                       r10_bio->mddev = mddev;
+                       set_bit(R10BIO_IsRecover, &r10_bio->state);
+                       r10_bio->sector = sect;
 
-                                               break;
-                                       }
-                               }
-                               if (j == conf->copies) {
-                                       /* Cannot recover, so abort the recovery */
-                                       put_buf(r10_bio);
-                                       if (rb2)
-                                               atomic_dec(&rb2->remaining);
-                                       r10_bio = rb2;
-                                       if (!test_and_set_bit(MD_RECOVERY_INTR,
-                                                             &mddev->recovery))
-                                               printk(KERN_INFO "md/raid10:%s: insufficient "
-                                                      "working devices for recovery.\n",
-                                                      mdname(mddev));
+                       raid10_find_phys(conf, r10_bio);
+
+                       /* Need to check if the array will still be
+                        * degraded
+                        */
+                       for (j=0; j<conf->raid_disks; j++)
+                               if (conf->mirrors[j].rdev == NULL ||
+                                   test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
+                                       still_degraded = 1;
                                        break;
                                }
+
+                       must_sync = bitmap_start_sync(mddev->bitmap, sect,
+                                                     &sync_blocks, still_degraded);
+
+                       for (j=0; j<conf->copies;j++) {
+                               int d = r10_bio->devs[j].devnum;
+                               if (!conf->mirrors[d].rdev ||
+                                   !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
+                                       continue;
+                               /* This is where we read from */
+                               bio = r10_bio->devs[0].bio;
+                               bio->bi_next = biolist;
+                               biolist = bio;
+                               bio->bi_private = r10_bio;
+                               bio->bi_end_io = end_sync_read;
+                               bio->bi_rw = READ;
+                               bio->bi_sector = r10_bio->devs[j].addr +
+                                       conf->mirrors[d].rdev->data_offset;
+                               bio->bi_bdev = conf->mirrors[d].rdev->bdev;
+                               atomic_inc(&conf->mirrors[d].rdev->nr_pending);
+                               atomic_inc(&r10_bio->remaining);
+                               /* and we write to 'i' */
+
+                               for (k=0; k<conf->copies; k++)
+                                       if (r10_bio->devs[k].devnum == i)
+                                               break;
+                               BUG_ON(k == conf->copies);
+                               bio = r10_bio->devs[1].bio;
+                               bio->bi_next = biolist;
+                               biolist = bio;
+                               bio->bi_private = r10_bio;
+                               bio->bi_end_io = end_sync_write;
+                               bio->bi_rw = WRITE;
+                               bio->bi_sector = r10_bio->devs[k].addr +
+                                       conf->mirrors[i].rdev->data_offset;
+                               bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+
+                               r10_bio->devs[0].devnum = d;
+                               r10_bio->devs[1].devnum = i;
+
+                               break;
+                       }
+                       if (j == conf->copies) {
+                               /* Cannot recover, so abort the recovery */
+                               put_buf(r10_bio);
+                               if (rb2)
+                                       atomic_dec(&rb2->remaining);
+                               r10_bio = rb2;
+                               if (!test_and_set_bit(MD_RECOVERY_INTR,
+                                                     &mddev->recovery))
+                                       printk(KERN_INFO "md/raid10:%s: insufficient "
+                                              "working devices for recovery.\n",
+                                              mdname(mddev));
+                               break;
                        }
+               }
                if (biolist == NULL) {
                        while (r10_bio) {
                                r10bio_t *rb2 = r10_bio;
@@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
                if (!bitmap_start_sync(mddev->bitmap, sector_nr,
                                       &sync_blocks, mddev->degraded) &&
-                   !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+                   !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
+                                                &mddev->recovery)) {
                        /* We can skip this block */
                        *skipped = 1;
                        return sync_blocks + sectors_skipped;
@@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        for (i=0; i<conf->copies; i++) {
                                int d = r10_bio->devs[i].devnum;
                                if (r10_bio->devs[i].bio->bi_end_io)
-                                       rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+                                       rdev_dec_pending(conf->mirrors[d].rdev,
+                                                        mddev);
                        }
                        put_buf(r10_bio);
                        biolist = NULL;
@@ -2047,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        do {
                struct page *page;
                int len = PAGE_SIZE;
-               disk = 0;
                if (sector_nr + (len>>9) > max_sector)
                        len = (max_sector - sector_nr) << 9;
                if (len == 0)
                        break;
                for (bio= biolist ; bio ; bio=bio->bi_next) {
+                       struct bio *bio2;
                        page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-                       if (bio_add_page(bio, page, len, 0) == 0) {
-                               /* stop here */
-                               struct bio *bio2;
-                               bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-                               for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
-                                       /* remove last page from this bio */
-                                       bio2->bi_vcnt--;
-                                       bio2->bi_size -= len;
-                                       bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
-                               }
-                               goto bio_full;
+                       if (bio_add_page(bio, page, len, 0))
+                               continue;
+
+                       /* stop here */
+                       bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+                       for (bio2 = biolist;
+                            bio2 && bio2 != bio;
+                            bio2 = bio2->bi_next) {
+                               /* remove last page from this bio */
+                               bio2->bi_vcnt--;
+                               bio2->bi_size -= len;
+                               bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
                        }
-                       disk = i;
+                       goto bio_full;
                }
                nr_sectors += len>>9;
                sector_nr += len>>9;
index 49bf5f8..34dd545 100644 (file)
@@ -1700,27 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
        raid5_conf_t *conf = mddev->private;
        pr_debug("raid456: error called\n");
 
-       if (!test_bit(Faulty, &rdev->flags)) {
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
-               if (test_and_clear_bit(In_sync, &rdev->flags)) {
-                       unsigned long flags;
-                       spin_lock_irqsave(&conf->device_lock, flags);
-                       mddev->degraded++;
-                       spin_unlock_irqrestore(&conf->device_lock, flags);
-                       /*
-                        * if recovery was running, make sure it aborts.
-                        */
-                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-               }
-               set_bit(Faulty, &rdev->flags);
-               printk(KERN_ALERT
-                      "md/raid:%s: Disk failure on %s, disabling device.\n"
-                      "md/raid:%s: Operation continuing on %d devices.\n",
-                      mdname(mddev),
-                      bdevname(rdev->bdev, b),
-                      mdname(mddev),
-                      conf->raid_disks - mddev->degraded);
+       if (test_and_clear_bit(In_sync, &rdev->flags)) {
+               unsigned long flags;
+               spin_lock_irqsave(&conf->device_lock, flags);
+               mddev->degraded++;
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+               /*
+                * if recovery was running, make sure it aborts.
+                */
+               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        }
+       set_bit(Faulty, &rdev->flags);
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       printk(KERN_ALERT
+              "md/raid:%s: Disk failure on %s, disabling device.\n"
+              "md/raid:%s: Operation continuing on %d devices.\n",
+              mdname(mddev),
+              bdevname(rdev->bdev, b),
+              mdname(mddev),
+              conf->raid_disks - mddev->degraded);
 }
 
 /*
@@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
                return -EINVAL;
        set_capacity(mddev->gendisk, mddev->array_sectors);
        revalidate_disk(mddev->gendisk);
-       if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+       if (sectors > mddev->dev_sectors &&
+           mddev->recovery_cp > mddev->dev_sectors) {
                mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }