md/raid1: handle merge_bvec_fn in member devices.
authorNeilBrown <neilb@suse.de>
Mon, 19 Mar 2012 01:46:39 +0000 (12:46 +1100)
committerNeilBrown <neilb@suse.de>
Mon, 19 Mar 2012 01:46:39 +0000 (12:46 +1100)
Currently we don't honour merge_bvec_fn in member devices so if there
is one, we force all requests to be single-page at most.
This is not ideal.

So create a raid1 merge_bvec_fn to check that function in children
as well.

This introduces a small problem.  There is no locking around calls
the ->merge_bvec_fn and subsequent calls to ->make_request.  So a
device added between these could end up getting a request which
violates its merge_bvec_fn.

Currently the best we can do is synchronize_sched().  This will work
providing no preemption happens.  If there is is preemption, we just
have to hope that new devices are largely consistent with old devices.

Signed-off-by: NeilBrown <neilb@suse.de>
drivers/md/raid1.c

index a933bd4..4a40a20 100644 (file)
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                rdev = rcu_dereference(conf->mirrors[disk].rdev);
                if (r1_bio->bios[disk] == IO_BLOCKED
                    || rdev == NULL
+                   || test_bit(Unmerged, &rdev->flags)
                    || test_bit(Faulty, &rdev->flags))
                        continue;
                if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
        return best_disk;
 }
 
+static int raid1_mergeable_bvec(struct request_queue *q,
+                               struct bvec_merge_data *bvm,
+                               struct bio_vec *biovec)
+{
+       struct mddev *mddev = q->queuedata;
+       struct r1conf *conf = mddev->private;
+       sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+       int max = biovec->bv_len;
+
+       if (mddev->merge_check_needed) {
+               int disk;
+               rcu_read_lock();
+               for (disk = 0; disk < conf->raid_disks * 2; disk++) {
+                       struct md_rdev *rdev = rcu_dereference(
+                               conf->mirrors[disk].rdev);
+                       if (rdev && !test_bit(Faulty, &rdev->flags)) {
+                               struct request_queue *q =
+                                       bdev_get_queue(rdev->bdev);
+                               if (q->merge_bvec_fn) {
+                                       bvm->bi_sector = sector +
+                                               rdev->data_offset;
+                                       bvm->bi_bdev = rdev->bdev;
+                                       max = min(max, q->merge_bvec_fn(
+                                                         q, bvm, biovec));
+                               }
+                       }
+               }
+               rcu_read_unlock();
+       }
+       return max;
+
+}
+
 int md_raid1_congested(struct mddev *mddev, int bits)
 {
        struct r1conf *conf = mddev->private;
@@ -1015,7 +1049,8 @@ read_again:
                        break;
                }
                r1_bio->bios[i] = NULL;
-               if (!rdev || test_bit(Faulty, &rdev->flags)) {
+               if (!rdev || test_bit(Faulty, &rdev->flags)
+                   || test_bit(Unmerged, &rdev->flags)) {
                        if (i < conf->raid_disks)
                                set_bit(R1BIO_Degraded, &r1_bio->state);
                        continue;
@@ -1335,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        struct mirror_info *p;
        int first = 0;
        int last = conf->raid_disks - 1;
+       struct request_queue *q = bdev_get_queue(rdev->bdev);
 
        if (mddev->recovery_disabled == conf->recovery_disabled)
                return -EBUSY;
@@ -1342,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (rdev->raid_disk >= 0)
                first = last = rdev->raid_disk;
 
+       if (q->merge_bvec_fn) {
+               set_bit(Unmerged, &rdev->flags);
+               mddev->merge_check_needed = 1;
+       }
+
        for (mirror = first; mirror <= last; mirror++) {
                p = conf->mirrors+mirror;
                if (!p->rdev) {
 
                        disk_stack_limits(mddev->gendisk, rdev->bdev,
                                          rdev->data_offset << 9);
-                       /* as we don't honour merge_bvec_fn, we must
-                        * never risk violating it, so limit
-                        * ->max_segments to one lying with a single
-                        * page, as a one page request is never in
-                        * violation.
-                        */
-                       if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                               blk_queue_max_segments(mddev->queue, 1);
-                               blk_queue_segment_boundary(mddev->queue,
-                                                          PAGE_CACHE_SIZE - 1);
-                       }
 
                        p->head_position = 0;
                        rdev->raid_disk = mirror;
@@ -1383,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                        break;
                }
        }
+       if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
+               /* Some requests might not have seen this new
+                * merge_bvec_fn.  We must wait for them to complete
+                * before merging the device fully.
+                * First we make sure any code which has tested
+                * our function has submitted the request, then
+                * we wait for all outstanding requests to complete.
+                */
+               synchronize_sched();
+               raise_barrier(conf);
+               lower_barrier(conf);
+               clear_bit(Unmerged, &rdev->flags);
+       }
        md_integrity_add_rdev(rdev, mddev);
        print_conf(conf);
        return err;
@@ -2627,15 +2670,6 @@ static int run(struct mddev *mddev)
                        continue;
                disk_stack_limits(mddev->gendisk, rdev->bdev,
                                  rdev->data_offset << 9);
-               /* as we don't honour merge_bvec_fn, we must never risk
-                * violating it, so limit ->max_segments to 1 lying within
-                * a single page, as a one page request is never in violation.
-                */
-               if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
-                       blk_queue_max_segments(mddev->queue, 1);
-                       blk_queue_segment_boundary(mddev->queue,
-                                                  PAGE_CACHE_SIZE - 1);
-               }
        }
 
        mddev->degraded = 0;
@@ -2669,6 +2703,7 @@ static int run(struct mddev *mddev)
        if (mddev->queue) {
                mddev->queue->backing_dev_info.congested_fn = raid1_congested;
                mddev->queue->backing_dev_info.congested_data = mddev;
+               blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
        }
        return md_integrity_register(mddev);
 }