ACPI / power: Avoid maybe-uninitialized warning
[pandora-kernel.git] / drivers / md / raid1.c
index cae8746..7b75a19 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/blkdev.h>
+#include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
 #include "md.h"
@@ -312,7 +313,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
                spin_lock_irqsave(&conf->device_lock, flags);
                if (r1_bio->mddev->degraded == conf->raid_disks ||
                    (r1_bio->mddev->degraded == conf->raid_disks-1 &&
-                    !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+                    test_bit(In_sync, &conf->mirrors[mirror].rdev->flags)))
                        uptodate = 1;
                spin_unlock_irqrestore(&conf->device_lock, flags);
        }
@@ -406,7 +407,17 @@ static void raid1_end_write_request(struct bio *bio, int error)
 
                r1_bio->bios[mirror] = NULL;
                to_put = bio;
-               set_bit(R1BIO_Uptodate, &r1_bio->state);
+               /*
+                * Do not set R1BIO_Uptodate if the current device is
+                * rebuilding or Faulty. This is because we cannot use
+                * such device for properly reading the data back (we could
+                * potentially use it, if the current write would have felt
+                * before rdev->recovery_offset, but for simplicity we don't
+                * check this here.
+                */
+               if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) &&
+                   !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))
+                       set_bit(R1BIO_Uptodate, &r1_bio->state);
 
                /* Maybe we can clear some bad blocks. */
                if (is_badblock(conf->mirrors[mirror].rdev,
@@ -524,8 +535,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                if (test_bit(WriteMostly, &rdev->flags)) {
                        /* Don't balance among write-mostly, just
                         * use the first as a last resort */
-                       if (best_disk < 0)
+                       if (best_disk < 0) {
+                               if (is_badblock(rdev, this_sector, sectors,
+                                               &first_bad, &bad_sectors)) {
+                                       if (first_bad < this_sector)
+                                               /* Cannot use this */
+                                               continue;
+                                       best_good_sectors = first_bad - this_sector;
+                               } else
+                                       best_good_sectors = sectors;
                                best_disk = disk;
+                       }
                        continue;
                }
                /* This is a reasonable device to use.  It might
@@ -721,9 +741,22 @@ static void wait_barrier(struct r1conf *conf)
        spin_lock_irq(&conf->resync_lock);
        if (conf->barrier) {
                conf->nr_waiting++;
-               wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+               /* Wait for the barrier to drop.
+                * However if there are already pending
+                * requests (preventing the barrier from
+                * rising completely), and the
+                * pre-process bio queue isn't empty,
+                * then don't wait, as we need to empty
+                * that queue to get the nr_pending
+                * count down.
+                */
+               wait_event_lock_irq(conf->wait_barrier,
+                                   !conf->barrier ||
+                                   (conf->nr_pending &&
+                                    current->bio_list &&
+                                    !bio_list_empty(current->bio_list)),
                                    conf->resync_lock,
-                                   );
+                       );
                conf->nr_waiting--;
        }
        conf->nr_pending++;
@@ -1106,7 +1139,7 @@ read_again:
                         * know the original bi_idx, so we just free
                         * them all
                         */
-                       __bio_for_each_segment(bvec, mbio, j, 0)
+                       bio_for_each_segment_all(bvec, mbio, j)
                                bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
                        if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
                                atomic_inc(&r1_bio->behind_remaining);
@@ -1175,6 +1208,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
        char b[BDEVNAME_SIZE];
        struct r1conf *conf = mddev->private;
+       unsigned long flags;
 
        /*
         * If it is not operational, then we have already marked it as dead
@@ -1194,19 +1228,19 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
                return;
        }
        set_bit(Blocked, &rdev->flags);
+       spin_lock_irqsave(&conf->device_lock, flags);
        if (test_and_clear_bit(In_sync, &rdev->flags)) {
-               unsigned long flags;
-               spin_lock_irqsave(&conf->device_lock, flags);
                mddev->degraded++;
                set_bit(Faulty, &rdev->flags);
-               spin_unlock_irqrestore(&conf->device_lock, flags);
-               /*
-                * if recovery is running, make sure it aborts.
-                */
-               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        } else
                set_bit(Faulty, &rdev->flags);
+       spin_unlock_irqrestore(&conf->device_lock, flags);
+       /*
+        * if recovery is running, make sure it aborts.
+        */
+       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       set_bit(MD_CHANGE_PENDING, &mddev->flags);
        printk(KERN_ALERT
               "md/raid1:%s: Disk failure on %s, disabling device.\n"
               "md/raid1:%s: Operation continuing on %d devices.\n",
@@ -1259,7 +1293,10 @@ static int raid1_spare_active(struct mddev *mddev)
         * Find all failed disks within the RAID1 configuration 
         * and mark them readable.
         * Called under mddev lock, so rcu protection not needed.
+        * device_lock used to avoid races with raid1_end_read_request
+        * which expects 'In_sync' flags and ->degraded to be consistent.
         */
+       spin_lock_irqsave(&conf->device_lock, flags);
        for (i = 0; i < conf->raid_disks; i++) {
                struct md_rdev *rdev = conf->mirrors[i].rdev;
                if (rdev
@@ -1269,7 +1306,6 @@ static int raid1_spare_active(struct mddev *mddev)
                        sysfs_notify_dirent_safe(rdev->sysfs_state);
                }
        }
-       spin_lock_irqsave(&conf->device_lock, flags);
        mddev->degraded -= count;
        spin_unlock_irqrestore(&conf->device_lock, flags);
 
@@ -1690,8 +1726,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 
        if (atomic_dec_and_test(&r1_bio->remaining)) {
                /* if we're here, all write(s) have completed, so clean up */
-               md_done_sync(mddev, r1_bio->sectors, 1);
-               put_buf(r1_bio);
+               int s = r1_bio->sectors;
+               if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+                   test_bit(R1BIO_WriteError, &r1_bio->state))
+                       reschedule_retry(r1_bio);
+               else {
+                       put_buf(r1_bio);
+                       md_done_sync(mddev, s, 1);
+               }
        }
 }
 
@@ -1908,6 +1950,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
        int m;
+       bool fail = false;
        for (m = 0; m < conf->raid_disks ; m++)
                if (r1_bio->bios[m] == IO_MADE_GOOD) {
                        struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -1920,6 +1963,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
                         * narrow down and record precise write
                         * errors.
                         */
+                       fail = true;
                        if (!narrow_write_error(r1_bio, m)) {
                                md_error(conf->mddev,
                                         conf->mirrors[m].rdev);
@@ -1929,9 +1973,17 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
                        rdev_dec_pending(conf->mirrors[m].rdev,
                                         conf->mddev);
                }
-       if (test_bit(R1BIO_WriteError, &r1_bio->state))
-               close_write(r1_bio);
-       raid_end_bio_io(r1_bio);
+       if (fail) {
+               spin_lock_irq(&conf->device_lock);
+               list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+               conf->nr_queued++;
+               spin_unlock_irq(&conf->device_lock);
+               md_wakeup_thread(conf->mddev->thread);
+       } else {
+               if (test_bit(R1BIO_WriteError, &r1_bio->state))
+                       close_write(r1_bio);
+               raid_end_bio_io(r1_bio);
+       }
 }
 
 static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2034,6 +2086,29 @@ static void raid1d(struct mddev *mddev)
 
        md_check_recovery(mddev);
 
+       if (!list_empty_careful(&conf->bio_end_io_list) &&
+           !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+               LIST_HEAD(tmp);
+               spin_lock_irqsave(&conf->device_lock, flags);
+               if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+                       while (!list_empty(&conf->bio_end_io_list)) {
+                               list_move(conf->bio_end_io_list.prev, &tmp);
+                               conf->nr_queued--;
+                       }
+               }
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+               while (!list_empty(&tmp)) {
+                       r1_bio = list_first_entry(&conf->bio_end_io_list,
+                                                 struct r1bio, retry_list);
+                       list_del(&r1_bio->retry_list);
+                       if (mddev->degraded)
+                               set_bit(R1BIO_Degraded, &r1_bio->state);
+                       if (test_bit(R1BIO_WriteError, &r1_bio->state))
+                               close_write(r1_bio);
+                       raid_end_bio_io(r1_bio);
+               }
+       }
+
        blk_start_plug(&plug);
        for (;;) {
 
@@ -2292,7 +2367,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                /* There is nowhere to write, so all non-sync
                 * drives must be failed - so we are finished
                 */
-               sector_t rv = max_sector - sector_nr;
+               sector_t rv;
+               if (min_bad > 0)
+                       max_sector = sector_nr + min_bad;
+               rv = max_sector - sector_nr;
                *skipped = 1;
                put_buf(r1_bio);
                return rv;
@@ -2355,9 +2433,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
         */
        if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                atomic_set(&r1_bio->remaining, read_targets);
-               for (i=0; i<conf->raid_disks; i++) {
+               for (i = 0; i < conf->raid_disks && read_targets; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io == end_sync_read) {
+                               read_targets--;
                                md_sync_acct(bio->bi_bdev, nr_sectors);
                                generic_make_request(bio);
                        }
@@ -2428,6 +2507,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->raid_disks = mddev->raid_disks;
        conf->mddev = mddev;
        INIT_LIST_HEAD(&conf->retry_list);
+       INIT_LIST_HEAD(&conf->bio_end_io_list);
 
        spin_lock_init(&conf->resync_lock);
        init_waitqueue_head(&conf->wait_barrier);