Merge branch 'master' into for-2.6.35
authorJens Axboe <jens.axboe@oracle.com>
Fri, 21 May 2010 19:27:26 +0000 (21:27 +0200)
committerJens Axboe <jens.axboe@oracle.com>
Fri, 21 May 2010 19:27:26 +0000 (21:27 +0200)
Conflicts:
fs/ext3/fsync.c

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
93 files changed:
Documentation/cgroups/blkio-controller.txt
block/Kconfig
block/Kconfig.iosched
block/Makefile
block/blk-barrier.c
block/blk-cgroup.c
block/blk-cgroup.h
block/blk-core.c
block/blk-lib.c [new file with mode: 0644]
block/cfq-iosched.c
block/elevator.c
block/genhd.c
block/ioctl.c
drivers/block/Kconfig
drivers/block/drbd/drbd_bitmap.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_proc.c
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
drivers/block/drbd/drbd_strings.c
drivers/block/drbd/drbd_worker.c
drivers/block/drbd/drbd_wrappers.h
drivers/ide/ide-disk.c
drivers/ide/ide-gd.c
fs/block_dev.c
fs/btrfs/extent-tree.c
fs/buffer.c
fs/ext3/fsync.c
fs/ext4/fsync.c
fs/fcntl.c
fs/fs-writeback.c
fs/gfs2/rgrp.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/nilfs2/the_nilfs.c
fs/partitions/acorn.c
fs/partitions/acorn.h
fs/partitions/amiga.c
fs/partitions/amiga.h
fs/partitions/atari.c
fs/partitions/atari.h
fs/partitions/check.c
fs/partitions/check.h
fs/partitions/efi.c
fs/partitions/efi.h
fs/partitions/ibm.c
fs/partitions/ibm.h
fs/partitions/karma.c
fs/partitions/karma.h
fs/partitions/ldm.c
fs/partitions/ldm.h
fs/partitions/mac.c
fs/partitions/mac.h
fs/partitions/msdos.c
fs/partitions/msdos.h
fs/partitions/osf.c
fs/partitions/osf.h
fs/partitions/sgi.c
fs/partitions/sgi.h
fs/partitions/sun.c
fs/partitions/sun.h
fs/partitions/sysv68.c
fs/partitions/sysv68.h
fs/partitions/ultrix.c
fs/partitions/ultrix.h
fs/pipe.c
fs/reiserfs/file.c
fs/splice.c
fs/sync.c
fs/xfs/linux-2.6/xfs_super.c
include/linux/backing-dev.h
include/linux/blkdev.h
include/linux/drbd.h
include/linux/drbd_limits.h
include/linux/drbd_nl.h
include/linux/elevator.h
include/linux/fcntl.h
include/linux/fs.h
include/linux/ide.h
include/linux/pipe_fs_i.h
include/linux/splice.h
include/linux/writeback.h
init/Kconfig
kernel/relay.c
kernel/sched_clock.c
kernel/sysctl.c
kernel/trace/trace.c
mm/backing-dev.c
mm/page-writeback.c
mm/swapfile.c
net/core/skbuff.c

index 630879c..48e0b21 100644 (file)
@@ -17,6 +17,9 @@ HOWTO
 You can do a very simple testing of running two dd threads in two different
 cgroups. Here is what you can do.
 
+- Enable Block IO controller
+       CONFIG_BLK_CGROUP=y
+
 - Enable group scheduling in CFQ
        CONFIG_CFQ_GROUP_IOSCHED=y
 
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
 
 Various user visible config options
 ===================================
-CONFIG_CFQ_GROUP_IOSCHED
-       - Enables group scheduling in CFQ. Currently only 1 level of group
-         creation is allowed.
-
-CONFIG_DEBUG_CFQ_IOSCHED
-       - Enables some debugging messages in blktrace. Also creates extra
-         cgroup file blkio.dequeue.
-
-Config options selected automatically
-=====================================
-These config options are not user visible and are selected/deselected
-automatically based on IO scheduler configuration.
-
 CONFIG_BLK_CGROUP
-       - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
+       - Block IO controller.
 
 CONFIG_DEBUG_BLK_CGROUP
-       - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
+       - Debug help. Right now some additional stats file show up in cgroup
+         if this option is enabled.
+
+CONFIG_CFQ_GROUP_IOSCHED
+       - Enables group scheduling in CFQ. Currently only 1 level of group
+         creation is allowed.
 
 Details of cgroup files
 =======================
 - blkio.weight
-       - Specifies per cgroup weight.
-
+       - Specifies per cgroup weight. This is default weight of the group
+         on all the devices until and unless overridden by per device rule.
+         (See blkio.weight_device).
          Currently allowed range of weights is from 100 to 1000.
 
+- blkio.weight_device
+       - One can specify per cgroup per device rules using this interface.
+         These rules override the default value of group weight as specified
+         by blkio.weight.
+
+         Following is the format.
+
+         #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
+         Configure weight=300 on /dev/sdb (8:16) in this cgroup
+         # echo 8:16 300 > blkio.weight_device
+         # cat blkio.weight_device
+         dev     weight
+         8:16    300
+
+         Configure weight=500 on /dev/sda (8:0) in this cgroup
+         # echo 8:0 500 > blkio.weight_device
+         # cat blkio.weight_device
+         dev     weight
+         8:0     500
+         8:16    300
+
+         Remove specific weight for /dev/sda in this cgroup
+         # echo 8:0 0 > blkio.weight_device
+         # cat blkio.weight_device
+         dev     weight
+         8:16    300
+
 - blkio.time
        - disk time allocated to cgroup per device in milliseconds. First
          two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
          third field specifies the number of sectors transferred by the
          group to/from the device.
 
+- blkio.io_service_bytes
+       - Number of bytes transferred to/from the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of bytes.
+
+- blkio.io_serviced
+       - Number of IOs completed to/from the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of IOs.
+
+- blkio.io_service_time
+       - Total amount of time between request dispatch and request completion
+         for the IOs done by this cgroup. This is in nanoseconds to make it
+         meaningful for flash devices too. For devices with queue depth of 1,
+         this time represents the actual service time. When queue_depth > 1,
+         that is no longer true as requests may be served out of order. This
+         may cause the service time for a given IO to include the service time
+         of multiple IOs when served out of order which may result in total
+         io_service_time > actual time elapsed. This time is further divided by
+         the type of operation - read or write, sync or async. First two fields
+         specify the major and minor number of the device, third field
+         specifies the operation type and the fourth field specifies the
+         io_service_time in ns.
+
+- blkio.io_wait_time
+       - Total amount of time the IOs for this cgroup spent waiting in the
+         scheduler queues for service. This can be greater than the total time
+         elapsed since it is cumulative io_wait_time for all IOs. It is not a
+         measure of total time the cgroup spent waiting but rather a measure of
+         the wait_time for its individual IOs. For devices with queue_depth > 1
+         this metric does not include the time spent waiting for service once
+         the IO is dispatched to the device but till it actually gets serviced
+         (there might be a time lag here due to re-ordering of requests by the
+         device). This is in nanoseconds to make it meaningful for flash
+         devices too. This time is further divided by the type of operation -
+         read or write, sync or async. First two fields specify the major and
+         minor number of the device, third field specifies the operation type
+         and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+       - Total number of bios/requests merged into requests belonging to this
+         cgroup. This is further divided by the type of operation - read or
+         write, sync or async.
+
+- blkio.io_queued
+       - Total number of requests queued up at any given instant for this
+         cgroup. This is further divided by the type of operation - read or
+         write, sync or async.
+
+- blkio.avg_queue_size
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         The average queue size for this cgroup over the entire time of this
+         cgroup's existence. Queue size samples are taken each time one of the
+         queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         This is the amount of time the cgroup had to wait since it became busy
+         (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+         its queues. This is different from the io_wait_time which is the
+         cumulative total of the amount of time spent by each IO in that cgroup
+         waiting in the scheduler queue. This is in nanoseconds. If this is
+         read when the cgroup is in a waiting (for timeslice) state, the stat
+         will only report the group_wait_time accumulated till the last time it
+         got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         This is the amount of time a cgroup spends without any pending
+         requests when not being served, i.e., it does not include any time
+         spent idling for one of the queues of the cgroup. This is in
+         nanoseconds. If this is read when the cgroup is in an empty state,
+         the stat will only report the empty_time accumulated till the last
+         time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         This is the amount of time spent by the IO scheduler idling for a
+         given cgroup in anticipation of a better request than the exising ones
+         from other queues/cgroups. This is in nanoseconds. If this is read
+         when the cgroup is in an idling state, the stat will only report the
+         idle_time accumulated till the last idle period and will not include
+         the current delta.
+
 - blkio.dequeue
-       - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
          gives the statistics about how many a times a group was dequeued
          from service tree of the device. First two fields specify the major
          and minor number of the device and third field specifies the number
          of times a group was dequeued from a particular device.
 
+- blkio.reset_stats
+       - Writing an int to this file will result in resetting all the stats
+         for that cgroup.
+
 CFQ sysfs tunable
 =================
 /sys/block/<disk>/queue/iosched/group_isolation
index f9e89f4..9be0b56 100644 (file)
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
        T10/SCSI Data Integrity Field or the T13/ATA External Path
        Protection.  If in doubt, say N.
 
-config BLK_CGROUP
-       tristate "Block cgroup support"
-       depends on CGROUPS
-       depends on CFQ_GROUP_IOSCHED
-       default n
-       ---help---
-       Generic block IO controller cgroup interface. This is the common
-       cgroup interface which should be used by various IO controlling
-       policies.
-
-       Currently, CFQ IO scheduler uses it to recognize task groups and
-       control disk bandwidth allocation (proportional time slice allocation)
-       to such task groups.
-
-config DEBUG_BLK_CGROUP
-       bool
-       depends on BLK_CGROUP
-       default n
-       ---help---
-       Enable some debugging help. Currently it stores the cgroup path
-       in the blk group which can be used by cfq for tracing various
-       group related activity.
-
 endif # BLOCK
 
 config BLOCK_COMPAT
index fc71cf0..3199b76 100644 (file)
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
        tristate "CFQ I/O scheduler"
-       select BLK_CGROUP if CFQ_GROUP_IOSCHED
+       # If BLK_CGROUP is a module, CFQ has to be built as module.
+       depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
        default y
        ---help---
          The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
 
          This is the default I/O scheduler.
 
+         Note: If BLK_CGROUP=m, then CFQ can be built only as module.
+
 config CFQ_GROUP_IOSCHED
        bool "CFQ Group Scheduling support"
-       depends on IOSCHED_CFQ && CGROUPS
+       depends on IOSCHED_CFQ && BLK_CGROUP
        default n
        ---help---
          Enable group IO scheduling in CFQ.
 
-config DEBUG_CFQ_IOSCHED
-       bool "Debug CFQ Scheduling"
-       depends on CFQ_GROUP_IOSCHED
-       select DEBUG_BLK_CGROUP
-       default n
-       ---help---
-         Enable CFQ IO scheduling debugging in CFQ. Currently it makes
-         blktrace output more verbose.
-
 choice
        prompt "Default I/O scheduler"
        default DEFAULT_CFQ
index cb2d515..0bb499a 100644 (file)
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-                       blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+                       blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)      += bsg.o
 obj-$(CONFIG_BLK_CGROUP)       += blk-cgroup.o
index 6d88544..0d710c9 100644 (file)
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        }
-
-       complete(bio->bi_private);
+       if (bio->bi_private)
+               complete(bio->bi_private);
+       bio_put(bio);
 }
 
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:      blockdev to issue flush for
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
  * @error_sector:      error sector
+ * @flags:     BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
- *    wish to.
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
  */
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+               sector_t *error_sector, unsigned long flags)
 {
        DECLARE_COMPLETION_ONSTACK(wait);
        struct request_queue *q;
        struct bio *bio;
-       int ret;
+       int ret = 0;
 
        if (bdev->bd_disk == NULL)
                return -ENXIO;
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
        if (!q)
                return -ENXIO;
 
-       bio = bio_alloc(GFP_KERNEL, 0);
+       bio = bio_alloc(gfp_mask, 0);
        bio->bi_end_io = bio_end_empty_barrier;
-       bio->bi_private = &wait;
        bio->bi_bdev = bdev;
-       submit_bio(WRITE_BARRIER, bio);
-
-       wait_for_completion(&wait);
+       if (test_bit(BLKDEV_WAIT, &flags))
+               bio->bi_private = &wait;
 
-       /*
-        * The driver must store the error location in ->bi_sector, if
-        * it supports it. For non-stacked drivers, this should be copied
-        * from blk_rq_pos(rq).
-        */
-       if (error_sector)
-               *error_sector = bio->bi_sector;
+       bio_get(bio);
+       submit_bio(WRITE_BARRIER, bio);
+       if (test_bit(BLKDEV_WAIT, &flags)) {
+               wait_for_completion(&wait);
+               /*
+                * The driver must store the error location in ->bi_sector, if
+                * it supports it. For non-stacked drivers, this should be
+                * copied from blk_rq_pos(rq).
+                */
+               if (error_sector)
+                       *error_sector = bio->bi_sector;
+       }
 
-       ret = 0;
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
        else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
-
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-               clear_bit(BIO_UPTODATE, &bio->bi_flags);
-       }
-
-       if (bio->bi_private)
-               complete(bio->bi_private);
-       __free_page(bio_page(bio));
-
-       bio_put(bio);
-}
-
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev:      blockdev to issue discard for
- * @sector:    start sector
- * @nr_sects:  number of sectors to discard
- * @gfp_mask:  memory allocation flags (for bio_alloc)
- * @flags:     DISCARD_FL_* flags to control behaviour
- *
- * Description:
- *    Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-               sector_t nr_sects, gfp_t gfp_mask, int flags)
-{
-       DECLARE_COMPLETION_ONSTACK(wait);
-       struct request_queue *q = bdev_get_queue(bdev);
-       int type = flags & DISCARD_FL_BARRIER ?
-               DISCARD_BARRIER : DISCARD_NOBARRIER;
-       struct bio *bio;
-       struct page *page;
-       int ret = 0;
-
-       if (!q)
-               return -ENXIO;
-
-       if (!blk_queue_discard(q))
-               return -EOPNOTSUPP;
-
-       while (nr_sects && !ret) {
-               unsigned int sector_size = q->limits.logical_block_size;
-               unsigned int max_discard_sectors =
-                       min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-
-               bio = bio_alloc(gfp_mask, 1);
-               if (!bio)
-                       goto out;
-               bio->bi_sector = sector;
-               bio->bi_end_io = blkdev_discard_end_io;
-               bio->bi_bdev = bdev;
-               if (flags & DISCARD_FL_WAIT)
-                       bio->bi_private = &wait;
-
-               /*
-                * Add a zeroed one-sector payload as that's what
-                * our current implementations need.  If we'll ever need
-                * more the interface will need revisiting.
-                */
-               page = alloc_page(gfp_mask | __GFP_ZERO);
-               if (!page)
-                       goto out_free_bio;
-               if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
-                       goto out_free_page;
-
-               /*
-                * And override the bio size - the way discard works we
-                * touch many more blocks on disk than the actual payload
-                * length.
-                */
-               if (nr_sects > max_discard_sectors) {
-                       bio->bi_size = max_discard_sectors << 9;
-                       nr_sects -= max_discard_sectors;
-                       sector += max_discard_sectors;
-               } else {
-                       bio->bi_size = nr_sects << 9;
-                       nr_sects = 0;
-               }
-
-               bio_get(bio);
-               submit_bio(type, bio);
-
-               if (flags & DISCARD_FL_WAIT)
-                       wait_for_completion(&wait);
-
-               if (bio_flagged(bio, BIO_EOPNOTSUPP))
-                       ret = -EOPNOTSUPP;
-               else if (!bio_flagged(bio, BIO_UPTODATE))
-                       ret = -EIO;
-               bio_put(bio);
-       }
-       return ret;
-out_free_page:
-       __free_page(page);
-out_free_bio:
-       bio_put(bio);
-out:
-       return -ENOMEM;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
index 2cc682b..a680964 100644 (file)
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/blkdev.h>
 #include <linux/slab.h>
 #include "blk-cgroup.h"
+#include <linux/genhd.h>
+
+#define MAX_KEY_LEN 100
 
 static DEFINE_SPINLOCK(blkio_list_lock);
 static LIST_HEAD(blkio_list);
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = {
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
+                                           struct blkio_policy_node *pn)
+{
+       list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */
+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
+{
+       list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+{
+       struct blkio_policy_node *pn;
+
+       list_for_each_entry(pn, &blkcg->policy_list, node) {
+               if (pn->dev == dev)
+                       return pn;
+       }
+
+       return NULL;
+}
+
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
        return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 }
 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-                       unsigned long time, unsigned long sectors)
+/*
+ * Add to the appropriate stat variable depending on the request type.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+                               bool sync)
+{
+       if (direction)
+               stat[BLKIO_STAT_WRITE] += add;
+       else
+               stat[BLKIO_STAT_READ] += add;
+       if (sync)
+               stat[BLKIO_STAT_SYNC] += add;
+       else
+               stat[BLKIO_STAT_ASYNC] += add;
+}
+
+/*
+ * Decrements the appropriate stat variable if non-zero depending on the
+ * request type. Panics on value being zero.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
+{
+       if (direction) {
+               BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
+               stat[BLKIO_STAT_WRITE]--;
+       } else {
+               BUG_ON(stat[BLKIO_STAT_READ] == 0);
+               stat[BLKIO_STAT_READ]--;
+       }
+       if (sync) {
+               BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
+               stat[BLKIO_STAT_SYNC]--;
+       } else {
+               BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
+               stat[BLKIO_STAT_ASYNC]--;
+       }
+}
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+                                               struct blkio_group *curr_blkg)
+{
+       if (blkio_blkg_waiting(&blkg->stats))
+               return;
+       if (blkg == curr_blkg)
+               return;
+       blkg->stats.start_group_wait_time = sched_clock();
+       blkio_mark_blkg_waiting(&blkg->stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+{
+       unsigned long long now;
+
+       if (!blkio_blkg_waiting(stats))
+               return;
+
+       now = sched_clock();
+       if (time_after64(now, stats->start_group_wait_time))
+               stats->group_wait_time += now - stats->start_group_wait_time;
+       blkio_clear_blkg_waiting(stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
+{
+       unsigned long long now;
+
+       if (!blkio_blkg_empty(stats))
+               return;
+
+       now = sched_clock();
+       if (time_after64(now, stats->start_empty_time))
+               stats->empty_time += now - stats->start_empty_time;
+       blkio_clear_blkg_empty(stats);
+}
+
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       BUG_ON(blkio_blkg_idling(&blkg->stats));
+       blkg->stats.start_idle_time = sched_clock();
+       blkio_mark_blkg_idling(&blkg->stats);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
+
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+       unsigned long flags;
+       unsigned long long now;
+       struct blkio_group_stats *stats;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       stats = &blkg->stats;
+       if (blkio_blkg_idling(stats)) {
+               now = sched_clock();
+               if (time_after64(now, stats->start_idle_time))
+                       stats->idle_time += now - stats->start_idle_time;
+               blkio_clear_blkg_idling(stats);
+       }
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
+
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+{
+       unsigned long flags;
+       struct blkio_group_stats *stats;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       stats = &blkg->stats;
+       stats->avg_queue_size_sum +=
+                       stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+                       stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+       stats->avg_queue_size_samples++;
+       blkio_update_group_wait_time(stats);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
+
+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
+{
+       unsigned long flags;
+       struct blkio_group_stats *stats;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       stats = &blkg->stats;
+
+       if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
+                       stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
+               spin_unlock_irqrestore(&blkg->stats_lock, flags);
+               return;
+       }
+
+       /*
+        * group is already marked empty. This can happen if cfqq got new
+        * request in parent group and moved to this group while being added
+        * to service tree. Just ignore the event and move on.
+        */
+       if(blkio_blkg_empty(stats)) {
+               spin_unlock_irqrestore(&blkg->stats_lock, flags);
+               return;
+       }
+
+       stats->start_empty_time = sched_clock();
+       blkio_mark_blkg_empty(stats);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
+
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+                       unsigned long dequeue)
+{
+       blkg->stats.dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
+#else
+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+                                       struct blkio_group *curr_blkg) {}
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
+#endif
+
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+                       struct blkio_group *curr_blkg, bool direction,
+                       bool sync)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+                       sync);
+       blkio_end_empty_time(&blkg->stats);
+       blkio_set_start_group_wait_time(blkg, curr_blkg);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
+
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                               bool direction, bool sync)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+                                       direction, sync);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
+
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       blkg->stats.time += time;
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+                               uint64_t bytes, bool direction, bool sync)
 {
-       blkg->time += time;
-       blkg->sectors += sectors;
+       struct blkio_group_stats *stats;
+       unsigned long flags;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       stats = &blkg->stats;
+       stats->sectors += bytes >> 9;
+       blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+                       sync);
+       blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+                       direction, sync);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
+
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+       uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+{
+       struct blkio_group_stats *stats;
+       unsigned long flags;
+       unsigned long long now = sched_clock();
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       stats = &blkg->stats;
+       if (time_after64(now, io_start_time))
+               blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+                               now - io_start_time, direction, sync);
+       if (time_after64(io_start_time, start_time))
+               blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+                               io_start_time - start_time, direction, sync);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
+
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+                                       bool sync)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&blkg->stats_lock, flags);
+       blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+                       sync);
+       spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
                        struct blkio_group *blkg, void *key, dev_t dev)
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
        unsigned long flags;
 
        spin_lock_irqsave(&blkcg->lock, flags);
+       spin_lock_init(&blkg->stats_lock);
        rcu_assign_pointer(blkg->key, key);
        blkg->blkcg_id = css_id(&blkcg->css);
        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
        spin_unlock_irqrestore(&blkcg->lock, flags);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
        /* Need to take css reference ? */
        cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-#endif
        blkg->dev = dev;
 }
 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
@@ -101,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
 
        rcu_read_lock();
        css = css_lookup(&blkio_subsys, blkg->blkcg_id);
-       if (!css)
-               goto out;
-
-       blkcg = container_of(css, struct blkio_cgroup, css);
-       spin_lock_irqsave(&blkcg->lock, flags);
-       if (!hlist_unhashed(&blkg->blkcg_node)) {
-               __blkiocg_del_blkio_group(blkg);
-               ret = 0;
+       if (css) {
+               blkcg = container_of(css, struct blkio_cgroup, css);
+               spin_lock_irqsave(&blkcg->lock, flags);
+               if (!hlist_unhashed(&blkg->blkcg_node)) {
+                       __blkiocg_del_blkio_group(blkg);
+                       ret = 0;
+               }
+               spin_unlock_irqrestore(&blkcg->lock, flags);
        }
-       spin_unlock_irqrestore(&blkcg->lock, flags);
-out:
+
        rcu_read_unlock();
        return ret;
 }
@@ -154,6 +428,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
        struct blkio_group *blkg;
        struct hlist_node *n;
        struct blkio_policy_type *blkiop;
+       struct blkio_policy_node *pn;
 
        if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
                return -EINVAL;
@@ -162,7 +437,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
        spin_lock(&blkio_list_lock);
        spin_lock_irq(&blkcg->lock);
        blkcg->weight = (unsigned int)val;
+
        hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+               pn = blkio_policy_search_node(blkcg, blkg->dev);
+
+               if (pn)
+                       continue;
+
                list_for_each_entry(blkiop, &blkio_list, list)
                        blkiop->ops.blkio_update_group_weight_fn(blkg,
                                        blkcg->weight);
@@ -172,13 +453,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
        return 0;
 }
 
-#define SHOW_FUNCTION_PER_GROUP(__VAR)                                 \
+static int
+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+       struct blkio_cgroup *blkcg;
+       struct blkio_group *blkg;
+       struct blkio_group_stats *stats;
+       struct hlist_node *n;
+       uint64_t queued[BLKIO_STAT_TOTAL];
+       int i;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       bool idling, waiting, empty;
+       unsigned long long now = sched_clock();
+#endif
+
+       blkcg = cgroup_to_blkio_cgroup(cgroup);
+       spin_lock_irq(&blkcg->lock);
+       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+               spin_lock(&blkg->stats_lock);
+               stats = &blkg->stats;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+               idling = blkio_blkg_idling(stats);
+               waiting = blkio_blkg_waiting(stats);
+               empty = blkio_blkg_empty(stats);
+#endif
+               for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+                       queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
+               memset(stats, 0, sizeof(struct blkio_group_stats));
+               for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+                       stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+               if (idling) {
+                       blkio_mark_blkg_idling(stats);
+                       stats->start_idle_time = now;
+               }
+               if (waiting) {
+                       blkio_mark_blkg_waiting(stats);
+                       stats->start_group_wait_time = now;
+               }
+               if (empty) {
+                       blkio_mark_blkg_empty(stats);
+                       stats->start_empty_time = now;
+               }
+#endif
+               spin_unlock(&blkg->stats_lock);
+       }
+       spin_unlock_irq(&blkcg->lock);
+       return 0;
+}
+
+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
+                               int chars_left, bool diskname_only)
+{
+       snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
+       chars_left -= strlen(str);
+       if (chars_left <= 0) {
+               printk(KERN_WARNING
+                       "Possibly incorrect cgroup stat display format");
+               return;
+       }
+       if (diskname_only)
+               return;
+       switch (type) {
+       case BLKIO_STAT_READ:
+               strlcat(str, " Read", chars_left);
+               break;
+       case BLKIO_STAT_WRITE:
+               strlcat(str, " Write", chars_left);
+               break;
+       case BLKIO_STAT_SYNC:
+               strlcat(str, " Sync", chars_left);
+               break;
+       case BLKIO_STAT_ASYNC:
+               strlcat(str, " Async", chars_left);
+               break;
+       case BLKIO_STAT_TOTAL:
+               strlcat(str, " Total", chars_left);
+               break;
+       default:
+               strlcat(str, " Invalid", chars_left);
+       }
+}
+
+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
+                               struct cgroup_map_cb *cb, dev_t dev)
+{
+       blkio_get_key_name(0, dev, str, chars_left, true);
+       cb->fill(cb, str, val);
+       return val;
+}
+
+/* This should be called with blkg->stats_lock held */
+static uint64_t blkio_get_stat(struct blkio_group *blkg,
+               struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
+{
+       uint64_t disk_total;
+       char key_str[MAX_KEY_LEN];
+       enum stat_sub_type sub_type;
+
+       if (type == BLKIO_STAT_TIME)
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                       blkg->stats.time, cb, dev);
+       if (type == BLKIO_STAT_SECTORS)
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                       blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
+               uint64_t sum = blkg->stats.avg_queue_size_sum;
+               uint64_t samples = blkg->stats.avg_queue_size_samples;
+               if (samples)
+                       do_div(sum, samples);
+               else
+                       sum = 0;
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
+       }
+       if (type == BLKIO_STAT_GROUP_WAIT_TIME)
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                       blkg->stats.group_wait_time, cb, dev);
+       if (type == BLKIO_STAT_IDLE_TIME)
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                       blkg->stats.idle_time, cb, dev);
+       if (type == BLKIO_STAT_EMPTY_TIME)
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                       blkg->stats.empty_time, cb, dev);
+       if (type == BLKIO_STAT_DEQUEUE)
+               return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+                                       blkg->stats.dequeue, cb, dev);
+#endif
+
+       for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+                       sub_type++) {
+               blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+               cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
+       }
+       disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
+                       blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+       blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+       cb->fill(cb, key_str, disk_total);
+       return disk_total;
+}
+
+#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)               \
 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,               \
-                       struct cftype *cftype, struct seq_file *m)      \
+               struct cftype *cftype, struct cgroup_map_cb *cb)        \
 {                                                                      \
        struct blkio_cgroup *blkcg;                                     \
        struct blkio_group *blkg;                                       \
        struct hlist_node *n;                                           \
+       uint64_t cgroup_total = 0;                                      \
                                                                        \
        if (!cgroup_lock_live_group(cgroup))                            \
                return -ENODEV;                                         \
@@ -186,32 +608,231 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,         \
        blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
        rcu_read_lock();                                                \
        hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-               if (blkg->dev)                                          \
-                       seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),  \
-                                MINOR(blkg->dev), blkg->__VAR);        \
+               if (blkg->dev) {                                        \
+                       spin_lock_irq(&blkg->stats_lock);               \
+                       cgroup_total += blkio_get_stat(blkg, cb,        \
+                                               blkg->dev, type);       \
+                       spin_unlock_irq(&blkg->stats_lock);             \
+               }                                                       \
        }                                                               \
+       if (show_total)                                                 \
+               cb->fill(cb, "Total", cgroup_total);                    \
        rcu_read_unlock();                                              \
        cgroup_unlock();                                                \
        return 0;                                                       \
 }
 
-SHOW_FUNCTION_PER_GROUP(time);
-SHOW_FUNCTION_PER_GROUP(sectors);
+SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
+SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue);
+SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
+SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
 #endif
 #undef SHOW_FUNCTION_PER_GROUP
 
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
-                       unsigned long dequeue)
+static int blkio_check_dev_num(dev_t dev)
 {
-       blkg->dequeue += dequeue;
+       int part = 0;
+       struct gendisk *disk;
+
+       disk = get_gendisk(dev, &part);
+       if (!disk || part)
+               return -ENODEV;
+
+       return 0;
+}
+
+static int blkio_policy_parse_and_set(char *buf,
+                                     struct blkio_policy_node *newpn)
+{
+       char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+       int ret;
+       unsigned long major, minor, temp;
+       int i = 0;
+       dev_t dev;
+
+       memset(s, 0, sizeof(s));
+
+       while ((p = strsep(&buf, " ")) != NULL) {
+               if (!*p)
+                       continue;
+
+               s[i++] = p;
+
+               /* Prevent from inputing too many things */
+               if (i == 3)
+                       break;
+       }
+
+       if (i != 2)
+               return -EINVAL;
+
+       p = strsep(&s[0], ":");
+       if (p != NULL)
+               major_s = p;
+       else
+               return -EINVAL;
+
+       minor_s = s[0];
+       if (!minor_s)
+               return -EINVAL;
+
+       ret = strict_strtoul(major_s, 10, &major);
+       if (ret)
+               return -EINVAL;
+
+       ret = strict_strtoul(minor_s, 10, &minor);
+       if (ret)
+               return -EINVAL;
+
+       dev = MKDEV(major, minor);
+
+       ret = blkio_check_dev_num(dev);
+       if (ret)
+               return ret;
+
+       newpn->dev = dev;
+
+       if (s[1] == NULL)
+               return -EINVAL;
+
+       ret = strict_strtoul(s[1], 10, &temp);
+       if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+           temp > BLKIO_WEIGHT_MAX)
+               return -EINVAL;
+
+       newpn->weight =  temp;
+
+       return 0;
+}
+
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+                             dev_t dev)
+{
+       struct blkio_policy_node *pn;
+
+       pn = blkio_policy_search_node(blkcg, dev);
+       if (pn)
+               return pn->weight;
+       else
+               return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+
+
+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+       int ret = 0;
+       char *buf;
+       struct blkio_policy_node *newpn, *pn;
+       struct blkio_cgroup *blkcg;
+       struct blkio_group *blkg;
+       int keep_newpn = 0;
+       struct hlist_node *n;
+       struct blkio_policy_type *blkiop;
+
+       buf = kstrdup(buffer, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+       if (!newpn) {
+               ret = -ENOMEM;
+               goto free_buf;
+       }
+
+       ret = blkio_policy_parse_and_set(buf, newpn);
+       if (ret)
+               goto free_newpn;
+
+       blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+       spin_lock_irq(&blkcg->lock);
+
+       pn = blkio_policy_search_node(blkcg, newpn->dev);
+       if (!pn) {
+               if (newpn->weight != 0) {
+                       blkio_policy_insert_node(blkcg, newpn);
+                       keep_newpn = 1;
+               }
+               spin_unlock_irq(&blkcg->lock);
+               goto update_io_group;
+       }
+
+       if (newpn->weight == 0) {
+               /* weight == 0 means deleteing a specific weight */
+               blkio_policy_delete_node(pn);
+               spin_unlock_irq(&blkcg->lock);
+               goto update_io_group;
+       }
+       spin_unlock_irq(&blkcg->lock);
+
+       pn->weight = newpn->weight;
+
+update_io_group:
+       /* update weight for each cfqg */
+       spin_lock(&blkio_list_lock);
+       spin_lock_irq(&blkcg->lock);
+
+       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+               if (newpn->dev == blkg->dev) {
+                       list_for_each_entry(blkiop, &blkio_list, list)
+                               blkiop->ops.blkio_update_group_weight_fn(blkg,
+                                                        newpn->weight ?
+                                                        newpn->weight :
+                                                        blkcg->weight);
+               }
+       }
+
+       spin_unlock_irq(&blkcg->lock);
+       spin_unlock(&blkio_list_lock);
+
+free_newpn:
+       if (!keep_newpn)
+               kfree(newpn);
+free_buf:
+       kfree(buf);
+       return ret;
+}
+
+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
+                                     struct seq_file *m)
+{
+       struct blkio_cgroup *blkcg;
+       struct blkio_policy_node *pn;
+
+       seq_printf(m, "dev\tweight\n");
+
+       blkcg = cgroup_to_blkio_cgroup(cgrp);
+       if (!list_empty(&blkcg->policy_list)) {
+               spin_lock_irq(&blkcg->lock);
+               list_for_each_entry(pn, &blkcg->policy_list, node) {
+                       seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                  MINOR(pn->dev), pn->weight);
+               }
+               spin_unlock_irq(&blkcg->lock);
+       }
+
+       return 0;
 }
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
-#endif
 
 struct cftype blkio_files[] = {
+       {
+               .name = "weight_device",
+               .read_seq_string = blkiocg_weight_device_read,
+               .write_string = blkiocg_weight_device_write,
+               .max_write_len = 256,
+       },
        {
                .name = "weight",
                .read_u64 = blkiocg_weight_read,
@@ -219,17 +840,61 @@ struct cftype blkio_files[] = {
        },
        {
                .name = "time",
-               .read_seq_string = blkiocg_time_read,
+               .read_map = blkiocg_time_read,
        },
        {
                .name = "sectors",
-               .read_seq_string = blkiocg_sectors_read,
+               .read_map = blkiocg_sectors_read,
+       },
+       {
+               .name = "io_service_bytes",
+               .read_map = blkiocg_io_service_bytes_read,
+       },
+       {
+               .name = "io_serviced",
+               .read_map = blkiocg_io_serviced_read,
+       },
+       {
+               .name = "io_service_time",
+               .read_map = blkiocg_io_service_time_read,
+       },
+       {
+               .name = "io_wait_time",
+               .read_map = blkiocg_io_wait_time_read,
+       },
+       {
+               .name = "io_merged",
+               .read_map = blkiocg_io_merged_read,
+       },
+       {
+               .name = "io_queued",
+               .read_map = blkiocg_io_queued_read,
+       },
+       {
+               .name = "reset_stats",
+               .write_u64 = blkiocg_reset_stats,
        },
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       {
+       {
+               .name = "avg_queue_size",
+               .read_map = blkiocg_avg_queue_size_read,
+       },
+       {
+               .name = "group_wait_time",
+               .read_map = blkiocg_group_wait_time_read,
+       },
+       {
+               .name = "idle_time",
+               .read_map = blkiocg_idle_time_read,
+       },
+       {
+               .name = "empty_time",
+               .read_map = blkiocg_empty_time_read,
+       },
+       {
                .name = "dequeue",
-               .read_seq_string = blkiocg_dequeue_read,
-       },
+               .read_map = blkiocg_dequeue_read,
+       },
 #endif
 };
 
@@ -246,37 +911,42 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
        struct blkio_group *blkg;
        void *key;
        struct blkio_policy_type *blkiop;
+       struct blkio_policy_node *pn, *pntmp;
 
        rcu_read_lock();
-remove_entry:
-       spin_lock_irqsave(&blkcg->lock, flags);
+       do {
+               spin_lock_irqsave(&blkcg->lock, flags);
+
+               if (hlist_empty(&blkcg->blkg_list)) {
+                       spin_unlock_irqrestore(&blkcg->lock, flags);
+                       break;
+               }
+
+               blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+                                       blkcg_node);
+               key = rcu_dereference(blkg->key);
+               __blkiocg_del_blkio_group(blkg);
 
-       if (hlist_empty(&blkcg->blkg_list)) {
                spin_unlock_irqrestore(&blkcg->lock, flags);
-               goto done;
-       }
 
-       blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
-                               blkcg_node);
-       key = rcu_dereference(blkg->key);
-       __blkiocg_del_blkio_group(blkg);
+               /*
+                * This blkio_group is being unlinked as associated cgroup is
+                * going away. Let all the IO controlling policies know about
+                * this event. Currently this is static call to one io
+                * controlling policy. Once we have more policies in place, we
+                * need some dynamic registration of callback function.
+                */
+               spin_lock(&blkio_list_lock);
+               list_for_each_entry(blkiop, &blkio_list, list)
+                       blkiop->ops.blkio_unlink_group_fn(key, blkg);
+               spin_unlock(&blkio_list_lock);
+       } while (1);
 
-       spin_unlock_irqrestore(&blkcg->lock, flags);
+       list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+               blkio_policy_delete_node(pn);
+               kfree(pn);
+       }
 
-       /*
-        * This blkio_group is being unlinked as associated cgroup is going
-        * away. Let all the IO controlling policies know about this event.
-        *
-        * Currently this is static call to one io controlling policy. Once
-        * we have more policies in place, we need some dynamic registration
-        * of callback function.
-        */
-       spin_lock(&blkio_list_lock);
-       list_for_each_entry(blkiop, &blkio_list, list)
-               blkiop->ops.blkio_unlink_group_fn(key, blkg);
-       spin_unlock(&blkio_list_lock);
-       goto remove_entry;
-done:
        free_css_id(&blkio_subsys, &blkcg->css);
        rcu_read_unlock();
        if (blkcg != &blkio_root_cgroup)
@@ -307,6 +977,7 @@ done:
        spin_lock_init(&blkcg->lock);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
 
+       INIT_LIST_HEAD(&blkcg->policy_list);
        return &blkcg->css;
 }
 
index 8ccc204..2b866ec 100644 (file)
@@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys;
 #define blkio_subsys_id blkio_subsys.subsys_id
 #endif
 
+enum stat_type {
+       /* Total time spent (in ns) between request dispatch to the driver and
+        * request completion for IOs doen by this cgroup. This may not be
+        * accurate when NCQ is turned on. */
+       BLKIO_STAT_SERVICE_TIME = 0,
+       /* Total bytes transferred */
+       BLKIO_STAT_SERVICE_BYTES,
+       /* Total IOs serviced, post merge */
+       BLKIO_STAT_SERVICED,
+       /* Total time spent waiting in scheduler queue in ns */
+       BLKIO_STAT_WAIT_TIME,
+       /* Number of IOs merged */
+       BLKIO_STAT_MERGED,
+       /* Number of IOs queued up */
+       BLKIO_STAT_QUEUED,
+       /* All the single valued stats go below this */
+       BLKIO_STAT_TIME,
+       BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       BLKIO_STAT_AVG_QUEUE_SIZE,
+       BLKIO_STAT_IDLE_TIME,
+       BLKIO_STAT_EMPTY_TIME,
+       BLKIO_STAT_GROUP_WAIT_TIME,
+       BLKIO_STAT_DEQUEUE
+#endif
+};
+
+enum stat_sub_type {
+       BLKIO_STAT_READ = 0,
+       BLKIO_STAT_WRITE,
+       BLKIO_STAT_SYNC,
+       BLKIO_STAT_ASYNC,
+       BLKIO_STAT_TOTAL
+};
+
+/* blkg state flags */
+enum blkg_state_flags {
+       BLKG_waiting = 0,
+       BLKG_idling,
+       BLKG_empty,
+};
+
 struct blkio_cgroup {
        struct cgroup_subsys_state css;
        unsigned int weight;
        spinlock_t lock;
        struct hlist_head blkg_list;
+       struct list_head policy_list; /* list of blkio_policy_node */
+};
+
+struct blkio_group_stats {
+       /* total disk time and nr sectors dispatched by this group */
+       uint64_t time;
+       uint64_t sectors;
+       uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       /* Sum of number of IOs queued across all samples */
+       uint64_t avg_queue_size_sum;
+       /* Count of samples taken for average */
+       uint64_t avg_queue_size_samples;
+       /* How many times this group has been removed from service tree */
+       unsigned long dequeue;
+
+       /* Total time spent waiting for it to be assigned a timeslice. */
+       uint64_t group_wait_time;
+       uint64_t start_group_wait_time;
+
+       /* Time spent idling for this blkio_group */
+       uint64_t idle_time;
+       uint64_t start_idle_time;
+       /*
+        * Total time when we have requests queued and do not contain the
+        * current active queue.
+        */
+       uint64_t empty_time;
+       uint64_t start_empty_time;
+       uint16_t flags;
+#endif
 };
 
 struct blkio_group {
@@ -35,20 +108,25 @@ struct blkio_group {
        void *key;
        struct hlist_node blkcg_node;
        unsigned short blkcg_id;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
        /* Store cgroup path */
        char path[128];
-       /* How many times this group has been removed from service tree */
-       unsigned long dequeue;
-#endif
        /* The device MKDEV(major, minor), this group has been created for */
-       dev_t   dev;
+       dev_t dev;
 
-       /* total disk time and nr sectors dispatched by this group */
-       unsigned long time;
-       unsigned long sectors;
+       /* Need to serialize the stats in the case of reset/update */
+       spinlock_t stats_lock;
+       struct blkio_group_stats stats;
 };
 
+struct blkio_policy_node {
+       struct list_head node;
+       dev_t dev;
+       unsigned int weight;
+};
+
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+                                    dev_t dev);
+
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
                                                unsigned int weight);
@@ -67,6 +145,11 @@ struct blkio_policy_type {
 extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
 
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+       return blkg->path;
+}
+
 #else
 
 struct blkio_group {
@@ -78,6 +161,8 @@ struct blkio_policy_type {
 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+
 #endif
 
 #define BLKIO_WEIGHT_MIN       100
@@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 #define BLKIO_WEIGHT_DEFAULT   500
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-static inline char *blkg_path(struct blkio_group *blkg)
-{
-       return blkg->path;
-}
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
                                unsigned long dequeue);
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg);
+
+#define BLKG_FLAG_FNS(name)                                            \
+static inline void blkio_mark_blkg_##name(                             \
+               struct blkio_group_stats *stats)                        \
+{                                                                      \
+       stats->flags |= (1 << BLKG_##name);                             \
+}                                                                      \
+static inline void blkio_clear_blkg_##name(                            \
+               struct blkio_group_stats *stats)                        \
+{                                                                      \
+       stats->flags &= ~(1 << BLKG_##name);                            \
+}                                                                      \
+static inline int blkio_blkg_##name(struct blkio_group_stats *stats)   \
+{                                                                      \
+       return (stats->flags & (1 << BLKG_##name)) != 0;                \
+}                                                                      \
+
+BLKG_FLAG_FNS(waiting)
+BLKG_FLAG_FNS(idling)
+BLKG_FLAG_FNS(empty)
+#undef BLKG_FLAG_FNS
 #else
-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-static inline void blkiocg_update_blkio_group_dequeue_stats(
-                       struct blkio_group *blkg, unsigned long dequeue) {}
+static inline void blkiocg_update_avg_queue_size_stats(
+                                               struct blkio_group *blkg) {}
+static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+                                               unsigned long dequeue) {}
+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{}
+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
 #endif
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
                                                void *key);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-                       unsigned long time, unsigned long sectors);
+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+                                       unsigned long time);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
+                                               bool direction, bool sync);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+       uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+                                       bool sync);
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+               struct blkio_group *curr_blkg, bool direction, bool sync);
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                       bool direction, bool sync);
 #else
 struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-                       struct blkio_group *blkg, void *key, dev_t dev)
-{
-}
+                       struct blkio_group *blkg, void *key, dev_t dev) {}
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
-static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
-                       unsigned long time, unsigned long sectors)
-{
-}
+static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+                                               unsigned long time) {}
+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+                               uint64_t bytes, bool direction, bool sync) {}
+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
+               uint64_t start_time, uint64_t io_start_time, bool direction,
+               bool sync) {}
+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+                                               bool direction, bool sync) {}
+static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+               struct blkio_group *curr_blkg, bool direction, bool sync) {}
+static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+                                               bool direction, bool sync) {}
 #endif
 #endif /* _BLK_CGROUP_H */
index 9fe174d..3bc5579 100644 (file)
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->tag = -1;
        rq->ref_count = 1;
        rq->start_time = jiffies;
+       set_start_time_ns(rq);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
         */
        blk_sync_queue(q);
 
+       del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
        mutex_lock(&q->sysfs_lock);
        queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
        mutex_unlock(&q->sysfs_lock);
@@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
                return NULL;
        }
 
+       setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+                   laptop_mode_timer_fn, (unsigned long) q);
        init_timer(&q->unplug_timer);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
        INIT_LIST_HEAD(&q->timeout_list);
@@ -568,6 +572,22 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
        struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 
+       return blk_init_allocated_queue_node(q, rfn, lock, node_id);
+}
+EXPORT_SYMBOL(blk_init_queue_node);
+
+struct request_queue *
+blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
+                        spinlock_t *lock)
+{
+       return blk_init_allocated_queue_node(q, rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_allocated_queue);
+
+struct request_queue *
+blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
+                             spinlock_t *lock, int node_id)
+{
        if (!q)
                return NULL;
 
@@ -601,7 +621,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
        blk_put_queue(q);
        return NULL;
 }
-EXPORT_SYMBOL(blk_init_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue_node);
 
 int blk_get_queue(struct request_queue *q)
 {
@@ -1198,6 +1218,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!blk_rq_cpu_valid(req))
                        req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
+               elv_bio_merged(q, req, bio);
                if (!attempt_back_merge(q, req))
                        elv_merged_request(q, req, el_ret);
                goto out;
@@ -1231,6 +1252,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!blk_rq_cpu_valid(req))
                        req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
+               elv_bio_merged(q, req, bio);
                if (!attempt_front_merge(q, req))
                        elv_merged_request(q, req, el_ret);
                goto out;
@@ -1855,8 +1877,10 @@ void blk_dequeue_request(struct request *rq)
         * and to it is freed is accounted as io that is in progress at
         * the driver side.
         */
-       if (blk_account_rq(rq))
+       if (blk_account_rq(rq)) {
                q->in_flight[rq_is_sync(rq)]++;
+               set_io_start_time_ns(rq);
+       }
 }
 
 /**
@@ -2098,7 +2122,7 @@ static void blk_finish_request(struct request *req, int error)
        BUG_ON(blk_queued_rq(req));
 
        if (unlikely(laptop_mode) && blk_fs_request(req))
-               laptop_io_completion();
+               laptop_io_completion(&req->q->backing_dev_info);
 
        blk_delete_timer(req);
 
@@ -2517,4 +2541,3 @@ int __init blk_dev_init(void)
 
        return 0;
 }
-
diff --git a/block/blk-lib.c b/block/blk-lib.c
new file mode 100644 (file)
index 0000000..d0216b9
--- /dev/null
@@ -0,0 +1,233 @@
+/*
+ * Functions related to generic helpers functions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+       if (err) {
+               if (err == -EOPNOTSUPP)
+                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+       }
+
+       if (bio->bi_private)
+               complete(bio->bi_private);
+       __free_page(bio_page(bio));
+
+       bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:      blockdev to issue discard for
+ * @sector:    start sector
+ * @nr_sects:  number of sectors to discard
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ * @flags:     BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+               sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+       DECLARE_COMPLETION_ONSTACK(wait);
+       struct request_queue *q = bdev_get_queue(bdev);
+       int type = flags & BLKDEV_IFL_BARRIER ?
+               DISCARD_BARRIER : DISCARD_NOBARRIER;
+       struct bio *bio;
+       struct page *page;
+       int ret = 0;
+
+       if (!q)
+               return -ENXIO;
+
+       if (!blk_queue_discard(q))
+               return -EOPNOTSUPP;
+
+       while (nr_sects && !ret) {
+               unsigned int sector_size = q->limits.logical_block_size;
+               unsigned int max_discard_sectors =
+                       min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+
+               bio = bio_alloc(gfp_mask, 1);
+               if (!bio)
+                       goto out;
+               bio->bi_sector = sector;
+               bio->bi_end_io = blkdev_discard_end_io;
+               bio->bi_bdev = bdev;
+               if (flags & BLKDEV_IFL_WAIT)
+                       bio->bi_private = &wait;
+
+               /*
+                * Add a zeroed one-sector payload as that's what
+                * our current implementations need.  If we'll ever need
+                * more the interface will need revisiting.
+                */
+               page = alloc_page(gfp_mask | __GFP_ZERO);
+               if (!page)
+                       goto out_free_bio;
+               if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
+                       goto out_free_page;
+
+               /*
+                * And override the bio size - the way discard works we
+                * touch many more blocks on disk than the actual payload
+                * length.
+                */
+               if (nr_sects > max_discard_sectors) {
+                       bio->bi_size = max_discard_sectors << 9;
+                       nr_sects -= max_discard_sectors;
+                       sector += max_discard_sectors;
+               } else {
+                       bio->bi_size = nr_sects << 9;
+                       nr_sects = 0;
+               }
+
+               bio_get(bio);
+               submit_bio(type, bio);
+
+               if (flags & BLKDEV_IFL_WAIT)
+                       wait_for_completion(&wait);
+
+               if (bio_flagged(bio, BIO_EOPNOTSUPP))
+                       ret = -EOPNOTSUPP;
+               else if (!bio_flagged(bio, BIO_UPTODATE))
+                       ret = -EIO;
+               bio_put(bio);
+       }
+       return ret;
+out_free_page:
+       __free_page(page);
+out_free_bio:
+       bio_put(bio);
+out:
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
+
+struct bio_batch
+{
+       atomic_t                done;
+       unsigned long           flags;
+       struct completion       *wait;
+       bio_end_io_t            *end_io;
+};
+
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+       struct bio_batch *bb = bio->bi_private;
+
+       if (err) {
+               if (err == -EOPNOTSUPP)
+                       set_bit(BIO_EOPNOTSUPP, &bb->flags);
+               else
+                       clear_bit(BIO_UPTODATE, &bb->flags);
+       }
+       if (bb) {
+               if (bb->end_io)
+                       bb->end_io(bio, err);
+               atomic_inc(&bb->done);
+               complete(bb->wait);
+       }
+       bio_put(bio);
+}
+
+/**
+ * blkdev_issue_zeroout generate number of zero filed write bios
+ * @bdev:      blockdev to issue
+ * @sector:    start sector
+ * @nr_sects:  number of sectors to write
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ * @flags:     BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *  Generate and issue number of bios with zerofiled pages.
+ *  Send barrier at the beginning and at the end if requested. This guarantie
+ *  correct request ordering. Empty barrier allow us to avoid post queue flush.
+ */
+
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+                       sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+       int ret = 0;
+       struct bio *bio;
+       struct bio_batch bb;
+       unsigned int sz, issued = 0;
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       atomic_set(&bb.done, 0);
+       bb.flags = 1 << BIO_UPTODATE;
+       bb.wait = &wait;
+       bb.end_io = NULL;
+
+       if (flags & BLKDEV_IFL_BARRIER) {
+               /* issue async barrier before the data */
+               ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
+               if (ret)
+                       return ret;
+       }
+submit:
+       while (nr_sects != 0) {
+               bio = bio_alloc(gfp_mask,
+                               min(nr_sects, (sector_t)BIO_MAX_PAGES));
+               if (!bio)
+                       break;
+
+               bio->bi_sector = sector;
+               bio->bi_bdev   = bdev;
+               bio->bi_end_io = bio_batch_end_io;
+               if (flags & BLKDEV_IFL_WAIT)
+                       bio->bi_private = &bb;
+
+               while (nr_sects != 0) {
+                       sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
+                       if (sz == 0)
+                               /* bio has maximum size possible */
+                               break;
+                       ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+                       nr_sects -= ret >> 9;
+                       sector += ret >> 9;
+                       if (ret < (sz << 9))
+                               break;
+               }
+               issued++;
+               submit_bio(WRITE, bio);
+       }
+       /*
+        * When all data bios are in flight. Send final barrier if requeted.
+        */
+       if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
+               ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
+                                       flags & BLKDEV_IFL_WAIT);
+
+
+       if (flags & BLKDEV_IFL_WAIT)
+               /* Wait for bios in-flight */
+               while ( issued != atomic_read(&bb.done))
+                       wait_for_completion(&wait);
+
+       if (!test_bit(BIO_UPTODATE, &bb.flags))
+               /* One of bios in the batch was completed with error.*/
+               ret = -EIO;
+
+       if (ret)
+               goto out;
+
+       if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
+       if (nr_sects != 0)
+               goto submit;
+out:
+       return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_zeroout);
index 5f127cf..ed897b5 100644 (file)
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
 #define RQ_CIC(rq)             \
        ((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)            (struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQG(rq)            (struct cfq_group *) ((rq)->elevator_private3)
 
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
@@ -143,8 +144,6 @@ struct cfq_queue {
        struct cfq_queue *new_cfqq;
        struct cfq_group *cfqg;
        struct cfq_group *orig_cfqg;
-       /* Sectors dispatched in current dispatch round */
-       unsigned long nr_sectors;
 };
 
 /*
@@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
-#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
        blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
        if (!RB_EMPTY_NODE(&cfqg->rb_node))
                cfq_rb_erase(&cfqg->rb_node, st);
        cfqg->saved_workload_slice = 0;
-       blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
+       blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
                        slice_used = cfqq->allocated_slice;
        }
 
-       cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
-                               cfqq->nr_sectors);
+       cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
        return slice_used;
 }
 
@@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
                                        st->min_vdisktime);
-       blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
-                                               cfqq->nr_sectors);
+       blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+       blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
        if (!cfqg)
                goto done;
 
-       cfqg->weight = blkcg->weight;
        for_each_cfqg_st(cfqg, i, j, st)
                *st = CFQ_RB_ROOT;
        RB_CLEAR_NODE(&cfqg->rb_node);
@@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
        sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
        blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
                                        MKDEV(major, minor));
+       cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 
        /* Add group on cfqd list */
        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
        return cfqg;
 }
 
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+       atomic_inc(&cfqg->ref);
+       return cfqg;
+}
+
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
        /* Currently, all async queues are mapped to root group */
@@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
        return &cfqd->root_group;
 }
+
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+       return cfqg;
+}
+
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
        cfqq->cfqg = cfqg;
@@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
        elv_rb_del(&cfqq->sort_list, rq);
        cfqq->queued[rq_is_sync(rq)]--;
+       blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
+                                               rq_is_sync(rq));
        cfq_add_rq_rb(rq);
+       blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+                       &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+                       rq_is_sync(rq));
 }
 
 static struct request *
@@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq)
        cfq_del_rq_rb(rq);
 
        cfqq->cfqd->rq_queued--;
+       blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
+                                               rq_is_sync(rq));
        if (rq_is_meta(rq)) {
                WARN_ON(!cfqq->meta_pending);
                cfqq->meta_pending--;
@@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
        }
 }
 
+static void cfq_bio_merged(struct request_queue *q, struct request *req,
+                               struct bio *bio)
+{
+       blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
+                                       cfq_bio_sync(bio));
+}
+
 static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
                    struct request *next)
@@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
        if (cfqq->next_rq == next)
                cfqq->next_rq = rq;
        cfq_remove_request(next);
+       blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
+                                       rq_is_sync(next));
 }
 
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
        return cfqq == RQ_CFQQ(rq);
 }
 
+static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+       del_timer(&cfqd->idle_slice_timer);
+       blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+}
+
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
                                   struct cfq_queue *cfqq)
 {
        if (cfqq) {
                cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
                                cfqd->serving_prio, cfqd->serving_type);
+               blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
                cfqq->slice_start = 0;
                cfqq->dispatch_start = jiffies;
                cfqq->allocated_slice = 0;
                cfqq->slice_end = 0;
                cfqq->slice_dispatch = 0;
-               cfqq->nr_sectors = 0;
 
                cfq_clear_cfqq_wait_request(cfqq);
                cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
                cfq_clear_cfqq_fifo_expire(cfqq);
                cfq_mark_cfqq_slice_new(cfqq);
 
-               del_timer(&cfqd->idle_slice_timer);
+               cfq_del_timer(cfqd, cfqq);
        }
 
        cfqd->active_queue = cfqq;
@@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 
        if (cfq_cfqq_wait_request(cfqq))
-               del_timer(&cfqd->idle_slice_timer);
+               cfq_del_timer(cfqd, cfqq);
 
        cfq_clear_cfqq_wait_request(cfqq);
        cfq_clear_cfqq_wait_busy(cfqq);
@@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
        sl = cfqd->cfq_slice_idle;
 
        mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+       blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
 }
 
@@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
        elv_dispatch_sort(q, rq);
 
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-       cfqq->nr_sectors += blk_rq_sectors(rq);
+       blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+                                       rq_data_dir(rq), rq_is_sync(rq));
 }
 
 /*
@@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                if (cfq_cfqq_wait_request(cfqq)) {
                        if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
                            cfqd->busy_queues > 1) {
-                               del_timer(&cfqd->idle_slice_timer);
+                               cfq_del_timer(cfqd, cfqq);
                                cfq_clear_cfqq_wait_request(cfqq);
                                __blk_run_queue(cfqd->queue);
-                       } else
+                       } else {
+                               blkiocg_update_idle_time_stats(
+                                               &cfqq->cfqg->blkg);
                                cfq_mark_cfqq_must_dispatch(cfqq);
+                       }
                }
        } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
                /*
@@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
        rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
        list_add_tail(&rq->queuelist, &cfqq->fifo);
        cfq_add_rq_rb(rq);
-
+       blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+                       &cfqd->serving_group->blkg, rq_data_dir(rq),
+                       rq_is_sync(rq));
        cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
@@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
        WARN_ON(!cfqq->dispatched);
        cfqd->rq_in_driver--;
        cfqq->dispatched--;
+       blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
+                       rq_io_start_time_ns(rq), rq_data_dir(rq),
+                       rq_is_sync(rq));
 
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq)
                rq->elevator_private = NULL;
                rq->elevator_private2 = NULL;
 
+               /* Put down rq reference on cfqg */
+               cfq_put_cfqg(RQ_CFQG(rq));
+               rq->elevator_private3 = NULL;
+
                cfq_put_queue(cfqq);
        }
 }
@@ -3528,6 +3574,7 @@ new_queue:
 
        rq->elevator_private = cic;
        rq->elevator_private2 = cfqq;
+       rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
        return 0;
 
 queue_fail:
@@ -3743,7 +3790,6 @@ static void *cfq_init_queue(struct request_queue *q)
         * second, in order to have larger depth for async operations.
         */
        cfqd->last_delayed_sync = jiffies - HZ;
-       INIT_RCU_HEAD(&cfqd->rcu);
        return cfqd;
 }
 
@@ -3872,6 +3918,7 @@ static struct elevator_type iosched_cfq = {
                .elevator_merged_fn =           cfq_merged_request,
                .elevator_merge_req_fn =        cfq_merged_requests,
                .elevator_allow_merge_fn =      cfq_allow_merge,
+               .elevator_bio_merged_fn =       cfq_bio_merged,
                .elevator_dispatch_fn =         cfq_dispatch_requests,
                .elevator_add_req_fn =          cfq_insert_request,
                .elevator_activate_req_fn =     cfq_activate_request,
index 76e3702..6df2b50 100644 (file)
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
        q->last_merge = rq;
 }
 
+void elv_bio_merged(struct request_queue *q, struct request *rq,
+                       struct bio *bio)
+{
+       struct elevator_queue *e = q->elevator;
+
+       if (e->ops->elevator_bio_merged_fn)
+               e->ops->elevator_bio_merged_fn(q, rq, bio);
+}
+
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
        /*
@@ -921,6 +930,7 @@ int elv_register_queue(struct request_queue *q)
        }
        return error;
 }
+EXPORT_SYMBOL(elv_register_queue);
 
 static void __elv_unregister_queue(struct elevator_queue *e)
 {
@@ -933,6 +943,7 @@ void elv_unregister_queue(struct request_queue *q)
        if (q)
                __elv_unregister_queue(q->elevator);
 }
+EXPORT_SYMBOL(elv_unregister_queue);
 
 void elv_register(struct elevator_type *e)
 {
index d13ba76..59a2db6 100644 (file)
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 
        return disk;
 }
+EXPORT_SYMBOL(get_gendisk);
 
 /**
  * bdget_disk - do bdget() by gendisk and partition number
@@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
        if (!new_ptbl)
                return -ENOMEM;
 
-       INIT_RCU_HEAD(&new_ptbl->rcu_head);
        new_ptbl->len = target;
 
        for (i = 0; i < len; i++)
index 8905d2a..e8eb679 100644 (file)
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
        if (start + len > (bdev->bd_inode->i_size >> 9))
                return -EINVAL;
        return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
-                                   DISCARD_FL_WAIT);
+                                   BLKDEV_IFL_WAIT);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
index 77bfce5..de27768 100644 (file)
@@ -76,6 +76,17 @@ config BLK_DEV_XD
 
          It's pretty unlikely that you have one of these: say N.
 
+config GDROM
+       tristate "SEGA Dreamcast GD-ROM drive"
+       depends on SH_DREAMCAST
+       help
+         A standard SEGA Dreamcast comes with a modified CD ROM drive called a
+         "GD-ROM" by SEGA to signify it is capable of reading special disks
+         with up to 1 GB of data. This drive will also read standard CD ROM
+         disks. Select this option to access any disks in your GD ROM drive.
+         Most users will want to say "Y" here.
+         You can also build this as a module which will be called gdrom.
+
 config PARIDE
        tristate "Parallel port IDE device support"
        depends on PARPORT_PC
@@ -103,17 +114,6 @@ config PARIDE
          "MicroSolutions backpack protocol", "DataStor Commuter protocol"
          etc.).
 
-config GDROM
-       tristate "SEGA Dreamcast GD-ROM drive"
-       depends on SH_DREAMCAST
-       help
-         A standard SEGA Dreamcast comes with a modified CD ROM drive called a
-         "GD-ROM" by SEGA to signify it is capable of reading special disks
-         with up to 1 GB of data. This drive will also read standard CD ROM
-         disks. Select this option to access any disks in your GD ROM drive.
-         Most users will want to say "Y" here.
-         You can also build this as a module which will be called gdrom.
-
 source "drivers/block/paride/Kconfig"
 
 config BLK_CPQ_DA
index 3390716..e3f88d6 100644 (file)
@@ -84,6 +84,9 @@ struct drbd_bitmap {
 #define BM_MD_IO_ERROR  1
 #define BM_P_VMALLOCED  2
 
+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+                              unsigned long e, int val, const enum km_type km);
+
 static int bm_is_locked(struct drbd_bitmap *b)
 {
        return test_bit(BM_LOCKED, &b->bm_flags);
@@ -441,7 +444,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
  * In case this is actually a resize, we copy the old bitmap into the new one.
  * Otherwise, the bitmap is initialized to all bits set.
  */
-int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 {
        struct drbd_bitmap *b = mdev->bitmap;
        unsigned long bits, words, owords, obits, *p_addr, *bm;
@@ -516,7 +519,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
        obits  = b->bm_bits;
 
        growing = bits > obits;
-       if (opages)
+       if (opages && growing && set_new_bits)
                bm_set_surplus(b);
 
        b->bm_pages = npages;
@@ -526,8 +529,12 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
        b->bm_dev_capacity = capacity;
 
        if (growing) {
-               bm_memset(b, owords, 0xff, words-owords);
-               b->bm_set += bits - obits;
+               if (set_new_bits) {
+                       bm_memset(b, owords, 0xff, words-owords);
+                       b->bm_set += bits - obits;
+               } else
+                       bm_memset(b, owords, 0x00, words-owords);
+
        }
 
        if (want < have) {
@@ -773,7 +780,7 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
        /* nothing to do, on disk == in memory */
 # define bm_cpu_to_lel(x) ((void)0)
 # else
-void bm_cpu_to_lel(struct drbd_bitmap *b)
+static void bm_cpu_to_lel(struct drbd_bitmap *b)
 {
        /* need to cpu_to_lel all the pages ...
         * this may be optimized by using
@@ -1015,7 +1022,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f
  * wants bitnr, not sector.
  * expected to be called for only a few bits (e - s about BITS_PER_LONG).
  * Must hold bitmap lock already. */
-int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
        unsigned long e, int val, const enum km_type km)
 {
        struct drbd_bitmap *b = mdev->bitmap;
@@ -1053,7 +1060,7 @@ int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
  * for val != 0, we change 0 -> 1, return code positive
  * for val == 0, we change 1 -> 0, return code negative
  * wants bitnr, not sector */
-int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
        const unsigned long e, int val)
 {
        unsigned long flags;
index e5e86a7..e9654c8 100644 (file)
@@ -132,6 +132,7 @@ enum {
        DRBD_FAULT_DT_RA = 6,   /* data read ahead */
        DRBD_FAULT_BM_ALLOC = 7,        /* bitmap allocation */
        DRBD_FAULT_AL_EE = 8,   /* alloc ee */
+       DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
 
        DRBD_FAULT_MAX,
 };
@@ -208,8 +209,11 @@ enum drbd_packets {
        P_RS_IS_IN_SYNC       = 0x22, /* meta socket */
        P_SYNC_PARAM89        = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
        P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+       /* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
+       /* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
+       P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
 
-       P_MAX_CMD             = 0x25,
+       P_MAX_CMD             = 0x28,
        P_MAY_IGNORE          = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
        P_MAX_OPT_CMD         = 0x101,
 
@@ -264,6 +268,7 @@ static inline const char *cmdname(enum drbd_packets cmd)
                [P_CSUM_RS_REQUEST]     = "CsumRSRequest",
                [P_RS_IS_IN_SYNC]       = "CsumRSIsInSync",
                [P_COMPRESSED_BITMAP]   = "CBitmap",
+               [P_DELAY_PROBE]         = "DelayProbe",
                [P_MAX_CMD]             = NULL,
        };
 
@@ -481,7 +486,8 @@ struct p_sizes {
        u64         u_size;  /* user requested size */
        u64         c_size;  /* current exported size */
        u32         max_segment_size;  /* Maximal size of a BIO */
-       u32         queue_order_type;
+       u16         queue_order_type;  /* not yet implemented in DRBD*/
+       u16         dds_flags; /* use enum dds_flags here. */
 } __packed;
 
 struct p_state {
@@ -538,6 +544,18 @@ struct p_compressed_bm {
        u8 code[0];
 } __packed;
 
+struct p_delay_probe {
+       struct p_header head;
+       u32     seq_num; /* sequence number to match the two probe packets */
+       u32     offset;  /* usecs the probe got sent after the reference time point */
+} __packed;
+
+struct delay_probe {
+       struct list_head list;
+       unsigned int seq_num;
+       struct timeval time;
+};
+
 /* DCBP: Drbd Compressed Bitmap Packet ... */
 static inline enum drbd_bitmap_code
 DCBP_get_code(struct p_compressed_bm *p)
@@ -722,22 +740,6 @@ enum epoch_event {
        EV_CLEANUP = 32, /* used as flag */
 };
 
-struct drbd_epoch_entry {
-       struct drbd_work    w;
-       struct drbd_conf *mdev;
-       struct bio *private_bio;
-       struct hlist_node colision;
-       sector_t sector;
-       unsigned int size;
-       struct drbd_epoch *epoch;
-
-       /* up to here, the struct layout is identical to drbd_request;
-        * we might be able to use that to our advantage...  */
-
-       unsigned int flags;
-       u64    block_id;
-};
-
 struct drbd_wq_barrier {
        struct drbd_work w;
        struct completion done;
@@ -748,17 +750,49 @@ struct digest_info {
        void *digest;
 };
 
-/* ee flag bits */
+struct drbd_epoch_entry {
+       struct drbd_work w;
+       struct hlist_node colision;
+       struct drbd_epoch *epoch;
+       struct drbd_conf *mdev;
+       struct page *pages;
+       atomic_t pending_bios;
+       unsigned int size;
+       /* see comments on ee flag bits below */
+       unsigned long flags;
+       sector_t sector;
+       u64 block_id;
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
 enum {
        __EE_CALL_AL_COMPLETE_IO,
-       __EE_CONFLICT_PENDING,
        __EE_MAY_SET_IN_SYNC,
+
+       /* This epoch entry closes an epoch using a barrier.
+        * On sucessful completion, the epoch is released,
+        * and the P_BARRIER_ACK send. */
        __EE_IS_BARRIER,
+
+       /* In case a barrier failed,
+        * we need to resubmit without the barrier flag. */
+       __EE_RESUBMITTED,
+
+       /* we may have several bios per epoch entry.
+        * if any of those fail, we set this flag atomically
+        * from the endio callback */
+       __EE_WAS_ERROR,
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
-#define EE_CONFLICT_PENDING    (1<<__EE_CONFLICT_PENDING)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
 #define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
+#define        EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 
 /* global flag bits */
 enum {
@@ -908,9 +942,12 @@ struct drbd_conf {
        unsigned int ko_count;
        struct drbd_work  resync_work,
                          unplug_work,
-                         md_sync_work;
+                         md_sync_work,
+                         delay_probe_work,
+                         uuid_work;
        struct timer_list resync_timer;
        struct timer_list md_sync_timer;
+       struct timer_list delay_probe_timer;
 
        /* Used after attach while negotiating new disk state. */
        union drbd_state new_state_tmp;
@@ -1026,6 +1063,13 @@ struct drbd_conf {
        u64 ed_uuid; /* UUID of the exposed data */
        struct mutex state_mutex;
        char congestion_reason;  /* Why we where congested... */
+       struct list_head delay_probes; /* protected by peer_seq_lock */
+       int data_delay;   /* Delay of packets on the data-sock behind meta-sock */
+       unsigned int delay_seq; /* To generate sequence numbers of delay probes */
+       struct timeval dps_time; /* delay-probes-start-time */
+       unsigned int dp_volume_last;  /* send_cnt of last delay probe */
+       int c_sync_rate; /* current resync rate after delay_probe magic */
+       atomic_t new_c_uuid;
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1081,6 +1125,11 @@ enum chg_state_flags {
        CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
 };
 
+enum dds_flags {
+       DDSF_FORCED    = 1,
+       DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
 extern void drbd_init_set_defaults(struct drbd_conf *mdev);
 extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
                        union drbd_state mask, union drbd_state val);
@@ -1113,7 +1162,7 @@ extern int drbd_send_protocol(struct drbd_conf *mdev);
 extern int drbd_send_uuids(struct drbd_conf *mdev);
 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
 extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
-extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
 extern int _drbd_send_state(struct drbd_conf *mdev);
 extern int drbd_send_state(struct drbd_conf *mdev);
 extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
@@ -1311,7 +1360,7 @@ struct bm_extent {
 #define APP_R_HSIZE 15
 
 extern int  drbd_bm_init(struct drbd_conf *mdev);
-extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
+extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
 extern void drbd_bm_cleanup(struct drbd_conf *mdev);
 extern void drbd_bm_set_all(struct drbd_conf *mdev);
 extern void drbd_bm_clear_all(struct drbd_conf *mdev);
@@ -1383,7 +1432,7 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
 extern char *ppsize(char *buf, unsigned long long size);
 extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
 enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
-extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local);
+extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
 extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
 extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
@@ -1414,7 +1463,8 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
 }
 
 
-extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
 /* worker callbacks */
 extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
 extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
@@ -1438,6 +1488,8 @@ extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
 extern void resync_timer_fn(unsigned long data);
 
 /* drbd_receiver.c */
+extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+               const unsigned rw, const int fault_type);
 extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
 extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
                                            u64 id,
@@ -1593,6 +1645,41 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
  * inline helper functions
  *************************/
 
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+       return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+       for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+                       page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+       for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+static inline int drbd_bio_has_active_page(struct bio *bio)
+{
+       struct bio_vec *bvec;
+       int i;
+
+       __bio_for_each_segment(bvec, bio, i, 0) {
+               if (page_count(bvec->bv_page) > 1)
+                       return 1;
+       }
+
+       return 0;
+}
+
+static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+{
+       struct page *page = e->pages;
+       page_chain_for_each(page) {
+               if (page_count(page) > 1)
+                       return 1;
+       }
+       return 0;
+}
+
+
 static inline void drbd_state_lock(struct drbd_conf *mdev)
 {
        wait_event(mdev->misc_wait,
@@ -2132,13 +2219,15 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
                return 0;
        if (test_bit(BITMAP_IO, &mdev->flags))
                return 0;
+       if (atomic_read(&mdev->new_c_uuid))
+               return 0;
        return 1;
 }
 
 /* I'd like to use wait_event_lock_irq,
  * but I'm not sure when it got introduced,
  * and not sure when it has 3 or 4 arguments */
-static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
+static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
 {
        /* compare with after_state_ch,
         * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
@@ -2152,6 +2241,9 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
         * to avoid races with the reconnect code,
         * we need to atomic_inc within the spinlock. */
 
+       if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
+               drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
+
        spin_lock_irq(&mdev->req_lock);
        while (!__inc_ap_bio_cond(mdev)) {
                prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
@@ -2160,7 +2252,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
                finish_wait(&mdev->misc_wait, &wait);
                spin_lock_irq(&mdev->req_lock);
        }
-       atomic_add(one_or_two, &mdev->ap_bio_cnt);
+       atomic_add(count, &mdev->ap_bio_cnt);
        spin_unlock_irq(&mdev->req_lock);
 }
 
@@ -2251,7 +2343,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
        if (test_bit(MD_NO_BARRIER, &mdev->flags))
                return;
 
-       r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+       r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
+                       BLKDEV_IFL_WAIT);
        if (r) {
                set_bit(MD_NO_BARRIER, &mdev->flags);
                dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
index 93d1f9b..be2d2da 100644 (file)
@@ -684,6 +684,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
        else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
                rv = SS_NO_REMOTE_DISK;
 
+       else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+               rv = SS_NO_UP_TO_DATE_DISK;
+
        else if ((ns.conn == C_CONNECTED ||
                  ns.conn == C_WF_BITMAP_S ||
                  ns.conn == C_SYNC_SOURCE ||
@@ -840,7 +843,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
                        break;
                case C_WF_BITMAP_S:
                case C_PAUSED_SYNC_S:
-                       ns.pdsk = D_OUTDATED;
+                       /* remap any consistent state to D_OUTDATED,
+                        * but disallow "upgrade" of not even consistent states.
+                        */
+                       ns.pdsk =
+                               (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
+                               ? os.pdsk : D_OUTDATED;
                        break;
                case C_SYNC_SOURCE:
                        ns.pdsk = D_INCONSISTENT;
@@ -1205,21 +1213,20 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
        &&  (ns.pdsk < D_INCONSISTENT ||
             ns.pdsk == D_UNKNOWN ||
             ns.pdsk == D_OUTDATED)) {
-               kfree(mdev->p_uuid);
-               mdev->p_uuid = NULL;
                if (get_ldev(mdev)) {
                        if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-                           mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-                               drbd_uuid_new_current(mdev);
-                               drbd_send_uuids(mdev);
-                       }
+                           mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
+                           !atomic_read(&mdev->new_c_uuid))
+                               atomic_set(&mdev->new_c_uuid, 2);
                        put_ldev(mdev);
                }
        }
 
        if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-               if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
-                       drbd_uuid_new_current(mdev);
+               /* Diskless peer becomes primary or got connected do diskless, primary peer. */
+               if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
+                   !atomic_read(&mdev->new_c_uuid))
+                       atomic_set(&mdev->new_c_uuid, 2);
 
                /* D_DISKLESS Peer becomes secondary */
                if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1232,7 +1239,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
            os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
                kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
                mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
-               drbd_send_sizes(mdev, 0);  /* to start sync... */
+               drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
                drbd_send_uuids(mdev);
                drbd_send_state(mdev);
        }
@@ -1343,6 +1350,24 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
        drbd_md_sync(mdev);
 }
 
+static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+       if (get_ldev(mdev)) {
+               if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
+                       drbd_uuid_new_current(mdev);
+                       if (get_net_conf(mdev)) {
+                               drbd_send_uuids(mdev);
+                               put_net_conf(mdev);
+                       }
+                       drbd_md_sync(mdev);
+               }
+               put_ldev(mdev);
+       }
+       atomic_dec(&mdev->new_c_uuid);
+       wake_up(&mdev->misc_wait);
+
+       return 1;
+}
 
 static int drbd_thread_setup(void *arg)
 {
@@ -1755,7 +1780,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
                             (struct p_header *)&p, sizeof(p));
 }
 
-int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
 {
        struct p_sizes p;
        sector_t d_size, u_size;
@@ -1767,7 +1792,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
                d_size = drbd_get_max_capacity(mdev->ldev);
                u_size = mdev->ldev->dc.disk_size;
                q_order_type = drbd_queue_order_type(mdev);
-               p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
                put_ldev(mdev);
        } else {
                d_size = 0;
@@ -1779,7 +1803,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
        p.u_size = cpu_to_be64(u_size);
        p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
        p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
-       p.queue_order_type = cpu_to_be32(q_order_type);
+       p.queue_order_type = cpu_to_be16(q_order_type);
+       p.dds_flags = cpu_to_be16(flags);
 
        ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
                           (struct p_header *)&p, sizeof(p));
@@ -2180,6 +2205,43 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
        return ok;
 }
 
+static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
+{
+       struct p_delay_probe dp;
+       int offset, ok = 0;
+       struct timeval now;
+
+       mutex_lock(&ds->mutex);
+       if (likely(ds->socket)) {
+               do_gettimeofday(&now);
+               offset = now.tv_usec - mdev->dps_time.tv_usec +
+                        (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
+               dp.seq_num  = cpu_to_be32(mdev->delay_seq);
+               dp.offset   = cpu_to_be32(offset);
+
+               ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
+                                   (struct p_header *)&dp, sizeof(dp), 0);
+       }
+       mutex_unlock(&ds->mutex);
+
+       return ok;
+}
+
+static int drbd_send_delay_probes(struct drbd_conf *mdev)
+{
+       int ok;
+
+       mdev->delay_seq++;
+       do_gettimeofday(&mdev->dps_time);
+       ok = drbd_send_delay_probe(mdev, &mdev->meta);
+       ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
+
+       mdev->dp_volume_last = mdev->send_cnt;
+       mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
+
+       return ok;
+}
+
 /* called on sndtimeo
  * returns FALSE if we should retry,
  * TRUE if we think connection is dead
@@ -2309,6 +2371,44 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
        return 1;
 }
 
+static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+       struct page *page = e->pages;
+       unsigned len = e->size;
+       page_chain_for_each(page) {
+               unsigned l = min_t(unsigned, len, PAGE_SIZE);
+               if (!_drbd_send_page(mdev, page, 0, l))
+                       return 0;
+               len -= l;
+       }
+       return 1;
+}
+
+static void consider_delay_probes(struct drbd_conf *mdev)
+{
+       if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
+               return;
+
+       if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
+               drbd_send_delay_probes(mdev);
+}
+
+static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+       if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
+               drbd_send_delay_probes(mdev);
+
+       return 1;
+}
+
+static void delay_probe_timer_fn(unsigned long data)
+{
+       struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+       if (list_empty(&mdev->delay_probe_work.list))
+               drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
+}
+
 /* Used to send write requests
  * R_PRIMARY -> Peer   (P_DATA)
  */
@@ -2360,7 +2460,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
                drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
        if (ok && dgs) {
                dgb = mdev->int_dig_out;
-               drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
+               drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
                ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
        }
        if (ok) {
@@ -2371,6 +2471,10 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
        }
 
        drbd_put_data_sock(mdev);
+
+       if (ok)
+               consider_delay_probes(mdev);
+
        return ok;
 }
 
@@ -2409,13 +2513,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
                                        sizeof(p), MSG_MORE);
        if (ok && dgs) {
                dgb = mdev->int_dig_out;
-               drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
+               drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
                ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
        }
        if (ok)
-               ok = _drbd_send_zc_bio(mdev, e->private_bio);
+               ok = _drbd_send_zc_ee(mdev, e);
 
        drbd_put_data_sock(mdev);
+
+       if (ok)
+               consider_delay_probes(mdev);
+
        return ok;
 }
 
@@ -2600,6 +2708,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
        atomic_set(&mdev->net_cnt, 0);
        atomic_set(&mdev->packet_seq, 0);
        atomic_set(&mdev->pp_in_use, 0);
+       atomic_set(&mdev->new_c_uuid, 0);
 
        mutex_init(&mdev->md_io_mutex);
        mutex_init(&mdev->data.mutex);
@@ -2628,16 +2737,26 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
        INIT_LIST_HEAD(&mdev->unplug_work.list);
        INIT_LIST_HEAD(&mdev->md_sync_work.list);
        INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
+       INIT_LIST_HEAD(&mdev->delay_probes);
+       INIT_LIST_HEAD(&mdev->delay_probe_work.list);
+       INIT_LIST_HEAD(&mdev->uuid_work.list);
+
        mdev->resync_work.cb  = w_resync_inactive;
        mdev->unplug_work.cb  = w_send_write_hint;
        mdev->md_sync_work.cb = w_md_sync;
        mdev->bm_io_work.w.cb = w_bitmap_io;
+       mdev->delay_probe_work.cb = w_delay_probes;
+       mdev->uuid_work.cb = w_new_current_uuid;
        init_timer(&mdev->resync_timer);
        init_timer(&mdev->md_sync_timer);
+       init_timer(&mdev->delay_probe_timer);
        mdev->resync_timer.function = resync_timer_fn;
        mdev->resync_timer.data = (unsigned long) mdev;
        mdev->md_sync_timer.function = md_sync_timer_fn;
        mdev->md_sync_timer.data = (unsigned long) mdev;
+       mdev->delay_probe_timer.function = delay_probe_timer_fn;
+       mdev->delay_probe_timer.data = (unsigned long) mdev;
+
 
        init_waitqueue_head(&mdev->misc_wait);
        init_waitqueue_head(&mdev->state_wait);
@@ -2680,7 +2799,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
        drbd_set_my_capacity(mdev, 0);
        if (mdev->bitmap) {
                /* maybe never allocated. */
-               drbd_bm_resize(mdev, 0);
+               drbd_bm_resize(mdev, 0, 1);
                drbd_bm_cleanup(mdev);
        }
 
@@ -3129,7 +3248,7 @@ int __init drbd_init(void)
        if (err)
                goto Enomem;
 
-       drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
+       drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
        if (!drbd_proc) {
                printk(KERN_ERR "drbd: unable to register proc file\n");
                goto Enomem;
@@ -3660,7 +3779,8 @@ _drbd_fault_str(unsigned int type) {
                [DRBD_FAULT_DT_RD] = "Data read",
                [DRBD_FAULT_DT_RA] = "Data read ahead",
                [DRBD_FAULT_BM_ALLOC] = "BM allocation",
-               [DRBD_FAULT_AL_EE] = "EE allocation"
+               [DRBD_FAULT_AL_EE] = "EE allocation",
+               [DRBD_FAULT_RECEIVE] = "receive data corruption",
        };
 
        return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
index 6429d2b..632e324 100644 (file)
@@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
  * Returns 0 on success, negative return values indicate errors.
  * You should call drbd_md_sync() after calling this function.
  */
-enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local)
+enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
 {
        sector_t prev_first_sect, prev_size; /* previous meta location */
        sector_t la_size;
@@ -541,12 +541,12 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force
        /* TODO: should only be some assert here, not (re)init... */
        drbd_md_set_sector_offsets(mdev, mdev->ldev);
 
-       size = drbd_new_dev_size(mdev, mdev->ldev, force);
+       size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
 
        if (drbd_get_capacity(mdev->this_bdev) != size ||
            drbd_bm_capacity(mdev) != size) {
                int err;
-               err = drbd_bm_resize(mdev, size);
+               err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
                if (unlikely(err)) {
                        /* currently there is only one error: ENOMEM! */
                        size = drbd_bm_capacity(mdev)>>1;
@@ -704,9 +704,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
        struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
        int max_segments = mdev->ldev->dc.max_bio_bvecs;
 
-       if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
-               max_seg_s = PAGE_SIZE;
-
        max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
 
        blk_queue_max_hw_sectors(q, max_seg_s >> 9);
@@ -1199,13 +1196,12 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
        }
 
        /* allocation not in the IO path, cqueue thread context */
-       new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+       new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
        if (!new_conf) {
                retcode = ERR_NOMEM;
                goto fail;
        }
 
-       memset(new_conf, 0, sizeof(struct net_conf));
        new_conf->timeout          = DRBD_TIMEOUT_DEF;
        new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
        new_conf->ping_int         = DRBD_PING_INT_DEF;
@@ -1477,8 +1473,8 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 {
        struct resize rs;
        int retcode = NO_ERROR;
-       int ldsc = 0; /* local disk size changed */
        enum determine_dev_size dd;
+       enum dds_flags ddsf;
 
        memset(&rs, 0, sizeof(struct resize));
        if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
@@ -1502,13 +1498,17 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
                goto fail;
        }
 
-       if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
-               mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
-               ldsc = 1;
+       if (rs.no_resync && mdev->agreed_pro_version < 93) {
+               retcode = ERR_NEED_APV_93;
+               goto fail;
        }
 
+       if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
+               mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+
        mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
-       dd = drbd_determin_dev_size(mdev, rs.resize_force);
+       ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+       dd = drbd_determin_dev_size(mdev, ddsf);
        drbd_md_sync(mdev);
        put_ldev(mdev);
        if (dd == dev_size_error) {
@@ -1516,12 +1516,12 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
                goto fail;
        }
 
-       if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
+       if (mdev->state.conn == C_CONNECTED) {
                if (dd == grew)
                        set_bit(RESIZE_PENDING, &mdev->flags);
 
                drbd_send_uuids(mdev);
-               drbd_send_sizes(mdev, 1);
+               drbd_send_sizes(mdev, 1, ddsf);
        }
 
  fail:
@@ -1551,6 +1551,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
                sc.rate       = DRBD_RATE_DEF;
                sc.after      = DRBD_AFTER_DEF;
                sc.al_extents = DRBD_AL_EXTENTS_DEF;
+               sc.dp_volume  = DRBD_DP_VOLUME_DEF;
+               sc.dp_interval = DRBD_DP_INTERVAL_DEF;
+               sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
+               sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
        } else
                memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
 
@@ -2207,9 +2211,9 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 {
        struct cn_msg *cn_reply;
        struct drbd_nl_cfg_reply *reply;
-       struct bio_vec *bvec;
        unsigned short *tl;
-       int i;
+       struct page *page;
+       unsigned len;
 
        if (!e)
                return;
@@ -2247,11 +2251,15 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
        put_unaligned(T_ee_data, tl++);
        put_unaligned(e->size, tl++);
 
-       __bio_for_each_segment(bvec, e->private_bio, i, 0) {
-               void *d = kmap(bvec->bv_page);
-               memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
-               kunmap(bvec->bv_page);
-               tl=(unsigned short*)((char*)tl + bvec->bv_len);
+       len = e->size;
+       page = e->pages;
+       page_chain_for_each(page) {
+               void *d = kmap_atomic(page, KM_USER0);
+               unsigned l = min_t(unsigned, len, PAGE_SIZE);
+               memcpy(tl, d, l);
+               kunmap_atomic(d, KM_USER0);
+               tl = (unsigned short*)((char*)tl + l);
+               len -= l;
        }
        put_unaligned(TT_END, tl++); /* Close the tag list */
 
index be3374b..d0f1767 100644 (file)
@@ -73,14 +73,21 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
        seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
        /* if more than 1 GB display in MB */
        if (mdev->rs_total > 0x100000L)
-               seq_printf(seq, "(%lu/%lu)M\n\t",
+               seq_printf(seq, "(%lu/%lu)M",
                            (unsigned long) Bit2KB(rs_left >> 10),
                            (unsigned long) Bit2KB(mdev->rs_total >> 10));
        else
-               seq_printf(seq, "(%lu/%lu)K\n\t",
+               seq_printf(seq, "(%lu/%lu)K",
                            (unsigned long) Bit2KB(rs_left),
                            (unsigned long) Bit2KB(mdev->rs_total));
 
+       if (mdev->state.conn == C_SYNC_TARGET)
+               seq_printf(seq, " queue_delay: %d.%d ms\n\t",
+                          mdev->data_delay / 1000,
+                          (mdev->data_delay % 1000) / 100);
+       else if (mdev->state.conn == C_SYNC_SOURCE)
+               seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq);
+
        /* see drivers/md/md.c
         * We do not want to overflow, so the order of operands and
         * the * 100 / 100 trick are important. We do a +1 to be
@@ -128,6 +135,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
        else
                seq_printf(seq, " (%ld)", dbdt);
 
+       if (mdev->state.conn == C_SYNC_TARGET) {
+               if (mdev->c_sync_rate > 1000)
+                       seq_printf(seq, " want: %d,%03d",
+                                  mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
+               else
+                       seq_printf(seq, " want: %d", mdev->c_sync_rate);
+       }
+
        seq_printf(seq, " K/sec\n");
 }
 
index 3f096e7..bc9ab7f 100644 (file)
@@ -80,30 +80,128 @@ static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epo
 
 #define GFP_TRY        (__GFP_HIGHMEM | __GFP_NOWARN)
 
-static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+       struct page *page;
+       struct page *tmp;
+
+       BUG_ON(!n);
+       BUG_ON(!head);
+
+       page = *head;
+
+       if (!page)
+               return NULL;
+
+       while (page) {
+               tmp = page_chain_next(page);
+               if (--n == 0)
+                       break; /* found sufficient pages */
+               if (tmp == NULL)
+                       /* insufficient pages, don't use any of them. */
+                       return NULL;
+               page = tmp;
+       }
+
+       /* add end of list marker for the returned list */
+       set_page_private(page, 0);
+       /* actual return value, and adjustment of head */
+       page = *head;
+       *head = tmp;
+       return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+       struct page *tmp;
+       int i = 1;
+       while ((tmp = page_chain_next(page)))
+               ++i, page = tmp;
+       if (len)
+               *len = i;
+       return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+       struct page *tmp;
+       int i = 0;
+       page_chain_for_each_safe(page, tmp) {
+               put_page(page);
+               ++i;
+       }
+       return i;
+}
+
+static void page_chain_add(struct page **head,
+               struct page *chain_first, struct page *chain_last)
+{
+#if 1
+       struct page *tmp;
+       tmp = page_chain_tail(chain_first, NULL);
+       BUG_ON(tmp != chain_last);
+#endif
+
+       /* add chain to head */
+       set_page_private(chain_last, (unsigned long)*head);
+       *head = chain_first;
+}
+
+static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
 {
        struct page *page = NULL;
+       struct page *tmp = NULL;
+       int i = 0;
 
        /* Yes, testing drbd_pp_vacant outside the lock is racy.
         * So what. It saves a spin_lock. */
-       if (drbd_pp_vacant > 0) {
+       if (drbd_pp_vacant >= number) {
                spin_lock(&drbd_pp_lock);
-               page = drbd_pp_pool;
-               if (page) {
-                       drbd_pp_pool = (struct page *)page_private(page);
-                       set_page_private(page, 0); /* just to be polite */
-                       drbd_pp_vacant--;
-               }
+               page = page_chain_del(&drbd_pp_pool, number);
+               if (page)
+                       drbd_pp_vacant -= number;
                spin_unlock(&drbd_pp_lock);
+               if (page)
+                       return page;
        }
+
        /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
         * "criss-cross" setup, that might cause write-out on some other DRBD,
         * which in turn might block on the other node at this very place.  */
-       if (!page)
-               page = alloc_page(GFP_TRY);
-       if (page)
-               atomic_inc(&mdev->pp_in_use);
-       return page;
+       for (i = 0; i < number; i++) {
+               tmp = alloc_page(GFP_TRY);
+               if (!tmp)
+                       break;
+               set_page_private(tmp, (unsigned long)page);
+               page = tmp;
+       }
+
+       if (i == number)
+               return page;
+
+       /* Not enough pages immediately available this time.
+        * No need to jump around here, drbd_pp_alloc will retry this
+        * function "soon". */
+       if (page) {
+               tmp = page_chain_tail(page, NULL);
+               spin_lock(&drbd_pp_lock);
+               page_chain_add(&drbd_pp_pool, page, tmp);
+               drbd_pp_vacant += i;
+               spin_unlock(&drbd_pp_lock);
+       }
+       return NULL;
 }
 
 /* kick lower level device, if we have more than (arbitrary number)
@@ -127,7 +225,7 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
 
        list_for_each_safe(le, tle, &mdev->net_ee) {
                e = list_entry(le, struct drbd_epoch_entry, w.list);
-               if (drbd_bio_has_active_page(e->private_bio))
+               if (drbd_ee_has_active_page(e))
                        break;
                list_move(le, to_be_freed);
        }
@@ -148,32 +246,34 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 }
 
 /**
- * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
+ * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
  * @mdev:      DRBD device.
- * @retry:     whether or not to retry allocation forever (or until signalled)
+ * @number:    number of pages requested
+ * @retry:     whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel, unless this allocation would exceed the max_buffers setting.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
  *
- * Tries to allocate a page, first from our own page pool, then from the
- * kernel, unless this allocation would exceed the max_buffers setting.
- * If @retry is non-zero, retry until DRBD frees a page somewhere else.
+ * Returns a page chain linked via page->private.
  */
-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
+static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
 {
        struct page *page = NULL;
        DEFINE_WAIT(wait);
 
-       if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
-               page = drbd_pp_first_page_or_try_alloc(mdev);
-               if (page)
-                       return page;
-       }
+       /* Yes, we may run up to @number over max_buffers. If we
+        * follow it strictly, the admin will get it wrong anyways. */
+       if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
+               page = drbd_pp_first_pages_or_try_alloc(mdev, number);
 
-       for (;;) {
+       while (page == NULL) {
                prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 
                drbd_kick_lo_and_reclaim_net(mdev);
 
                if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
-                       page = drbd_pp_first_page_or_try_alloc(mdev);
+                       page = drbd_pp_first_pages_or_try_alloc(mdev, number);
                        if (page)
                                break;
                }
@@ -190,62 +290,32 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
        }
        finish_wait(&drbd_pp_wait, &wait);
 
+       if (page)
+               atomic_add(number, &mdev->pp_in_use);
        return page;
 }
 
 /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
- * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
 static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
 {
-       int free_it;
-
-       spin_lock(&drbd_pp_lock);
-       if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
-               free_it = 1;
-       } else {
-               set_page_private(page, (unsigned long)drbd_pp_pool);
-               drbd_pp_pool = page;
-               drbd_pp_vacant++;
-               free_it = 0;
-       }
-       spin_unlock(&drbd_pp_lock);
-
-       atomic_dec(&mdev->pp_in_use);
-
-       if (free_it)
-               __free_page(page);
-
-       wake_up(&drbd_pp_wait);
-}
-
-static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
-{
-       struct page *p_to_be_freed = NULL;
-       struct page *page;
-       struct bio_vec *bvec;
        int i;
-
-       spin_lock(&drbd_pp_lock);
-       __bio_for_each_segment(bvec, bio, i, 0) {
-               if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
-                       set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
-                       p_to_be_freed = bvec->bv_page;
-               } else {
-                       set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
-                       drbd_pp_pool = bvec->bv_page;
-                       drbd_pp_vacant++;
-               }
-       }
-       spin_unlock(&drbd_pp_lock);
-       atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
-
-       while (p_to_be_freed) {
-               page = p_to_be_freed;
-               p_to_be_freed = (struct page *)page_private(page);
-               set_page_private(page, 0); /* just to be polite */
-               put_page(page);
+       if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
+               i = page_chain_free(page);
+       else {
+               struct page *tmp;
+               tmp = page_chain_tail(page, &i);
+               spin_lock(&drbd_pp_lock);
+               page_chain_add(&drbd_pp_pool, page, tmp);
+               drbd_pp_vacant += i;
+               spin_unlock(&drbd_pp_lock);
        }
-
+       atomic_sub(i, &mdev->pp_in_use);
+       i = atomic_read(&mdev->pp_in_use);
+       if (i < 0)
+               dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
        wake_up(&drbd_pp_wait);
 }
 
@@ -270,11 +340,9 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
                                     unsigned int data_size,
                                     gfp_t gfp_mask) __must_hold(local)
 {
-       struct request_queue *q;
        struct drbd_epoch_entry *e;
        struct page *page;
-       struct bio *bio;
-       unsigned int ds;
+       unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 
        if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
                return NULL;
@@ -286,84 +354,32 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
                return NULL;
        }
 
-       bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
-       if (!bio) {
-               if (!(gfp_mask & __GFP_NOWARN))
-                       dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
-               goto fail1;
-       }
-
-       bio->bi_bdev = mdev->ldev->backing_bdev;
-       bio->bi_sector = sector;
-
-       ds = data_size;
-       while (ds) {
-               page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
-               if (!page) {
-                       if (!(gfp_mask & __GFP_NOWARN))
-                               dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
-                       goto fail2;
-               }
-               if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
-                       drbd_pp_free(mdev, page);
-                       dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
-                           "data_size=%u,ds=%u) failed\n",
-                           (unsigned long long)sector, data_size, ds);
-
-                       q = bdev_get_queue(bio->bi_bdev);
-                       if (q->merge_bvec_fn) {
-                               struct bvec_merge_data bvm = {
-                                       .bi_bdev = bio->bi_bdev,
-                                       .bi_sector = bio->bi_sector,
-                                       .bi_size = bio->bi_size,
-                                       .bi_rw = bio->bi_rw,
-                               };
-                               int l = q->merge_bvec_fn(q, &bvm,
-                                               &bio->bi_io_vec[bio->bi_vcnt]);
-                               dev_err(DEV, "merge_bvec_fn() = %d\n", l);
-                       }
-
-                       /* dump more of the bio. */
-                       dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
-                       dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
-                       dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
-                       dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
-
-                       goto fail2;
-                       break;
-               }
-               ds -= min_t(int, ds, PAGE_SIZE);
-       }
-
-       D_ASSERT(data_size == bio->bi_size);
-
-       bio->bi_private = e;
-       e->mdev = mdev;
-       e->sector = sector;
-       e->size = bio->bi_size;
+       page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+       if (!page)
+               goto fail;
 
-       e->private_bio = bio;
-       e->block_id = id;
        INIT_HLIST_NODE(&e->colision);
        e->epoch = NULL;
+       e->mdev = mdev;
+       e->pages = page;
+       atomic_set(&e->pending_bios, 0);
+       e->size = data_size;
        e->flags = 0;
+       e->sector = sector;
+       e->sector = sector;
+       e->block_id = id;
 
        return e;
 
- fail2:
-       drbd_pp_free_bio_pages(mdev, bio);
-       bio_put(bio);
- fail1:
+ fail:
        mempool_free(e, drbd_ee_mempool);
-
        return NULL;
 }
 
 void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 {
-       struct bio *bio = e->private_bio;
-       drbd_pp_free_bio_pages(mdev, bio);
-       bio_put(bio);
+       drbd_pp_free(mdev, e->pages);
+       D_ASSERT(atomic_read(&e->pending_bios) == 0);
        D_ASSERT(hlist_unhashed(&e->colision));
        mempool_free(e, drbd_ee_mempool);
 }
@@ -902,7 +918,7 @@ retry:
        if (!drbd_send_protocol(mdev))
                return -1;
        drbd_send_sync_param(mdev, &mdev->sync_conf);
-       drbd_send_sizes(mdev, 0);
+       drbd_send_sizes(mdev, 0, 0);
        drbd_send_uuids(mdev);
        drbd_send_state(mdev);
        clear_bit(USE_DEGR_WFC_T, &mdev->flags);
@@ -946,7 +962,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
        int rv;
 
        if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
-               rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+               rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
+                                       NULL, BLKDEV_IFL_WAIT);
                if (rv) {
                        dev_err(DEV, "local disk flush failed with status %d\n", rv);
                        /* would rather check on EOPNOTSUPP, but that is not reliable.
@@ -1119,6 +1136,101 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
                dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
 }
 
+/**
+ * drbd_submit_ee()
+ * @mdev:      DRBD device.
+ * @e:         epoch entry
+ * @rw:                flag field, see bio->bi_rw
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+               const unsigned rw, const int fault_type)
+{
+       struct bio *bios = NULL;
+       struct bio *bio;
+       struct page *page = e->pages;
+       sector_t sector = e->sector;
+       unsigned ds = e->size;
+       unsigned n_bios = 0;
+       unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+       if (atomic_read(&mdev->new_c_uuid)) {
+               if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
+                       drbd_uuid_new_current(mdev);
+                       drbd_md_sync(mdev);
+
+                       atomic_dec(&mdev->new_c_uuid);
+                       wake_up(&mdev->misc_wait);
+               }
+               wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
+       }
+
+       /* In most cases, we will only need one bio.  But in case the lower
+        * level restrictions happen to be different at this offset on this
+        * side than those of the sending peer, we may need to submit the
+        * request in more than one bio. */
+next_bio:
+       bio = bio_alloc(GFP_NOIO, nr_pages);
+       if (!bio) {
+               dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
+               goto fail;
+       }
+       /* > e->sector, unless this is the first bio */
+       bio->bi_sector = sector;
+       bio->bi_bdev = mdev->ldev->backing_bdev;
+       /* we special case some flags in the multi-bio case, see below
+        * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */
+       bio->bi_rw = rw;
+       bio->bi_private = e;
+       bio->bi_end_io = drbd_endio_sec;
+
+       bio->bi_next = bios;
+       bios = bio;
+       ++n_bios;
+
+       page_chain_for_each(page) {
+               unsigned len = min_t(unsigned, ds, PAGE_SIZE);
+               if (!bio_add_page(bio, page, len, 0)) {
+                       /* a single page must always be possible! */
+                       BUG_ON(bio->bi_vcnt == 0);
+                       goto next_bio;
+               }
+               ds -= len;
+               sector += len >> 9;
+               --nr_pages;
+       }
+       D_ASSERT(page == NULL);
+       D_ASSERT(ds == 0);
+
+       atomic_set(&e->pending_bios, n_bios);
+       do {
+               bio = bios;
+               bios = bios->bi_next;
+               bio->bi_next = NULL;
+
+               /* strip off BIO_RW_UNPLUG unless it is the last bio */
+               if (bios)
+                       bio->bi_rw &= ~(1<<BIO_RW_UNPLUG);
+
+               drbd_generic_make_request(mdev, fault_type, bio);
+
+               /* strip off BIO_RW_BARRIER,
+                * unless it is the first or last bio */
+               if (bios && bios->bi_next)
+                       bios->bi_rw &= ~(1<<BIO_RW_BARRIER);
+       } while (bios);
+       maybe_kick_lo(mdev);
+       return 0;
+
+fail:
+       while (bios) {
+               bio = bios;
+               bios = bios->bi_next;
+               bio_put(bio);
+       }
+       return -ENOMEM;
+}
+
 /**
  * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
  * @mdev:      DRBD device.
@@ -1128,8 +1240,6 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
 int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
 {
        struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
-       struct bio *bio = e->private_bio;
-
        /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
           (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
           so that we can finish that epoch in drbd_may_finish_epoch().
@@ -1143,33 +1253,17 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
        if (previous_epoch(mdev, e->epoch))
                dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
 
-       /* prepare bio for re-submit,
-        * re-init volatile members */
        /* we still have a local reference,
         * get_ldev was done in receive_Data. */
-       bio->bi_bdev = mdev->ldev->backing_bdev;
-       bio->bi_sector = e->sector;
-       bio->bi_size = e->size;
-       bio->bi_idx = 0;
-
-       bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-       bio->bi_flags |= 1 << BIO_UPTODATE;
-
-       /* don't know whether this is necessary: */
-       bio->bi_phys_segments = 0;
-       bio->bi_next = NULL;
-
-       /* these should be unchanged: */
-       /* bio->bi_end_io = drbd_endio_write_sec; */
-       /* bio->bi_vcnt = whatever; */
 
        e->w.cb = e_end_block;
-
-       /* This is no longer a barrier request. */
-       bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
-
-       drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
-
+       if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
+               /* drbd_submit_ee fails for one reason only:
+                * if was not able to allocate sufficient bios.
+                * requeue, try again later. */
+               e->w.cb = w_e_reissue;
+               drbd_queue_work(&mdev->data.work, &e->w);
+       }
        return 1;
 }
 
@@ -1261,13 +1355,13 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
 static struct drbd_epoch_entry *
 read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
 {
+       const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
        struct drbd_epoch_entry *e;
-       struct bio_vec *bvec;
        struct page *page;
-       struct bio *bio;
-       int dgs, ds, i, rr;
+       int dgs, ds, rr;
        void *dig_in = mdev->int_dig_in;
        void *dig_vv = mdev->int_dig_vv;
+       unsigned long *data;
 
        dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
                crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
@@ -1286,29 +1380,44 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
        ERR_IF(data_size &  0x1ff) return NULL;
        ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
 
+       /* even though we trust out peer,
+        * we sometimes have to double check. */
+       if (sector + (data_size>>9) > capacity) {
+               dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
+                       (unsigned long long)capacity,
+                       (unsigned long long)sector, data_size);
+               return NULL;
+       }
+
        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
         * "criss-cross" setup, that might cause write-out on some other DRBD,
         * which in turn might block on the other node at this very place.  */
        e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
        if (!e)
                return NULL;
-       bio = e->private_bio;
+
        ds = data_size;
-       bio_for_each_segment(bvec, bio, i) {
-               page = bvec->bv_page;
-               rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
+       page = e->pages;
+       page_chain_for_each(page) {
+               unsigned len = min_t(int, ds, PAGE_SIZE);
+               data = kmap(page);
+               rr = drbd_recv(mdev, data, len);
+               if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
+                       dev_err(DEV, "Fault injection: Corrupting data on receive\n");
+                       data[0] = data[0] ^ (unsigned long)-1;
+               }
                kunmap(page);
-               if (rr != min_t(int, ds, PAGE_SIZE)) {
+               if (rr != len) {
                        drbd_free_ee(mdev, e);
                        dev_warn(DEV, "short read receiving data: read %d expected %d\n",
-                            rr, min_t(int, ds, PAGE_SIZE));
+                            rr, len);
                        return NULL;
                }
                ds -= rr;
        }
 
        if (dgs) {
-               drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+               drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
                if (memcmp(dig_in, dig_vv, dgs)) {
                        dev_err(DEV, "Digest integrity check FAILED.\n");
                        drbd_bcast_ee(mdev, "digest failed",
@@ -1330,7 +1439,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
        int rr, rv = 1;
        void *data;
 
-       page = drbd_pp_alloc(mdev, 1);
+       if (!data_size)
+               return TRUE;
+
+       page = drbd_pp_alloc(mdev, 1, 1);
 
        data = kmap(page);
        while (data_size) {
@@ -1394,7 +1506,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
        }
 
        if (dgs) {
-               drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+               drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
                if (memcmp(dig_in, dig_vv, dgs)) {
                        dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
                        return 0;
@@ -1415,7 +1527,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
 
        D_ASSERT(hlist_unhashed(&e->colision));
 
-       if (likely(drbd_bio_uptodate(e->private_bio))) {
+       if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                drbd_set_in_sync(mdev, sector, e->size);
                ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
        } else {
@@ -1434,30 +1546,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
        struct drbd_epoch_entry *e;
 
        e = read_in_block(mdev, ID_SYNCER, sector, data_size);
-       if (!e) {
-               put_ldev(mdev);
-               return FALSE;
-       }
+       if (!e)
+               goto fail;
 
        dec_rs_pending(mdev);
 
-       e->private_bio->bi_end_io = drbd_endio_write_sec;
-       e->private_bio->bi_rw = WRITE;
-       e->w.cb = e_end_resync_block;
-
        inc_unacked(mdev);
        /* corresponding dec_unacked() in e_end_resync_block()
         * respective _drbd_clear_done_ee */
 
+       e->w.cb = e_end_resync_block;
+
        spin_lock_irq(&mdev->req_lock);
        list_add(&e->w.list, &mdev->sync_ee);
        spin_unlock_irq(&mdev->req_lock);
 
-       drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
-       /* accounting done in endio */
+       if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
+               return TRUE;
 
-       maybe_kick_lo(mdev);
-       return TRUE;
+       drbd_free_ee(mdev, e);
+fail:
+       put_ldev(mdev);
+       return FALSE;
 }
 
 static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
@@ -1552,7 +1662,7 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
        }
 
        if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
-               if (likely(drbd_bio_uptodate(e->private_bio))) {
+               if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                        pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
                                mdev->state.conn <= C_PAUSED_SYNC_T &&
                                e->flags & EE_MAY_SET_IN_SYNC) ?
@@ -1698,7 +1808,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
                return FALSE;
        }
 
-       e->private_bio->bi_end_io = drbd_endio_write_sec;
        e->w.cb = e_end_block;
 
        spin_lock(&mdev->epoch_lock);
@@ -1894,12 +2003,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
                drbd_al_begin_io(mdev, e->sector);
        }
 
-       e->private_bio->bi_rw = rw;
-       drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
-       /* accounting done in endio */
-
-       maybe_kick_lo(mdev);
-       return TRUE;
+       if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
+               return TRUE;
 
 out_interrupted:
        /* yes, the epoch_size now is imbalanced.
@@ -1945,7 +2050,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
                            "no local data.\n");
                drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
                                 P_NEG_RS_DREPLY , p);
-               return TRUE;
+               return drbd_drain_block(mdev, h->length - brps);
        }
 
        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -1957,9 +2062,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
                return FALSE;
        }
 
-       e->private_bio->bi_rw = READ;
-       e->private_bio->bi_end_io = drbd_endio_read_sec;
-
        switch (h->command) {
        case P_DATA_REQUEST:
                e->w.cb = w_e_end_data_req;
@@ -2053,10 +2155,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 
        inc_unacked(mdev);
 
-       drbd_generic_make_request(mdev, fault_type, e->private_bio);
-       maybe_kick_lo(mdev);
-
-       return TRUE;
+       if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
+               return TRUE;
 
 out_free_e:
        kfree(di);
@@ -2473,6 +2573,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
                     hg > 0 ? "source" : "target");
        }
 
+       if (abs(hg) == 100)
+               drbd_khelper(mdev, "initial-split-brain");
+
        if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
                int pcount = (mdev->state.role == R_PRIMARY)
                           + (peer_role == R_PRIMARY);
@@ -2518,7 +2621,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
                 * after an attempted attach on a diskless node.
                 * We just refuse to attach -- well, we drop the "connection"
                 * to that disk, in a way... */
-               dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
+               dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
                drbd_khelper(mdev, "split-brain");
                return C_MASK;
        }
@@ -2849,7 +2952,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
        unsigned int max_seg_s;
        sector_t p_size, p_usize, my_usize;
        int ldsc = 0; /* local disk size changed */
-       enum drbd_conns nconn;
+       enum dds_flags ddsf;
 
        ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
        if (drbd_recv(mdev, h->payload, h->length) != h->length)
@@ -2905,8 +3008,9 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
        }
 #undef min_not_zero
 
+       ddsf = be16_to_cpu(p->dds_flags);
        if (get_ldev(mdev)) {
-         dd = drbd_determin_dev_size(mdev, 0);
+               dd = drbd_determin_dev_size(mdev, ddsf);
                put_ldev(mdev);
                if (dd == dev_size_error)
                        return FALSE;
@@ -2916,33 +3020,21 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
                drbd_set_my_capacity(mdev, p_size);
        }
 
-       if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
-               nconn = drbd_sync_handshake(mdev,
-                               mdev->state.peer, mdev->state.pdsk);
-               put_ldev(mdev);
-
-               if (nconn == C_MASK) {
-                       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-                       return FALSE;
-               }
-
-               if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
-                       drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-                       return FALSE;
-               }
-       }
-
        if (get_ldev(mdev)) {
                if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
                        mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
                        ldsc = 1;
                }
 
-               max_seg_s = be32_to_cpu(p->max_segment_size);
+               if (mdev->agreed_pro_version < 94)
+                       max_seg_s = be32_to_cpu(p->max_segment_size);
+               else /* drbd 8.3.8 onwards */
+                       max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+
                if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
                        drbd_setup_queue_param(mdev, max_seg_s);
 
-               drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
+               drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
                put_ldev(mdev);
        }
 
@@ -2951,14 +3043,17 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
                    drbd_get_capacity(mdev->this_bdev) || ldsc) {
                        /* we have different sizes, probably peer
                         * needs to know my new size... */
-                       drbd_send_sizes(mdev, 0);
+                       drbd_send_sizes(mdev, 0, ddsf);
                }
                if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
                    (dd == grew && mdev->state.conn == C_CONNECTED)) {
                        if (mdev->state.pdsk >= D_INCONSISTENT &&
-                           mdev->state.disk >= D_INCONSISTENT)
-                               resync_after_online_grow(mdev);
-                       else
+                           mdev->state.disk >= D_INCONSISTENT) {
+                               if (ddsf & DDSF_NO_RESYNC)
+                                       dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
+                               else
+                                       resync_after_online_grow(mdev);
+                       } else
                                set_bit(RESYNC_AFTER_NEG, &mdev->flags);
                }
        }
@@ -3490,6 +3585,92 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
        return TRUE;
 }
 
+static void timeval_sub_us(struct timeval* tv, unsigned int us)
+{
+       tv->tv_sec -= us / 1000000;
+       us = us % 1000000;
+       if (tv->tv_usec > us) {
+               tv->tv_usec += 1000000;
+               tv->tv_sec--;
+       }
+       tv->tv_usec -= us;
+}
+
+static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p)
+{
+       struct delay_probe *dp;
+       struct list_head *le;
+       struct timeval now;
+       int seq_num;
+       int offset;
+       int data_delay;
+
+       seq_num = be32_to_cpu(p->seq_num);
+       offset  = be32_to_cpu(p->offset);
+
+       spin_lock(&mdev->peer_seq_lock);
+       if (!list_empty(&mdev->delay_probes)) {
+               if (from == USE_DATA_SOCKET)
+                       le = mdev->delay_probes.next;
+               else
+                       le = mdev->delay_probes.prev;
+
+               dp = list_entry(le, struct delay_probe, list);
+
+               if (dp->seq_num == seq_num) {
+                       list_del(le);
+                       spin_unlock(&mdev->peer_seq_lock);
+                       do_gettimeofday(&now);
+                       timeval_sub_us(&now, offset);
+                       data_delay =
+                               now.tv_usec - dp->time.tv_usec +
+                               (now.tv_sec - dp->time.tv_sec) * 1000000;
+
+                       if (data_delay > 0)
+                               mdev->data_delay = data_delay;
+
+                       kfree(dp);
+                       return;
+               }
+
+               if (dp->seq_num > seq_num) {
+                       spin_unlock(&mdev->peer_seq_lock);
+                       dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n");
+                       return; /* Do not alloca a struct delay_probe.... */
+               }
+       }
+       spin_unlock(&mdev->peer_seq_lock);
+
+       dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO);
+       if (!dp) {
+               dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n");
+               return;
+       }
+
+       dp->seq_num = seq_num;
+       do_gettimeofday(&dp->time);
+       timeval_sub_us(&dp->time, offset);
+
+       spin_lock(&mdev->peer_seq_lock);
+       if (from == USE_DATA_SOCKET)
+               list_add(&dp->list, &mdev->delay_probes);
+       else
+               list_add_tail(&dp->list, &mdev->delay_probes);
+       spin_unlock(&mdev->peer_seq_lock);
+}
+
+static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h)
+{
+       struct p_delay_probe *p = (struct p_delay_probe *)h;
+
+       ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+       if (drbd_recv(mdev, h->payload, h->length) != h->length)
+               return FALSE;
+
+       got_delay_probe(mdev, USE_DATA_SOCKET, p);
+       return TRUE;
+}
+
 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
 
 static drbd_cmd_handler_f drbd_default_handler[] = {
@@ -3513,6 +3694,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
        [P_OV_REQUEST]      = receive_DataRequest,
        [P_OV_REPLY]        = receive_DataRequest,
        [P_CSUM_RS_REQUEST]    = receive_DataRequest,
+       [P_DELAY_PROBE]     = receive_delay_probe,
        /* anything missing from this table is in
         * the asender_tbl, see get_asender_cmd */
        [P_MAX_CMD]         = NULL,
@@ -3739,7 +3921,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
                dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
        i = atomic_read(&mdev->pp_in_use);
        if (i)
-               dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
+               dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
 
        D_ASSERT(list_empty(&mdev->read_ee));
        D_ASSERT(list_empty(&mdev->active_ee));
@@ -4232,7 +4414,6 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
 
        sector = be64_to_cpu(p->sector);
        size = be32_to_cpu(p->blksize);
-       D_ASSERT(p->block_id == ID_SYNCER);
 
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
@@ -4290,6 +4471,14 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
        return TRUE;
 }
 
+static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h)
+{
+       struct p_delay_probe *p = (struct p_delay_probe *)h;
+
+       got_delay_probe(mdev, USE_META_SOCKET, p);
+       return TRUE;
+}
+
 struct asender_cmd {
        size_t pkt_size;
        int (*process)(struct drbd_conf *mdev, struct p_header *h);
@@ -4314,6 +4503,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
        [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
        [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
        [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+       [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_delay_probe_m },
        [P_MAX_CMD]         = { 0, NULL },
        };
        if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
index de81ab7..3397f11 100644 (file)
@@ -722,6 +722,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
        struct drbd_request *req;
        int local, remote;
        int err = -EIO;
+       int ret = 0;
 
        /* allocate outside of all locks; */
        req = drbd_req_new(mdev, bio);
@@ -784,7 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
                            (mdev->state.pdsk == D_INCONSISTENT &&
                             mdev->state.conn >= C_CONNECTED));
 
-       if (!(local || remote)) {
+       if (!(local || remote) && !mdev->state.susp) {
                dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
                goto fail_free_complete;
        }
@@ -810,6 +811,16 @@ allocate_barrier:
        /* GOOD, everything prepared, grab the spin_lock */
        spin_lock_irq(&mdev->req_lock);
 
+       if (mdev->state.susp) {
+               /* If we got suspended, use the retry mechanism of
+                  generic_make_request() to restart processing of this
+                  bio. In the next call to drbd_make_request_26
+                  we sleep in inc_ap_bio() */
+               ret = 1;
+               spin_unlock_irq(&mdev->req_lock);
+               goto fail_free_complete;
+       }
+
        if (remote) {
                remote = (mdev->state.pdsk == D_UP_TO_DATE ||
                            (mdev->state.pdsk == D_INCONSISTENT &&
@@ -947,12 +958,14 @@ fail_and_free_req:
                req->private_bio = NULL;
                put_ldev(mdev);
        }
-       bio_endio(bio, err);
+       if (!ret)
+               bio_endio(bio, err);
+
        drbd_req_free(req);
        dec_ap_bio(mdev);
        kfree(b);
 
-       return 0;
+       return ret;
 }
 
 /* helper function for drbd_make_request
@@ -962,11 +975,6 @@ fail_and_free_req:
  */
 static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
 {
-       /* Unconfigured */
-       if (mdev->state.conn == C_DISCONNECTING &&
-           mdev->state.disk == D_DISKLESS)
-               return 1;
-
        if (mdev->state.role != R_PRIMARY &&
                (!allow_oos || is_write)) {
                if (__ratelimit(&drbd_ratelimit_state)) {
@@ -1070,15 +1078,21 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 
                /* we need to get a "reference count" (ap_bio_cnt)
                 * to avoid races with the disconnect/reconnect/suspend code.
-                * In case we need to split the bio here, we need to get two references
+                * In case we need to split the bio here, we need to get three references
                 * atomically, otherwise we might deadlock when trying to submit the
                 * second one! */
-               inc_ap_bio(mdev, 2);
+               inc_ap_bio(mdev, 3);
 
                D_ASSERT(e_enr == s_enr + 1);
 
-               drbd_make_request_common(mdev, &bp->bio1);
-               drbd_make_request_common(mdev, &bp->bio2);
+               while (drbd_make_request_common(mdev, &bp->bio1))
+                       inc_ap_bio(mdev, 1);
+
+               while (drbd_make_request_common(mdev, &bp->bio2))
+                       inc_ap_bio(mdev, 1);
+
+               dec_ap_bio(mdev);
+
                bio_pair_release(bp);
        }
        return 0;
@@ -1115,7 +1129,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
        } else if (limit && get_ldev(mdev)) {
                struct request_queue * const b =
                        mdev->ldev->backing_bdev->bd_disk->queue;
-               if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
+               if (b->merge_bvec_fn) {
                        backing_limit = b->merge_bvec_fn(b, bvm, bvec);
                        limit = min(limit, backing_limit);
                }
index 76863e3..85179e1 100644 (file)
@@ -70,7 +70,7 @@ static const char *drbd_disk_s_names[] = {
 
 static const char *drbd_state_sw_errors[] = {
        [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
-       [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
+       [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
        [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
        [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
        [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
index d48a1df..727ff63 100644 (file)
@@ -47,8 +47,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
 
 /* defined here:
    drbd_md_io_complete
-   drbd_endio_write_sec
-   drbd_endio_read_sec
+   drbd_endio_sec
    drbd_endio_pri
 
  * more endio handlers:
@@ -85,27 +84,10 @@ void drbd_md_io_complete(struct bio *bio, int error)
 /* reads on behalf of the partner,
  * "submitted" by the receiver
  */
-void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
+void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
 {
        unsigned long flags = 0;
-       struct drbd_epoch_entry *e = NULL;
-       struct drbd_conf *mdev;
-       int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
-       e = bio->bi_private;
-       mdev = e->mdev;
-
-       if (error)
-               dev_warn(DEV, "read: error=%d s=%llus\n", error,
-                               (unsigned long long)e->sector);
-       if (!error && !uptodate) {
-               dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
-                               (unsigned long long)e->sector);
-               /* strange behavior of some lower level drivers...
-                * fail the request by clearing the uptodate flag,
-                * but do not return any error?! */
-               error = -EIO;
-       }
+       struct drbd_conf *mdev = e->mdev;
 
        D_ASSERT(e->block_id != ID_VACANT);
 
@@ -114,49 +96,38 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
        list_del(&e->w.list);
        if (list_empty(&mdev->read_ee))
                wake_up(&mdev->ee_wait);
+       if (test_bit(__EE_WAS_ERROR, &e->flags))
+               __drbd_chk_io_error(mdev, FALSE);
        spin_unlock_irqrestore(&mdev->req_lock, flags);
 
-       drbd_chk_io_error(mdev, error, FALSE);
        drbd_queue_work(&mdev->data.work, &e->w);
        put_ldev(mdev);
 }
 
+static int is_failed_barrier(int ee_flags)
+{
+       return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
+                       == (EE_IS_BARRIER|EE_WAS_ERROR);
+}
+
 /* writes on behalf of the partner, or resync writes,
- * "submitted" by the receiver.
- */
-void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
+ * "submitted" by the receiver, final stage.  */
+static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
 {
        unsigned long flags = 0;
-       struct drbd_epoch_entry *e = NULL;
-       struct drbd_conf *mdev;
+       struct drbd_conf *mdev = e->mdev;
        sector_t e_sector;
        int do_wake;
        int is_syncer_req;
        int do_al_complete_io;
-       int uptodate = bio_flagged(bio, BIO_UPTODATE);
-       int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
-
-       e = bio->bi_private;
-       mdev = e->mdev;
 
-       if (error)
-               dev_warn(DEV, "write: error=%d s=%llus\n", error,
-                               (unsigned long long)e->sector);
-       if (!error && !uptodate) {
-               dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
-                               (unsigned long long)e->sector);
-               /* strange behavior of some lower level drivers...
-                * fail the request by clearing the uptodate flag,
-                * but do not return any error?! */
-               error = -EIO;
-       }
-
-       /* error == -ENOTSUPP would be a better test,
-        * alas it is not reliable */
-       if (error && is_barrier && e->flags & EE_IS_BARRIER) {
+       /* if this is a failed barrier request, disable use of barriers,
+        * and schedule for resubmission */
+       if (is_failed_barrier(e->flags)) {
                drbd_bump_write_ordering(mdev, WO_bdev_flush);
                spin_lock_irqsave(&mdev->req_lock, flags);
                list_del(&e->w.list);
+               e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
                e->w.cb = w_e_reissue;
                /* put_ldev actually happens below, once we come here again. */
                __release(local);
@@ -167,17 +138,16 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 
        D_ASSERT(e->block_id != ID_VACANT);
 
-       spin_lock_irqsave(&mdev->req_lock, flags);
-       mdev->writ_cnt += e->size >> 9;
-       is_syncer_req = is_syncer_block_id(e->block_id);
-
        /* after we moved e to done_ee,
         * we may no longer access it,
         * it may be freed/reused already!
         * (as soon as we release the req_lock) */
        e_sector = e->sector;
        do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
+       is_syncer_req = is_syncer_block_id(e->block_id);
 
+       spin_lock_irqsave(&mdev->req_lock, flags);
+       mdev->writ_cnt += e->size >> 9;
        list_del(&e->w.list); /* has been on active_ee or sync_ee */
        list_add_tail(&e->w.list, &mdev->done_ee);
 
@@ -190,7 +160,7 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
                ? list_empty(&mdev->sync_ee)
                : list_empty(&mdev->active_ee);
 
-       if (error)
+       if (test_bit(__EE_WAS_ERROR, &e->flags))
                __drbd_chk_io_error(mdev, FALSE);
        spin_unlock_irqrestore(&mdev->req_lock, flags);
 
@@ -205,7 +175,42 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 
        wake_asender(mdev);
        put_ldev(mdev);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_endio_sec(struct bio *bio, int error)
+{
+       struct drbd_epoch_entry *e = bio->bi_private;
+       struct drbd_conf *mdev = e->mdev;
+       int uptodate = bio_flagged(bio, BIO_UPTODATE);
+       int is_write = bio_data_dir(bio) == WRITE;
+
+       if (error)
+               dev_warn(DEV, "%s: error=%d s=%llus\n",
+                               is_write ? "write" : "read", error,
+                               (unsigned long long)e->sector);
+       if (!error && !uptodate) {
+               dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
+                               is_write ? "write" : "read",
+                               (unsigned long long)e->sector);
+               /* strange behavior of some lower level drivers...
+                * fail the request by clearing the uptodate flag,
+                * but do not return any error?! */
+               error = -EIO;
+       }
+
+       if (error)
+               set_bit(__EE_WAS_ERROR, &e->flags);
 
+       bio_put(bio); /* no need for the bio anymore */
+       if (atomic_dec_and_test(&e->pending_bios)) {
+               if (is_write)
+                       drbd_endio_write_sec_final(e);
+               else
+                       drbd_endio_read_sec_final(e);
+       }
 }
 
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
@@ -295,7 +300,34 @@ int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
        return 1; /* Simply ignore this! */
 }
 
-void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+{
+       struct hash_desc desc;
+       struct scatterlist sg;
+       struct page *page = e->pages;
+       struct page *tmp;
+       unsigned len;
+
+       desc.tfm = tfm;
+       desc.flags = 0;
+
+       sg_init_table(&sg, 1);
+       crypto_hash_init(&desc);
+
+       while ((tmp = page_chain_next(page))) {
+               /* all but the last page will be fully used */
+               sg_set_page(&sg, page, PAGE_SIZE, 0);
+               crypto_hash_update(&desc, &sg, sg.length);
+               page = tmp;
+       }
+       /* and now the last, possibly only partially used page */
+       len = e->size & (PAGE_SIZE - 1);
+       sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+       crypto_hash_update(&desc, &sg, sg.length);
+       crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
 {
        struct hash_desc desc;
        struct scatterlist sg;
@@ -329,11 +361,11 @@ static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel
                return 1;
        }
 
-       if (likely(drbd_bio_uptodate(e->private_bio))) {
+       if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                digest_size = crypto_hash_digestsize(mdev->csums_tfm);
                digest = kmalloc(digest_size, GFP_NOIO);
                if (digest) {
-                       drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+                       drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
 
                        inc_rs_pending(mdev);
                        ok = drbd_send_drequest_csum(mdev,
@@ -369,23 +401,21 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
        /* GFP_TRY, because if there is no memory available right now, this may
         * be rescheduled for later. It is "only" background resync, after all. */
        e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
-       if (!e) {
-               put_ldev(mdev);
-               return 2;
-       }
+       if (!e)
+               goto fail;
 
        spin_lock_irq(&mdev->req_lock);
        list_add(&e->w.list, &mdev->read_ee);
        spin_unlock_irq(&mdev->req_lock);
 
-       e->private_bio->bi_end_io = drbd_endio_read_sec;
-       e->private_bio->bi_rw = READ;
        e->w.cb = w_e_send_csum;
+       if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+               return 1;
 
-       mdev->read_cnt += size >> 9;
-       drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
-
-       return 1;
+       drbd_free_ee(mdev, e);
+fail:
+       put_ldev(mdev);
+       return 2;
 }
 
 void resync_timer_fn(unsigned long data)
@@ -414,13 +444,25 @@ void resync_timer_fn(unsigned long data)
                drbd_queue_work(&mdev->data.work, &mdev->resync_work);
 }
 
+static int calc_resync_rate(struct drbd_conf *mdev)
+{
+       int d = mdev->data_delay / 1000; /* us -> ms */
+       int td = mdev->sync_conf.throttle_th * 100;  /* 0.1s -> ms */
+       int hd = mdev->sync_conf.hold_off_th * 100;  /* 0.1s -> ms */
+       int cr = mdev->sync_conf.rate;
+
+       return d <= td ? cr :
+               d >= hd ? 0 :
+               cr + (cr * (td - d) / (hd - td));
+}
+
 int w_make_resync_request(struct drbd_conf *mdev,
                struct drbd_work *w, int cancel)
 {
        unsigned long bit;
        sector_t sector;
        const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
-       int max_segment_size = queue_max_segment_size(mdev->rq_queue);
+       int max_segment_size;
        int number, i, size, pe, mx;
        int align, queued, sndbuf;
 
@@ -446,7 +488,13 @@ int w_make_resync_request(struct drbd_conf *mdev,
                return 1;
        }
 
-       number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+       /* starting with drbd 8.3.8, we can handle multi-bio EEs,
+        * if it should be necessary */
+       max_segment_size = mdev->agreed_pro_version < 94 ?
+               queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
+
+       mdev->c_sync_rate = calc_resync_rate(mdev);
+       number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
        pe = atomic_read(&mdev->rs_pending_cnt);
 
        mutex_lock(&mdev->data.mutex);
@@ -509,12 +557,6 @@ next_sector:
                 *
                 * Additionally always align bigger requests, in order to
                 * be prepared for all stripe sizes of software RAIDs.
-                *
-                * we _do_ care about the agreed-upon q->max_segment_size
-                * here, as splitting up the requests on the other side is more
-                * difficult.  the consequence is, that on lvm and md and other
-                * "indirect" devices, this is dead code, since
-                * q->max_segment_size will be PAGE_SIZE.
                 */
                align = 1;
                for (;;) {
@@ -806,7 +848,7 @@ out:
 /* helper */
 static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 {
-       if (drbd_bio_has_active_page(e->private_bio)) {
+       if (drbd_ee_has_active_page(e)) {
                /* This might happen if sendpage() has not finished */
                spin_lock_irq(&mdev->req_lock);
                list_add_tail(&e->w.list, &mdev->net_ee);
@@ -832,7 +874,7 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
                return 1;
        }
 
-       if (likely(drbd_bio_uptodate(e->private_bio))) {
+       if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                ok = drbd_send_block(mdev, P_DATA_REPLY, e);
        } else {
                if (__ratelimit(&drbd_ratelimit_state))
@@ -873,7 +915,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
                put_ldev(mdev);
        }
 
-       if (likely(drbd_bio_uptodate(e->private_bio))) {
+       if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
                        inc_rs_pending(mdev);
                        ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
@@ -921,7 +963,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
        di = (struct digest_info *)(unsigned long)e->block_id;
 
-       if (likely(drbd_bio_uptodate(e->private_bio))) {
+       if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                /* quick hack to try to avoid a race against reconfiguration.
                 * a real fix would be much more involved,
                 * introducing more locking mechanisms */
@@ -931,7 +973,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
                        digest = kmalloc(digest_size, GFP_NOIO);
                }
                if (digest) {
-                       drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+                       drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
                        eq = !memcmp(digest, di->digest, digest_size);
                        kfree(digest);
                }
@@ -973,14 +1015,14 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
        if (unlikely(cancel))
                goto out;
 
-       if (unlikely(!drbd_bio_uptodate(e->private_bio)))
+       if (unlikely((e->flags & EE_WAS_ERROR) != 0))
                goto out;
 
        digest_size = crypto_hash_digestsize(mdev->verify_tfm);
        /* FIXME if this allocation fails, online verify will not terminate! */
        digest = kmalloc(digest_size, GFP_NOIO);
        if (digest) {
-               drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+               drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
                inc_rs_pending(mdev);
                ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
                                             digest, digest_size, P_OV_REPLY);
@@ -1029,11 +1071,11 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
        di = (struct digest_info *)(unsigned long)e->block_id;
 
-       if (likely(drbd_bio_uptodate(e->private_bio))) {
+       if (likely((e->flags & EE_WAS_ERROR) == 0)) {
                digest_size = crypto_hash_digestsize(mdev->verify_tfm);
                digest = kmalloc(digest_size, GFP_NOIO);
                if (digest) {
-                       drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+                       drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
 
                        D_ASSERT(digest_size == di->digest_size);
                        eq = !memcmp(digest, di->digest, digest_size);
index f93fa11..defdb50 100644 (file)
@@ -18,23 +18,9 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
 
 #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
 
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
-       struct bio_vec *bvec;
-       int i;
-
-       __bio_for_each_segment(bvec, bio, i, 0) {
-               if (page_count(bvec->bv_page) > 1)
-                       return 1;
-       }
-
-       return 0;
-}
-
 /* bi_end_io handlers */
 extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_endio_read_sec(struct bio *bio, int error);
-extern void drbd_endio_write_sec(struct bio *bio, int error);
+extern void drbd_endio_sec(struct bio *bio, int error);
 extern void drbd_endio_pri(struct bio *bio, int error);
 
 /*
index 3b128dc..33d6503 100644 (file)
@@ -407,32 +407,24 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
        return 0;
 }
 
-static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
+static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 {
-       u64 set = min(capacity, drive->probed_capacity);
        u16 *id = drive->id;
        int lba48 = ata_id_lba48_enabled(id);
 
        if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
            ata_id_hpa_enabled(id) == 0)
-               goto out;
+               return;
 
        /*
         * according to the spec the SET MAX ADDRESS command shall be
         * immediately preceded by a READ NATIVE MAX ADDRESS command
         */
-       capacity = ide_disk_hpa_get_native_capacity(drive, lba48);
-       if (capacity == 0)
-               goto out;
-
-       set = ide_disk_hpa_set_capacity(drive, set, lba48);
-       if (set) {
-               /* needed for ->resume to disable HPA */
-               drive->dev_flags |= IDE_DFLAG_NOHPA;
-               return set;
-       }
-out:
-       return drive->capacity64;
+       if (!ide_disk_hpa_get_native_capacity(drive, lba48))
+               return;
+
+       if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
+               drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 
 static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
@@ -783,13 +775,13 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 }
 
 const struct ide_disk_ops ide_ata_disk_ops = {
-       .check          = ide_disk_check,
-       .set_capacity   = ide_disk_set_capacity,
-       .get_capacity   = ide_disk_get_capacity,
-       .setup          = ide_disk_setup,
-       .flush          = ide_disk_flush,
-       .init_media     = ide_disk_init_media,
-       .set_doorlock   = ide_disk_set_doorlock,
-       .do_request     = ide_do_rw_disk,
-       .ioctl          = ide_disk_ioctl,
+       .check                  = ide_disk_check,
+       .unlock_native_capacity = ide_disk_unlock_native_capacity,
+       .get_capacity           = ide_disk_get_capacity,
+       .setup                  = ide_disk_setup,
+       .flush                  = ide_disk_flush,
+       .init_media             = ide_disk_init_media,
+       .set_doorlock           = ide_disk_set_doorlock,
+       .do_request             = ide_do_rw_disk,
+       .ioctl                  = ide_disk_ioctl,
 };
index c32d839..c102d23 100644 (file)
@@ -288,17 +288,14 @@ static int ide_gd_media_changed(struct gendisk *disk)
        return ret;
 }
 
-static unsigned long long ide_gd_set_capacity(struct gendisk *disk,
-                                             unsigned long long capacity)
+static void ide_gd_unlock_native_capacity(struct gendisk *disk)
 {
        struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
        ide_drive_t *drive = idkp->drive;
        const struct ide_disk_ops *disk_ops = drive->disk_ops;
 
-       if (disk_ops->set_capacity)
-               return disk_ops->set_capacity(drive, capacity);
-
-       return drive->capacity64;
+       if (disk_ops->unlock_native_capacity)
+               disk_ops->unlock_native_capacity(drive);
 }
 
 static int ide_gd_revalidate_disk(struct gendisk *disk)
@@ -329,7 +326,7 @@ static const struct block_device_operations ide_gd_ops = {
        .locked_ioctl           = ide_gd_ioctl,
        .getgeo                 = ide_gd_getgeo,
        .media_changed          = ide_gd_media_changed,
-       .set_capacity           = ide_gd_set_capacity,
+       .unlock_native_capacity = ide_gd_unlock_native_capacity,
        .revalidate_disk        = ide_gd_revalidate_disk
 };
 
index 6dcee88..55dcb78 100644 (file)
@@ -417,7 +417,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
         */
        mutex_unlock(&bd_inode->i_mutex);
 
-       error = blkdev_issue_flush(bdev, NULL);
+       error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
        if (error == -EOPNOTSUPP)
                error = 0;
 
@@ -668,41 +668,209 @@ void bd_forget(struct inode *inode)
                iput(bdev->bd_inode);
 }
 
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+                        void *holder)
 {
-       int res;
-       spin_lock(&bdev_lock);
-
-       /* first decide result */
        if (bdev->bd_holder == holder)
-               res = 0;         /* already a holder */
+               return true;     /* already a holder */
        else if (bdev->bd_holder != NULL)
-               res = -EBUSY;    /* held by someone else */
+               return false;    /* held by someone else */
        else if (bdev->bd_contains == bdev)
-               res = 0;         /* is a whole device which isn't held */
+               return true;     /* is a whole device which isn't held */
 
-       else if (bdev->bd_contains->bd_holder == bd_claim)
-               res = 0;         /* is a partition of a device that is being partitioned */
-       else if (bdev->bd_contains->bd_holder != NULL)
-               res = -EBUSY;    /* is a partition of a held device */
+       else if (whole->bd_holder == bd_claim)
+               return true;     /* is a partition of a device that is being partitioned */
+       else if (whole->bd_holder != NULL)
+               return false;    /* is a partition of a held device */
        else
-               res = 0;         /* is a partition of an un-held device */
+               return true;     /* is a partition of an un-held device */
+}
+
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+                              struct block_device *whole, void *holder)
+{
+retry:
+       /* if someone else claimed, fail */
+       if (!bd_may_claim(bdev, whole, holder))
+               return -EBUSY;
+
+       /* if someone else is claiming, wait for it to finish */
+       if (whole->bd_claiming && whole->bd_claiming != holder) {
+               wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+               DEFINE_WAIT(wait);
+
+               prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+               spin_unlock(&bdev_lock);
+               schedule();
+               finish_wait(wq, &wait);
+               spin_lock(&bdev_lock);
+               goto retry;
+       }
+
+       /* yay, all mine */
+       return 0;
+}
+
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming().  If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+                                             void *holder)
+{
+       struct gendisk *disk;
+       struct block_device *whole;
+       int partno, err;
+
+       might_sleep();
+
+       /*
+        * @bdev might not have been initialized properly yet, look up
+        * and grab the outer block device the hard way.
+        */
+       disk = get_gendisk(bdev->bd_dev, &partno);
+       if (!disk)
+               return ERR_PTR(-ENXIO);
+
+       whole = bdget_disk(disk, 0);
+       put_disk(disk);
+       if (!whole)
+               return ERR_PTR(-ENOMEM);
+
+       /* prepare to claim, if successful, mark claiming in progress */
+       spin_lock(&bdev_lock);
+
+       err = bd_prepare_to_claim(bdev, whole, holder);
+       if (err == 0) {
+               whole->bd_claiming = holder;
+               spin_unlock(&bdev_lock);
+               return whole;
+       } else {
+               spin_unlock(&bdev_lock);
+               bdput(whole);
+               return ERR_PTR(err);
+       }
+}
 
-       /* now impose change */
-       if (res==0) {
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+       BUG_ON(whole->bd_claiming != holder);
+       whole->bd_claiming = NULL;
+       wake_up_bit(&whole->bd_claiming, 0);
+
+       spin_unlock(&bdev_lock);
+       bdput(whole);
+}
+
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+       spin_lock(&bdev_lock);
+       __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
+}
+
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully.  This
+ * function may be called with or without preceding
+ * blk_start_claiming().  In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+       struct block_device *whole = bdev->bd_contains;
+       int res;
+
+       might_sleep();
+
+       spin_lock(&bdev_lock);
+
+       res = bd_prepare_to_claim(bdev, whole, holder);
+       if (res == 0) {
                /* note that for a whole device bd_holders
                 * will be incremented twice, and bd_holder will
                 * be set to bd_claim before being set to holder
                 */
-               bdev->bd_contains->bd_holders ++;
-               bdev->bd_contains->bd_holder = bd_claim;
+               whole->bd_holders++;
+               whole->bd_holder = bd_claim;
                bdev->bd_holders++;
                bdev->bd_holder = holder;
        }
-       spin_unlock(&bdev_lock);
+
+       if (whole->bd_claiming)
+               __bd_abort_claiming(whole, holder);     /* releases bdev_lock */
+       else
+               spin_unlock(&bdev_lock);
+
        return res;
 }
-
 EXPORT_SYMBOL(bd_claim);
 
 void bd_release(struct block_device *bdev)
@@ -1316,6 +1484,7 @@ EXPORT_SYMBOL(blkdev_get);
 
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+       struct block_device *whole = NULL;
        struct block_device *bdev;
        int res;
 
@@ -1338,22 +1507,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
 
+       if (filp->f_mode & FMODE_EXCL) {
+               whole = bd_start_claiming(bdev, filp);
+               if (IS_ERR(whole)) {
+                       bdput(bdev);
+                       return PTR_ERR(whole);
+               }
+       }
+
        filp->f_mapping = bdev->bd_inode->i_mapping;
 
        res = blkdev_get(bdev, filp->f_mode);
-       if (res)
-               return res;
 
-       if (filp->f_mode & FMODE_EXCL) {
-               res = bd_claim(bdev, filp);
-               if (res)
-                       goto out_blkdev_put;
+       if (whole) {
+               if (res == 0)
+                       BUG_ON(bd_claim(bdev, filp) != 0);
+               else
+                       bd_abort_claiming(whole, filp);
        }
 
-       return 0;
-
- out_blkdev_put:
-       blkdev_put(bdev, filp->f_mode);
        return res;
 }
 
@@ -1564,27 +1736,34 @@ EXPORT_SYMBOL(lookup_bdev);
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-       struct block_device *bdev;
-       int error = 0;
+       struct block_device *bdev, *whole;
+       int error;
 
        bdev = lookup_bdev(path);
        if (IS_ERR(bdev))
                return bdev;
 
+       whole = bd_start_claiming(bdev, holder);
+       if (IS_ERR(whole)) {
+               bdput(bdev);
+               return whole;
+       }
+
        error = blkdev_get(bdev, mode);
        if (error)
-               return ERR_PTR(error);
+               goto out_abort_claiming;
+
        error = -EACCES;
        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-               goto blkdev_put;
-       error = bd_claim(bdev, holder);
-       if (error)
-               goto blkdev_put;
+               goto out_blkdev_put;
 
+       BUG_ON(bd_claim(bdev, holder) != 0);
        return bdev;
-       
-blkdev_put:
+
+out_blkdev_put:
        blkdev_put(bdev, mode);
+out_abort_claiming:
+       bd_abort_claiming(whole, holder);
        return ERR_PTR(error);
 }
 
index b34d32f..c6a4f45 100644 (file)
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-                            DISCARD_FL_BARRIER);
+                       BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
index c9c266d..08e422d 100644 (file)
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
                return;
 
        invalidate_bh_lrus();
+       lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
 }
 EXPORT_SYMBOL(invalidate_bdev);
index 26289e8..fcf7487 100644 (file)
@@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
         * storage
         */
        if (needs_barrier)
-               blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+               blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                               BLKDEV_IFL_WAIT);
        return ret;
 }
index 0d0c323..ef3d980 100644 (file)
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                if (ext4_should_writeback_data(inode) &&
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
-                       blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                       blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+                                       NULL, BLKDEV_IFL_WAIT);
                jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
-               blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+               blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                       BLKDEV_IFL_WAIT);
        return ret;
 }
index 0a14074..f74d270 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, arg);
                break;
+       case F_SETPIPE_SZ:
+       case F_GETPIPE_SZ:
+               err = pipe_fcntl(filp, cmd, arg);
+               break;
        default:
                break;
        }
index 4b37f7c..437a743 100644 (file)
@@ -45,6 +45,7 @@ struct wb_writeback_args {
        int for_kupdate:1;
        int range_cyclic:1;
        int for_background:1;
+       int sb_pinned:1;
 };
 
 /*
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                struct wb_writeback_args *args)
+                                struct wb_writeback_args *args,
+                                int wait)
 {
        struct bdi_work *work;
 
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
        if (work) {
                bdi_work_init(work, args);
                bdi_queue_work(bdi, work);
+               if (wait)
+                       bdi_wait_on_work_clear(work);
        } else {
                struct bdi_writeback *wb = &bdi->wb;
 
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
+               /*
+                * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+                * lets make it explicitly clear.
+                */
+               .sb_pinned      = 1,
        };
        struct bdi_work work;
 
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
  * @bdi: the backing device to write from
  * @sb: write inodes from this super_block
  * @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
  *
  * Description:
  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
  *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   completion. Caller specifies whether sb umount sem is held already or not.
  *
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                        long nr_pages)
+                        long nr_pages, int sb_locked)
 {
        struct wb_writeback_args args = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .nr_pages       = nr_pages,
                .range_cyclic   = 1,
+               .sb_pinned      = sb_locked,
        };
 
        /*
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
                args.for_background = 1;
        }
 
-       bdi_alloc_queue_work(bdi, &args);
+       bdi_alloc_queue_work(bdi, &args, sb_locked);
 }
 
 /*
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
        BUG_ON(inode->i_state & I_SYNC);
 
-       /* Set I_SYNC, reset I_DIRTY */
-       dirty = inode->i_state & I_DIRTY;
+       /* Set I_SYNC, reset I_DIRTY_PAGES */
        inode->i_state |= I_SYNC;
-       inode->i_state &= ~I_DIRTY;
-
+       inode->i_state &= ~I_DIRTY_PAGES;
        spin_unlock(&inode_lock);
 
        ret = do_writepages(mapping, wbc);
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        ret = err;
        }
 
+       /*
+        * Some filesystems may redirty the inode during the writeback
+        * due to delalloc, clear dirty metadata flags right before
+        * write_inode()
+        */
+       spin_lock(&inode_lock);
+       dirty = inode->i_state & I_DIRTY;
+       inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+       spin_unlock(&inode_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
        /*
         * Caller must already hold the ref for this
         */
-       if (wbc->sync_mode == WB_SYNC_ALL) {
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
                WARN_ON(!rwsem_is_locked(&sb->s_umount));
                return SB_NOT_PINNED;
        }
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                .for_kupdate            = args->for_kupdate,
                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
+               .sb_pinned              = args->sb_pinned,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
        unsigned long expired;
        long nr_pages;
 
+       /*
+        * When set to zero, disable periodic writeback
+        */
+       if (!dirty_writeback_interval)
+               return 0;
+
        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
        while ((work = get_next_work_item(bdi, wb)) != NULL) {
                struct wb_writeback_args args = work->args;
+               int post_clear;
 
                /*
                 * Override sync mode, in case we must wait for completion
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                if (force_wait)
                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
+               post_clear = WB_SYNC_ALL || args.sb_pinned;
+
                /*
                 * If this isn't a data integrity operation, just notify
                 * that we have seen this work and we are now starting it.
                 */
-               if (args.sync_mode == WB_SYNC_NONE)
+               if (!post_clear)
                        wb_clear_pending(wb, work);
 
                wrote += wb_writeback(wb, &args);
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                 * This is a data integrity writeback, so only do the
                 * notification when we have completed the work.
                 */
-               if (args.sync_mode == WB_SYNC_ALL)
+               if (post_clear)
                        wb_clear_pending(wb, work);
        }
 
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
                                break;
                }
 
-               wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-               schedule_timeout_interruptible(wait_jiffies);
+               if (dirty_writeback_interval) {
+                       wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+                       schedule_timeout_interruptible(wait_jiffies);
+               } else {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       if (list_empty_careful(&wb->bdi->work_list) &&
+                           !kthread_should_stop())
+                               schedule();
+                       __set_current_state(TASK_RUNNING);
+               }
+
                try_to_freeze();
        }
 
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
                if (!bdi_has_dirty_io(bdi))
                        continue;
 
-               bdi_alloc_queue_work(bdi, &args);
+               bdi_alloc_queue_work(bdi, &args, 0);
        }
 
        rcu_read_unlock();
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
        iput(old_inode);
 }
 
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+       long nr_to_write;
+
+       nr_to_write = nr_dirty + nr_unstable +
+                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+       bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
+
 /**
  * writeback_inodes_sb -       writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1194,17 +1243,22 @@ static void wait_sb_inodes(struct super_block *sb)
  */
 void writeback_inodes_sb(struct super_block *sb)
 {
-       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-       long nr_to_write;
-
-       nr_to_write = nr_dirty + nr_unstable +
-                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
-       bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+       __writeback_inodes_sb(sb, 0);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
+/**
+ * writeback_inodes_sb_locked  - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+       __writeback_inodes_sb(sb, 1);
+}
+
 /**
  * writeback_inodes_sb_if_idle -       start writeback if none underway
  * @sb: the superblock
index 8bce73e..117fa41 100644 (file)
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                           DISCARD_FL_BARRIER);
+                                                           BLKDEV_IFL_WAIT |
+                                                           BLKDEV_IFL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
        }
        if (nr_sects) {
                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-                                        DISCARD_FL_BARRIER);
+                                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (rv)
                        goto fail;
        }
index 30beb11..076d1cc 100644 (file)
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-               blkdev_issue_flush(journal->j_fs_dev, NULL);
+               blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                       BLKDEV_IFL_WAIT);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
index 671da7f..75716d3 100644 (file)
@@ -717,7 +717,8 @@ start_journal_io:
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-               blkdev_issue_flush(journal->j_fs_dev, NULL);
+               blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                       BLKDEV_IFL_WAIT);
 
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
                if (err)
                        __jbd2_journal_abort_hard(journal);
                if (journal->j_flags & JBD2_BARRIER)
-                       blkdev_issue_flush(journal->j_dev, NULL);
+                       blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+                               BLKDEV_IFL_WAIT);
        }
 
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
index a756168..8c10973 100644 (file)
@@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
                                                   GFP_NOFS,
-                                                  DISCARD_FL_BARRIER);
+                                                  BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
@@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                          GFP_NOFS, DISCARD_FL_BARRIER);
+                                          GFP_NOFS, BLKDEV_IFL_BARRIER);
        return ret;
 }
 
index a97b477..6921e78 100644 (file)
@@ -70,14 +70,14 @@ struct riscix_record {
 
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
-               unsigned long first_sect, int slot, unsigned long nr_sects)
+static int riscix_partition(struct parsed_partitions *state,
+                           unsigned long first_sect, int slot,
+                           unsigned long nr_sects)
 {
        Sector sect;
        struct riscix_record *rr;
        
-       rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
+       rr = read_part_sector(state, first_sect, &sect);
        if (!rr)
                return -1;
 
@@ -123,9 +123,9 @@ struct linux_part {
 
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
-               unsigned long first_sect, int slot, unsigned long nr_sects)
+static int linux_partition(struct parsed_partitions *state,
+                          unsigned long first_sect, int slot,
+                          unsigned long nr_sects)
 {
        Sector sect;
        struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 
        put_partition(state, slot++, first_sect, size);
 
-       linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
+       linuxp = read_part_sector(state, first_sect, &sect);
        if (!linuxp)
                return -1;
 
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 #endif
 
 #ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_CUMANA(struct parsed_partitions *state)
 {
        unsigned long first_sector = 0;
        unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
                struct adfs_discrecord *dr;
                unsigned int nr_sects;
 
-               data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
+               data = read_part_sector(state, start_blk * 2 + 6, &sect);
                if (!data)
                        return -1;
 
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                        /* RISCiX - we don't know how to find the next one. */
-                       slot = riscix_partition(state, bdev, first_sector,
-                                                slot, nr_sects);
+                       slot = riscix_partition(state, first_sector, slot,
+                                               nr_sects);
                        break;
 #endif
 
                case PARTITION_LINUX:
-                       slot = linux_partition(state, bdev, first_sector,
-                                               slot, nr_sects);
+                       slot = linux_partition(state, first_sector, slot,
+                                              nr_sects);
                        break;
                }
                put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
  *         hda1 = ADFS partition on first drive.
  *         hda2 = non-ADFS partition.
  */
-int
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ADFS(struct parsed_partitions *state)
 {
        unsigned long start_sect, nr_sects, sectscyl, heads;
        Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        unsigned char id;
        int slot = 1;
 
-       data = read_dev_sector(bdev, 6, &sect);
+       data = read_part_sector(state, 6, &sect);
        if (!data)
                return -1;
 
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Work out start of non-adfs partition.
         */
-       nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+       nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
 
        if (start_sect) {
                switch (id) {
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                case PARTITION_RISCIX_MFM:
-                       slot = riscix_partition(state, bdev, start_sect,
-                                                slot, nr_sects);
+                       slot = riscix_partition(state, start_sect, slot,
+                                               nr_sects);
                        break;
 #endif
 
                case PARTITION_LINUX:
-                       slot = linux_partition(state, bdev, start_sect,
-                                               slot, nr_sects);
+                       slot = linux_partition(state, start_sect, slot,
+                                              nr_sects);
                        break;
                }
        }
@@ -308,10 +306,11 @@ struct ics_part {
        __le32 size;
 };
 
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+                                  unsigned long block)
 {
        Sector sect;
-       unsigned char *data = read_dev_sector(bdev, block, &sect);
+       unsigned char *data = read_part_sector(state, block, &sect);
        int result = 0;
 
        if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
  *         hda2 = ADFS partition 1 on first drive.
  *             ..etc..
  */
-int
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ICS(struct parsed_partitions *state)
 {
        const unsigned char *data;
        const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Try ICS style partitions - sector 0 contains partition info.
         */
-       data = read_dev_sector(bdev, 0, &sect);
+       data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
 
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
                         * partition is.  We must not make this visible
                         * to the filesystem.
                         */
-                       if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+                       if (size > 1 && adfspart_check_ICSLinux(state, start)) {
                                start += 1;
                                size -= 1;
                        }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
  *         hda2 = ADFS partition 1 on first drive.
  *             ..etc..
  */
-int
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
 {
        Sector sect;
        const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
        int slot = 1;
        int i;
 
-       data = read_dev_sector(bdev, 0, &sect);
+       data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
 
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
  *  1. The individual ADFS boot block entries that are placed on the disk.
  *  2. The start address of the next entry.
  */
-int
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_EESOX(struct parsed_partitions *state)
 {
        Sector sect;
        const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        sector_t start = 0;
        int i, slot = 1;
 
-       data = read_dev_sector(bdev, 7, &sect);
+       data = read_part_sector(state, 7, &sect);
        if (!data)
                return -1;
 
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        if (i != 0) {
                sector_t size;
 
-               size = get_capacity(bdev->bd_disk);
+               size = get_capacity(state->bdev->bd_disk);
                put_partition(state, slot++, start, size - start);
                printk("\n");
        }
index 81fd50e..ede8285 100644 (file)
@@ -7,8 +7,8 @@
  *  format, and everyone stick to it?
  */
 
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
index 9917a8c..ba443d4 100644 (file)
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
        return sum;
 }
 
-int
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
+int amiga_partition(struct parsed_partitions *state)
 {
        Sector sect;
        unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
        for (blk = 0; ; blk++, put_dev_sector(sect)) {
                if (blk == RDB_ALLOCATION_LIMIT)
                        goto rdb_done;
-               data = read_dev_sector(bdev, blk, &sect);
+               data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read RDB block %d\n",
-                                      bdevname(bdev, b), blk);
+                                      bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
                }
 
                printk("Dev %s: RDB in block %d has bad checksum\n",
-                              bdevname(bdev, b), blk);
+                      bdevname(state->bdev, b), blk);
        }
 
        /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
        put_dev_sector(sect);
        for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
                blk *= blksize; /* Read in terms partition table understands */
-               data = read_dev_sector(bdev, blk, &sect);
+               data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read partition block %d\n",
-                                      bdevname(bdev, b), blk);
+                                      bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
index 2f3e9ce..d094585 100644 (file)
@@ -2,5 +2,5 @@
  *  fs/partitions/amiga.h
  */
 
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
 
index 1f3572d..4439ff1 100644 (file)
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
                memcmp (s, "RAW", 3) == 0 ;
 }
 
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
 {
        Sector sect;
        struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
        int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
 #endif
 
-       rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
+       rs = read_part_sector(state, 0, &sect);
        if (!rs)
                return -1;
 
        /* Verify this is an Atari rootsector: */
-       hd_size = bdev->bd_inode->i_size >> 9;
+       hd_size = state->bdev->bd_inode->i_size >> 9;
        if (!VALID_PARTITION(&rs->part[0], hd_size) &&
            !VALID_PARTITION(&rs->part[1], hd_size) &&
            !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
                printk(" XGM<");
                partsect = extensect = be32_to_cpu(pi->st);
                while (1) {
-                       xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
+                       xrs = read_part_sector(state, partsect, &sect2);
                        if (!xrs) {
                                printk (" block %ld read failed\n", partsect);
                                put_dev_sector(sect);
index 63186b0..fe2d32a 100644 (file)
@@ -31,4 +31,4 @@ struct rootsector
   u16 checksum;                        /* checksum for bootable disks */
 } __attribute__((__packed__));
 
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
index e238ab2..5dcd4b0 100644 (file)
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
 
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
 
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
        /*
         * Probe partition formats with tables at disk address 0
         * that also have an ADFS boot block at 0xdc0.
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        struct parsed_partitions *state;
        int i, res, err;
 
-       state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+       state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
        if (!state)
                return NULL;
 
+       state->bdev = bdev;
        disk_name(hd, 0, state->name);
        printk(KERN_INFO " %s:", state->name);
        if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        i = res = err = 0;
        while (!res && check_part[i]) {
                memset(&state->parts, 0, sizeof(state->parts));
-               res = check_part[i++](state, bdev);
+               res = check_part[i++](state);
                if (res < 0) {
                        /* We have hit an I/O error which we don't report now.
                        * But record it, and let the others do their job.
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        }
        if (res > 0)
                return state;
+       if (state->access_beyond_eod)
+               err = -ENOSPC;
        if (err)
        /* The partition is unrecognized. So report I/O errors if there were any */
                res = err;
@@ -538,12 +541,33 @@ exit:
        disk_part_iter_exit(&piter);
 }
 
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+       const struct block_device_operations *bdops = disk->fops;
+
+       if (bdops->unlock_native_capacity &&
+           !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+               printk(KERN_CONT "enabling native capacity\n");
+               bdops->unlock_native_capacity(disk);
+               disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+               return true;
+       } else {
+               printk(KERN_CONT "truncated\n");
+               return false;
+       }
+}
+
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+       struct parsed_partitions *state = NULL;
        struct disk_part_iter piter;
        struct hd_struct *part;
-       struct parsed_partitions *state;
        int p, highest, res;
+rescan:
+       if (state && !IS_ERR(state)) {
+               kfree(state);
+               state = NULL;
+       }
 
        if (bdev->bd_part_count)
                return -EBUSY;
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        bdev->bd_invalidated = 0;
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
                return 0;
-       if (IS_ERR(state))      /* I/O error reading the partition table */
+       if (IS_ERR(state)) {
+               /*
+                * I/O error reading the partition table.  If any
+                * partition code tried to read beyond EOD, retry
+                * after unlocking native capacity.
+                */
+               if (PTR_ERR(state) == -ENOSPC) {
+                       printk(KERN_WARNING "%s: partition table beyond EOD, ",
+                              disk->disk_name);
+                       if (disk_unlock_native_capacity(disk))
+                               goto rescan;
+               }
                return -EIO;
+       }
+       /*
+        * If any partition code tried to read beyond EOD, try
+        * unlocking native capacity even if partition table is
+        * sucessfully read as we could be missing some partitions.
+        */
+       if (state->access_beyond_eod) {
+               printk(KERN_WARNING
+                      "%s: partition table partially beyond EOD, ",
+                      disk->disk_name);
+               if (disk_unlock_native_capacity(disk))
+                       goto rescan;
+       }
 
        /* tell userspace that the media / partition table may have changed */
        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size, from;
-try_scan:
+
                size = state->parts[p].size;
                if (!size)
                        continue;
@@ -589,30 +637,21 @@ try_scan:
                from = state->parts[p].from;
                if (from >= get_capacity(disk)) {
                        printk(KERN_WARNING
-                              "%s: p%d ignored, start %llu is behind the end of the disk\n",
+                              "%s: p%d start %llu is beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) from);
+                       if (disk_unlock_native_capacity(disk))
+                               goto rescan;
                        continue;
                }
 
                if (from + size > get_capacity(disk)) {
-                       const struct block_device_operations *bdops = disk->fops;
-                       unsigned long long capacity;
-
                        printk(KERN_WARNING
-                              "%s: p%d size %llu exceeds device capacity, ",
+                              "%s: p%d size %llu extends beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) size);
 
-                       if (bdops->set_capacity &&
-                           (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
-                               printk(KERN_CONT "enabling native capacity\n");
-                               capacity = bdops->set_capacity(disk, ~0ULL);
-                               disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-                               if (capacity > get_capacity(disk)) {
-                                       set_capacity(disk, capacity);
-                                       check_disk_size_change(disk, bdev);
-                                       bdev->bd_invalidated = 0;
-                               }
-                               goto try_scan;
+                       if (disk_unlock_native_capacity(disk)) {
+                               /* free state and restart */
+                               goto rescan;
                        } else {
                                /*
                                 * we can not ignore partitions of broken tables
@@ -620,7 +659,6 @@ try_scan:
                                 * we limit them to the end of the disk to avoid
                                 * creating invalid block devices
                                 */
-                               printk(KERN_CONT "limited to end of disk\n");
                                size = get_capacity(disk) - from;
                        }
                }
index 98dbe1a..52f8bd3 100644 (file)
@@ -6,6 +6,7 @@
  * description.
  */
 struct parsed_partitions {
+       struct block_device *bdev;
        char name[BDEVNAME_SIZE];
        struct {
                sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
+       bool access_beyond_eod;
 };
 
+static inline void *read_part_sector(struct parsed_partitions *state,
+                                    sector_t n, Sector *p)
+{
+       if (n >= get_capacity(state->bdev->bd_disk)) {
+               state->access_beyond_eod = true;
+               return NULL;
+       }
+       return read_dev_sector(state->bdev, n, p);
+}
+
 static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
index 91babda..9e346c1 100644 (file)
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
  *  the part[0] entry for this disk, and is the number of
  *  physical sectors available on the disk.
  */
-static u64
-last_lba(struct block_device *bdev)
+static u64 last_lba(struct block_device *bdev)
 {
        if (!bdev || !bdev->bd_inode)
                return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
 
 /**
  * read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
  * @lba
  * @buffer
  * @size_t
  *
- * Description:  Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
  * Returns number of bytes read on success, 0 on error.
  */
-static size_t
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+static size_t read_lba(struct parsed_partitions *state,
+                      u64 lba, u8 *buffer, size_t count)
 {
        size_t totalreadcount = 0;
+       struct block_device *bdev = state->bdev;
        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
 
-       if (!bdev || !buffer || lba > last_lba(bdev))
+       if (!buffer || lba > last_lba(bdev))
                 return 0;
 
        while (count) {
                int copied = 512;
                Sector sect;
-               unsigned char *data = read_dev_sector(bdev, n++, &sect);
+               unsigned char *data = read_part_sector(state, n++, &sect);
                if (!data)
                        break;
                if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 
 /**
  * alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
  * @gpt - GPT header
  * 
  * Description: Returns ptes on success,  NULL on error.
  * Allocates space for PTEs based on information found in @gpt.
  * Notes: remember to free pte when you're done!
  */
-static gpt_entry *
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+                                        gpt_header *gpt)
 {
        size_t count;
        gpt_entry *pte;
-       if (!bdev || !gpt)
+
+       if (!gpt)
                return NULL;
 
        count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
        if (!pte)
                return NULL;
 
-       if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+       if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
                      (u8 *) pte,
                     count) < count) {
                kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 
 /**
  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
  * @lba is the Logical Block Address of the partition table
  * 
  * Description: returns GPT header on success, NULL on error.   Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
  * Note: remember to free gpt when finished with it.
  */
-static gpt_header *
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+                                        u64 lba)
 {
        gpt_header *gpt;
-       unsigned ssz = bdev_logical_block_size(bdev);
-
-       if (!bdev)
-               return NULL;
+       unsigned ssz = bdev_logical_block_size(state->bdev);
 
        gpt = kzalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
 
-       if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+       if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
                kfree(gpt);
                 gpt=NULL;
                return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 
 /**
  * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
  * @lba is the logical block address of the GPT header to test
  * @gpt is a GPT header ptr, filled on return.
  * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
  * Description: returns 1 if valid,  0 on error.
  * If valid, returns pointers to newly allocated GPT header and PTEs.
  */
-static int
-is_gpt_valid(struct block_device *bdev, u64 lba,
-            gpt_header **gpt, gpt_entry **ptes)
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+                       gpt_header **gpt, gpt_entry **ptes)
 {
        u32 crc, origcrc;
        u64 lastlba;
 
-       if (!bdev || !gpt || !ptes)
+       if (!ptes)
                return 0;
-       if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+       if (!(*gpt = alloc_read_gpt_header(state, lba)))
                return 0;
 
        /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
        /* Check the first_usable_lba and last_usable_lba are
         * within the disk.
         */
-       lastlba = last_lba(bdev);
+       lastlba = last_lba(state->bdev);
        if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
                pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
                         (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
                goto fail;
        }
 
-       if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+       if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
                goto fail;
 
        /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
 /**
  * find_valid_gpt() - Search disk for valid GPT headers