Merge branch 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 23 Oct 2010 00:00:32 +0000 (17:00 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 23 Oct 2010 00:00:32 +0000 (17:00 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 23 Oct 2010 00:00:32 +0000 (17:00 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 23 Oct 2010 00:00:32 +0000 (17:00 -0700)
diff --combined Documentation/cgroups/blkio-controller.txt

index 6919d62,2f5613f..d6da611
--- 1/Documentation/cgroups/blkio-controller.txt
--- 2/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@@ -8,12 -8,17 +8,17 @@@ both at leaf nodes as well as at interm
   Plan is to use the same cgroup based management interface for blkio controller
   and based on user options switch IO policies in the background.
   
- In the first phase, this patchset implements proportional weight time based
- division of disk policy. It is implemented in CFQ. Hence this policy takes
- effect only on leaf nodes when CFQ is being used.
+ Currently two IO control policies are implemented. First one is proportional
+ weight time based division of disk policy. It is implemented in CFQ. Hence
+ this policy takes effect only on leaf nodes when CFQ is being used. The second
+ one is throttling policy which can be used to specify upper IO rate limits
+ on devices. This policy is implemented in generic block layer and can be
+ used on leaf nodes as well as higher level logical devices like device mapper.
   
   HOWTO
   =====
+ Proportional Weight division of bandwidth
+ -----------------------------------------
   You can do a very simple testing of running two dd threads in two different
   cgroups. Here is what you can do.
   
@@@ -55,6 -60,35 +60,35 @@@
     group dispatched to the disk. We provide fairness in terms of disk time, so
     ideally io.disk_time of cgroups should be in proportion to the weight.
   
+ Throttling/Upper Limit policy
+ -----------------------------
+ - Enable Block IO controller
+       CONFIG_BLK_CGROUP=y
+ 
+ - Enable throttling in block layer
+       CONFIG_BLK_DEV_THROTTLING=y
+ 
+ - Mount blkio controller
+         mount -t cgroup -o blkio none /cgroup/blkio
+ 
+ - Specify a bandwidth rate on particular device for root group. The format
+   for policy is "<major>:<minor>  <byes_per_second>".
+ 
+         echo "8:16  1048576" > /cgroup/blkio/blkio.read_bps_device
+ 
+   Above will put a limit of 1MB/second on reads happening for root group
+   on device having major/minor number 8:16.
+ 
+ - Run dd to read a file and see if rate is throttled to 1MB/s or not.
+ 
+               # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+               # iflag=direct
+         1024+0 records in
+         1024+0 records out
+         4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+ 
+  Limits for writes can be put using blkio.write_bps_device file.
+ 
   Various user visible config options
   ===================================
   CONFIG_BLK_CGROUP
@@@ -68,8 -102,13 +102,13 @@@ CONFIG_CFQ_GROUP_IOSCHE
         - Enables group scheduling in CFQ. Currently only 1 level of group
           creation is allowed.
   
+ CONFIG_BLK_DEV_THROTTLING
+       - Enable block device throttling support in block layer.
+ 
   Details of cgroup files
   =======================
+ Proportional weight policy files
+ --------------------------------
   - blkio.weight
         - Specifies per cgroup weight. This is default weight of the group
           on all the devices until and unless overridden by per device rule.
@@@ -210,6 -249,67 +249,67 @@@
           and minor number of the device and third field specifies the number
           of times a group was dequeued from a particular device.
   
+ Throttling/Upper limit policy files
+ -----------------------------------
+ - blkio.throttle.read_bps_device
+       - Specifies upper limit on READ rate from the device. IO rate is
+         specified in bytes per second. Rules are per deivce. Following is
+         the format.
+ 
+   echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device
+ 
+ - blkio.throttle.write_bps_device
+       - Specifies upper limit on WRITE rate to the device. IO rate is
+         specified in bytes per second. Rules are per deivce. Following is
+         the format.
+ 
+   echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device
+ 
+ - blkio.throttle.read_iops_device
+       - Specifies upper limit on READ rate from the device. IO rate is
+         specified in IO per second. Rules are per deivce. Following is
+         the format.
+ 
+   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.read_iops_device
+ 
+ - blkio.throttle.write_iops_device
+       - Specifies upper limit on WRITE rate to the device. IO rate is
+         specified in io per second. Rules are per deivce. Following is
+         the format.
+ 
+   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.write_iops_device
+ 
+ Note: If both BW and IOPS rules are specified for a device, then IO is
+       subjectd to both the constraints.
+ 
+ - blkio.throttle.io_serviced
+       - Number of IOs (bio) completed to/from the disk by the group (as
+         seen by throttling policy). These are further divided by the type
+         of operation - read or write, sync or async. First two fields specify
+         the major and minor number of the device, third field specifies the
+         operation type and the fourth field specifies the number of IOs.
+ 
+         blkio.io_serviced does accounting as seen by CFQ and counts are in
+         number of requests (struct request). On the other hand,
+         blkio.throttle.io_serviced counts number of IO in terms of number
+         of bios as seen by throttling policy.  These bios can later be
+         merged by elevator and total number of requests completed can be
+         lesser.
+ 
+ - blkio.throttle.io_service_bytes
+       - Number of bytes transferred to/from the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of bytes.
+ 
+         These numbers should roughly be same as blkio.io_service_bytes as
+         updated by CFQ. The difference between two is that
+         blkio.io_service_bytes will not be updated if CFQ is not operating
+         on request queue.
+ 
+ Common files among various policies
+ -----------------------------------
   - blkio.reset_stats
         - Writing an int to this file will result in resetting all the stats
           for that cgroup.
@@@ -217,7 -317,6 +317,7 @@@
   CFQ sysfs tunable
   =================
   /sys/block/<disk>/queue/iosched/group_isolation
+ +-----------------------------------------------
   
   If group_isolation=1, it provides stronger isolation between groups at the
   expense of throughput. By default group_isolation is 0. In general that
@@@ -244,33 -343,6 +344,33 @@@ By default one should run with group_is
   and one wants stronger isolation between groups, then set group_isolation=1
   but this will come at cost of reduced throughput.
   
+ +/sys/block/<disk>/queue/iosched/slice_idle
+ +------------------------------------------
+ +On a faster hardware CFQ can be slow, especially with sequential workload.
+ +This happens because CFQ idles on a single queue and single queue might not
+ +drive deeper request queue depths to keep the storage busy. In such scenarios
+ +one can try setting slice_idle=0 and that would switch CFQ to IOPS
+ +(IO operations per second) mode on NCQ supporting hardware.
+ +
+ +That means CFQ will not idle between cfq queues of a cfq group and hence be
+ +able to driver higher queue depth and achieve better throughput. That also
+ +means that cfq provides fairness among groups in terms of IOPS and not in
+ +terms of disk time.
+ +
+ +/sys/block/<disk>/queue/iosched/group_idle
+ +------------------------------------------
+ +If one disables idling on individual cfq queues and cfq service trees by
+ +setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
+ +on the group in an attempt to provide fairness among groups.
+ +
+ +By default group_idle is same as slice_idle and does not do anything if
+ +slice_idle is enabled.
+ +
+ +One can experience an overall throughput drop if you have created multiple
+ +groups and put applications in that group which are not driving enough
+ +IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
+ +on individual groups and throughput should improve.
+ +
   What works
   ==========
   - Currently only sync IO queues are support. All the buffered writes are
diff --combined block/blk-cgroup.c

index 2fef1ef,0f59b23..b1febd0
--- 1/block/blk-cgroup.c
--- 2/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -37,6 -37,12 +37,12 @@@ static void blkiocg_attach(struct cgrou
   static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
   static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
   
+ /* for encoding cft->private value on file */
+ #define BLKIOFILE_PRIVATE(x, val)     (((x) << 16) | (val))
+ /* What policy owns the file, proportional or throttle */
+ #define BLKIOFILE_POLICY(val)         (((val) >> 16) & 0xffff)
+ #define BLKIOFILE_ATTR(val)           ((val) & 0xffff)
+ 
   struct cgroup_subsys blkio_subsys = {
         .name = "blkio",
         .create = blkiocg_create,
@@@ -59,6 -65,27 +65,27 @@@ static inline void blkio_policy_insert_
         list_add(&pn->node, &blkcg->policy_list);
   }
   
+ static inline bool cftype_blkg_same_policy(struct cftype *cft,
+                       struct blkio_group *blkg)
+ {
+       enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+ 
+       if (blkg->plid == plid)
+               return 1;
+ 
+       return 0;
+ }
+ 
+ /* Determines if policy node matches cgroup file being accessed */
+ static inline bool pn_matches_cftype(struct cftype *cft,
+                       struct blkio_policy_node *pn)
+ {
+       enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+       int fileid = BLKIOFILE_ATTR(cft->private);
+ 
+       return (plid == pn->plid && fileid == pn->fileid);
+ }
+ 
   /* Must be called with blkcg->lock held */
   static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
   {
@@@ -67,12 -94,13 +94,13 @@@
   
   /* Must be called with blkcg->lock held */
   static struct blkio_policy_node *
- blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+ blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
+               enum blkio_policy_id plid, int fileid)
   {
         struct blkio_policy_node *pn;
   
         list_for_each_entry(pn, &blkcg->policy_list, node) {
-               if (pn->dev == dev)
+               if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
                         return pn;
         }
   
@@@ -86,6 -114,67 +114,67 @@@ struct blkio_cgroup *cgroup_to_blkio_cg
   }
   EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
   
+ static inline void
+ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
+ {
+       struct blkio_policy_type *blkiop;
+ 
+       list_for_each_entry(blkiop, &blkio_list, list) {
+               /* If this policy does not own the blkg, do not send updates */
+               if (blkiop->plid != blkg->plid)
+                       continue;
+               if (blkiop->ops.blkio_update_group_weight_fn)
+                       blkiop->ops.blkio_update_group_weight_fn(blkg->key,
+                                                       blkg, weight);
+       }
+ }
+ 
+ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
+                               int fileid)
+ {
+       struct blkio_policy_type *blkiop;
+ 
+       list_for_each_entry(blkiop, &blkio_list, list) {
+ 
+               /* If this policy does not own the blkg, do not send updates */
+               if (blkiop->plid != blkg->plid)
+                       continue;
+ 
+               if (fileid == BLKIO_THROTL_read_bps_device
+                   && blkiop->ops.blkio_update_group_read_bps_fn)
+                       blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
+                                                               blkg, bps);
+ 
+               if (fileid == BLKIO_THROTL_write_bps_device
+                   && blkiop->ops.blkio_update_group_write_bps_fn)
+                       blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
+                                                               blkg, bps);
+       }
+ }
+ 
+ static inline void blkio_update_group_iops(struct blkio_group *blkg,
+                       unsigned int iops, int fileid)
+ {
+       struct blkio_policy_type *blkiop;
+ 
+       list_for_each_entry(blkiop, &blkio_list, list) {
+ 
+               /* If this policy does not own the blkg, do not send updates */
+               if (blkiop->plid != blkg->plid)
+                       continue;
+ 
+               if (fileid == BLKIO_THROTL_read_iops_device
+                   && blkiop->ops.blkio_update_group_read_iops_fn)
+                       blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
+                                                               blkg, iops);
+ 
+               if (fileid == BLKIO_THROTL_write_iops_device
+                   && blkiop->ops.blkio_update_group_write_iops_fn)
+                       blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
+                                                               blkg,iops);
+       }
+ }
+ 
   /*
    * Add to the appropriate stat variable depending on the request type.
    * This should be called with the blkg->stats_lock held.
@@@ -341,7 -430,8 +430,8 @@@ void blkiocg_update_io_merged_stats(str
   EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
   
   void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-                       struct blkio_group *blkg, void *key, dev_t dev)
+               struct blkio_group *blkg, void *key, dev_t dev,
+               enum blkio_policy_id plid)
   {
         unsigned long flags;
   
@@@ -350,6 -440,7 +440,7 @@@
         rcu_assign_pointer(blkg->key, key);
         blkg->blkcg_id = css_id(&blkcg->css);
         hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+       blkg->plid = plid;
         spin_unlock_irqrestore(&blkcg->lock, flags);
         /* Need to take css reference ? */
         cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@@ -408,51 -499,6 +499,6 @@@ struct blkio_group *blkiocg_lookup_grou
   }
   EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
   
- #define SHOW_FUNCTION(__VAR)                                          \
- static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,              \
-                                      struct cftype *cftype)           \
- {                                                                     \
-       struct blkio_cgroup *blkcg;                                     \
-                                                                       \
-       blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
-       return (u64)blkcg->__VAR;                                       \
- }
- 
- SHOW_FUNCTION(weight);
- #undef SHOW_FUNCTION
- 
- static int
- blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
- {
-       struct blkio_cgroup *blkcg;
-       struct blkio_group *blkg;
-       struct hlist_node *n;
-       struct blkio_policy_type *blkiop;
-       struct blkio_policy_node *pn;
- 
-       if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
-               return -EINVAL;
- 
-       blkcg = cgroup_to_blkio_cgroup(cgroup);
-       spin_lock(&blkio_list_lock);
-       spin_lock_irq(&blkcg->lock);
-       blkcg->weight = (unsigned int)val;
- 
-       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-               pn = blkio_policy_search_node(blkcg, blkg->dev);
- 
-               if (pn)
-                       continue;
- 
-               list_for_each_entry(blkiop, &blkio_list, list)
-                       blkiop->ops.blkio_update_group_weight_fn(blkg,
-                                       blkcg->weight);
-       }
-       spin_unlock_irq(&blkcg->lock);
-       spin_unlock(&blkio_list_lock);
-       return 0;
- }
- 
   static int
   blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
   {
@@@ -593,52 -639,6 +639,6 @@@ static uint64_t blkio_get_stat(struct b
         return disk_total;
   }
   
- #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)              \
- static int blkiocg_##__VAR##_read(struct cgroup *cgroup,              \
-               struct cftype *cftype, struct cgroup_map_cb *cb)        \
- {                                                                     \
-       struct blkio_cgroup *blkcg;                                     \
-       struct blkio_group *blkg;                                       \
-       struct hlist_node *n;                                           \
-       uint64_t cgroup_total = 0;                                      \
-                                                                       \
-       if (!cgroup_lock_live_group(cgroup))                            \
-               return -ENODEV;                                         \
-                                                                       \
-       blkcg = cgroup_to_blkio_cgroup(cgroup);                         \
-       rcu_read_lock();                                                \
-       hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
-               if (blkg->dev) {                                        \
-                       spin_lock_irq(&blkg->stats_lock);               \
-                       cgroup_total += blkio_get_stat(blkg, cb,        \
-                                               blkg->dev, type);       \
-                       spin_unlock_irq(&blkg->stats_lock);             \
-               }                                                       \
-       }                                                               \
-       if (show_total)                                                 \
-               cb->fill(cb, "Total", cgroup_total);                    \
-       rcu_read_unlock();                                              \
-       cgroup_unlock();                                                \
-       return 0;                                                       \
- }
- 
- SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
- SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
- SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
- SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
- SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
- SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
- SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
- SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
- #ifdef CONFIG_DEBUG_BLK_CGROUP
- SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
- SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
- SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
- SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
- SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
- #endif
- #undef SHOW_FUNCTION_PER_GROUP
- 
   static int blkio_check_dev_num(dev_t dev)
   {
         int part = 0;
@@@ -652,13 -652,14 +652,14 @@@
   }
   
   static int blkio_policy_parse_and_set(char *buf,
-                                     struct blkio_policy_node *newpn)
+       struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
   {
         char *s[4], *p, *major_s = NULL, *minor_s = NULL;
         int ret;
         unsigned long major, minor, temp;
         int i = 0;
         dev_t dev;
+       u64 bps, iops;
   
         memset(s, 0, sizeof(s));
   
@@@ -705,12 -706,47 +706,47 @@@
         if (s[1] == NULL)
                 return -EINVAL;
   
-       ret = strict_strtoul(s[1], 10, &temp);
-       if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
-           temp > BLKIO_WEIGHT_MAX)
-               return -EINVAL;
+       switch (plid) {
+       case BLKIO_POLICY_PROP:
+               ret = strict_strtoul(s[1], 10, &temp);
+               if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+                       temp > BLKIO_WEIGHT_MAX)
+                       return -EINVAL;
   
-       newpn->weight =  temp;
+               newpn->plid = plid;
+               newpn->fileid = fileid;
+               newpn->val.weight = temp;
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       ret = strict_strtoull(s[1], 10, &bps);
+                       if (ret)
+                               return -EINVAL;
+ 
+                       newpn->plid = plid;
+                       newpn->fileid = fileid;
+                       newpn->val.bps = bps;
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       ret = strict_strtoull(s[1], 10, &iops);
+                       if (ret)
+                               return -EINVAL;
+ 
+                       if (iops > THROTL_IOPS_MAX)
+                               return -EINVAL;
+ 
+                       newpn->plid = plid;
+                       newpn->fileid = fileid;
+                       newpn->val.iops = (unsigned int)iops;
+                       break;
+               }
+               break;
+       default:
+               BUG();
+       }
   
         return 0;
   }
@@@ -720,26 -756,180 +756,180 @@@ unsigned int blkcg_get_weight(struct bl
   {
         struct blkio_policy_node *pn;
   
-       pn = blkio_policy_search_node(blkcg, dev);
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
+                               BLKIO_PROP_weight_device);
         if (pn)
-               return pn->weight;
+               return pn->val.weight;
         else
                 return blkcg->weight;
   }
   EXPORT_SYMBOL_GPL(blkcg_get_weight);
   
+ uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
+ {
+       struct blkio_policy_node *pn;
+ 
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_bps_device);
+       if (pn)
+               return pn->val.bps;
+       else
+               return -1;
+ }
+ 
+ uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
+ {
+       struct blkio_policy_node *pn;
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_bps_device);
+       if (pn)
+               return pn->val.bps;
+       else
+               return -1;
+ }
+ 
+ unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
+ {
+       struct blkio_policy_node *pn;
+ 
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_iops_device);
+       if (pn)
+               return pn->val.iops;
+       else
+               return -1;
+ }
+ 
+ unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
+ {
+       struct blkio_policy_node *pn;
+       pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_iops_device);
+       if (pn)
+               return pn->val.iops;
+       else
+               return -1;
+ }
+ 
+ /* Checks whether user asked for deleting a policy rule */
+ static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
+ {
+       switch(pn->plid) {
+       case BLKIO_POLICY_PROP:
+               if (pn->val.weight == 0)
+                       return 1;
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(pn->fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       if (pn->val.bps == 0)
+                               return 1;
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       if (pn->val.iops == 0)
+                               return 1;
+               }
+               break;
+       default:
+               BUG();
+       }
+ 
+       return 0;
+ }
+ 
+ static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
+                                       struct blkio_policy_node *newpn)
+ {
+       switch(oldpn->plid) {
+       case BLKIO_POLICY_PROP:
+               oldpn->val.weight = newpn->val.weight;
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(newpn->fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       oldpn->val.bps = newpn->val.bps;
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       oldpn->val.iops = newpn->val.iops;
+               }
+               break;
+       default:
+               BUG();
+       }
+ }
+ 
+ /*
+  * Some rules/values in blkg have changed. Propogate those to respective
+  * policies.
+  */
+ static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
+               struct blkio_group *blkg, struct blkio_policy_node *pn)
+ {
+       unsigned int weight, iops;
+       u64 bps;
+ 
+       switch(pn->plid) {
+       case BLKIO_POLICY_PROP:
+               weight = pn->val.weight ? pn->val.weight :
+                               blkcg->weight;
+               blkio_update_group_weight(blkg, weight);
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(pn->fileid) {
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+                       bps = pn->val.bps ? pn->val.bps : (-1);
+                       blkio_update_group_bps(blkg, bps, pn->fileid);
+                       break;
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       iops = pn->val.iops ? pn->val.iops : (-1);
+                       blkio_update_group_iops(blkg, iops, pn->fileid);
+                       break;
+               }
+               break;
+       default:
+               BUG();
+       }
+ }
+ 
+ /*
+  * A policy node rule has been updated. Propogate this update to all the
+  * block groups which might be affected by this update.
+  */
+ static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
+                               struct blkio_policy_node *pn)
+ {
+       struct blkio_group *blkg;
+       struct hlist_node *n;
+ 
+       spin_lock(&blkio_list_lock);
+       spin_lock_irq(&blkcg->lock);
+ 
+       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+               if (pn->dev != blkg->dev || pn->plid != blkg->plid)
+                       continue;
+               blkio_update_blkg_policy(blkcg, blkg, pn);
+       }
+ 
+       spin_unlock_irq(&blkcg->lock);
+       spin_unlock(&blkio_list_lock);
+ }
   
- static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
-                                      const char *buffer)
+ static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
   {
         int ret = 0;
         char *buf;
         struct blkio_policy_node *newpn, *pn;
         struct blkio_cgroup *blkcg;
-       struct blkio_group *blkg;
         int keep_newpn = 0;
-       struct hlist_node *n;
-       struct blkio_policy_type *blkiop;
+       enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+       int fileid = BLKIOFILE_ATTR(cft->private);
   
         buf = kstrdup(buffer, GFP_KERNEL);
         if (!buf)
@@@ -751,7 -941,7 +941,7 @@@
                 goto free_buf;
         }
   
-       ret = blkio_policy_parse_and_set(buf, newpn);
+       ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
         if (ret)
                 goto free_newpn;
   
@@@ -759,9 -949,9 +949,9 @@@
   
         spin_lock_irq(&blkcg->lock);
   
-       pn = blkio_policy_search_node(blkcg, newpn->dev);
+       pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
         if (!pn) {
-               if (newpn->weight != 0) {
+               if (!blkio_delete_rule_command(newpn)) {
                         blkio_policy_insert_node(blkcg, newpn);
                         keep_newpn = 1;
                 }
@@@ -769,33 -959,17 +959,17 @@@
                 goto update_io_group;
         }
   
-       if (newpn->weight == 0) {
-               /* weight == 0 means deleteing a specific weight */
+       if (blkio_delete_rule_command(newpn)) {
                 blkio_policy_delete_node(pn);
                 spin_unlock_irq(&blkcg->lock);
                 goto update_io_group;
         }
         spin_unlock_irq(&blkcg->lock);
   
-       pn->weight = newpn->weight;
+       blkio_update_policy_rule(pn, newpn);
   
   update_io_group:
-       /* update weight for each cfqg */
-       spin_lock(&blkio_list_lock);
-       spin_lock_irq(&blkcg->lock);
- 
-       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-               if (newpn->dev == blkg->dev) {
-                       list_for_each_entry(blkiop, &blkio_list, list)
-                               blkiop->ops.blkio_update_group_weight_fn(blkg,
-                                                        newpn->weight ?
-                                                        newpn->weight :
-                                                        blkcg->weight);
-               }
-       }
- 
-       spin_unlock_irq(&blkcg->lock);
-       spin_unlock(&blkio_list_lock);
+       blkio_update_policy_node_blkg(blkcg, newpn);
   
   free_newpn:
         if (!keep_newpn)
@@@ -805,23 -979,256 +979,256 @@@ free_buf
         return ret;
   }
   
- static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
-                                     struct seq_file *m)
+ static void
+ blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
   {
-       struct blkio_cgroup *blkcg;
-       struct blkio_policy_node *pn;
+       switch(pn->plid) {
+               case BLKIO_POLICY_PROP:
+                       if (pn->fileid == BLKIO_PROP_weight_device)
+                               seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                       MINOR(pn->dev), pn->val.weight);
+                       break;
+               case BLKIO_POLICY_THROTL:
+                       switch(pn->fileid) {
+                       case BLKIO_THROTL_read_bps_device:
+                       case BLKIO_THROTL_write_bps_device:
+                               seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
+                                       MINOR(pn->dev), pn->val.bps);
+                               break;
+                       case BLKIO_THROTL_read_iops_device:
+                       case BLKIO_THROTL_write_iops_device:
+                               seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+                                       MINOR(pn->dev), pn->val.iops);
+                               break;
+                       }
+                       break;
+               default:
+                       BUG();
+       }
+ }
   
-       seq_printf(m, "dev\tweight\n");
+ /* cgroup files which read their data from policy nodes end up here */
+ static void blkio_read_policy_node_files(struct cftype *cft,
+                       struct blkio_cgroup *blkcg, struct seq_file *m)
+ {
+       struct blkio_policy_node *pn;
   
-       blkcg = cgroup_to_blkio_cgroup(cgrp);
         if (!list_empty(&blkcg->policy_list)) {
                 spin_lock_irq(&blkcg->lock);
                 list_for_each_entry(pn, &blkcg->policy_list, node) {
-                       seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
-                                  MINOR(pn->dev), pn->weight);
+                       if (!pn_matches_cftype(cft, pn))
+                               continue;
+                       blkio_print_policy_node(m, pn);
                 }
                 spin_unlock_irq(&blkcg->lock);
         }
+ }
+ 
+ static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
+                               struct seq_file *m)
+ {
+       struct blkio_cgroup *blkcg;
+       enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+       int name = BLKIOFILE_ATTR(cft->private);
+ 
+       blkcg = cgroup_to_blkio_cgroup(cgrp);
+ 
+       switch(plid) {
+       case BLKIO_POLICY_PROP:
+               switch(name) {
+               case BLKIO_PROP_weight_device:
+                       blkio_read_policy_node_files(cft, blkcg, m);
+                       return 0;
+               default:
+                       BUG();
+               }
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(name){
+               case BLKIO_THROTL_read_bps_device:
+               case BLKIO_THROTL_write_bps_device:
+               case BLKIO_THROTL_read_iops_device:
+               case BLKIO_THROTL_write_iops_device:
+                       blkio_read_policy_node_files(cft, blkcg, m);
+                       return 0;
+               default:
+                       BUG();
+               }
+               break;
+       default:
+               BUG();
+       }
+ 
+       return 0;
+ }
+ 
+ static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
+               struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
+               bool show_total)
+ {
+       struct blkio_group *blkg;
+       struct hlist_node *n;
+       uint64_t cgroup_total = 0;
+ 
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+               if (blkg->dev) {
+                       if (!cftype_blkg_same_policy(cft, blkg))
+                               continue;
+                       spin_lock_irq(&blkg->stats_lock);
+                       cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
+                                               type);
+                       spin_unlock_irq(&blkg->stats_lock);
+               }
+       }
+       if (show_total)
+               cb->fill(cb, "Total", cgroup_total);
+       rcu_read_unlock();
+       return 0;
+ }
+ 
+ /* All map kind of cgroup file get serviced by this function */
+ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
+                               struct cgroup_map_cb *cb)
+ {
+       struct blkio_cgroup *blkcg;
+       enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+       int name = BLKIOFILE_ATTR(cft->private);
+ 
+       blkcg = cgroup_to_blkio_cgroup(cgrp);
+ 
+       switch(plid) {
+       case BLKIO_POLICY_PROP:
+               switch(name) {
+               case BLKIO_PROP_time:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_TIME, 0);
+               case BLKIO_PROP_sectors:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_SECTORS, 0);
+               case BLKIO_PROP_io_service_bytes:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_SERVICE_BYTES, 1);
+               case BLKIO_PROP_io_serviced:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_SERVICED, 1);
+               case BLKIO_PROP_io_service_time:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_SERVICE_TIME, 1);
+               case BLKIO_PROP_io_wait_time:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_WAIT_TIME, 1);
+               case BLKIO_PROP_io_merged:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_MERGED, 1);
+               case BLKIO_PROP_io_queued:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_QUEUED, 1);
+ #ifdef CONFIG_DEBUG_BLK_CGROUP
+               case BLKIO_PROP_dequeue:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_DEQUEUE, 0);
+               case BLKIO_PROP_avg_queue_size:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+               case BLKIO_PROP_group_wait_time:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_GROUP_WAIT_TIME, 0);
+               case BLKIO_PROP_idle_time:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_IDLE_TIME, 0);
+               case BLKIO_PROP_empty_time:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_EMPTY_TIME, 0);
+ #endif
+               default:
+                       BUG();
+               }
+               break;
+       case BLKIO_POLICY_THROTL:
+               switch(name){
+               case BLKIO_THROTL_io_service_bytes:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_SERVICE_BYTES, 1);
+               case BLKIO_THROTL_io_serviced:
+                       return blkio_read_blkg_stats(blkcg, cft, cb,
+                                               BLKIO_STAT_SERVICED, 1);
+               default:
+                       BUG();
+               }
+               break;
+       default:
+               BUG();
+       }
+ 
+       return 0;
+ }
+ 
+ static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
+ {
+       struct blkio_group *blkg;
+       struct hlist_node *n;
+       struct blkio_policy_node *pn;
+ 
+       if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+               return -EINVAL;
+ 
+       spin_lock(&blkio_list_lock);
+       spin_lock_irq(&blkcg->lock);
+       blkcg->weight = (unsigned int)val;
+ 
+       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+               pn = blkio_policy_search_node(blkcg, blkg->dev,
+                               BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
+               if (pn)
+                       continue;
+ 
+               blkio_update_group_weight(blkg, blkcg->weight);
+       }
+       spin_unlock_irq(&blkcg->lock);
+       spin_unlock(&blkio_list_lock);
+       return 0;
+ }
+ 
+ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
+       struct blkio_cgroup *blkcg;
+       enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+       int name = BLKIOFILE_ATTR(cft->private);
+ 
+       blkcg = cgroup_to_blkio_cgroup(cgrp);
+ 
+       switch(plid) {
+       case BLKIO_POLICY_PROP:
+               switch(name) {
+               case BLKIO_PROP_weight:
+                       return (u64)blkcg->weight;
+               }
+               break;
+       default:
+               BUG();
+       }
+       return 0;
+ }
+ 
+ static int
+ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+ {
+       struct blkio_cgroup *blkcg;
+       enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
+       int name = BLKIOFILE_ATTR(cft->private);
+ 
+       blkcg = cgroup_to_blkio_cgroup(cgrp);
+ 
+       switch(plid) {
+       case BLKIO_POLICY_PROP:
+               switch(name) {
+               case BLKIO_PROP_weight:
+                       return blkio_weight_write(blkcg, val);
+               }
+               break;
+       default:
+               BUG();
+       }
   
         return 0;
   }
@@@ -829,71 -1236,151 +1236,151 @@@
   struct cftype blkio_files[] = {
         {
                 .name = "weight_device",
-               .read_seq_string = blkiocg_weight_device_read,
-               .write_string = blkiocg_weight_device_write,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_weight_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
                 .max_write_len = 256,
         },
         {
                 .name = "weight",
-               .read_u64 = blkiocg_weight_read,
-               .write_u64 = blkiocg_weight_write,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_weight),
+               .read_u64 = blkiocg_file_read_u64,
+               .write_u64 = blkiocg_file_write_u64,
         },
         {
                 .name = "time",
-               .read_map = blkiocg_time_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_time),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "sectors",
-               .read_map = blkiocg_sectors_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_sectors),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "io_service_bytes",
-               .read_map = blkiocg_io_service_bytes_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_io_service_bytes),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "io_serviced",
-               .read_map = blkiocg_io_serviced_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_io_serviced),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "io_service_time",
-               .read_map = blkiocg_io_service_time_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_io_service_time),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "io_wait_time",
-               .read_map = blkiocg_io_wait_time_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_io_wait_time),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "io_merged",
-               .read_map = blkiocg_io_merged_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_io_merged),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "io_queued",
-               .read_map = blkiocg_io_queued_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_io_queued),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "reset_stats",
                 .write_u64 = blkiocg_reset_stats,
         },
+ #ifdef CONFIG_BLK_DEV_THROTTLING
+       {
+               .name = "throttle.read_bps_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_bps_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+ 
+       {
+               .name = "throttle.write_bps_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_bps_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+ 
+       {
+               .name = "throttle.read_iops_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_read_iops_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+ 
+       {
+               .name = "throttle.write_iops_device",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_write_iops_device),
+               .read_seq_string = blkiocg_file_read,
+               .write_string = blkiocg_file_write,
+               .max_write_len = 256,
+       },
+       {
+               .name = "throttle.io_service_bytes",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_io_service_bytes),
+               .read_map = blkiocg_file_read_map,
+       },
+       {
+               .name = "throttle.io_serviced",
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
+                               BLKIO_THROTL_io_serviced),
+               .read_map = blkiocg_file_read_map,
+       },
+ #endif /* CONFIG_BLK_DEV_THROTTLING */
+ 
   #ifdef CONFIG_DEBUG_BLK_CGROUP
         {
                 .name = "avg_queue_size",
-               .read_map = blkiocg_avg_queue_size_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_avg_queue_size),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "group_wait_time",
-               .read_map = blkiocg_group_wait_time_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_group_wait_time),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "idle_time",
-               .read_map = blkiocg_idle_time_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_idle_time),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "empty_time",
-               .read_map = blkiocg_empty_time_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_empty_time),
+               .read_map = blkiocg_file_read_map,
         },
         {
                 .name = "dequeue",
-               .read_map = blkiocg_dequeue_read,
+               .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+                               BLKIO_PROP_dequeue),
+               .read_map = blkiocg_file_read_map,
         },
   #endif
   };
@@@ -932,13 -1419,14 +1419,14 @@@ static void blkiocg_destroy(struct cgro
                 /*
                  * This blkio_group is being unlinked as associated cgroup is
                  * going away. Let all the IO controlling policies know about
-                * this event. Currently this is static call to one io
-                * controlling policy. Once we have more policies in place, we
-                * need some dynamic registration of callback function.
+                * this event.
                  */
                 spin_lock(&blkio_list_lock);
-               list_for_each_entry(blkiop, &blkio_list, list)
+               list_for_each_entry(blkiop, &blkio_list, list) {
+                       if (blkiop->plid != blkg->plid)
+                               continue;
                         blkiop->ops.blkio_unlink_group_fn(key, blkg);
+               }
                 spin_unlock(&blkio_list_lock);
         } while (1);
   
@@@ -966,7 -1454,7 +1454,7 @@@ blkiocg_create(struct cgroup_subsys *su
   
         /* Currently we do not support hierarchy deeper than two level (0,1) */
         if (parent != cgroup->top_cgroup)
- -              return ERR_PTR(-EINVAL);
+ +              return ERR_PTR(-EPERM);
   
         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
         if (!blkcg)
diff --combined block/blk-core.c

index 32a1c12,ddc6833..500eb85
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -64,13 -64,15 +64,15 @@@ static void drive_stat_acct(struct requ
                 return;
   
         cpu = part_stat_lock();
-       part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
   
-       if (!new_io)
+       if (!new_io) {
+               part = rq->part;
                 part_stat_inc(cpu, part, merges[rw]);
-       else {
+       } else {
+               part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
                 part_round_stats(cpu, part);
                 part_inc_in_flight(part, rw);
+               rq->part = part;
         }
   
         part_stat_unlock();
@@@ -128,6 -130,7 +130,7 @@@ void blk_rq_init(struct request_queue *
         rq->ref_count = 1;
         rq->start_time = jiffies;
         set_start_time_ns(rq);
+       rq->part = NULL;
   }
   EXPORT_SYMBOL(blk_rq_init);
   
@@@ -382,6 -385,7 +385,7 @@@ void blk_sync_queue(struct request_queu
         del_timer_sync(&q->unplug_timer);
         del_timer_sync(&q->timeout);
         cancel_work_sync(&q->unplug_work);
+       throtl_shutdown_timer_wq(q);
   }
   EXPORT_SYMBOL(blk_sync_queue);
   
@@@ -459,6 -463,8 +463,8 @@@ void blk_cleanup_queue(struct request_q
         if (q->elevator)
                 elevator_exit(q->elevator);
   
+       blk_throtl_exit(q);
+ 
         blk_put_queue(q);
   }
   EXPORT_SYMBOL(blk_cleanup_queue);
@@@ -515,6 -521,11 +521,11 @@@ struct request_queue *blk_alloc_queue_n
                 return NULL;
         }
   
+       if (blk_throtl_init(q)) {
+               kmem_cache_free(blk_requestq_cachep, q);
+               return NULL;
+       }
+ 
         setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                     laptop_mode_timer_fn, (unsigned long) q);
         init_timer(&q->unplug_timer);
@@@ -796,11 -807,16 +807,16 @@@ static struct request *get_request(stru
         rl->starved[is_sync] = 0;
   
         priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-       if (priv)
+       if (priv) {
                 rl->elvpriv++;
   
-       if (blk_queue_io_stat(q))
-               rw_flags |= REQ_IO_STAT;
+               /*
+                * Don't do stats for non-priv requests
+                */
+               if (blk_queue_io_stat(q))
+                       rw_flags |= REQ_IO_STAT;
+       }
+ 
         spin_unlock_irq(q->queue_lock);
   
         rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@@ -1198,9 -1214,9 +1214,9 @@@ static int __make_request(struct reques
         int el_ret;
         unsigned int bytes = bio->bi_size;
         const unsigned short prio = bio_prio(bio);
- -      const bool sync = (bio->bi_rw & REQ_SYNC);
- -      const bool unplug = (bio->bi_rw & REQ_UNPLUG);
- -      const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ +      const bool sync = !!(bio->bi_rw & REQ_SYNC);
+ +      const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+ +      const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
         int rw_flags;
   
         if ((bio->bi_rw & REQ_HARDBARRIER) &&
@@@ -1522,6 -1538,15 +1538,15 @@@ static inline void __generic_make_reque
                         goto end_io;
                 }
   
+               blk_throtl_bio(q, &bio);
+ 
+               /*
+                * If bio = NULL, bio has been throttled and will be submitted
+                * later.
+                */
+               if (!bio)
+                       break;
+ 
                 trace_block_bio_queue(q, bio);
   
                 ret = q->make_request_fn(q, bio);
@@@ -1612,11 -1637,12 +1637,12 @@@ void submit_bio(int rw, struct bio *bio
   
                 if (unlikely(block_dump)) {
                         char b[BDEVNAME_SIZE];
-                       printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+                       printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
                         current->comm, task_pid_nr(current),
                                 (rw & WRITE) ? "WRITE" : "READ",
                                 (unsigned long long)bio->bi_sector,
-                               bdevname(bio->bi_bdev, b));
+                               bdevname(bio->bi_bdev, b),
+                               count);
                 }
         }
   
@@@ -1759,7 -1785,7 +1785,7 @@@ static void blk_account_io_completion(s
                 int cpu;
   
                 cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
                 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                 part_stat_unlock();
         }
@@@ -1779,7 -1805,7 +1805,7 @@@ static void blk_account_io_done(struct 
                 int cpu;
   
                 cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
   
                 part_stat_inc(cpu, part, ios[rw]);
                 part_stat_add(cpu, part, ticks[rw], duration);
@@@ -2579,6 -2605,13 +2605,13 @@@ int kblockd_schedule_work(struct reques
   }
   EXPORT_SYMBOL(kblockd_schedule_work);
   
+ int kblockd_schedule_delayed_work(struct request_queue *q,
+                       struct delayed_work *dwork, unsigned long delay)
+ {
+       return queue_delayed_work(kblockd_workqueue, dwork, delay);
+ }
+ EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+ 
   int __init blk_dev_init(void)
   {
         BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --combined block/blk-map.c

index ade0a08,ac0f7d4..d4a586d
--- 1/block/blk-map.c
--- 2/block/blk-map.c
+++ b/block/blk-map.c
@@@ -54,7 -54,7 +54,7 @@@ static int __blk_rq_map_user(struct req
          * direct dma. else, set up kernel bounce buffers
          */
         uaddr = (unsigned long) ubuf;
-       if (blk_rq_aligned(q, ubuf, len) && !map_data)
+       if (blk_rq_aligned(q, uaddr, len) && !map_data)
                 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
         else
                 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@@ -288,6 -288,7 +288,7 @@@ int blk_rq_map_kern(struct request_queu
                     unsigned int len, gfp_t gfp_mask)
   {
         int reading = rq_data_dir(rq) == READ;
+       unsigned long addr = (unsigned long) kbuf;
         int do_copy = 0;
         struct bio *bio;
         int ret;
@@@ -297,7 -298,7 +298,7 @@@
         if (!len || !kbuf)
                 return -EINVAL;
   
-       do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
+       do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
         if (do_copy)
                 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
         else
@@@ -307,7 -308,7 +308,7 @@@
                 return PTR_ERR(bio);
   
         if (rq_data_dir(rq) == WRITE)
- -              bio->bi_rw |= (1 << REQ_WRITE);
+ +              bio->bi_rw |= REQ_WRITE;
   
         if (do_copy)
                 rq->cmd_flags |= REQ_COPY_USER;
diff --combined block/blk-merge.c

index eafc94f,38ff234..0a2fd8a
--- 1/block/blk-merge.c
--- 2/block/blk-merge.c
+++ b/block/blk-merge.c
@@@ -205,12 -205,11 +205,11 @@@ static inline int ll_new_hw_segment(str
   {
         int nr_phys_segs = bio_phys_segments(q, bio);
   
-       if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
-               req->cmd_flags |= REQ_NOMERGE;
-               if (req == q->last_merge)
-                       q->last_merge = NULL;
-               return 0;
-       }
+       if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+               goto no_merge;
+ 
+       if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
+               goto no_merge;
   
         /*
          * This will form the start of a new hw segment.  Bump both
@@@ -218,6 -217,12 +217,12 @@@
          */
         req->nr_phys_segments += nr_phys_segs;
         return 1;
+ 
+ no_merge:
+       req->cmd_flags |= REQ_NOMERGE;
+       if (req == q->last_merge)
+               q->last_merge = NULL;
+       return 0;
   }
   
   int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@@ -301,6 -306,9 +306,9 @@@ static int ll_merge_requests_fn(struct 
         if (total_phys_segments > queue_max_segments(q))
                 return 0;
   
+       if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
+               return 0;
+ 
         /* Merge is OK... */
         req->nr_phys_segments = total_phys_segments;
         return 1;
@@@ -343,7 -351,7 +351,7 @@@ static void blk_account_io_merge(struc
                 int cpu;
   
                 cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
   
                 part_round_stats(cpu, part);
                 part_dec_in_flight(part, rq_data_dir(req));
@@@ -361,18 -369,6 +369,18 @@@ static int attempt_merge(struct request
         if (!rq_mergeable(req) || !rq_mergeable(next))
                 return 0;
   
+ +      /*
+ +       * Don't merge file system requests and discard requests
+ +       */
+ +      if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD))
+ +              return 0;
+ +
+ +      /*
+ +       * Don't merge discard requests and secure discard requests
+ +       */
+ +      if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE))
+ +              return 0;
+ +
         /*
          * not contiguous
          */
@@@ -384,9 -380,6 +392,6 @@@
             || next->special)
                 return 0;
   
-       if (blk_integrity_rq(req) != blk_integrity_rq(next))
-               return 0;
- 
         /*
          * If we are allowed to merge, then append bio list
          * from next to rq and release next. merge_requests_fn
diff --combined block/blk-sysfs.c

index 0749b89,b014f77..da8a8a4
--- 1/block/blk-sysfs.c
--- 2/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@@ -112,6 -112,11 +112,11 @@@ static ssize_t queue_max_segments_show(
         return queue_var_show(queue_max_segments(q), (page));
   }
   
+ static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
+ {
+       return queue_var_show(q->limits.max_integrity_segments, (page));
+ }
+ 
   static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
   {
         if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@@ -288,6 -293,11 +293,11 @@@ static struct queue_sysfs_entry queue_m
         .show = queue_max_segments_show,
   };
   
+ static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
+       .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+       .show = queue_max_integrity_segments_show,
+ };
+ 
   static struct queue_sysfs_entry queue_max_segment_size_entry = {
         .attr = {.name = "max_segment_size", .mode = S_IRUGO },
         .show = queue_max_segment_size_show,
@@@ -375,6 -385,7 +385,7 @@@ static struct attribute *default_attrs[
         &queue_max_hw_sectors_entry.attr,
         &queue_max_sectors_entry.attr,
         &queue_max_segments_entry.attr,
+       &queue_max_integrity_segments_entry.attr,
         &queue_max_segment_size_entry.attr,
         &queue_iosched_entry.attr,
         &queue_hw_sector_size_entry.attr,
@@@ -511,7 -522,6 +522,7 @@@ int blk_register_queue(struct gendisk *
                 kobject_uevent(&q->kobj, KOBJ_REMOVE);
                 kobject_del(&q->kobj);
                 blk_trace_remove_sysfs(disk_to_dev(disk));
+ +              kobject_put(&dev->kobj);
                 return ret;
         }
   
diff --combined block/blk.h

index d6b911a,1340cce..f864012
--- 1/block/blk.h
--- 2/block/blk.h
+++ b/block/blk.h
@@@ -110,10 -110,6 +110,6 @@@ void blk_queue_congestion_threshold(str
   
   int blk_dev_init(void);
   
- void elv_quiesce_start(struct request_queue *q);
- void elv_quiesce_end(struct request_queue *q);
- 
- 
   /*
    * Return the threshold (number of used requests) at which the queue is
    * considered to be congested.  It include a little hysteresis to keep the
@@@ -132,28 -128,16 +128,20 @@@ static inline int queue_congestion_off_
         return q->nr_congestion_off;
   }
   
- #if defined(CONFIG_BLK_DEV_INTEGRITY)
- 
- #define rq_for_each_integrity_segment(bvl, _rq, _iter)                \
-       __rq_for_each_bio(_iter.bio, _rq)                       \
-               bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
- 
- #endif /* BLK_DEV_INTEGRITY */
- 
   static inline int blk_cpu_to_group(int cpu)
   {
+ +      int group = NR_CPUS;
   #ifdef CONFIG_SCHED_MC
         const struct cpumask *mask = cpu_coregroup_mask(cpu);
- -      return cpumask_first(mask);
+ +      group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
- -      return cpumask_first(topology_thread_cpumask(cpu));
+ +      group = cpumask_first(topology_thread_cpumask(cpu));
   #else
         return cpu;
   #endif
+ +      if (likely(group < NR_CPUS))
+ +              return group;
+ +      return cpu;
   }
   
   /*
diff --combined block/cfq-iosched.c

index 9eba291,dfceb63..4cd59b0
--- 1/block/cfq-iosched.c
--- 2/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@@ -30,7 -30,6 +30,7 @@@ static const int cfq_slice_sync = HZ / 
   static int cfq_slice_async = HZ / 25;
   static const int cfq_slice_async_rq = 2;
   static int cfq_slice_idle = HZ / 125;
+ +static int cfq_group_idle = HZ / 125;
   static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
   static const int cfq_hist_divisor = 4;
   
@@@ -148,8 -147,6 +148,8 @@@ struct cfq_queue 
         struct cfq_queue *new_cfqq;
         struct cfq_group *cfqg;
         struct cfq_group *orig_cfqg;
+ +      /* Number of sectors dispatched from queue in single dispatch round */
+ +      unsigned long nr_sectors;
   };
   
   /*
@@@ -160,6 -157,7 +160,7 @@@ enum wl_prio_t 
         BE_WORKLOAD = 0,
         RT_WORKLOAD = 1,
         IDLE_WORKLOAD = 2,
+       CFQ_PRIO_NR,
   };
   
   /*
@@@ -184,10 -182,19 +185,19 @@@ struct cfq_group 
         /* number of cfqq currently on this group */
         int nr_cfqq;
   
-       /* Per group busy queus average. Useful for workload slice calc. */
-       unsigned int busy_queues_avg[2];
         /*
-        * rr lists of queues with requests, onle rr for each priority class.
+        * Per group busy queus average. Useful for workload slice calc. We
+        * create the array for each prio class but at run time it is used
+        * only for RT and BE class and slot for IDLE class remains unused.
+        * This is primarily done to avoid confusion and a gcc warning.
+        */
+       unsigned int busy_queues_avg[CFQ_PRIO_NR];
+       /*
+        * rr lists of queues with requests. We maintain service trees for
+        * RT and BE classes. These trees are subdivided in subclasses
+        * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
+        * class there is no subclassification and all the cfq queues go on
+        * a single tree service_tree_idle.
          * Counts are embedded in the cfq_rb_root
          */
         struct cfq_rb_root service_trees[2][3];
@@@ -201,8 -208,6 +211,8 @@@
         struct hlist_node cfqd_node;
         atomic_t ref;
   #endif
+ +      /* number of requests that are on the dispatch list or inside driver */
+ +      int dispatched;
   };
   
   /*
@@@ -221,7 -226,6 +231,6 @@@ struct cfq_data 
         enum wl_type_t serving_type;
         unsigned long workload_expires;
         struct cfq_group *serving_group;
-       bool noidle_tree_requires_idle;
   
         /*
          * Each priority tree is sorted by next_request position.  These
@@@ -276,7 -280,6 +285,7 @@@
         unsigned int cfq_slice[2];
         unsigned int cfq_slice_async_rq;
         unsigned int cfq_slice_idle;
+ +      unsigned int cfq_group_idle;
         unsigned int cfq_latency;
         unsigned int cfq_group_isolation;
   
@@@ -384,21 -387,6 +393,21 @@@ CFQ_CFQQ_FNS(wait_busy)
                         &cfqg->service_trees[i][j]: NULL) \
   
   
+ +static inline bool iops_mode(struct cfq_data *cfqd)
+ +{
+ +      /*
+ +       * If we are not idling on queues and it is a NCQ drive, parallel
+ +       * execution of requests is on and measuring time is not possible
+ +       * in most of the cases until and unless we drive shallower queue
+ +       * depths and that becomes a performance bottleneck. In such cases
+ +       * switch to start providing fairness in terms of number of IOs.
+ +       */
+ +      if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
+ +              return true;
+ +      else
+ +              return false;
+ +}
+ +
   static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
   {
         if (cfq_class_idle(cfqq))
@@@ -927,6 -915,7 +936,6 @@@ static inline unsigned int cfq_cfqq_sli
                         slice_used = cfqq->allocated_slice;
         }
   
- -      cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
         return slice_used;
   }
   
@@@ -934,21 -923,19 +943,21 @@@ static void cfq_group_served(struct cfq
                                 struct cfq_queue *cfqq)
   {
         struct cfq_rb_root *st = &cfqd->grp_service_tree;
- -      unsigned int used_sl, charge_sl;
+ +      unsigned int used_sl, charge;
         int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
                         - cfqg->service_tree_idle.count;
   
         BUG_ON(nr_sync < 0);
- -      used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
+ +      used_sl = charge = cfq_cfqq_slice_usage(cfqq);
   
- -      if (!cfq_cfqq_sync(cfqq) && !nr_sync)
- -              charge_sl = cfqq->allocated_slice;
+ +      if (iops_mode(cfqd))
+ +              charge = cfqq->slice_dispatch;
+ +      else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
+ +              charge = cfqq->allocated_slice;
   
         /* Can't update vdisktime while group is on service tree */
         cfq_rb_erase(&cfqg->rb_node, st);
- -      cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
+ +      cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
         __cfq_group_service_tree_add(st, cfqg);
   
         /* This group is being expired. Save the context */
@@@ -962,9 -949,6 +971,9 @@@
   
         cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
                                         st->min_vdisktime);
+ +      cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
+ +                      " sect=%u", used_sl, cfqq->slice_dispatch, charge,
+ +                      iops_mode(cfqd), cfqq->nr_sectors);
         cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
         cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
   }
@@@ -977,8 -961,8 +986,8 @@@ static inline struct cfq_group *cfqg_of
         return NULL;
   }
   
- void
- cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
+                                       unsigned int weight)
   {
         cfqg_of_blkg(blkg)->weight = weight;
   }
@@@ -1019,20 -1003,10 +1028,20 @@@ cfq_find_alloc_cfqg(struct cfq_data *cf
          */
         atomic_set(&cfqg->ref, 1);
   
- -      /* Add group onto cgroup list */
- -      sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
- -      cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ +      /*
+ +       * Add group onto cgroup list. It might happen that bdi->dev is
+ +       * not initiliazed yet. Initialize this new group without major
+ +       * and minor info and this info will be filled in once a new thread
+ +       * comes for IO. See code above.
+ +       */
+ +      if (bdi->dev) {
+ +              sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ +              cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
                                         MKDEV(major, minor));
+ +      } else
+ +              cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ +                                      0);
+ +
         cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
   
         /* Add group on cfqd list */
@@@ -1622,7 -1596,6 +1631,7 @@@ static void __cfq_set_active_queue(stru
                 cfqq->allocated_slice = 0;
                 cfqq->slice_end = 0;
                 cfqq->slice_dispatch = 0;
+ +              cfqq->nr_sectors = 0;
   
                 cfq_clear_cfqq_wait_request(cfqq);
                 cfq_clear_cfqq_must_dispatch(cfqq);
@@@ -1875,9 -1848,6 +1884,9 @@@ static bool cfq_should_idle(struct cfq_
         BUG_ON(!service_tree);
         BUG_ON(!service_tree->count);
   
+ +      if (!cfqd->cfq_slice_idle)
+ +              return false;
+ +
         /* We never do for idle class queues. */
         if (prio == IDLE_WORKLOAD)
                 return false;
@@@ -1902,7 -1872,7 +1911,7 @@@ static void cfq_arm_slice_timer(struct 
   {
         struct cfq_queue *cfqq = cfqd->active_queue;
         struct cfq_io_context *cic;
- -      unsigned long sl;
+ +      unsigned long sl, group_idle = 0;
   
         /*
          * SSD device without seek penalty, disable idling. But only do so
@@@ -1918,13 -1888,8 +1927,13 @@@
         /*
          * idle is disabled, either manually or by past process history
          */
- -      if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
- -              return;
+ +      if (!cfq_should_idle(cfqd, cfqq)) {
+ +              /* no queue idling. Check for group idling */
+ +              if (cfqd->cfq_group_idle)
+ +                      group_idle = cfqd->cfq_group_idle;
+ +              else
+ +                      return;
+ +      }
   
         /*
          * still active requests from this queue, don't idle
@@@ -1951,21 -1916,13 +1960,21 @@@
                 return;
         }
   
+ +      /* There are other queues in the group, don't do group idle */
+ +      if (group_idle && cfqq->cfqg->nr_cfqq > 1)
+ +              return;
+ +
         cfq_mark_cfqq_wait_request(cfqq);
   
- -      sl = cfqd->cfq_slice_idle;
+ +      if (group_idle)
+ +              sl = cfqd->cfq_group_idle;
+ +      else
+ +              sl = cfqd->cfq_slice_idle;
   
         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
         cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
- -      cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
+ +      cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
+ +                      group_idle ? 1 : 0);
   }
   
   /*
@@@ -1981,11 -1938,9 +1990,11 @@@ static void cfq_dispatch_insert(struct 
         cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
         cfq_remove_request(rq);
         cfqq->dispatched++;
+ +      (RQ_CFQG(rq))->dispatched++;
         elv_dispatch_sort(q, rq);
   
         cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
+ +      cfqq->nr_sectors += blk_rq_sectors(rq);
         cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
                                         rq_data_dir(rq), rq_is_sync(rq));
   }
@@@ -2180,7 -2135,6 +2189,6 @@@ static void choose_service_tree(struct 
         slice = max_t(unsigned, slice, CFQ_MIN_TT);
         cfq_log(cfqd, "workload slice:%d", slice);
         cfqd->workload_expires = jiffies + slice;
-       cfqd->noidle_tree_requires_idle = false;
   }
   
   static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@@ -2252,7 -2206,7 +2260,7 @@@ static struct cfq_queue *cfq_select_que
                         cfqq = NULL;
                         goto keep_queue;
                 } else
- -                      goto expire;
+ +                      goto check_group_idle;
         }
   
         /*
@@@ -2280,23 -2234,8 +2288,23 @@@
          * flight or is idling for a new request, allow either of these
          * conditions to happen (or time out) before selecting a new queue.
          */
- -      if (timer_pending(&cfqd->idle_slice_timer) ||
- -          (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
+ +      if (timer_pending(&cfqd->idle_slice_timer)) {
+ +              cfqq = NULL;
+ +              goto keep_queue;
+ +      }
+ +
+ +      if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+ +              cfqq = NULL;
+ +              goto keep_queue;
+ +      }
+ +
+ +      /*
+ +       * If group idle is enabled and there are requests dispatched from
+ +       * this group, wait for requests to complete.
+ +       */
+ +check_group_idle:
+ +      if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
+ +          && cfqq->cfqg->dispatched) {
                 cfqq = NULL;
                 goto keep_queue;
         }
@@@ -3177,7 -3116,9 +3185,9 @@@ cfq_update_idle_window(struct cfq_data 
         if (cfqq->queued[0] + cfqq->queued[1] >= 4)
                 cfq_mark_cfqq_deep(cfqq);
   
-       if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
+       if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+               enable_idle = 0;
+       else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
             (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
                 enable_idle = 0;
         else if (sample_valid(cic->ttime_samples)) {
@@@ -3444,7 -3385,6 +3454,7 @@@ static void cfq_completed_request(struc
         WARN_ON(!cfqq->dispatched);
         cfqd->rq_in_driver--;
         cfqq->dispatched--;
+ +      (RQ_CFQG(rq))->dispatched--;
         cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
                         rq_start_time_ns(rq), rq_io_start_time_ns(rq),
                         rq_data_dir(rq), rq_is_sync(rq));
@@@ -3474,10 -3414,7 +3484,10 @@@
                  * the queue.
                  */
                 if (cfq_should_wait_busy(cfqd, cfqq)) {
- -                      cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
+ +                      unsigned long extend_sl = cfqd->cfq_slice_idle;
+ +                      if (!cfqd->cfq_slice_idle)
+ +                              extend_sl = cfqd->cfq_group_idle;
+ +                      cfqq->slice_end = jiffies + extend_sl;
                         cfq_mark_cfqq_wait_busy(cfqq);
                         cfq_log_cfqq(cfqd, cfqq, "will busy wait");
                 }
@@@ -3494,17 -3431,7 +3504,7 @@@
                         cfq_slice_expired(cfqd, 1);
                 else if (sync && cfqq_empty &&
                          !cfq_close_cooperator(cfqd, cfqq)) {
-                       cfqd->noidle_tree_requires_idle |=
-                               !(rq->cmd_flags & REQ_NOIDLE);
-                       /*
-                        * Idling is enabled for SYNC_WORKLOAD.
-                        * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
-                        * only if we processed at least one !REQ_NOIDLE request
-                        */
-                       if (cfqd->serving_type == SYNC_WORKLOAD
-                           || cfqd->noidle_tree_requires_idle
-                           || cfqq->cfqg->nr_cfqq == 1)
-                               cfq_arm_slice_timer(cfqd);
+                       cfq_arm_slice_timer(cfqd);
                 }
         }
   
@@@ -3923,7 -3850,6 +3923,7 @@@ static void *cfq_init_queue(struct requ
         cfqd->cfq_slice[1] = cfq_slice_sync;
         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
         cfqd->cfq_slice_idle = cfq_slice_idle;
+ +      cfqd->cfq_group_idle = cfq_group_idle;
         cfqd->cfq_latency = 1;
         cfqd->cfq_group_isolation = 0;
         cfqd->hw_tag = -1;
@@@ -3996,7 -3922,6 +3996,7 @@@ SHOW_FUNCTION(cfq_fifo_expire_async_sho
   SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
   SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
   SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+ +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
   SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
   SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
   SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@@ -4029,7 -3954,6 +4029,7 @@@ STORE_FUNCTION(cfq_back_seek_max_store
   STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
                 UINT_MAX, 0);
   STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+ +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
   STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
   STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
   STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@@ -4051,7 -3975,6 +4051,7 @@@ static struct elv_fs_entry cfq_attrs[] 
         CFQ_ATTR(slice_async),
         CFQ_ATTR(slice_async_rq),
         CFQ_ATTR(slice_idle),
+ +      CFQ_ATTR(group_idle),
         CFQ_ATTR(low_latency),
         CFQ_ATTR(group_isolation),
         __ATTR_NULL
@@@ -4090,6 -4013,7 +4090,7 @@@ static struct blkio_policy_type blkio_p
                 .blkio_unlink_group_fn =        cfq_unlink_blkio_group,
                 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
         },
+       .plid = BLKIO_POLICY_PROP,
   };
   #else
   static struct blkio_policy_type blkio_policy_cfq;
@@@ -4105,12 -4029,6 +4106,12 @@@ static int __init cfq_init(void
         if (!cfq_slice_idle)
                 cfq_slice_idle = 1;
   
+ +#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ +      if (!cfq_group_idle)
+ +              cfq_group_idle = 1;
+ +#else
+ +              cfq_group_idle = 0;
+ +#endif
         if (cfq_slab_setup())
                 return -ENOMEM;
   
diff --combined drivers/scsi/scsi_lib.c

index ee02d38,861c0b9..8041fe1
--- 1/drivers/scsi/scsi_lib.c
--- 2/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@@ -968,11 -968,13 +968,13 @@@ static int scsi_init_sgtable(struct req
    */
   int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
   {
-       int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask);
+       struct request *rq = cmd->request;
+ 
+       int error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask);
         if (error)
                 goto err_exit;
   
-       if (blk_bidi_rq(cmd->request)) {
+       if (blk_bidi_rq(rq)) {
                 struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
                         scsi_sdb_cache, GFP_ATOMIC);
                 if (!bidi_sdb) {
@@@ -980,28 -982,28 +982,28 @@@
                         goto err_exit;
                 }
   
-               cmd->request->next_rq->special = bidi_sdb;
-               error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb,
-                                                                   GFP_ATOMIC);
+               rq->next_rq->special = bidi_sdb;
+               error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC);
                 if (error)
                         goto err_exit;
         }
   
-       if (blk_integrity_rq(cmd->request)) {
+       if (blk_integrity_rq(rq)) {
                 struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
                 int ivecs, count;
   
                 BUG_ON(prot_sdb == NULL);
-               ivecs = blk_rq_count_integrity_sg(cmd->request);
+               ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
   
                 if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
                         error = BLKPREP_DEFER;
                         goto err_exit;
                 }
   
-               count = blk_rq_map_integrity_sg(cmd->request,
+               count = blk_rq_map_integrity_sg(rq->q, rq->bio,
                                                 prot_sdb->table.sgl);
                 BUG_ON(unlikely(count > ivecs));
+               BUG_ON(unlikely(count > queue_max_integrity_segments(rq->q)));
   
                 cmd->prot_sdb = prot_sdb;
                 cmd->prot_sdb->table.nents = count;
@@@ -1011,8 -1013,8 +1013,8 @@@
   
   err_exit:
         scsi_release_buffers(cmd);
- -      scsi_put_command(cmd);
         cmd->request->special = NULL;
+ +      scsi_put_command(cmd);
         return error;
   }
   EXPORT_SYMBOL(scsi_init_io);
@@@ -1625,6 -1627,14 +1627,14 @@@ struct request_queue *__scsi_alloc_queu
         blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
                                         SCSI_MAX_SG_CHAIN_SEGMENTS));
   
+       if (scsi_host_prot_dma(shost)) {
+               shost->sg_prot_tablesize =
+                       min_not_zero(shost->sg_prot_tablesize,
+                                    (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
+               BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
+               blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
+       }
+ 
         blk_queue_max_hw_sectors(q, shost->max_sectors);
         blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
         blk_queue_segment_boundary(q, shost->dma_boundary);
diff --combined drivers/scsi/sg.c

index 58ec8f4,655ab92..5428d53
--- 1/drivers/scsi/sg.c
--- 2/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@@ -49,7 -49,7 +49,7 @@@ static int sg_version_num = 30534;    /* 
   #include <linux/blkdev.h>
   #include <linux/delay.h>
   #include <linux/blktrace_api.h>
- -#include <linux/smp_lock.h>
+ +#include <linux/mutex.h>
   
   #include "scsi.h"
   #include <scsi/scsi_dbg.h>
@@@ -103,8 -103,6 +103,8 @@@ static int scatter_elem_sz_prev = SG_SC
   static int sg_add(struct device *, struct class_interface *);
   static void sg_remove(struct device *, struct class_interface *);
   
+ +static DEFINE_MUTEX(sg_mutex);
+ +
   static DEFINE_IDR(sg_index_idr);
   static DEFINE_RWLOCK(sg_index_lock);  /* Also used to lock
                                                            file descriptor list for device */
@@@ -231,7 -229,7 +231,7 @@@ sg_open(struct inode *inode, struct fil
         int res;
         int retval;
   
- -      lock_kernel();
+ +      mutex_lock(&sg_mutex);
         nonseekable_open(inode, filp);
         SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
         sdp = sg_get_dev(dev);
@@@ -316,7 -314,7 +316,7 @@@ sdp_put
   sg_put:
         if (sdp)
                 sg_put_dev(sdp);
- -      unlock_kernel();
+ +      mutex_unlock(&sg_mutex);
         return retval;
   }
   
@@@ -1094,9 -1092,9 +1094,9 @@@ sg_unlocked_ioctl(struct file *filp, un
   {
         int ret;
   
- -      lock_kernel();
+ +      mutex_lock(&sg_mutex);
         ret = sg_ioctl(filp, cmd_in, arg);
- -      unlock_kernel();
+ +      mutex_unlock(&sg_mutex);
   
         return ret;
   }
@@@ -1353,7 -1351,6 +1353,7 @@@ static const struct file_operations sg_
         .mmap = sg_mmap,
         .release = sg_release,
         .fasync = sg_fasync,
+ +      .llseek = no_llseek,
   };
   
   static struct class *sg_sysfs_class;
@@@ -1660,7 -1657,7 +1660,7 @@@ static int sg_start_req(Sg_request *srp
         if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO &&
             dxfer_dir != SG_DXFER_UNKNOWN && !iov_count &&
             !sfp->parentdp->device->host->unchecked_isa_dma &&
-           blk_rq_aligned(q, hp->dxferp, dxfer_len))
+           blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len))
                 md = NULL;
         else
                 md = &map_data;
diff --combined include/linux/elevator.h

index 4fd978e,df1ee86..80a0ece
--- 1/include/linux/elevator.h
--- 2/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@@ -93,7 -93,6 +93,7 @@@ struct elevator_queu
         struct elevator_type *elevator_type;
         struct mutex sysfs_lock;
         struct hlist_head *hash;
+ +      unsigned int registered:1;
   };
   
   /*
@@@ -122,6 -121,8 +122,8 @@@ extern void elv_completed_request(struc
   extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
   extern void elv_put_request(struct request_queue *, struct request *);
   extern void elv_drain_elevator(struct request_queue *);
+ extern void elv_quiesce_start(struct request_queue *);
+ extern void elv_quiesce_end(struct request_queue *);
   
   /*
    * io scheduler registration
@@@ -137,7 -138,6 +139,7 @@@ extern ssize_t elv_iosched_store(struc
   
   extern int elevator_init(struct request_queue *, char *);
   extern void elevator_exit(struct elevator_queue *);
+ +extern int elevator_change(struct request_queue *, const char *);
   extern int elv_rq_merge_ok(struct request *, struct bio *);
   
   /*
diff --combined include/linux/genhd.h

index af3f06b,57647ec..557c392
--- 1/include/linux/genhd.h
--- 2/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@@ -12,6 -12,7 +12,7 @@@
   #include <linux/types.h>
   #include <linux/kdev_t.h>
   #include <linux/rcupdate.h>
+ #include <linux/slab.h>
   
   #ifdef CONFIG_BLOCK
   
@@@ -86,7 -87,15 +87,15 @@@ struct disk_stats 
         unsigned long io_ticks;
         unsigned long time_in_queue;
   };
-       
+ 
+ #define PARTITION_META_INFO_VOLNAMELTH        64
+ #define PARTITION_META_INFO_UUIDLTH   16
+ 
+ struct partition_meta_info {
+       u8 uuid[PARTITION_META_INFO_UUIDLTH];   /* always big endian */
+       u8 volname[PARTITION_META_INFO_VOLNAMELTH];
+ };
+ 
   struct hd_struct {
         sector_t start_sect;
         sector_t nr_sects;
@@@ -95,6 -104,7 +104,7 @@@
         struct device __dev;
         struct kobject *holder_dir;
         int policy, partno;
+       struct partition_meta_info *info;
   #ifdef CONFIG_FAIL_MAKE_REQUEST
         int make_it_fail;
   #endif
@@@ -129,8 -139,9 +139,9 @@@ struct blk_scsi_cmd_filter 
   struct disk_part_tbl {
         struct rcu_head rcu_head;
         int len;
- -      struct hd_struct *last_lookup;
+ +      struct hd_struct __rcu *last_lookup;
+       struct gendisk *disk;
- -      struct hd_struct *part[];
+ +      struct hd_struct __rcu *part[];
   };
   
   struct gendisk {
@@@ -149,7 -160,7 +160,7 @@@
          * non-critical accesses use RCU.  Always access through
          * helpers.
          */
- -      struct disk_part_tbl *part_tbl;
+ +      struct disk_part_tbl __rcu *part_tbl;
         struct hd_struct part0;
   
         const struct block_device_operations *fops;
@@@ -181,6 -192,30 +192,30 @@@ static inline struct gendisk *part_to_d
         return NULL;
   }
   
+ static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
+ {
+       int i;
+       for (i = 0; i < 16; ++i) {
+               *to++ = (hex_to_bin(*uuid_str) << 4) |
+                       (hex_to_bin(*(uuid_str + 1)));
+               uuid_str += 2;
+               switch (i) {
+               case 3:
+               case 5:
+               case 7:
+               case 9:
+                       uuid_str++;
+                       continue;
+               }
+       }
+ }
+ 
+ static inline char *part_unpack_uuid(const u8 *uuid, char *out)
+ {
+       sprintf(out, "%pU", uuid);
+       return out;
+ }
+ 
   static inline int disk_max_parts(struct gendisk *disk)
   {
         if (disk->flags & GENHD_FL_EXT_DEVT)
@@@ -342,6 -377,19 +377,19 @@@ static inline int part_in_flight(struc
         return part->in_flight[0] + part->in_flight[1];
   }
   
+ static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
+ {
+       if (disk)
+               return kzalloc_node(sizeof(struct partition_meta_info),
+                                   GFP_KERNEL, disk->node_id);
+       return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL);
+ }
+ 
+ static inline void free_part_info(struct hd_struct *part)
+ {
+       kfree(part->info);
+ }
+ 
   /* block/blk-core.c */
   extern void part_round_stats(int cpu, struct hd_struct *part);
   
@@@ -533,7 -581,9 +581,9 @@@ extern int disk_expand_part_tbl(struct 
   extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
   extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
                                                      int partno, sector_t start,
-                                                    sector_t len, int flags);
+                                                    sector_t len, int flags,
+                                                    struct partition_meta_info
+                                                      *info);
   extern void delete_partition(struct gendisk *, int);
   extern void printk_all_partitions(void);
   
diff --combined include/linux/kernel.h

index 1759ba5,f5df2f4..edef168
--- 1/include/linux/kernel.h
--- 2/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@@ -58,18 -58,7 +58,18 @@@ extern const char linux_proc_banner[]
   
   #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
   #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
- -#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+ +#define roundup(x, y) (                                       \
+ +{                                                     \
+ +      typeof(y) __y = y;                              \
+ +      (((x) + (__y - 1)) / __y) * __y;                \
+ +}                                                     \
+ +)
+ +#define rounddown(x, y) (                             \
+ +{                                                     \
+ +      typeof(x) __x = (x);                            \
+ +      __x - (__x % (y));                              \
+ +}                                                     \
+ +)
   #define DIV_ROUND_CLOSEST(x, divisor)(                        \
   {                                                     \
         typeof(divisor) __divisor = divisor;            \
@@@ -651,6 -640,16 +651,16 @@@ static inline void ftrace_dump(enum ftr
         (void) (&_max1 == &_max2);              \
         _max1 > _max2 ? _max1 : _max2; })
   
+ /**
+  * min_not_zero - return the minimum that is _not_ zero, unless both are zero
+  * @x: value1
+  * @y: value2
+  */
+ #define min_not_zero(x, y) ({                 \
+       typeof(x) __x = (x);                    \
+       typeof(y) __y = (y);                    \
+       __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+ 
   /**
    * clamp - return a value clamped to a given range with strict typechecking
    * @val: current value
diff --combined include/linux/sched.h

index 0383601,dbafa9e..56154bb
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -336,6 -336,9 +336,9 @@@ extern unsigned long sysctl_hung_task_w
   extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
                                          void __user *buffer,
                                          size_t *lenp, loff_t *ppos);
+ #else
+ /* Avoid need for ifdefs elsewhere in the code */
+ enum { sysctl_hung_task_timeout_secs = 0 };
   #endif
   
   /* Attach to any functions which should be ignored in wchan output. */
@@@ -875,7 -878,6 +878,7 @@@ enum sched_domain_level 
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
         SD_LV_MC,
+ +      SD_LV_BOOK,
         SD_LV_CPU,
         SD_LV_NODE,
         SD_LV_ALLNODES,
@@@ -1161,13 -1163,6 +1164,13 @@@ struct sched_rt_entity 
   
   struct rcu_node;
   
+ +enum perf_event_task_context {
+ +      perf_invalid_context = -1,
+ +      perf_hw_context = 0,
+ +      perf_sw_context,
+ +      perf_nr_task_contexts,
+ +};
+ +
   struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
@@@ -1210,13 -1205,11 +1213,13 @@@
         unsigned int policy;
         cpumask_t cpus_allowed;
   
- -#ifdef CONFIG_TREE_PREEMPT_RCU
+ +#ifdef CONFIG_PREEMPT_RCU
         int rcu_read_lock_nesting;
         char rcu_read_unlock_special;
- -      struct rcu_node *rcu_blocked_node;
         struct list_head rcu_node_entry;
+ +#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ +#ifdef CONFIG_TREE_PREEMPT_RCU
+ +      struct rcu_node *rcu_blocked_node;
   #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
   
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@@ -1298,9 -1291,9 +1301,9 @@@
         struct list_head cpu_timers[3];
   
   /* process credentials */
- -      const struct cred *real_cred;   /* objective and real subjective task
+ +      const struct cred __rcu *real_cred; /* objective and real subjective task
                                          * credentials (COW) */
- -      const struct cred *cred;        /* effective (overridable) subjective task
+ +      const struct cred __rcu *cred;  /* effective (overridable) subjective task
                                          * credentials (COW) */
         struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                          * credential calculations
@@@ -1428,7 -1421,7 +1431,7 @@@
   #endif
   #ifdef CONFIG_CGROUPS
         /* Control Group info protected by css_set_lock */
- -      struct css_set *cgroups;
+ +      struct css_set __rcu *cgroups;
         /* cg_list protected by css_set_lock and tsk->alloc_lock */
         struct list_head cg_list;
   #endif
@@@ -1441,7 -1434,7 +1444,7 @@@
         struct futex_pi_state *pi_state_cache;
   #endif
   #ifdef CONFIG_PERF_EVENTS
- -      struct perf_event_context *perf_event_ctxp;
+ +      struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
         struct mutex perf_event_mutex;
         struct list_head perf_event_list;
   #endif
@@@ -1691,7 -1684,8 +1694,7 @@@ extern void thread_group_times(struct t
   /*
    * Per process flags
    */
- -#define PF_ALIGNWARN  0x00000001      /* Print alignment warning msgs */
- -                                      /* Not implemented yet, only for 486*/
+ +#define PF_KSOFTIRQD  0x00000001      /* I am ksoftirqd */
   #define PF_STARTING   0x00000002      /* being created */
   #define PF_EXITING    0x00000004      /* getting shut down */
   #define PF_EXITPIDONE 0x00000008      /* pi exit done on shut down */
@@@ -1749,7 -1743,7 +1752,7 @@@
   #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
   #define used_math() tsk_used_math(current)
   
- -#ifdef CONFIG_TREE_PREEMPT_RCU
+ +#ifdef CONFIG_PREEMPT_RCU
   
   #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
   #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
@@@ -1758,9 -1752,7 +1761,9 @@@ static inline void rcu_copy_process(str
   {
         p->rcu_read_lock_nesting = 0;
         p->rcu_read_unlock_special = 0;
+ +#ifdef CONFIG_TREE_PREEMPT_RCU
         p->rcu_blocked_node = NULL;
+ +#endif
         INIT_LIST_HEAD(&p->rcu_node_entry);
   }
   
@@@ -1837,19 -1829,6 +1840,19 @@@ extern void sched_clock_idle_sleep_even
   extern void sched_clock_idle_wakeup_event(u64 delta_ns);
   #endif
   
+ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ +/*
+ + * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+ + * The reason for this explicit opt-in is not to have perf penalty with
+ + * slow sched_clocks.
+ + */
+ +extern void enable_sched_clock_irqtime(void);
+ +extern void disable_sched_clock_irqtime(void);
+ +#else
+ +static inline void enable_sched_clock_irqtime(void) {}
+ +static inline void disable_sched_clock_irqtime(void) {}
+ +#endif
+ +
   extern unsigned long long
   task_sched_runtime(struct task_struct *task);
   extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@@ -2391,9 -2370,9 +2394,9 @@@ extern int __cond_resched_lock(spinlock
   
   extern int __cond_resched_softirq(void);
   
- -#define cond_resched_softirq() ({                             \
- -      __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);      \
- -      __cond_resched_softirq();                               \
+ +#define cond_resched_softirq() ({                                     \
+ +      __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);      \
+ +      __cond_resched_softirq();                                       \
   })
   
   /*
diff --combined init/Kconfig

index be85a0a,950ba26..bd125a7
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -21,13 -21,6 +21,13 @@@ config CONSTRUCTOR
         depends on !UML
         default y
   
+ +config HAVE_IRQ_WORK
+ +      bool
+ +
+ +config IRQ_WORK
+ +      bool
+ +      depends on HAVE_IRQ_WORK
+ +
   menu "General setup"
   
   config EXPERIMENTAL
@@@ -71,7 -64,7 +71,7 @@@ config BROKEN_ON_SM
   
   config LOCK_KERNEL
         bool
- -      depends on SMP || PREEMPT
+ +      depends on (SMP || PREEMPT) && BKL
         default y
   
   config INIT_ENV_ARG_LIMIT
@@@ -339,8 -332,6 +339,8 @@@ config AUDIT_TRE
         depends on AUDITSYSCALL
         select FSNOTIFY
   
+ +source "kernel/irq/Kconfig"
+ +
   menu "RCU Subsystem"
   
   choice
@@@ -349,7 -340,6 +349,7 @@@
   
   config TREE_RCU
         bool "Tree-based hierarchical RCU"
+ +      depends on !PREEMPT && SMP
         help
           This option selects the RCU implementation that is
           designed for very large SMP system with hundreds or
@@@ -357,7 -347,7 +357,7 @@@
           smaller systems.
   
   config TREE_PREEMPT_RCU
- -      bool "Preemptable tree-based hierarchical RCU"
+ +      bool "Preemptible tree-based hierarchical RCU"
         depends on PREEMPT
         help
           This option selects the RCU implementation that is
@@@ -375,22 -365,8 +375,22 @@@ config TINY_RC
           is not required.  This option greatly reduces the
           memory footprint of RCU.
   
+ +config TINY_PREEMPT_RCU
+ +      bool "Preemptible UP-only small-memory-footprint RCU"
+ +      depends on !SMP && PREEMPT
+ +      help
+ +        This option selects the RCU implementation that is designed
+ +        for real-time UP systems.  This option greatly reduces the
+ +        memory footprint of RCU.
+ +
   endchoice
   
+ +config PREEMPT_RCU
+ +      def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
+ +      help
+ +        This option enables preemptible-RCU code that is common between
+ +        the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+ +
   config RCU_TRACE
         bool "Enable tracing for RCU"
         depends on TREE_RCU || TREE_PREEMPT_RCU
@@@ -411,12 -387,9 +411,12 @@@ config RCU_FANOU
         help
           This option controls the fanout of hierarchical implementations
           of RCU, allowing RCU to work efficiently on machines with
- -        large numbers of CPUs.  This value must be at least the cube
- -        root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
- -        systems and up to 262,144 for 64-bit systems.
+ +        large numbers of CPUs.  This value must be at least the fourth
+ +        root of NR_CPUS, which allows NR_CPUS to be insanely large.
+ +        The default value of RCU_FANOUT should be used for production
+ +        systems, but if you are stress-testing the RCU implementation
+ +        itself, small RCU_FANOUT values allow you to test large-system
+ +        code paths on small(er) systems.
   
           Select a specific number if testing RCU itself.
           Take the default if unsure.
@@@ -661,11 -634,14 +661,14 @@@ config BLK_CGROU
   
         Currently, CFQ IO scheduler uses it to recognize task groups and
         control disk bandwidth allocation (proportional time slice allocation)
-       to such task groups.
+       to such task groups. It is also used by bio throttling logic in
+       block layer to implement upper limit in IO rates on a device.
   
         This option only enables generic Block IO controller infrastructure.
-       One needs to also enable actual IO controlling logic in CFQ for it
-       to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+       One needs to also enable actual IO controlling logic/policy. For
+       enabling proportional weight division of disk bandwidth in CFQ seti
+       CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
+       CONFIG_BLK_THROTTLE=y.
   
         See Documentation/cgroups/blkio-controller.txt for more information.
   
@@@ -1014,7 -990,6 +1017,7 @@@ config PERF_EVENT
         default y if (PROFILING || PERF_COUNTERS)
         depends on HAVE_PERF_EVENTS
         select ANON_INODES
+ +      select IRQ_WORK
         help
           Enable kernel support for various performance events provided
           by software and hardware.
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 23 Oct 2010 00:00:32 +0000 (17:00 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 23 Oct 2010 00:00:32 +0000 (17:00 -0700)
		1	2
Documentation/cgroups/blkio-controller.txt	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-map.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-merge.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk.h	patch \|	diff1 \|	diff2 \|	blob \| history
block/cfq-iosched.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_lib.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/sg.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/elevator.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/genhd.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kernel.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history