Merge branch 'sh-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[pandora-kernel.git] / block / cfq-iosched.c
index ab7a9e6..f379943 100644 (file)
@@ -185,7 +185,7 @@ struct cfq_group {
        int nr_cfqq;
 
        /*
-        * Per group busy queus average. Useful for workload slice calc. We
+        * Per group busy queues average. Useful for workload slice calc. We
         * create the array for each prio class but at run time it is used
         * only for RT and BE class and slot for IDLE class remains unused.
         * This is primarily done to avoid confusion and a gcc warning.
@@ -300,7 +300,9 @@ struct cfq_data {
 
        /* List of cfq groups being managed on this device*/
        struct hlist_head cfqg_list;
-       struct rcu_head rcu;
+
+       /* Number of groups which are on blkcg->blkg_list */
+       unsigned int nr_blkcg_linked_grps;
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -367,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy);
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
        blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
                        cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
-                       blkg_path(&(cfqq)->cfqg->blkg), ##args);
+                       blkg_path(&(cfqq)->cfqg->blkg), ##args)
 
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)                         \
        blk_add_trace_msg((cfqd)->queue, "%s " fmt,                     \
-                               blkg_path(&(cfqg)->blkg), ##args);      \
+                               blkg_path(&(cfqg)->blkg), ##args)       \
 
 #else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
        blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)         do {} while (0);
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)         do {} while (0)
 #endif
 #define cfq_log(cfqd, fmt, args...)    \
        blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
@@ -665,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
        if (rq2 == NULL)
                return rq1;
 
-       if (rq_is_sync(rq1) && !rq_is_sync(rq2))
-               return rq1;
-       else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
-               return rq2;
-       if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
-               return rq1;
-       else if ((rq2->cmd_flags & REQ_META) &&
-                !(rq1->cmd_flags & REQ_META))
-               return rq2;
+       if (rq_is_sync(rq1) != rq_is_sync(rq2))
+               return rq_is_sync(rq1) ? rq1 : rq2;
+
+       if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+               return rq1->cmd_flags & REQ_META ? rq1 : rq2;
 
        s1 = blk_rq_pos(rq1);
        s2 = blk_rq_pos(rq2);
@@ -990,9 +988,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
        cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
                                        st->min_vdisktime);
-       cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
-                       " sect=%u", used_sl, cfqq->slice_dispatch, charge,
-                       iops_mode(cfqd), cfqq->nr_sectors);
+       cfq_log_cfqq(cfqq->cfqd, cfqq,
+                    "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
+                    used_sl, cfqq->slice_dispatch, charge,
+                    iops_mode(cfqd), cfqq->nr_sectors);
        cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
                                          unaccounted_sl);
        cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
@@ -1014,28 +1013,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
        cfqg->needs_update = true;
 }
 
-static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
-               struct blkio_cgroup *blkcg, int create)
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
+                       struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
 {
-       struct cfq_group *cfqg = NULL;
-       void *key = cfqd;
-       int i, j;
-       struct cfq_rb_root *st;
        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
        unsigned int major, minor;
 
-       cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+       /*
+        * Add group onto cgroup list. It might happen that bdi->dev is
+        * not initialized yet. Initialize this new group without major
+        * and minor info and this info will be filled in once a new thread
+        * comes for IO.
+        */
+       if (bdi->dev) {
                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-               cfqg->blkg.dev = MKDEV(major, minor);
-               goto done;
-       }
-       if (cfqg || !create)
-               goto done;
+               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+                                       (void *)cfqd, MKDEV(major, minor));
+       } else
+               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
+                                       (void *)cfqd, 0);
+
+       cfqd->nr_blkcg_linked_grps++;
+       cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+
+       /* Add group on cfqd list */
+       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
+ */
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+{
+       struct cfq_group *cfqg = NULL;
+       int i, j, ret;
+       struct cfq_rb_root *st;
 
        cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
        if (!cfqg)
-               goto done;
+               return NULL;
 
        for_each_cfqg_st(cfqg, i, j, st)
                *st = CFQ_RB_ROOT;
@@ -1049,43 +1067,94 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
         */
        cfqg->ref = 1;
 
+       ret = blkio_alloc_blkg_stats(&cfqg->blkg);
+       if (ret) {
+               kfree(cfqg);
+               return NULL;
+       }
+
+       return cfqg;
+}
+
+static struct cfq_group *
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+{
+       struct cfq_group *cfqg = NULL;
+       void *key = cfqd;
+       struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+       unsigned int major, minor;
+
        /*
-        * Add group onto cgroup list. It might happen that bdi->dev is
-        * not initialized yet. Initialize this new group without major
-        * and minor info and this info will be filled in once a new thread
-        * comes for IO. See code above.
+        * This is the common case when there are no blkio cgroups.
+        * Avoid lookup in this case
         */
-       if (bdi->dev) {
-               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-                                       MKDEV(major, minor));
-       } else
-               cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-                                       0);
-
-       cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+       if (blkcg == &blkio_root_cgroup)
+               cfqg = &cfqd->root_group;
+       else
+               cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
 
-       /* Add group on cfqd list */
-       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+       if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+               sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+               cfqg->blkg.dev = MKDEV(major, minor);
+       }
 
-done:
        return cfqg;
 }
 
 /*
- * Search for the cfq group current task belongs to. If create = 1, then also
- * create the cfq group if it does not exist. request_queue lock must be held.
+ * Search for the cfq group current task belongs to. request_queue lock must
+ * be held.
  */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
        struct blkio_cgroup *blkcg;
-       struct cfq_group *cfqg = NULL;
+       struct cfq_group *cfqg = NULL, *__cfqg = NULL;
+       struct request_queue *q = cfqd->queue;
+
+       rcu_read_lock();
+       blkcg = task_blkio_cgroup(current);
+       cfqg = cfq_find_cfqg(cfqd, blkcg);
+       if (cfqg) {
+               rcu_read_unlock();
+               return cfqg;
+       }
+
+       /*
+        * Need to allocate a group. Allocation of group also needs allocation
+        * of per cpu stats which in-turn takes a mutex() and can block. Hence
+        * we need to drop rcu lock and queue_lock before we call alloc.
+        *
+        * Not taking any queue reference here and assuming that queue is
+        * around by the time we return. CFQ queue allocation code does
+        * the same. It might be racy though.
+        */
+
+       rcu_read_unlock();
+       spin_unlock_irq(q->queue_lock);
+
+       cfqg = cfq_alloc_cfqg(cfqd);
+
+       spin_lock_irq(q->queue_lock);
 
        rcu_read_lock();
        blkcg = task_blkio_cgroup(current);
-       cfqg = cfq_find_alloc_cfqg(cfqd, blkcg, create);
-       if (!cfqg && create)
+
+       /*
+        * If some other thread already allocated the group while we were
+        * not holding queue lock, free up the group
+        */
+       __cfqg = cfq_find_cfqg(cfqd, blkcg);
+
+       if (__cfqg) {
+               kfree(cfqg);
+               rcu_read_unlock();
+               return __cfqg;
+       }
+
+       if (!cfqg)
                cfqg = &cfqd->root_group;
+
+       cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
        rcu_read_unlock();
        return cfqg;
 }
@@ -1118,6 +1187,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
                return;
        for_each_cfqg_st(cfqg, i, j, st)
                BUG_ON(!RB_EMPTY_ROOT(&st->rb));
+       free_percpu(cfqg->blkg.stats_cpu);
        kfree(cfqg);
 }
 
@@ -1176,7 +1246,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 }
 
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
        return &cfqd->root_group;
 }
@@ -1210,7 +1280,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        struct cfq_rb_root *service_tree;
        int left;
        int new_cfqq = 1;
-       int group_changed = 0;
 
        service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
                                                cfqq_type(cfqq));
@@ -1281,7 +1350,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        rb_link_node(&cfqq->rb_node, parent, p);
        rb_insert_color(&cfqq->rb_node, &service_tree->rb);
        service_tree->count++;
-       if ((add_front || !new_cfqq) && !group_changed)
+       if (add_front || !new_cfqq)
                return;
        cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
 }
@@ -1955,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
         */
        if (sample_valid(cic->ttime_samples) &&
            (cfqq->slice_end - jiffies < cic->ttime_mean)) {
-               cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
-                               cic->ttime_mean);
+               cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
+                            cic->ttime_mean);
                return;
        }
 
@@ -2029,7 +2098,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
        WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 
-       return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
+       return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
 }
 
 /*
@@ -2704,8 +2773,11 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
        smp_wmb();
        cic->key = cfqd_dead_key(cfqd);
 
-       if (ioc->ioc_data == cic)
+       if (rcu_dereference(ioc->ioc_data) == cic) {
+               spin_lock(&ioc->lock);
                rcu_assign_pointer(ioc->ioc_data, NULL);
+               spin_unlock(&ioc->lock);
+       }
 
        if (cic->cfqq[BLK_RW_ASYNC]) {
                cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2911,7 +2983,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
        struct cfq_group *cfqg;
 
 retry:
-       cfqg = cfq_get_cfqg(cfqd, 1);
+       cfqg = cfq_get_cfqg(cfqd);
        cic = cfq_cic_lookup(cfqd, ioc);
        /* cic always exists here */
        cfqq = cic_to_cfqq(cic, is_sync);
@@ -3718,9 +3790,6 @@ new_queue:
        return 0;
 
 queue_fail:
-       if (cic)
-               put_io_context(cic->ioc);
-
        cfq_schedule_dispatch(cfqd);
        spin_unlock_irqrestore(q->queue_lock, flags);
        cfq_log(cfqd, "set_request fail");
@@ -3815,15 +3884,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
                cfq_put_queue(cfqd->async_idle_cfqq);
 }
 
-static void cfq_cfqd_free(struct rcu_head *head)
-{
-       kfree(container_of(head, struct cfq_data, rcu));
-}
-
 static void cfq_exit_queue(struct elevator_queue *e)
 {
        struct cfq_data *cfqd = e->elevator_data;
        struct request_queue *q = cfqd->queue;
+       bool wait = false;
 
        cfq_shutdown_timer_wq(cfqd);
 
@@ -3842,7 +3907,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
 
        cfq_put_async_queues(cfqd);
        cfq_release_cfq_groups(cfqd);
-       cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg);
+
+       /*
+        * If there are groups which we could not unlink from blkcg list,
+        * wait for a rcu period for them to be freed.
+        */
+       if (cfqd->nr_blkcg_linked_grps)
+               wait = true;
 
        spin_unlock_irq(q->queue_lock);
 
@@ -3852,8 +3923,25 @@ static void cfq_exit_queue(struct elevator_queue *e)
        ida_remove(&cic_index_ida, cfqd->cic_index);
        spin_unlock(&cic_index_lock);
 
-       /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
-       call_rcu(&cfqd->rcu, cfq_cfqd_free);
+       /*
+        * Wait for cfqg->blkg->key accessors to exit their grace periods.
+        * Do this wait only if there are other unlinked groups out
+        * there. This can happen if cgroup deletion path claimed the
+        * responsibility of cleaning up a group before queue cleanup code
+        * get to the group.
+        *
+        * Do not call synchronize_rcu() unconditionally as there are drivers
+        * which create/delete request queue hundreds of times during scan/boot
+        * and synchronize_rcu() can take significant time and slow down boot.
+        */
+       if (wait)
+               synchronize_rcu();
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+       /* Free up per cpu stats for root group */
+       free_percpu(cfqd->root_group.blkg.stats_cpu);
+#endif
+       kfree(cfqd);
 }
 
 static int cfq_alloc_cic_index(void)
@@ -3886,8 +3974,12 @@ static void *cfq_init_queue(struct request_queue *q)
                return NULL;
 
        cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-       if (!cfqd)
+       if (!cfqd) {
+               spin_lock(&cic_index_lock);
+               ida_remove(&cic_index_ida, i);
+               spin_unlock(&cic_index_lock);
                return NULL;
+       }
 
        /*
         * Don't need take queue_lock in the routine, since we are
@@ -3909,14 +4001,29 @@ static void *cfq_init_queue(struct request_queue *q)
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
        /*
-        * Take a reference to root group which we never drop. This is just
-        * to make sure that cfq_put_cfqg() does not try to kfree root group
+        * Set root group reference to 2. One reference will be dropped when
+        * all groups on cfqd->cfqg_list are being deleted during queue exit.
+        * Other reference will remain there as we don't want to delete this
+        * group as it is statically allocated and gets destroyed when
+        * throtl_data goes away.
         */
-       cfqg->ref = 1;
+       cfqg->ref = 2;
+
+       if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+               kfree(cfqg);
+               kfree(cfqd);
+               return NULL;
+       }
+
        rcu_read_lock();
+
        cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
                                        (void *)cfqd, 0);
        rcu_read_unlock();
+       cfqd->nr_blkcg_linked_grps++;
+
+       /* Add group on cfqd->cfqg_list */
+       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 #endif
        /*
         * Not strictly needed (since RB_ROOT just clears the node and we