/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
+
+ /* Some throttle limits got updated for the group */
+ bool limits_changed;
};
struct throtl_data
unsigned int nr_queued[2];
/*
- * number of total undestroyed groups (excluding root group)
+ * number of total undestroyed groups
*/
unsigned int nr_undestroyed_grps;
/* Work for dispatching throttled bios */
struct delayed_work throtl_work;
+
+ atomic_t limits_changed;
};
enum tg_state_flags {
static inline void
throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
{
- unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
+ unsigned long nr_slices, time_elapsed, io_trim;
+ u64 bytes_trim, tmp;
BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
if (!nr_slices)
return;
+ tmp = tg->bps[rw] * throtl_slice * nr_slices;
+ do_div(tmp, HZ);
+ bytes_trim = tmp;
- bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
if (!bytes_trim && !io_trim)
tg->slice_start[rw] += nr_slices * throtl_slice;
- throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
+ throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
" start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
tg->slice_start[rw], tg->slice_end[rw], jiffies);
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+ u64 tmp;
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
- io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
- / MSEC_PER_SEC;
+ /*
+ * jiffy_elapsed_rnd should not be a big value as minimum iops can be
+ * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+ * will allow dispatch after 1 second and after that slice should
+ * have been trimmed.
+ */
+
+ tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+ do_div(tmp, HZ);
+
+ if (tmp > UINT_MAX)
+ io_allowed = UINT_MAX;
+ else
+ io_allowed = tmp;
if (tg->io_disp[rw] + 1 <= io_allowed) {
if (wait)
struct bio *bio, unsigned long *wait)
{
bool rw = bio_data_dir(bio);
- u64 bytes_allowed, extra_bytes;
+ u64 bytes_allowed, extra_bytes, tmp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
- bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
- / MSEC_PER_SEC;
+ tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+ do_div(tmp, HZ);
+ bytes_allowed = tmp;
if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
if (wait)
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
- /*
- * If group is already on active tree, then update dispatch time
- * only if it is lesser than existing dispatch time. Otherwise
- * always update the dispatch time
- */
-
- if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
- return;
-
/* Update dispatch time */
throtl_dequeue_tg(td, tg);
tg->disptime = disptime;
return nr_disp;
}
+static void throtl_process_limit_change(struct throtl_data *td)
+{
+ struct throtl_grp *tg;
+ struct hlist_node *pos, *n;
+
+ /*
+ * Make sure atomic_inc() effects from
+ * throtl_update_blkio_group_read_bps(), group of functions are
+ * visible.
+ * Is this required or smp_mb__after_atomic_inc() was suffcient
+ * after the atomic_inc().
+ */
+ smp_rmb();
+ if (!atomic_read(&td->limits_changed))
+ return;
+
+ throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+
+ hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
+ /*
+ * Do I need an smp_rmb() here to make sure tg->limits_changed
+ * update is visible. I am relying on smp_rmb() at the
+ * beginning of function and not putting a new one here.
+ */
+
+ if (throtl_tg_on_rr(tg) && tg->limits_changed) {
+ throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+ " riops=%u wiops=%u", tg->bps[READ],
+ tg->bps[WRITE], tg->iops[READ],
+ tg->iops[WRITE]);
+ tg_update_disptime(td, tg);
+ tg->limits_changed = false;
+ }
+ }
+
+ smp_mb__before_atomic_dec();
+ atomic_dec(&td->limits_changed);
+ smp_mb__after_atomic_dec();
+}
+
/* Dispatch throttled bios. Should be called without queue lock held. */
static int throtl_dispatch(struct request_queue *q)
{
spin_lock_irq(q->queue_lock);
+ throtl_process_limit_change(td);
+
if (!total_nr_queued(td))
goto out;
spin_unlock_irqrestore(td->queue->queue_lock, flags);
}
-static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
- u64 read_bps)
+/*
+ * For all update functions, key should be a valid pointer because these
+ * update functions are called under blkcg_lock, that means, blkg is
+ * valid and in turn key is valid. queue exit path can not race becuase
+ * of blkcg_lock
+ *
+ * Can not take queue lock in update functions as queue lock under blkcg_lock
+ * is not allowed. Under other paths we take blkcg_lock under queue_lock.
+ */
+static void throtl_update_blkio_group_read_bps(void *key,
+ struct blkio_group *blkg, u64 read_bps)
{
+ struct throtl_data *td = key;
+
tg_of_blkg(blkg)->bps[READ] = read_bps;
+ /* Make sure read_bps is updated before setting limits_changed */
+ smp_wmb();
+ tg_of_blkg(blkg)->limits_changed = true;
+
+ /* Make sure tg->limits_changed is updated before td->limits_changed */
+ smp_mb__before_atomic_inc();
+ atomic_inc(&td->limits_changed);
+ smp_mb__after_atomic_inc();
+
+ /* Schedule a work now to process the limit change */
+ throtl_schedule_delayed_work(td->queue, 0);
}
-static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
- u64 write_bps)
+static void throtl_update_blkio_group_write_bps(void *key,
+ struct blkio_group *blkg, u64 write_bps)
{
+ struct throtl_data *td = key;
+
tg_of_blkg(blkg)->bps[WRITE] = write_bps;
+ smp_wmb();
+ tg_of_blkg(blkg)->limits_changed = true;
+ smp_mb__before_atomic_inc();
+ atomic_inc(&td->limits_changed);
+ smp_mb__after_atomic_inc();
+ throtl_schedule_delayed_work(td->queue, 0);
}
-static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
- unsigned int read_iops)
+static void throtl_update_blkio_group_read_iops(void *key,
+ struct blkio_group *blkg, unsigned int read_iops)
{
+ struct throtl_data *td = key;
+
tg_of_blkg(blkg)->iops[READ] = read_iops;
+ smp_wmb();
+ tg_of_blkg(blkg)->limits_changed = true;
+ smp_mb__before_atomic_inc();
+ atomic_inc(&td->limits_changed);
+ smp_mb__after_atomic_inc();
+ throtl_schedule_delayed_work(td->queue, 0);
}
-static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
- unsigned int write_iops)
+static void throtl_update_blkio_group_write_iops(void *key,
+ struct blkio_group *blkg, unsigned int write_iops)
{
+ struct throtl_data *td = key;
+
tg_of_blkg(blkg)->iops[WRITE] = write_iops;
+ smp_wmb();
+ tg_of_blkg(blkg)->limits_changed = true;
+ smp_mb__before_atomic_inc();
+ atomic_inc(&td->limits_changed);
+ smp_mb__after_atomic_inc();
+ throtl_schedule_delayed_work(td->queue, 0);
}
void throtl_shutdown_timer_wq(struct request_queue *q)
/*
* There is already another bio queued in same dir. No
* need to update dispatch time.
+ * Still update the disptime if rate limits on this group
+ * were changed.
*/
- update_disptime = false;
+ if (!tg->limits_changed)
+ update_disptime = false;
+ else
+ tg->limits_changed = false;
+
goto queue_bio;
}
INIT_HLIST_HEAD(&td->tg_list);
td->tg_service_tree = THROTL_RB_ROOT;
+ atomic_set(&td->limits_changed, 0);
/* Init root group */
tg = &td->root_tg;
/* Practically unlimited BW */
tg->bps[0] = tg->bps[1] = -1;
tg->iops[0] = tg->iops[1] = -1;
- atomic_set(&tg->ref, 1);
+
+ /*
+ * Set root group reference to 2. One reference will be dropped when
+ * all groups on tg_list are being deleted during queue exit. Other
+ * reference will remain there as we don't want to delete this group
+ * as it is statically allocated and gets destroyed when throtl_data
+ * goes away.
+ */
+ atomic_set(&tg->ref, 2);
+ hlist_add_head(&tg->tg_node, &td->tg_list);
+ td->nr_undestroyed_grps++;
INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
spin_lock_irq(q->queue_lock);
throtl_release_tgs(td);
- blkiocg_del_blkio_group(&td->root_tg.blkg);
/* If there are other groups */
- if (td->nr_undestroyed_grps >= 1)
+ if (td->nr_undestroyed_grps > 0)
wait = true;
spin_unlock_irq(q->queue_lock);
*/
if (wait)
synchronize_rcu();
+
+ /*
+ * Just being safe to make sure after previous flush if some body did
+ * update limits through cgroup and another work got queued, cancel
+ * it.
+ */
+ throtl_shutdown_timer_wq(q);
throtl_td_free(td);
}