sched/tune: Fix improper accounting of tasks

cgroup_migrate_execute() calls can_attach() and css_set_move_task()
separately without holding rq->lock.

The schedtune implementation breaks here, since can_attach() accounts
for the task move way before the group move is committed. If the task
sleeps right after can_attach(), the sleep is accounted towards the
previous group. This ends up in disparity of counts between group.

Consider this race:

TaskA is moved from root_grp to topapp_grp, root_grp's tasks = 1 and
topapp tasks =0 right before the move and TaskB is moving it.

On cpu X
TaskA runs
* cgroup_migrate_execute()
  schedtune_can_attach()
   root_grp.tasks--; topapp_grp.tasks++;
   (root_grp.tasks =  0 and topapp_grp.tasks = 1)

*right at this moment context is switched and TaskA runs.

*TaskA sleeps
 dequeue_task()
   schedtune_dequeue_task()
    schedtune_task_update
     root_grp.tasks--; //TaskA has not really "switched" group, so it
     decrements from the root_grp, however can_attach() has accounted
     the task move and this leaves us with
     root_grp.tasks = 0 (it is -ve value protected)
     topapp.grp.tasks = 1

Now even if cpuX is idle (TaskA is long gone sleeping), its
topapp_grp.tasks continues to stay +ve and it is subject to topapp's
boost unnecessarily.

An easy way to fix this is to move the group change accounting in
attach() callback which gets called _after_ css_set_move_task(). Also
maintain the task's current idx in struct task_struct as it moves
between groups. The task's enqueue/dequeue is accounted towards the
cached idx value. In an event when the task dequeues just
before group changes, it gets subtracted from the old group, which is
correct because the task would have bumped up the old group's count. If
the task changes group while its running, the attach() callback has to
decrement from the old group and increment from the new group so that
the next dequeue will subtract from the new group. IOW the attach()
callback has to account only for running task but has to update the
cached index for both running and sleeping task.

The current uses task->on_rq != 0 check to determine whether a task
is queued on the runqueue or not. This is an incorrect check. Because
task->on_rq is set to TASK_ON_RQ_MIGRATING (value = 2) during
migration. Fix this by using task_on_rq_queued() to check if a task
is queued or not.

Change-Id: If412da5a239c18d9122cfad2be59b355c14c068f
Signed-off-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
Co-developed-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
This commit is contained in:
Abhijeet Dharmapurikar 2019-07-12 15:17:48 -07:00 committed by Pavankumar Kondeti
parent e63cbea4a5
commit bca62a0ae5
3 changed files with 79 additions and 89 deletions

View file

@ -885,6 +885,9 @@ struct task_struct {
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
#ifdef CONFIG_SCHED_TUNE
int stune_idx;
#endif
struct sched_dl_entity dl;

View file

@ -93,6 +93,9 @@ struct task_struct init_task
#endif
#ifdef CONFIG_CGROUP_SCHED
.sched_task_group = &root_task_group,
#endif
#ifdef CONFIG_SCHED_TUNE
.stune_idx = 0,
#endif
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),

View file

@ -420,7 +420,6 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
{
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
unsigned long irq_flags;
struct schedtune *st;
int idx;
if (unlikely(!schedtune_initialized))
@ -432,90 +431,16 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
* do_exit()::cgroup_exit() and task migration.
*/
raw_spin_lock_irqsave(&bg->lock, irq_flags);
rcu_read_lock();
st = task_schedtune(p);
idx = st->idx;
idx = p->stune_idx;
schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
rcu_read_unlock();
raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
}
int schedtune_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
struct boost_groups *bg;
struct rq_flags rq_flags;
unsigned int cpu;
struct rq *rq;
int src_bg; /* Source boost group index */
int dst_bg; /* Destination boost group index */
int tasks;
u64 now;
if (unlikely(!schedtune_initialized))
return 0;
cgroup_taskset_for_each(task, css, tset) {
/*
* Lock the CPU's RQ the task is enqueued to avoid race
* conditions with migration code while the task is being
* accounted
*/
rq = task_rq_lock(task, &rq_flags);
if (!task->on_rq) {
task_rq_unlock(rq, task, &rq_flags);
continue;
}
/*
* Boost group accouting is protected by a per-cpu lock and requires
* interrupt to be disabled to avoid race conditions on...
*/
cpu = cpu_of(rq);
bg = &per_cpu(cpu_boost_groups, cpu);
raw_spin_lock(&bg->lock);
dst_bg = css_st(css)->idx;
src_bg = task_schedtune(task)->idx;
/*
* Current task is not changing boostgroup, which can
* happen when the new hierarchy is in use.
*/
if (unlikely(dst_bg == src_bg)) {
raw_spin_unlock(&bg->lock);
task_rq_unlock(rq, task, &rq_flags);
continue;
}
/*
* This is the case of a RUNNABLE task which is switching its
* current boost group.
*/
/* Move task from src to dst boost group */
tasks = bg->group[src_bg].tasks - 1;
bg->group[src_bg].tasks = max(0, tasks);
bg->group[dst_bg].tasks += 1;
/* Update boost hold start for this group */
now = sched_clock_cpu(cpu);
bg->group[dst_bg].ts = now;
/* Force boost group re-evaluation at next boost check */
bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
raw_spin_unlock(&bg->lock);
task_rq_unlock(rq, task, &rq_flags);
}
return 0;
}
@ -580,7 +505,6 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu)
{
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
unsigned long irq_flags;
struct schedtune *st;
int idx;
if (unlikely(!schedtune_initialized))
@ -591,14 +515,11 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu)
* interrupt to be disabled to avoid race conditions on...
*/
raw_spin_lock_irqsave(&bg->lock, irq_flags);
rcu_read_lock();
st = task_schedtune(p);
idx = st->idx;
idx = p->stune_idx;
schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
rcu_read_unlock();
raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
}
@ -677,11 +598,19 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
return st->boost;
}
#ifdef CONFIG_SCHED_WALT
static void schedtune_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
struct boost_groups *bg;
struct rq_flags rq_flags;
unsigned int cpu;
struct rq *rq;
int src_idx; /* Source boost group index */
int dst_idx; /* Destination boost group index */
int tasks;
u64 now;
#ifdef CONFIG_SCHED_WALT
struct schedtune *st;
bool colocate;
@ -692,13 +621,68 @@ static void schedtune_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sync_cgroup_colocation(task, colocate);
}
#else
static void schedtune_attach(struct cgroup_taskset *tset)
{
}
#endif
cgroup_taskset_for_each(task, css, tset) {
/*
* Lock the CPU's RQ the task is enqueued to avoid race
* conditions with migration code while the task is being
* accounted
*/
rq = task_rq_lock(task, &rq_flags);
/*
* Boost group accouting is protected by a per-cpu lock and
* requires interrupt to be disabled to avoid race conditions
* on...
*/
cpu = cpu_of(rq);
bg = &per_cpu(cpu_boost_groups, cpu);
raw_spin_lock(&bg->lock);
dst_idx = task_schedtune(task)->idx;
src_idx = task->stune_idx;
/*
* Current task is not changing boostgroup, which can
* happen when the new hierarchy is in use.
*/
if (unlikely(dst_idx == src_idx)) {
raw_spin_unlock(&bg->lock);
task_rq_unlock(rq, task, &rq_flags);
continue;
}
task->stune_idx = dst_idx;
if (!task_on_rq_queued(task)) {
raw_spin_unlock(&bg->lock);
task_rq_unlock(rq, task, &rq_flags);
continue;
}
/*
* This is the case of a RUNNABLE task which is switching its
* current boost group.
*/
/* Move task from src to dst boost group */
tasks = bg->group[src_idx].tasks - 1;
bg->group[src_idx].tasks = max(0, tasks);
bg->group[dst_idx].tasks += 1;
/* Update boost hold start for this group */
now = sched_clock_cpu(cpu);
bg->group[dst_idx].ts = now;
/* Force boost group re-evaluation at next boost check */
bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
raw_spin_unlock(&bg->lock);
task_rq_unlock(rq, task, &rq_flags);
}
}
static int
boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
s64 boost)
@ -831,8 +815,8 @@ struct cgroup_subsys schedtune_cgrp_subsys = {
.css_alloc = schedtune_css_alloc,
.css_free = schedtune_css_free,
.attach = schedtune_attach,
.can_attach = schedtune_can_attach,
.cancel_attach = schedtune_cancel_attach,
.can_attach = schedtune_can_attach,
.cancel_attach = schedtune_cancel_attach,
.legacy_cftypes = files,
.early_init = 1,
};