[PATCH 0/2] sched/fair: Skip sched avgs update for new group task before wake_up_new_task()

classic Classic list List threaded Threaded
4 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH 0/2] sched/fair: Skip sched avgs update for new group task before wake_up_new_task()

Yuyang Du
Hi Peter,

Vincent reported this problem, which is there for a while. The first patch
is cleanup as well as paving the way to the fix.

Thanks,
Yuyang

---
Yuyang Du (2):
  sched/fair: Clean up attach_entity_load_avg()
  sched/fair: Skip detach and attach load avgs for new group task

 kernel/sched/auto_group.c |    2 +-
 kernel/sched/core.c       |    8 +++---
 kernel/sched/fair.c       |   62 ++++++++++++++++++++++++---------------------
 kernel/sched/sched.h      |    4 +--
 4 files changed, 40 insertions(+), 36 deletions(-)

--
1.7.9.5

Reply | Threaded
Open this post in threaded view
|

[PATCH 1/2] sched/fair: Clean up attach_entity_load_avg()

Yuyang Du
attach_entity_load_avg() is called (indirectly) from:

 - switched_to_fair(): switch between classes to fair
 - task_move_group_fair(): move between task groups
 - enqueue_entity_load_avg(): enqueue entity

Only in switched_to_fair() is it possible that the task's last_update_time
is not 0 and therefore the task needs sched avgs update, so move the task
sched avgs update to switched_to_fair() only. In addition, code comments are
clarified.

No functionality change.

Signed-off-by: Yuyang Du <[hidden email]>
---
 kernel/sched/fair.c |   39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e8..e89c39b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2961,24 +2961,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-  &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
  se->avg.last_update_time = cfs_rq->avg.last_update_time;
  cfs_rq->avg.load_avg += se->avg.load_avg;
  cfs_rq->avg.load_sum += se->avg.load_sum;
@@ -8418,6 +8400,27 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+ struct sched_entity *se = &p->se;
+
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ goto skip_aging;
+
+ /*
+ * If we change between classes, age the averages before attaching them.
+ */
+ if (se->avg.last_update_time) {
+ __update_load_avg(cfs_rq_of(se)->avg.last_update_time, cpu_of(rq),
+  &se->avg, 0, 0, NULL);
+
+ /*
+ * XXX: we could have just aged the entire load away if we've been
+ * absent from the fair class for too long.
+ */
+ }
+
+skip_aging:
+#endif
  attach_task_cfs_rq(p);
 
  if (task_on_rq_queued(p)) {
--
1.7.9.5

Reply | Threaded
Open this post in threaded view
|

[PATCH 2/2] sched/fair: Skip detach and attach load avgs for new group task

Yuyang Du
In reply to this post by Yuyang Du
Vincent reported that the first task to a new task group's cfs_rq will
be attached in attach_task_cfs_rq() and once more when it is enqueued
(see https://lkml.org/lkml/2016/5/25/388).

Actually, it is worse, attach_task_cfs_rq() is called for new task even
way before init_entity_runnable_average().

Solve this by avoiding attach as well as detach new task's sched avgs
in task_move_group_fair(). To do it, we need to know whether the task
is forked or not, so we pass this info all the way from sched_move_task()
to attach_task_cfs_rq().

Reported-by: Vincent Guittot <[hidden email]>
Signed-off-by: Yuyang Du <[hidden email]>
---
 kernel/sched/auto_group.c |    2 +-
 kernel/sched/core.c       |    8 ++++----
 kernel/sched/fair.c       |   23 ++++++++++++-----------
 kernel/sched/sched.h      |    4 ++--
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966c..e5f0be2 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -143,7 +143,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
  goto out;
 
  for_each_thread(p, t)
- sched_move_task(t);
+ sched_move_task(t, 0);
 out:
  unlock_task_sighand(p, &flags);
  autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4..8585032 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7724,7 +7724,7 @@ void sched_offline_group(struct task_group *tg)
  * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  * reflect its new group.
  */
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, int fork)
 {
  struct task_group *tg;
  int queued, running;
@@ -7753,7 +7753,7 @@ void sched_move_task(struct task_struct *tsk)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
  if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ tsk->sched_class->task_move_group(tsk, fork);
  else
 #endif
  set_task_rq(tsk, task_cpu(tsk));
@@ -8186,7 +8186,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 
 static void cpu_cgroup_fork(struct task_struct *task)
 {
- sched_move_task(task);
+ sched_move_task(task, 1);
 }
 
 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -8213,7 +8213,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
  struct cgroup_subsys_state *css;
 
  cgroup_taskset_for_each(task, css, tset)
- sched_move_task(task);
+ sched_move_task(task, 0);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e89c39b..e5a61b1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2970,6 +2970,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  cfs_rq_util_change(cfs_rq);
 }
 
+/* Catch up with the cfs_rq and then remove our sched avgs from it */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
  __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -8368,9 +8369,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
  place_entity(cfs_rq, se, 0);
  se->vruntime -= cfs_rq->min_vruntime;
  }
-
- /* Catch up with the cfs_rq and remove our load when we leave */
- detach_entity_load_avg(cfs_rq, se);
 }
 
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8386,9 +8384,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
  se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
 
- /* Synchronize task with its cfs_rq */
- attach_entity_load_avg(cfs_rq, se);
-
  if (!vruntime_normalized(p))
  se->vruntime += cfs_rq->min_vruntime;
 }
@@ -8396,6 +8391,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
 {
  detach_task_cfs_rq(p);
+ detach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
 }
 
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
@@ -8422,6 +8418,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 skip_aging:
 #endif
  attach_task_cfs_rq(p);
+ attach_entity_load_avg(cfs_rq_of(se), se);
 
  if (task_on_rq_queued(p)) {
  /*
@@ -8468,16 +8465,20 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p)
+static void task_move_group_fair(struct task_struct *p, int fork)
 {
  detach_task_cfs_rq(p);
+ /*
+ * New task does not need detach or attach load (see below)
+ */
+ if (!fork)
+ detach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
+
  set_task_rq(p, task_cpu(p));
 
-#ifdef CONFIG_SMP
- /* Tell se's cfs_rq has been changed -- migrated */
- p->se.avg.last_update_time = 0;
-#endif
  attach_task_cfs_rq(p);
+ if (!fork)
+ attach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
 }
 
 void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f30..58b1259 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -343,7 +343,7 @@ extern void sched_online_group(struct task_group *tg,
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_offline_group(struct task_group *tg);
 
-extern void sched_move_task(struct task_struct *tsk);
+extern void sched_move_task(struct task_struct *tsk, int fork);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
@@ -1247,7 +1247,7 @@ struct sched_class {
  void (*update_curr) (struct rq *rq);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_move_group) (struct task_struct *p, int fork);
 #endif
 };
 
--
1.7.9.5

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 2/2] sched/fair: Skip detach and attach load avgs for new group task

Vincent Guittot
On 26 May 2016 at 03:14, Yuyang Du <[hidden email]> wrote:

> Vincent reported that the first task to a new task group's cfs_rq will
> be attached in attach_task_cfs_rq() and once more when it is enqueued
> (see https://lkml.org/lkml/2016/5/25/388).
>
> Actually, it is worse, attach_task_cfs_rq() is called for new task even
> way before init_entity_runnable_average().
>
> Solve this by avoiding attach as well as detach new task's sched avgs
> in task_move_group_fair(). To do it, we need to know whether the task
> is forked or not, so we pass this info all the way from sched_move_task()
> to attach_task_cfs_rq().

Not sure that this is the right way to solve this problem because you
continue to attach the task twice without detaching it in the mean
time:
- once during the copy of the process in cpu_cgroup_fork (you skip the
attach of load average but the task is still attached to the local
cpu)
In the mean time, sched_entity is initialized and the last_update_time is reset
- one more time when the task is enqueued because the last_update_time
has been reset (this time you don't skip the attache of load_avg

Should you better detach the sched_entity with a copy of its parent
metrics before initializing it and attaching it to the new cpu ?

>
> Reported-by: Vincent Guittot <[hidden email]>
> Signed-off-by: Yuyang Du <[hidden email]>
> ---
>  kernel/sched/auto_group.c |    2 +-
>  kernel/sched/core.c       |    8 ++++----
>  kernel/sched/fair.c       |   23 ++++++++++++-----------
>  kernel/sched/sched.h      |    4 ++--
>  4 files changed, 19 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
> index a5d966c..e5f0be2 100644
> --- a/kernel/sched/auto_group.c
> +++ b/kernel/sched/auto_group.c
> @@ -143,7 +143,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
>                 goto out;
>
>         for_each_thread(p, t)
> -               sched_move_task(t);
> +               sched_move_task(t, 0);
>  out:
>         unlock_task_sighand(p, &flags);
>         autogroup_kref_put(prev);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7f2cae4..8585032 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7724,7 +7724,7 @@ void sched_offline_group(struct task_group *tg)
>   *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
>   *     reflect its new group.
>   */
> -void sched_move_task(struct task_struct *tsk)
> +void sched_move_task(struct task_struct *tsk, int fork)
>  {
>         struct task_group *tg;
>         int queued, running;
> @@ -7753,7 +7753,7 @@ void sched_move_task(struct task_struct *tsk)
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>         if (tsk->sched_class->task_move_group)
> -               tsk->sched_class->task_move_group(tsk);
> +               tsk->sched_class->task_move_group(tsk, fork);
>         else
>  #endif
>                 set_task_rq(tsk, task_cpu(tsk));
> @@ -8186,7 +8186,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
>
>  static void cpu_cgroup_fork(struct task_struct *task)
>  {
> -       sched_move_task(task);
> +       sched_move_task(task, 1);
>  }
>
>  static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
> @@ -8213,7 +8213,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
>         struct cgroup_subsys_state *css;
>
>         cgroup_taskset_for_each(task, css, tset)
> -               sched_move_task(task);
> +               sched_move_task(task, 0);
>  }
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e89c39b..e5a61b1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2970,6 +2970,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>         cfs_rq_util_change(cfs_rq);
>  }
>
> +/* Catch up with the cfs_rq and then remove our sched avgs from it */
>  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>         __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> @@ -8368,9 +8369,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
>                 place_entity(cfs_rq, se, 0);
>                 se->vruntime -= cfs_rq->min_vruntime;
>         }
> -
> -       /* Catch up with the cfs_rq and remove our load when we leave */
> -       detach_entity_load_avg(cfs_rq, se);
>  }
>
>  static void attach_task_cfs_rq(struct task_struct *p)
> @@ -8386,9 +8384,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
>         se->depth = se->parent ? se->parent->depth + 1 : 0;
>  #endif
>
> -       /* Synchronize task with its cfs_rq */
> -       attach_entity_load_avg(cfs_rq, se);
> -
>         if (!vruntime_normalized(p))
>                 se->vruntime += cfs_rq->min_vruntime;
>  }
> @@ -8396,6 +8391,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
>  static void switched_from_fair(struct rq *rq, struct task_struct *p)
>  {
>         detach_task_cfs_rq(p);
> +       detach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
>  }
>
>  static void switched_to_fair(struct rq *rq, struct task_struct *p)
> @@ -8422,6 +8418,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
>  skip_aging:
>  #endif
>         attach_task_cfs_rq(p);
> +       attach_entity_load_avg(cfs_rq_of(se), se);
>
>         if (task_on_rq_queued(p)) {
>                 /*
> @@ -8468,16 +8465,20 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
>  }
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -static void task_move_group_fair(struct task_struct *p)
> +static void task_move_group_fair(struct task_struct *p, int fork)
>  {
>         detach_task_cfs_rq(p);
> +       /*
> +        * New task does not need detach or attach load (see below)
> +        */
> +       if (!fork)
> +               detach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
> +
>         set_task_rq(p, task_cpu(p));
>
> -#ifdef CONFIG_SMP
> -       /* Tell se's cfs_rq has been changed -- migrated */
> -       p->se.avg.last_update_time = 0;
> -#endif
>         attach_task_cfs_rq(p);
> +       if (!fork)
> +               attach_entity_load_avg(cfs_rq_of(&p->se), &p->se);
>  }
>
>  void free_fair_sched_group(struct task_group *tg)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 72f1f30..58b1259 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -343,7 +343,7 @@ extern void sched_online_group(struct task_group *tg,
>  extern void sched_destroy_group(struct task_group *tg);
>  extern void sched_offline_group(struct task_group *tg);
>
> -extern void sched_move_task(struct task_struct *tsk);
> +extern void sched_move_task(struct task_struct *tsk, int fork);
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
> @@ -1247,7 +1247,7 @@ struct sched_class {
>         void (*update_curr) (struct rq *rq);
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -       void (*task_move_group) (struct task_struct *p);
> +       void (*task_move_group) (struct task_struct *p, int fork);
>  #endif
>  };
>
> --
> 1.7.9.5
>