sched: Ensure cpu_power periodic update

With a lot of small tasks, the softirq sched is nearly never called when no_hz is enabled. In this case load_balance() is mainly called with the newly_idle mode which doesn't update the cpu_power. Add a next_update field which ensure a maximum update period when there is short activity. Having stale cpu_power information can skew the load-balancing decisions, this is cured by the guaranteed update. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1323717668-2143-1-git-send-email-vincent.guittot@linaro.org
2024-11-29 15:41:36 +00:00 · 2011-12-12 20:21:08 +01:00 · 2011-12-12 20:21:08 +01:00 · 4ec4412e1e
commit 4ec4412e1e
parent 39be350127
2 changed files with 17 additions and 8 deletions
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -905,6 +905,7 @@ struct sched_group_power {
 	 * single CPU.
 	 */
 	unsigned int power, power_orig;
+	unsigned long next_update;
 	/*
 	 * Number of busy cpus in this group.
 	 */
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -215,6 +215,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,

 const struct sched_class fair_sched_class;

+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /**************************************************************
 * CFS operations on generic schedulable entities:
 */
@ -3776,6 +3778,11 @@ void update_group_power(struct sched_domain *sd, int cpu)
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power;
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(sd->balance_interval);
+	interval = clamp(interval, 1UL, max_load_balance_interval);
+	sdg->sgp->next_update = jiffies + interval;

 	if (!child) {
 		update_cpu_power(sd, cpu);
@ -3883,12 +3890,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	 * domains. In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
-	if (idle != CPU_NEWLY_IDLE && local_group) {
-		if (balance_cpu != this_cpu) {
-			*balance = 0;
-			return;
-		}
-		update_group_power(sd, this_cpu);
+	if (local_group) {
+		if (idle != CPU_NEWLY_IDLE) {
+			if (balance_cpu != this_cpu) {
+				*balance = 0;
+				return;
+			}
+			update_group_power(sd, this_cpu);
+		} else if (time_after_eq(jiffies, group->sgp->next_update))
+			update_group_power(sd, this_cpu);
 	}

 	/* Adjust by relative CPU power of the group */
@ -4945,8 +4955,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,

 static DEFINE_SPINLOCK(balancing);

-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
 /*
 * Scale the max load_balance interval with the number of CPUs in the system.
 * This trades load-balance latency on larger machines for less cross talk.