sched/numa: Implement NUMA node level wake_affine()
Since select_idle_sibling() can place a task anywhere on a socket, comparing loads between individual CPU cores makes no real sense for deciding whether to do an affine wakeup across sockets, either. Instead, compare the load between the sockets in a similar way the load balancer and the numa balancing code do. Signed-off-by: Rik van Riel <riel@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
		
							parent
							
								
									7d894e6e34
								
							
						
					
					
						commit
						3fed382b46
					
				| @ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Can a task be moved from prev_cpu to this_cpu without causing a load | ||||
|  * imbalance that would trigger the load balancer? | ||||
|  */ | ||||
| static inline bool numa_wake_affine(struct sched_domain *sd, | ||||
| 				    struct task_struct *p, int this_cpu, | ||||
| 				    int prev_cpu, int sync) | ||||
| { | ||||
| 	struct numa_stats prev_load, this_load; | ||||
| 	s64 this_eff_load, prev_eff_load; | ||||
| 
 | ||||
| 	update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); | ||||
| 	update_numa_stats(&this_load, cpu_to_node(this_cpu)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If sync wakeup then subtract the (maximum possible) | ||||
| 	 * effect of the currently running task from the load | ||||
| 	 * of the current CPU: | ||||
| 	 */ | ||||
| 	if (sync) { | ||||
| 		unsigned long current_load = task_h_load(current); | ||||
| 
 | ||||
| 		if (this_load.load > current_load) | ||||
| 			this_load.load -= current_load; | ||||
| 		else | ||||
| 			this_load.load = 0; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * In low-load situations, where this_cpu's node is idle due to the | ||||
| 	 * sync cause above having dropped this_load.load to 0, move the task. | ||||
| 	 * Moving to an idle socket will not create a bad imbalance. | ||||
| 	 * | ||||
| 	 * Otherwise check if the nodes are near enough in load to allow this | ||||
| 	 * task to be woken on this_cpu's node. | ||||
| 	 */ | ||||
| 	if (this_load.load > 0) { | ||||
| 		unsigned long task_load = task_h_load(p); | ||||
| 
 | ||||
| 		this_eff_load = 100; | ||||
| 		this_eff_load *= prev_load.compute_capacity; | ||||
| 
 | ||||
| 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||||
| 		prev_eff_load *= this_load.compute_capacity; | ||||
| 
 | ||||
| 		this_eff_load *= this_load.load + task_load; | ||||
| 		prev_eff_load *= prev_load.load - task_load; | ||||
| 
 | ||||
| 		return this_eff_load <= prev_eff_load; | ||||
| 	} | ||||
| 
 | ||||
| 	return true; | ||||
| } | ||||
| #else | ||||
| static void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||||
| { | ||||
| @ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||||
| static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| static inline bool numa_wake_affine(struct sched_domain *sd, | ||||
| 				    struct task_struct *p, int this_cpu, | ||||
| 				    int prev_cpu, int sync) | ||||
| { | ||||
| 	return true; | ||||
| } | ||||
| #endif /* CONFIG_NUMA_BALANCING */ | ||||
| 
 | ||||
| static void | ||||
| @ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p) | ||||
| static int wake_affine(struct sched_domain *sd, struct task_struct *p, | ||||
| 		       int prev_cpu, int sync) | ||||
| { | ||||
| 	s64 this_load, load; | ||||
| 	s64 this_eff_load, prev_eff_load; | ||||
| 	int idx, this_cpu; | ||||
| 	struct task_group *tg; | ||||
| 	unsigned long weight; | ||||
| 	int balanced; | ||||
| 
 | ||||
| 	idx	  = sd->wake_idx; | ||||
| 	this_cpu  = smp_processor_id(); | ||||
| 	load	  = source_load(prev_cpu, idx); | ||||
| 	this_load = target_load(this_cpu, idx); | ||||
| 	int this_cpu = smp_processor_id(); | ||||
| 	bool affine = false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Common case: CPUs are in the same socket, and select_idle_sibling() | ||||
| 	 * will do its thing regardless of what we return: | ||||
| 	 */ | ||||
| 	if (cpus_share_cache(prev_cpu, this_cpu)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If sync wakeup then subtract the (maximum possible) | ||||
| 	 * effect of the currently running task from the load | ||||
| 	 * of the current CPU: | ||||
| 	 */ | ||||
| 	if (sync) { | ||||
| 		tg = task_group(current); | ||||
| 		weight = current->se.avg.load_avg; | ||||
| 
 | ||||
| 		this_load += effective_load(tg, this_cpu, -weight, -weight); | ||||
| 		load += effective_load(tg, prev_cpu, 0, -weight); | ||||
| 	} | ||||
| 
 | ||||
| 	tg = task_group(p); | ||||
| 	weight = p->se.avg.load_avg; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle | ||||
| 	 * due to the sync cause above having dropped this_load to 0, we'll | ||||
| 	 * always have an imbalance, but there's really nothing you can do | ||||
| 	 * about that, so that's good too. | ||||
| 	 * | ||||
| 	 * Otherwise check if either cpus are near enough in load to allow this | ||||
| 	 * task to be woken on this_cpu. | ||||
| 	 */ | ||||
| 	this_eff_load = 100; | ||||
| 	this_eff_load *= capacity_of(prev_cpu); | ||||
| 
 | ||||
| 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||||
| 	prev_eff_load *= capacity_of(this_cpu); | ||||
| 
 | ||||
| 	if (this_load > 0) { | ||||
| 		this_eff_load *= this_load + | ||||
| 			effective_load(tg, this_cpu, weight, weight); | ||||
| 
 | ||||
| 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | ||||
| 	} | ||||
| 
 | ||||
| 	balanced = this_eff_load <= prev_eff_load; | ||||
| 		affine = true; | ||||
| 	else | ||||
| 		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); | ||||
| 
 | ||||
| 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); | ||||
| 	if (affine) { | ||||
| 		schedstat_inc(sd->ttwu_move_affine); | ||||
| 		schedstat_inc(p->se.statistics.nr_wakeups_affine); | ||||
| 	} | ||||
| 
 | ||||
| 	if (!balanced) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	schedstat_inc(sd->ttwu_move_affine); | ||||
| 	schedstat_inc(p->se.statistics.nr_wakeups_affine); | ||||
| 
 | ||||
| 	return 1; | ||||
| 	return affine; | ||||
| } | ||||
| 
 | ||||
| static inline int task_util(struct task_struct *p); | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user