From 38f83090f515b4b5d59382dfada1e7457f19aa47 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Thu, 5 Sep 2024 10:26:38 +0100 Subject: [PATCH 1/4] cpuidle: menu: Remove iowait influence Remove CPU iowaiters influence on idle state selection. Remove the menu notion of performance multiplier which increased with the number of tasks that went to iowait sleep on this CPU and haven't woken up yet. Relying on iowait for cpuidle is problematic for a few reasons: 1. There is no guarantee that an iowaiting task will wake up on the same CPU. 2. The task being in iowait says nothing about the idle duration, we could be selecting shallower states for a long time. 3. The task being in iowait doesn't always imply a performance hit with increased latency. 4. If there is such a performance hit, the number of iowaiting tasks doesn't directly correlate. 5. The definition of iowait altogether is vague at best, it is sprinkled across kernel code. Signed-off-by: Christian Loehle Link: https://patch.msgid.link/20240905092645.2885200-2-christian.loehle@arm.com [ rjw: Minor edits in the changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 76 ++++---------------------------- 1 file changed, 9 insertions(+), 67 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index f3c9d49f0f2a..28363bfa3e4c 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,7 +19,7 @@ #include "gov.h" -#define BUCKETS 12 +#define BUCKETS 6 #define INTERVAL_SHIFT 3 #define INTERVALS (1UL << INTERVAL_SHIFT) #define RESOLUTION 1024 @@ -29,12 +29,11 @@ /* * Concepts and ideas behind the menu governor * - * For the menu governor, there are 3 decision factors for picking a C + * For the menu governor, there are 2 decision factors for picking a C * state: * 1) Energy break even point - * 2) Performance impact - * 3) Latency tolerance (from pmqos infrastructure) - * These three factors are treated independently. + * 2) Latency tolerance (from pmqos infrastructure) + * These two factors are treated independently. * * Energy break even point * ----------------------- @@ -75,30 +74,6 @@ * intervals and if the stand deviation of these 8 intervals is below a * threshold value, we use the average of these intervals as prediction. * - * Limiting Performance Impact - * --------------------------- - * C states, especially those with large exit latencies, can have a real - * noticeable impact on workloads, which is not acceptable for most sysadmins, - * and in addition, less performance has a power price of its own. - * - * As a general rule of thumb, menu assumes that the following heuristic - * holds: - * The busier the system, the less impact of C states is acceptable - * - * This rule-of-thumb is implemented using a performance-multiplier: - * If the exit latency times the performance multiplier is longer than - * the predicted duration, the C state is not considered a candidate - * for selection due to a too high performance impact. So the higher - * this multiplier is, the longer we need to be idle to pick a deep C - * state, and thus the less likely a busy CPU will hit such a deep - * C state. - * - * Currently there is only one value determining the factor: - * 10 points are added for each process that is waiting for IO on this CPU. - * (This value was experimentally determined.) - * Utilization is no longer a factor as it was shown that it never contributed - * significantly to the performance multiplier in the first place. - * */ struct menu_device { @@ -112,19 +87,10 @@ struct menu_device { int interval_ptr; }; -static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) +static inline int which_bucket(u64 duration_ns) { int bucket = 0; - /* - * We keep two groups of stats; one with no - * IO pending, one without. - * This allows us to calculate - * E(duration)|iowait - */ - if (nr_iowaiters) - bucket = BUCKETS/2; - if (duration_ns < 10ULL * NSEC_PER_USEC) return bucket; if (duration_ns < 100ULL * NSEC_PER_USEC) @@ -138,19 +104,6 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) return bucket + 5; } -/* - * Return a multiplier for the exit latency that is intended - * to take performance requirements into account. - * The more performance critical we estimate the system - * to be, the higher this multiplier, and thus the higher - * the barrier to go to an expensive C state. - */ -static inline int performance_multiplier(unsigned int nr_iowaiters) -{ - /* for IO wait tasks (per cpu!) we add 10x each */ - return 1 + 10 * nr_iowaiters; -} - static DEFINE_PER_CPU(struct menu_device, menu_devices); static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); @@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct menu_device *data = this_cpu_ptr(&menu_devices); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); u64 predicted_ns; - u64 interactivity_req; - unsigned int nr_iowaiters; ktime_t delta, delta_tick; int i, idx; @@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } - nr_iowaiters = nr_iowait_cpu(dev->cpu); - /* Find the shortest expected idle interval. */ predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; if (predicted_ns > RESIDENCY_THRESHOLD_NS) { @@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } data->next_timer_ns = delta; - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + data->bucket = which_bucket(data->next_timer_ns); /* Round up the result for half microseconds. */ timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + @@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ data->next_timer_ns = KTIME_MAX; delta_tick = TICK_NSEC / 2; - data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); + data->bucket = which_bucket(KTIME_MAX); } if (unlikely(drv->state_count <= 1 || latency_req == 0) || @@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (predicted_ns < TICK_NSEC) predicted_ns = data->next_timer_ns; - } else { - /* - * Use the performance multiplier and the user-configurable - * latency_req to determine the maximum exit latency. - */ - interactivity_req = div64_u64(predicted_ns, - performance_multiplier(nr_iowaiters)); - if (latency_req > interactivity_req) - latency_req = interactivity_req; + } else if (latency_req > predicted_ns) { + latency_req = predicted_ns; } /* From ee702fdaf1563078758065aca7beb5e5d89367e5 Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Fri, 27 Sep 2024 16:10:18 +0800 Subject: [PATCH 2/4] cpuidle: Correct some typos in comments Fixed some confusing typos that were currently identified with codespell, the details are as follows: -in the code comments: drivers/cpuidle/cpuidle-arm.c:142: registeration ==> registration drivers/cpuidle/cpuidle-qcom-spm.c:51: accidently ==> accidentally drivers/cpuidle/cpuidle.c:409: dependant ==> dependent drivers/cpuidle/driver.c:264: occuring ==> occurring drivers/cpuidle/driver.c:299: occuring ==> occurring Signed-off-by: Shen Lichuan Link: https://patch.msgid.link/20240927081018.8608-1-shenlichuan@vivo.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-arm.c | 2 +- drivers/cpuidle/cpuidle-qcom-spm.c | 2 +- drivers/cpuidle/cpuidle.c | 2 +- drivers/cpuidle/driver.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index 7cfb980a357d..caba6f4bb1b7 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -139,7 +139,7 @@ out_kfree_drv: * * Initializes arm cpuidle driver for all CPUs, if any CPU fails * to register cpuidle driver then rollback to cancel all CPUs - * registeration. + * registration. */ static int __init arm_idle_init(void) { diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c index 1fc9968eae19..3ab240e0e122 100644 --- a/drivers/cpuidle/cpuidle-qcom-spm.c +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -48,7 +48,7 @@ static int qcom_cpu_spc(struct spm_driver_data *drv) ret = cpu_suspend(0, qcom_pm_collapse); /* * ARM common code executes WFI without calling into our driver and - * if the SPM mode is not reset, then we may accidently power down the + * if the SPM mode is not reset, then we may accidentally power down the * cpu when we intended only to gate the cpu clock. * Ensure the state is set to standby before returning. */ diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 9e418aec1755..06ace16f9e71 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -406,7 +406,7 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) * Min polling interval of 10usec is a guess. It is assuming that * for most users, the time for a single ping-pong workload like * perf bench pipe would generally complete within 10usec but - * this is hardware dependant. Actual time can be estimated with + * this is hardware dependent. Actual time can be estimated with * * perf bench sched pipe -l 10000 * diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index cf5873cc45dc..9bbfa594c442 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -261,7 +261,7 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv) * @drv: a pointer to a valid struct cpuidle_driver * * Register the driver under a lock to prevent concurrent attempts to - * [un]register the driver from occuring at the same time. + * [un]register the driver from occurring at the same time. * * Returns 0 on success, a negative error code (returned by * __cpuidle_register_driver()) otherwise. @@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver); * @drv: a pointer to a valid struct cpuidle_driver * * Unregisters the cpuidle driver under a lock to prevent concurrent attempts - * to [un]register the driver from occuring at the same time. @drv has to + * to [un]register the driver from occurring at the same time. @drv has to * match the currently registered driver. */ void cpuidle_unregister_driver(struct cpuidle_driver *drv) From 5609296750afd6462a4d994b6803ccc5e8bf1d4e Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 30 Oct 2024 16:39:51 +0000 Subject: [PATCH 3/4] PM: EM: Add min/max available performance state limits On some devices there are HW dependencies for shared frequency and voltage between devices. It will impact Energy Aware Scheduler (EAS) decision, where CPUs share the voltage & frequency domain with other CPUs or devices e.g. - Mid CPUs + Big CPU - Little CPU + L3 cache in DSU - some other device + Little CPUs Detailed explanation of one example: When the L3 cache frequency is increased, the affected Little CPUs might run at higher voltage and frequency. That higher voltage causes higher CPU power and thus more energy is used for running the tasks. This is important for background running tasks, which try to run on energy efficient CPUs. Therefore, add performance state limits which are applied for the device (in this case CPU). This is important on SoCs with HW dependencies mentioned above so that the Energy Aware Scheduler (EAS) does not use performance states outside the valid min-max range for energy calculation. Signed-off-by: Lukasz Luba Link: https://patch.msgid.link/20241030164126.1263793-2-lukasz.luba@arm.com Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 29 ++++++++++++++------ kernel/power/energy_model.c | 52 ++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 8 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 1ff52020cf75..752e0b297582 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -55,6 +55,8 @@ struct em_perf_table { * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table * @nr_perf_states: Number of performance states + * @min_perf_state: Minimum allowed Performance State index + * @max_perf_state: Maximum allowed Performance State index * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache @@ -70,6 +72,8 @@ struct em_perf_table { struct em_perf_domain { struct em_perf_table __rcu *em_table; int nr_perf_states; + int min_perf_state; + int max_perf_state; unsigned long flags; unsigned long cpus[]; }; @@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); int em_dev_update_chip_binning(struct device *dev); +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM * @table: List of performance states, in ascending order - * @nr_perf_states: Number of performance states + * @pd: performance domain for which this must be done * @max_util: Max utilization to map with the EM - * @pd_flags: Performance Domain flags * * It is called from the scheduler code quite frequently and as a consequence * doesn't implement any check. @@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev); * requirement. */ static inline int -em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, - unsigned long max_util, unsigned long pd_flags) +em_pd_get_efficient_state(struct em_perf_state *table, + struct em_perf_domain *pd, unsigned long max_util) { + unsigned long pd_flags = pd->flags; + int min_ps = pd->min_perf_state; + int max_ps = pd->max_perf_state; struct em_perf_state *ps; int i; - for (i = 0; i < nr_perf_states; i++) { + for (i = min_ps; i <= max_ps; i++) { ps = &table[i]; if (ps->performance >= max_util) { if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && @@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, } } - return nr_perf_states - 1; + return max_ps; } /** @@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * requested performance. */ em_table = rcu_dereference(pd->em_table); - i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states, - max_util, pd->flags); + i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; /* @@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev) { return -EINVAL; } +static inline +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + return -EINVAL; +} #endif #endif diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 927cc55ba0b3..d07faf42eace 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, goto unlock; dev->em_pd->flags |= flags; + dev->em_pd->min_perf_state = 0; + dev->em_pd->max_perf_state = nr_states - 1; em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); @@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev) return em_recalc_and_update(dev, pd, em_table); } EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); + + +/** + * em_update_performance_limits() - Update Energy Model with performance + * limits information. + * @pd : Performance Domain with EM that has to be updated. + * @freq_min_khz : New minimum allowed frequency for this device. + * @freq_max_khz : New maximum allowed frequency for this device. + * + * This function allows to update the EM with information about available + * performance levels. It takes the minimum and maximum frequency in kHz + * and does internal translation to performance levels. + * Returns 0 on success or -EINVAL when failed. + */ +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + struct em_perf_state *table; + int min_ps = -1; + int max_ps = -1; + int i; + + if (!pd) + return -EINVAL; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + if (freq_min_khz == table[i].frequency) + min_ps = i; + if (freq_max_khz == table[i].frequency) + max_ps = i; + } + rcu_read_unlock(); + + /* Only update when both are found and sane */ + if (min_ps < 0 || max_ps < 0 || max_ps < min_ps) + return -EINVAL; + + + /* Guard simultaneous updates and make them atomic */ + mutex_lock(&em_pd_mutex); + pd->min_perf_state = min_ps; + pd->max_perf_state = max_ps; + mutex_unlock(&em_pd_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(em_update_performance_limits); From f557e0d1c2e6eb6af6d4468ed2c0ee91829370e2 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 7 Nov 2024 13:56:07 +0200 Subject: [PATCH 4/4] intel_idle: add Granite Rapids Xeon D support Add Granite Rapids Xeon D C-states support: C1, C1E, C6, and C6P. The C-states are basically the same as in Granite Rapids Xeon SP/AP, but characteristics (latency, target residency) are a bit different. Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20241107115608.52233-1-artem.bityutskiy@linux.intel.com [ rjw: Changelog edit ] Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 67aebfe0fed6..ac4d8faa3886 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1069,6 +1069,47 @@ static struct cpuidle_state gnr_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state gnrd_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 4, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | + CPUIDLE_FLAG_INIT_XSTATE | + CPUIDLE_FLAG_PARTIAL_HINT_MATCH, + .exit_latency = 220, + .target_residency = 650, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6P", + .desc = "MWAIT 0x21", + .flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED | + CPUIDLE_FLAG_INIT_XSTATE | + CPUIDLE_FLAG_PARTIAL_HINT_MATCH, + .exit_latency = 240, + .target_residency = 750, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state atom_cstates[] __initdata = { { .name = "C1E", @@ -1508,6 +1549,12 @@ static const struct idle_cpu idle_cpu_gnr __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_gnrd __initconst = { + .state_table = gnrd_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct idle_cpu idle_cpu_avn __initconst = { .state_table = avn_cstates, .disable_promotion_to_c1e = true, @@ -1593,6 +1640,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &idle_cpu_gnr), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &idle_cpu_gnrd), X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &idle_cpu_bxt),