diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index 7cfb980a357d..caba6f4bb1b7 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -139,7 +139,7 @@ out_kfree_drv: * * Initializes arm cpuidle driver for all CPUs, if any CPU fails * to register cpuidle driver then rollback to cancel all CPUs - * registeration. + * registration. */ static int __init arm_idle_init(void) { diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c index 1fc9968eae19..3ab240e0e122 100644 --- a/drivers/cpuidle/cpuidle-qcom-spm.c +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -48,7 +48,7 @@ static int qcom_cpu_spc(struct spm_driver_data *drv) ret = cpu_suspend(0, qcom_pm_collapse); /* * ARM common code executes WFI without calling into our driver and - * if the SPM mode is not reset, then we may accidently power down the + * if the SPM mode is not reset, then we may accidentally power down the * cpu when we intended only to gate the cpu clock. * Ensure the state is set to standby before returning. */ diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 9e418aec1755..06ace16f9e71 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -406,7 +406,7 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) * Min polling interval of 10usec is a guess. It is assuming that * for most users, the time for a single ping-pong workload like * perf bench pipe would generally complete within 10usec but - * this is hardware dependant. Actual time can be estimated with + * this is hardware dependent. Actual time can be estimated with * * perf bench sched pipe -l 10000 * diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index cf5873cc45dc..9bbfa594c442 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -261,7 +261,7 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv) * @drv: a pointer to a valid struct cpuidle_driver * * Register the driver under a lock to prevent concurrent attempts to - * [un]register the driver from occuring at the same time. + * [un]register the driver from occurring at the same time. * * Returns 0 on success, a negative error code (returned by * __cpuidle_register_driver()) otherwise. @@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver); * @drv: a pointer to a valid struct cpuidle_driver * * Unregisters the cpuidle driver under a lock to prevent concurrent attempts - * to [un]register the driver from occuring at the same time. @drv has to + * to [un]register the driver from occurring at the same time. @drv has to * match the currently registered driver. */ void cpuidle_unregister_driver(struct cpuidle_driver *drv) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index f3c9d49f0f2a..28363bfa3e4c 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,7 +19,7 @@ #include "gov.h" -#define BUCKETS 12 +#define BUCKETS 6 #define INTERVAL_SHIFT 3 #define INTERVALS (1UL << INTERVAL_SHIFT) #define RESOLUTION 1024 @@ -29,12 +29,11 @@ /* * Concepts and ideas behind the menu governor * - * For the menu governor, there are 3 decision factors for picking a C + * For the menu governor, there are 2 decision factors for picking a C * state: * 1) Energy break even point - * 2) Performance impact - * 3) Latency tolerance (from pmqos infrastructure) - * These three factors are treated independently. + * 2) Latency tolerance (from pmqos infrastructure) + * These two factors are treated independently. * * Energy break even point * ----------------------- @@ -75,30 +74,6 @@ * intervals and if the stand deviation of these 8 intervals is below a * threshold value, we use the average of these intervals as prediction. * - * Limiting Performance Impact - * --------------------------- - * C states, especially those with large exit latencies, can have a real - * noticeable impact on workloads, which is not acceptable for most sysadmins, - * and in addition, less performance has a power price of its own. - * - * As a general rule of thumb, menu assumes that the following heuristic - * holds: - * The busier the system, the less impact of C states is acceptable - * - * This rule-of-thumb is implemented using a performance-multiplier: - * If the exit latency times the performance multiplier is longer than - * the predicted duration, the C state is not considered a candidate - * for selection due to a too high performance impact. So the higher - * this multiplier is, the longer we need to be idle to pick a deep C - * state, and thus the less likely a busy CPU will hit such a deep - * C state. - * - * Currently there is only one value determining the factor: - * 10 points are added for each process that is waiting for IO on this CPU. - * (This value was experimentally determined.) - * Utilization is no longer a factor as it was shown that it never contributed - * significantly to the performance multiplier in the first place. - * */ struct menu_device { @@ -112,19 +87,10 @@ struct menu_device { int interval_ptr; }; -static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) +static inline int which_bucket(u64 duration_ns) { int bucket = 0; - /* - * We keep two groups of stats; one with no - * IO pending, one without. - * This allows us to calculate - * E(duration)|iowait - */ - if (nr_iowaiters) - bucket = BUCKETS/2; - if (duration_ns < 10ULL * NSEC_PER_USEC) return bucket; if (duration_ns < 100ULL * NSEC_PER_USEC) @@ -138,19 +104,6 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) return bucket + 5; } -/* - * Return a multiplier for the exit latency that is intended - * to take performance requirements into account. - * The more performance critical we estimate the system - * to be, the higher this multiplier, and thus the higher - * the barrier to go to an expensive C state. - */ -static inline int performance_multiplier(unsigned int nr_iowaiters) -{ - /* for IO wait tasks (per cpu!) we add 10x each */ - return 1 + 10 * nr_iowaiters; -} - static DEFINE_PER_CPU(struct menu_device, menu_devices); static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); @@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct menu_device *data = this_cpu_ptr(&menu_devices); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); u64 predicted_ns; - u64 interactivity_req; - unsigned int nr_iowaiters; ktime_t delta, delta_tick; int i, idx; @@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } - nr_iowaiters = nr_iowait_cpu(dev->cpu); - /* Find the shortest expected idle interval. */ predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; if (predicted_ns > RESIDENCY_THRESHOLD_NS) { @@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } data->next_timer_ns = delta; - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + data->bucket = which_bucket(data->next_timer_ns); /* Round up the result for half microseconds. */ timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + @@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ data->next_timer_ns = KTIME_MAX; delta_tick = TICK_NSEC / 2; - data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); + data->bucket = which_bucket(KTIME_MAX); } if (unlikely(drv->state_count <= 1 || latency_req == 0) || @@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (predicted_ns < TICK_NSEC) predicted_ns = data->next_timer_ns; - } else { - /* - * Use the performance multiplier and the user-configurable - * latency_req to determine the maximum exit latency. - */ - interactivity_req = div64_u64(predicted_ns, - performance_multiplier(nr_iowaiters)); - if (latency_req > interactivity_req) - latency_req = interactivity_req; + } else if (latency_req > predicted_ns) { + latency_req = predicted_ns; } /* diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 67aebfe0fed6..ac4d8faa3886 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1069,6 +1069,47 @@ static struct cpuidle_state gnr_cstates[] __initdata = { .enter = NULL } }; +static struct cpuidle_state gnrd_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 4, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | + CPUIDLE_FLAG_INIT_XSTATE | + CPUIDLE_FLAG_PARTIAL_HINT_MATCH, + .exit_latency = 220, + .target_residency = 650, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6P", + .desc = "MWAIT 0x21", + .flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED | + CPUIDLE_FLAG_INIT_XSTATE | + CPUIDLE_FLAG_PARTIAL_HINT_MATCH, + .exit_latency = 240, + .target_residency = 750, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static struct cpuidle_state atom_cstates[] __initdata = { { .name = "C1E", @@ -1508,6 +1549,12 @@ static const struct idle_cpu idle_cpu_gnr __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_gnrd __initconst = { + .state_table = gnrd_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct idle_cpu idle_cpu_avn __initconst = { .state_table = avn_cstates, .disable_promotion_to_c1e = true, @@ -1593,6 +1640,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &idle_cpu_gnr), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &idle_cpu_gnrd), X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &idle_cpu_bxt), diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 1ff52020cf75..752e0b297582 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -55,6 +55,8 @@ struct em_perf_table { * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table * @nr_perf_states: Number of performance states + * @min_perf_state: Minimum allowed Performance State index + * @max_perf_state: Maximum allowed Performance State index * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache @@ -70,6 +72,8 @@ struct em_perf_table { struct em_perf_domain { struct em_perf_table __rcu *em_table; int nr_perf_states; + int min_perf_state; + int max_perf_state; unsigned long flags; unsigned long cpus[]; }; @@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); int em_dev_update_chip_binning(struct device *dev); +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM * @table: List of performance states, in ascending order - * @nr_perf_states: Number of performance states + * @pd: performance domain for which this must be done * @max_util: Max utilization to map with the EM - * @pd_flags: Performance Domain flags * * It is called from the scheduler code quite frequently and as a consequence * doesn't implement any check. @@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev); * requirement. */ static inline int -em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, - unsigned long max_util, unsigned long pd_flags) +em_pd_get_efficient_state(struct em_perf_state *table, + struct em_perf_domain *pd, unsigned long max_util) { + unsigned long pd_flags = pd->flags; + int min_ps = pd->min_perf_state; + int max_ps = pd->max_perf_state; struct em_perf_state *ps; int i; - for (i = 0; i < nr_perf_states; i++) { + for (i = min_ps; i <= max_ps; i++) { ps = &table[i]; if (ps->performance >= max_util) { if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && @@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, } } - return nr_perf_states - 1; + return max_ps; } /** @@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * requested performance. */ em_table = rcu_dereference(pd->em_table); - i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states, - max_util, pd->flags); + i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; /* @@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev) { return -EINVAL; } +static inline +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + return -EINVAL; +} #endif #endif diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 927cc55ba0b3..d07faf42eace 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, goto unlock; dev->em_pd->flags |= flags; + dev->em_pd->min_perf_state = 0; + dev->em_pd->max_perf_state = nr_states - 1; em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); @@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev) return em_recalc_and_update(dev, pd, em_table); } EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); + + +/** + * em_update_performance_limits() - Update Energy Model with performance + * limits information. + * @pd : Performance Domain with EM that has to be updated. + * @freq_min_khz : New minimum allowed frequency for this device. + * @freq_max_khz : New maximum allowed frequency for this device. + * + * This function allows to update the EM with information about available + * performance levels. It takes the minimum and maximum frequency in kHz + * and does internal translation to performance levels. + * Returns 0 on success or -EINVAL when failed. + */ +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + struct em_perf_state *table; + int min_ps = -1; + int max_ps = -1; + int i; + + if (!pd) + return -EINVAL; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + if (freq_min_khz == table[i].frequency) + min_ps = i; + if (freq_max_khz == table[i].frequency) + max_ps = i; + } + rcu_read_unlock(); + + /* Only update when both are found and sane */ + if (min_ps < 0 || max_ps < 0 || max_ps < min_ps) + return -EINVAL; + + + /* Guard simultaneous updates and make them atomic */ + mutex_lock(&em_pd_mutex); + pd->min_perf_state = min_ps; + pd->max_perf_state = max_ps; + mutex_unlock(&em_pd_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(em_update_performance_limits);