From aa1a43262ad5df010768f69530fa179ff81651d3 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:22 +0100 Subject: [PATCH 01/15] PM: EM: Fix inefficient states detection Currently, a debug message is printed if an inefficient state is detected in the Energy Model. Unfortunately, it won't detect if the first state is inefficient or if two successive states are. Fix this behavior. Fixes: 27871f7a8a34 (PM: Introduce an Energy Model management framework) Signed-off-by: Vincent Donnefort Reviewed-by: Quentin Perret Reviewed-by: Lukasz Luba Reviewed-by: Matthias Kaehlcke Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index a332ccd829e2..97e62469a6b3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -107,8 +107,7 @@ static void em_debug_remove_pd(struct device *dev) {} static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, int nr_states, struct em_data_callback *cb) { - unsigned long opp_eff, prev_opp_eff = ULONG_MAX; - unsigned long power, freq, prev_freq = 0; + unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; int i, ret; u64 fmax; @@ -153,27 +152,21 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, table[i].power = power; table[i].frequency = prev_freq = freq; - - /* - * The hertz/watts efficiency ratio should decrease as the - * frequency grows on sane platforms. But this isn't always - * true in practice so warn the user if a higher OPP is more - * power efficient than a lower one. - */ - opp_eff = freq / power; - if (opp_eff >= prev_opp_eff) - dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n", - i, i - 1); - prev_opp_eff = opp_eff; } /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; - for (i = 0; i < nr_states; i++) { + for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res = em_scale_power(table[i].power); table[i].cost = div64_u64(fmax * power_res, table[i].frequency); + if (table[i].cost >= prev_cost) { + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } } pd->table = table; From c8ed99533dbc0fcc1142671ec80acb33045d2999 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:23 +0100 Subject: [PATCH 02/15] PM: EM: Mark inefficient states Some SoCs, such as the sd855 have OPPs within the same performance domain, whose cost is higher than others with a higher frequency. Even though those OPPs are interesting from a cooling perspective, it makes no sense to use them when the device can run at full capacity. Those OPPs handicap the performance domain, when choosing the most energy-efficient CPU and are wasting energy. They are inefficient. Hence, add support for such OPPs to the Energy Model. The table can now be read skipping inefficient performance states (and by extension, inefficient OPPs). Signed-off-by: Vincent Donnefort Reviewed-by: Matthias Kaehlcke Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 ++++++++++++ kernel/power/energy_model.c | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 39dcadd492b5..3641ca4acf04 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -17,13 +17,25 @@ * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency + * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { unsigned long frequency; unsigned long power; unsigned long cost; + unsigned long flags; }; +/* + * em_perf_state flags: + * + * EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is + * in this em_perf_domain, another performance state with a higher frequency + * but a lower or equal power cost. Such inefficient states are ignored when + * using em_pd_get_efficient_*() functions. + */ +#define EM_PERF_STATE_INEFFICIENT BIT(0) + /** * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 97e62469a6b3..6d8438347535 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -2,7 +2,7 @@ /* * Energy Model of devices * - * Copyright (c) 2018-2020, Arm ltd. + * Copyright (c) 2018-2021, Arm ltd. * Written by: Quentin Perret, Arm ltd. * Improvements provided by: Lukasz Luba, Arm ltd. */ @@ -42,6 +42,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) debugfs_create_ulong("frequency", 0444, d, &ps->frequency); debugfs_create_ulong("power", 0444, d, &ps->power); debugfs_create_ulong("cost", 0444, d, &ps->cost); + debugfs_create_ulong("inefficient", 0444, d, &ps->flags); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -162,6 +163,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, table[i].cost = div64_u64(fmax * power_res, table[i].frequency); if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", table[i].frequency); } else { From 88f7a89560f6d0fc7803a8933637488f14e0a098 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:24 +0100 Subject: [PATCH 03/15] PM: EM: Extend em_perf_domain with a flag field Merge the current "milliwatts" option into a "flag" field. This intends to prepare the extension of this structure for inefficient states support in the Energy Model. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 13 ++++++++++--- kernel/power/energy_model.c | 6 ++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3641ca4acf04..671440371a95 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -40,8 +40,7 @@ struct em_perf_state { * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @milliwatts: Flag indicating the power values are in milli-Watts - * or some other scale. + * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache * misses during energy calculations in the scheduler @@ -56,10 +55,18 @@ struct em_perf_state { struct em_perf_domain { struct em_perf_state *table; int nr_perf_states; - int milliwatts; + unsigned long flags; unsigned long cpus[]; }; +/* + * em_perf_domain flags: + * + * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some + * other scale. + */ +#define EM_PERF_DOMAIN_MILLIWATTS BIT(0) + #define em_span_cpus(em) (to_cpumask((em)->cpus)) #ifdef CONFIG_ENERGY_MODEL diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 6d8438347535..3a7d1573b214 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -56,7 +56,8 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); static int em_debug_units_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; + char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? + "milliWatts" : "bogoWatts"; seq_printf(s, "%s\n", units); @@ -330,7 +331,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (ret) goto unlock; - dev->em_pd->milliwatts = milliwatts; + if (milliwatts) + dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); From 8354eb9eb3ddb4a8d0857648a470beffcc9d8639 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:25 +0100 Subject: [PATCH 04/15] PM: EM: Allow skipping inefficient states The new performance domain flag EM_PERF_DOMAIN_SKIP_INEFFICIENCIES allows to not take into account inefficient states when estimating energy consumption. This intends to let the Energy Model know that CPUFreq itself will skip inefficiencies and such states don't need to be part of the estimation anymore. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 43 +++++++++++++++++++++++++++++++----- kernel/power/energy_model.c | 13 +++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 671440371a95..6377adc3b78d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -64,8 +64,12 @@ struct em_perf_domain { * * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some * other scale. + * + * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating + * energy consumption. */ #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) +#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) #define em_span_cpus(em) (to_cpumask((em)->cpus)) @@ -120,6 +124,37 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts); void em_dev_unregister_perf_domain(struct device *dev); +/** + * em_pd_get_efficient_state() - Get an efficient performance state from the EM + * @pd : Performance domain for which we want an efficient frequency + * @freq : Frequency to map with the EM + * + * It is called from the scheduler code quite frequently and as a consequence + * doesn't implement any check. + * + * Return: An efficient performance state, high enough to meet @freq + * requirement. + */ +static inline +struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd, + unsigned long freq) +{ + struct em_perf_state *ps; + int i; + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &pd->table[i]; + if (ps->frequency >= freq) { + if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && + ps->flags & EM_PERF_STATE_INEFFICIENT) + continue; + break; + } + } + + return ps; +} + /** * em_cpu_energy() - Estimates the energy consumed by the CPUs of a * performance domain @@ -142,7 +177,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { unsigned long freq, scale_cpu; struct em_perf_state *ps; - int i, cpu; + int cpu; if (!sum_util) return 0; @@ -167,11 +202,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - for (i = 0; i < pd->nr_perf_states; i++) { - ps = &pd->table[i]; - if (ps->frequency >= freq) - break; - } + ps = em_pd_get_efficient_state(pd, freq); /* * The capacity of a CPU in the domain at the performance state (ps) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3a7d1573b214..d353ef29e37f 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -65,6 +65,17 @@ static int em_debug_units_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_units); +static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; + + seq_printf(s, "%d\n", enabled); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); + static void em_debug_create_pd(struct device *dev) { struct dentry *d; @@ -78,6 +89,8 @@ static void em_debug_create_pd(struct device *dev) &em_debug_cpus_fops); debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); + debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, + &em_debug_skip_inefficiencies_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) From 15171769069408789a72f9aa9a52cc931b839b56 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:26 +0100 Subject: [PATCH 05/15] cpufreq: Make policy min/max hard requirements When applying the policy min/max limits, the requested frequency is simply clamped to not be out of range. It means, however, if one of the boundaries isn't an available frequency, the frequency resolution can return a value out of those limits, depending on the relation used. e.g. freq{0,1,2} being available frequencies. freq0 policy->min freq1 policy->max freq2 | | | | | 17kHz 18kHz 19kHz 20kHz 21kHz __resolve_freq(21kHz, CPUFREQ_RELATION_L) -> 21kHz (out of bounds) __resolve_freq(17kHz, CPUFREQ_RELATION_H) -> 17kHz (out of bounds) If, during the policy init, we resolve the requested min/max to existing frequencies, we ensure that any CPUFREQ_RELATION_* would resolve to a frequency which is inside the policy min/max range. Making the policy limits rigid helps to introduce the inefficient frequencies support. Resolving an inefficient frequency to an efficient one should not transgress policy->max (which can be set for thermal reason) and having a value we can trust simplify this comparison. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5782b15a8caa..284e940084c6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2523,8 +2523,15 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, if (ret) return ret; + /* + * Resolve policy min/max to available frequencies. It ensures + * no frequency resolution will neither overshoot the requested maximum + * nor undershoot the requested minimum. + */ policy->min = new_data.min; policy->max = new_data.max; + policy->min = __resolve_freq(policy, policy->min, CPUFREQ_RELATION_L); + policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H); trace_cpu_frequency_limits(policy); policy->cached_target_freq = UINT_MAX; From 442d24a5c49a8ed1008dcb9c06dc463d12421e7f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:27 +0100 Subject: [PATCH 06/15] cpufreq: Add an interface to mark inefficient frequencies Some SoCs such as the sd855 have OPPs within the same policy whose cost is higher than others with a higher frequency. Those OPPs are inefficients and it might be interesting for a governor to not use them. cpufreq_table_set_inefficient() allows the caller to identify a specified frequency as being inefficient. Inefficient frequencies are only supported on sorted tables. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 44 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index ff88bb3e44fc..16997e1ff3fe 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -660,10 +660,11 @@ struct governor_attr { *********************************************************************/ /* Special Values of .frequency field */ -#define CPUFREQ_ENTRY_INVALID ~0u -#define CPUFREQ_TABLE_END ~1u +#define CPUFREQ_ENTRY_INVALID ~0u +#define CPUFREQ_TABLE_END ~1u /* Special Values of .flags field */ -#define CPUFREQ_BOOST_FREQ (1 << 0) +#define CPUFREQ_BOOST_FREQ (1 << 0) +#define CPUFREQ_INEFFICIENT_FREQ (1 << 1) struct cpufreq_frequency_table { unsigned int flags; @@ -1003,6 +1004,36 @@ static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy return count; } +/** + * cpufreq_table_set_inefficient() - Mark a frequency as inefficient + * @policy: the &struct cpufreq_policy containing the inefficient frequency + * @frequency: the inefficient frequency + * + * The &struct cpufreq_policy must use a sorted frequency table + * + * Return: %0 on success or a negative errno code + */ + +static inline int +cpufreq_table_set_inefficient(struct cpufreq_policy *policy, + unsigned int frequency) +{ + struct cpufreq_frequency_table *pos; + + /* Not supported */ + if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED) + return -EINVAL; + + cpufreq_for_each_valid_entry(pos, policy->freq_table) { + if (pos->frequency == frequency) { + pos->flags |= CPUFREQ_INEFFICIENT_FREQ; + return 0; + } + } + + return -EINVAL; +} + static inline int parse_perf_domain(int cpu, const char *list_name, const char *cell_name) { @@ -1071,6 +1102,13 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } +static inline int +cpufreq_table_set_inefficient(struct cpufreq_policy *policy, + unsigned int frequency) +{ + return -EINVAL; +} + static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, const char *cell_name, struct cpumask *cpumask) { From 1f39fa0dccff71d4788089b5e617229b19166867 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:28 +0100 Subject: [PATCH 07/15] cpufreq: Introducing CPUFREQ_RELATION_E This newly introduced flag can be applied by a governor to a CPUFreq relation, when looking for a frequency within the policy table. The resolution would then only walk through efficient frequencies. Even with the flag set, the policy max limit will still be honoured. If no efficient frequencies can be found within the limits of the policy, an inefficient one would be returned. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 3 +- drivers/cpufreq/amd_freq_sensitivity.c | 3 +- drivers/cpufreq/cpufreq.c | 10 ++- drivers/cpufreq/cpufreq_ondemand.c | 6 +- drivers/cpufreq/powernv-cpufreq.c | 4 +- drivers/cpufreq/s5pv210-cpufreq.c | 2 +- include/linux/cpufreq.h | 111 +++++++++++++++++++------ 7 files changed, 106 insertions(+), 33 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 28467d83c745..3d514b82d055 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -470,7 +470,8 @@ static unsigned int acpi_cpufreq_fast_switch(struct cpufreq_policy *policy, if (policy->cached_target_freq == target_freq) index = policy->cached_resolved_idx; else - index = cpufreq_table_find_index_dl(policy, target_freq); + index = cpufreq_table_find_index_dl(policy, target_freq, + false); entry = &policy->freq_table[index]; next_freq = entry->frequency; diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c index d0b10baf039a..6448e03bcf48 100644 --- a/drivers/cpufreq/amd_freq_sensitivity.c +++ b/drivers/cpufreq/amd_freq_sensitivity.c @@ -91,7 +91,8 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy, unsigned int index; index = cpufreq_table_find_index_h(policy, - policy->cur - 1); + policy->cur - 1, + relation & CPUFREQ_RELATION_E); freq_next = policy->freq_table[index].frequency; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 284e940084c6..8069a7bac9ef 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2260,8 +2260,16 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, !(cpufreq_driver->flags & CPUFREQ_NEED_UPDATE_LIMITS)) return 0; - if (cpufreq_driver->target) + if (cpufreq_driver->target) { + /* + * If the driver hasn't setup a single inefficient frequency, + * it's unlikely it knows how to decode CPUFREQ_RELATION_E. + */ + if (!policy->efficiencies_available) + relation &= ~CPUFREQ_RELATION_E; + return cpufreq_driver->target(policy, target_freq, relation); + } if (!cpufreq_driver->target_index) return -EINVAL; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index eb4320b619c9..f68cad9abd11 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -83,9 +83,11 @@ static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy, freq_avg = freq_req - freq_reduc; /* Find freq bounds for freq_avg in freq_table */ - index = cpufreq_table_find_index_h(policy, freq_avg); + index = cpufreq_table_find_index_h(policy, freq_avg, + relation & CPUFREQ_RELATION_E); freq_lo = freq_table[index].frequency; - index = cpufreq_table_find_index_l(policy, freq_avg); + index = cpufreq_table_find_index_l(policy, freq_avg, + relation & CPUFREQ_RELATION_E); freq_hi = freq_table[index].frequency; /* Find out how long we have to be in hi and lo freqs */ diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 5a2cf5f91ccb..fddbd1ea1635 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -934,7 +934,7 @@ static void powernv_cpufreq_work_fn(struct work_struct *work) policy = cpufreq_cpu_get(cpu); if (!policy) continue; - index = cpufreq_table_find_index_c(policy, policy->cur); + index = cpufreq_table_find_index_c(policy, policy->cur, false); powernv_cpufreq_target_index(policy, index); cpumask_andnot(&mask, &mask, policy->cpus); cpufreq_cpu_put(policy); @@ -1022,7 +1022,7 @@ static unsigned int powernv_fast_switch(struct cpufreq_policy *policy, int index; struct powernv_smp_call_data freq_data; - index = cpufreq_table_find_index_dl(policy, target_freq); + index = cpufreq_table_find_index_dl(policy, target_freq, false); freq_data.pstate_id = powernv_freqs[index].driver_data; freq_data.gpstate_id = powernv_freqs[index].driver_data; set_pstate(&freq_data); diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c index ad7d4f272ddc..76c888ed8d16 100644 --- a/drivers/cpufreq/s5pv210-cpufreq.c +++ b/drivers/cpufreq/s5pv210-cpufreq.c @@ -243,7 +243,7 @@ static int s5pv210_target(struct cpufreq_policy *policy, unsigned int index) new_freq = s5pv210_freq_table[index].frequency; /* Finding current running level index */ - priv_index = cpufreq_table_find_index_h(policy, old_freq); + priv_index = cpufreq_table_find_index_h(policy, old_freq, false); arm_volt = dvs_conf[index].arm_volt; int_volt = dvs_conf[index].int_volt; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 16997e1ff3fe..6d96bd452d55 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -118,6 +118,13 @@ struct cpufreq_policy { */ bool strict_target; + /* + * Set if inefficient frequencies were found in the frequency table. + * This indicates if the relation flag CPUFREQ_RELATION_E can be + * honored. + */ + bool efficiencies_available; + /* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the @@ -273,6 +280,8 @@ static inline void cpufreq_stats_record_transition(struct cpufreq_policy *policy #define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */ #define CPUFREQ_RELATION_C 2 /* closest frequency to target */ +/* relation flags */ +#define CPUFREQ_RELATION_E BIT(2) /* Get if possible an efficient frequency */ struct freq_attr { struct attribute attr; @@ -741,6 +750,22 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, continue; \ else +/** + * cpufreq_for_each_efficient_entry_idx - iterate with index over a cpufreq + * frequency_table excluding CPUFREQ_ENTRY_INVALID and + * CPUFREQ_INEFFICIENT_FREQ frequencies. + * @pos: the &struct cpufreq_frequency_table to use as a loop cursor. + * @table: the &struct cpufreq_frequency_table to iterate over. + * @idx: the table entry currently being processed. + * @efficiencies: set to true to only iterate over efficient frequencies. + */ + +#define cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) \ + cpufreq_for_each_valid_entry_idx(pos, table, idx) \ + if (efficiencies && (pos->flags & CPUFREQ_INEFFICIENT_FREQ)) \ + continue; \ + else + int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table); @@ -765,14 +790,15 @@ bool policy_has_boost_freq(struct cpufreq_policy *policy); /* Find lowest freq at or above target in a table in ascending order */ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq >= target_freq) @@ -786,14 +812,15 @@ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, /* Find lowest freq at or above target in a table in descending order */ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -816,26 +843,30 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_l(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_al(policy, target_freq); + return cpufreq_table_find_index_al(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dl(policy, target_freq); + return cpufreq_table_find_index_dl(policy, target_freq, + efficiencies); } /* Find highest freq at or below target in a table in ascending order */ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -858,14 +889,15 @@ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy, /* Find highest freq at or below target in a table in descending order */ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq <= target_freq) @@ -879,26 +911,30 @@ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_h(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_ah(policy, target_freq); + return cpufreq_table_find_index_ah(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dh(policy, target_freq); + return cpufreq_table_find_index_dh(policy, target_freq, + efficiencies); } /* Find closest freq to target in a table in ascending order */ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -925,14 +961,15 @@ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy, /* Find closest freq to target in a table in descending order */ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -959,35 +996,58 @@ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_c(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_ac(policy, target_freq); + return cpufreq_table_find_index_ac(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dc(policy, target_freq); + return cpufreq_table_find_index_dc(policy, target_freq, + efficiencies); } static inline int cpufreq_frequency_table_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { + bool efficiencies = policy->efficiencies_available && + (relation & CPUFREQ_RELATION_E); + int idx; + + /* cpufreq_table_index_unsorted() has no use for this flag anyway */ + relation &= ~CPUFREQ_RELATION_E; + if (unlikely(policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED)) return cpufreq_table_index_unsorted(policy, target_freq, relation); - +retry: switch (relation) { case CPUFREQ_RELATION_L: - return cpufreq_table_find_index_l(policy, target_freq); + idx = cpufreq_table_find_index_l(policy, target_freq, + efficiencies); + break; case CPUFREQ_RELATION_H: - return cpufreq_table_find_index_h(policy, target_freq); + idx = cpufreq_table_find_index_h(policy, target_freq, + efficiencies); + break; case CPUFREQ_RELATION_C: - return cpufreq_table_find_index_c(policy, target_freq); + idx = cpufreq_table_find_index_c(policy, target_freq, + efficiencies); + break; default: WARN_ON_ONCE(1); return 0; } + + if (idx < 0 && efficiencies) { + efficiencies = false; + goto retry; + } + + return idx; } static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy *policy) @@ -1027,6 +1087,7 @@ cpufreq_table_set_inefficient(struct cpufreq_policy *policy, cpufreq_for_each_valid_entry(pos, policy->freq_table) { if (pos->frequency == frequency) { pos->flags |= CPUFREQ_INEFFICIENT_FREQ; + policy->efficiencies_available = true; return 0; } } From b894d20e6867f04827c7817fbc155460ff108f6f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:29 +0100 Subject: [PATCH 08/15] cpufreq: Use CPUFREQ_RELATION_E in DVFS governors Let the governors schedutil, conservative and ondemand to work, if possible on efficient frequencies only. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpufreq/cpufreq_conservative.c | 6 ++++-- drivers/cpufreq/cpufreq_ondemand.c | 10 +++++----- include/linux/cpufreq.h | 10 ++++++++-- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8069a7bac9ef..e338d2f010fe 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -554,7 +554,7 @@ static unsigned int __resolve_freq(struct cpufreq_policy *policy, unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy, unsigned int target_freq) { - return __resolve_freq(policy, target_freq, CPUFREQ_RELATION_L); + return __resolve_freq(policy, target_freq, CPUFREQ_RELATION_LE); } EXPORT_SYMBOL_GPL(cpufreq_driver_resolve_freq); diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index aa39ff31ec9f..0879ec3c170c 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -111,7 +111,8 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy) if (requested_freq > policy->max) requested_freq = policy->max; - __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_H); + __cpufreq_driver_target(policy, requested_freq, + CPUFREQ_RELATION_HE); dbs_info->requested_freq = requested_freq; goto out; } @@ -134,7 +135,8 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy) else requested_freq = policy->min; - __cpufreq_driver_target(policy, requested_freq, CPUFREQ_RELATION_L); + __cpufreq_driver_target(policy, requested_freq, + CPUFREQ_RELATION_LE); dbs_info->requested_freq = requested_freq; } diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index f68cad9abd11..3b8f924771b4 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -120,12 +120,12 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) if (od_tuners->powersave_bias) freq = od_ops.powersave_bias_target(policy, freq, - CPUFREQ_RELATION_H); + CPUFREQ_RELATION_HE); else if (policy->cur == policy->max) return; __cpufreq_driver_target(policy, freq, od_tuners->powersave_bias ? - CPUFREQ_RELATION_L : CPUFREQ_RELATION_H); + CPUFREQ_RELATION_LE : CPUFREQ_RELATION_HE); } /* @@ -163,9 +163,9 @@ static void od_update(struct cpufreq_policy *policy) if (od_tuners->powersave_bias) freq_next = od_ops.powersave_bias_target(policy, freq_next, - CPUFREQ_RELATION_L); + CPUFREQ_RELATION_LE); - __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_C); + __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_CE); } } @@ -184,7 +184,7 @@ static unsigned int od_dbs_update(struct cpufreq_policy *policy) */ if (sample_type == OD_SUB_SAMPLE && policy_dbs->sample_delay_ns > 0) { __cpufreq_driver_target(policy, dbs_info->freq_lo, - CPUFREQ_RELATION_H); + CPUFREQ_RELATION_HE); return dbs_info->freq_lo_delay_us; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6d96bd452d55..7ce71c7371fb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -283,6 +283,10 @@ static inline void cpufreq_stats_record_transition(struct cpufreq_policy *policy /* relation flags */ #define CPUFREQ_RELATION_E BIT(2) /* Get if possible an efficient frequency */ +#define CPUFREQ_RELATION_LE (CPUFREQ_RELATION_L | CPUFREQ_RELATION_E) +#define CPUFREQ_RELATION_HE (CPUFREQ_RELATION_H | CPUFREQ_RELATION_E) +#define CPUFREQ_RELATION_CE (CPUFREQ_RELATION_C | CPUFREQ_RELATION_E) + struct freq_attr { struct attribute attr; ssize_t (*show)(struct cpufreq_policy *, char *); @@ -636,9 +640,11 @@ struct cpufreq_governor *cpufreq_fallback_governor(void); static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy) { if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + __cpufreq_driver_target(policy, policy->max, + CPUFREQ_RELATION_HE); else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + __cpufreq_driver_target(policy, policy->min, + CPUFREQ_RELATION_LE); } /* Governor attribute set */ From e458716a92b57f854deb89bb40aa3554c2b6205e Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:30 +0100 Subject: [PATCH 09/15] PM: EM: Mark inefficiencies in CPUFreq The Energy Model has a 1:1 mapping between OPPs and performance states (em_perf_state). If a CPUFreq driver registers an Energy Model, inefficiencies found by the latter can be applied to CPUFreq. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d353ef29e37f..0153b0ca7b23 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "energy_model: " fmt #include +#include #include #include #include @@ -231,6 +232,43 @@ static int em_create_pd(struct device *dev, int nr_states, return 0; } +static void em_cpufreq_update_efficiencies(struct device *dev) +{ + struct em_perf_domain *pd = dev->em_pd; + struct em_perf_state *table; + struct cpufreq_policy *policy; + int found = 0; + int i; + + if (!_is_cpu_device(dev) || !pd) + return; + + policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed"); + return; + } + + table = pd->table; + + for (i = 0; i < pd->nr_perf_states; i++) { + if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) + continue; + + if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) + found++; + } + + if (!found) + return; + + /* + * Efficiencies have been installed in CPUFreq, inefficient frequencies + * will be skipped. The EM can do the same. + */ + pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; +} + /** * em_pd_get() - Return the performance domain for a device * @dev : Device to find the performance domain for @@ -347,6 +385,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (milliwatts) dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + em_cpufreq_update_efficiencies(dev); + em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); From 6215a5de9e9138fda60f4f1d72f86dd52e4be02b Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 7 Oct 2021 17:42:18 +0100 Subject: [PATCH 10/15] cpufreq: mediatek-hw: Fix cpufreq_table_find_index_dl() call The new cpufreq table flag RELATION_E introduced a new "efficient" parameter for the cpufreq_table_find*() functions. Fixes: 1f39fa0dccff (cpufreq: Introducing CPUFREQ_RELATION_E) Signed-off-by: Vincent Donnefort Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/mediatek-cpufreq-hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index 0cf18dd46b92..8ddbd0c5ce37 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -109,7 +109,7 @@ static unsigned int mtk_cpufreq_hw_fast_switch(struct cpufreq_policy *policy, struct mtk_cpufreq_data *data = policy->driver_data; unsigned int index; - index = cpufreq_table_find_index_dl(policy, target_freq); + index = cpufreq_table_find_index_dl(policy, target_freq, false); writel_relaxed(index, data->reg_bases[REG_FREQ_PERF_STATE]); From 4570ddda43387e5a130dd85e71a1947b0c11da77 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 12 Mar 2021 14:04:07 +0100 Subject: [PATCH 11/15] powercap/drivers/dtpm: Encapsulate even more the code In order to increase the self-encapsulation of the dtpm generic code, the following changes are adding a power update ops to the dtpm ops. That allows the generic code to call directly the dtpm backend function to update the power values. The power update function does compute the power characteristics when the function is invoked. In the case of the CPUs, the power consumption depends on the number of online CPUs. The online CPUs mask is not up to date at CPUHP_AP_ONLINE_DYN state in the tear down callback. That is the reason why the online / offline are at separate state. As there is already an existing state for DTPM, this one is only moved to the DEAD state, so there is no addition of new state with these changes. The dtpm node is not removed when the cpu is unplugged. That simplifies the code for the next changes and results in a more self-encapsulated code. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210312130411.29833-1-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 54 ++++++------- drivers/powercap/dtpm_cpu.c | 148 ++++++++++++++++-------------------- include/linux/cpuhotplug.h | 2 +- include/linux/dtpm.h | 3 +- 4 files changed, 97 insertions(+), 110 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index c2185ec5f887..58433b8ef9a1 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -116,8 +116,6 @@ static void __dtpm_sub_power(struct dtpm *dtpm) parent->power_limit -= dtpm->power_limit; parent = parent->parent; } - - __dtpm_rebalance_weight(root); } static void __dtpm_add_power(struct dtpm *dtpm) @@ -130,45 +128,45 @@ static void __dtpm_add_power(struct dtpm *dtpm) parent->power_limit += dtpm->power_limit; parent = parent->parent; } +} - __dtpm_rebalance_weight(root); +static int __dtpm_update_power(struct dtpm *dtpm) +{ + int ret; + + __dtpm_sub_power(dtpm); + + ret = dtpm->ops->update_power_uw(dtpm); + if (ret) + pr_err("Failed to update power for '%s': %d\n", + dtpm->zone.name, ret); + + if (!test_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags)) + dtpm->power_limit = dtpm->power_max; + + __dtpm_add_power(dtpm); + + if (root) + __dtpm_rebalance_weight(root); + + return ret; } /** * dtpm_update_power - Update the power on the dtpm * @dtpm: a pointer to a dtpm structure to update - * @power_min: a u64 representing the new power_min value - * @power_max: a u64 representing the new power_max value * * Function to update the power values of the dtpm node specified in * parameter. These new values will be propagated to the tree. * * Return: zero on success, -EINVAL if the values are inconsistent */ -int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max) +int dtpm_update_power(struct dtpm *dtpm) { - int ret = 0; + int ret; mutex_lock(&dtpm_lock); - - if (power_min == dtpm->power_min && power_max == dtpm->power_max) - goto unlock; - - if (power_max < power_min) { - ret = -EINVAL; - goto unlock; - } - - __dtpm_sub_power(dtpm); - - dtpm->power_min = power_min; - dtpm->power_max = power_max; - if (!test_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags)) - dtpm->power_limit = power_max; - - __dtpm_add_power(dtpm); - -unlock: + ret = __dtpm_update_power(dtpm); mutex_unlock(&dtpm_lock); return ret; @@ -436,6 +434,7 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) if (dtpm->ops && !(dtpm->ops->set_power_uw && dtpm->ops->get_power_uw && + dtpm->ops->update_power_uw && dtpm->ops->release)) return -EINVAL; @@ -455,7 +454,8 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) root = dtpm; } - __dtpm_add_power(dtpm); + if (dtpm->ops && !dtpm->ops->update_power_uw(dtpm)) + __dtpm_add_power(dtpm); pr_info("Registered dtpm node '%s' / %llu-%llu uW, \n", dtpm->zone.name, dtpm->power_min, dtpm->power_max); diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 51c366938acd..f6076de39540 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -14,6 +14,8 @@ * The CPU hotplug is supported and the power numbers will be updated * if a CPU is hot plugged / unplugged. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -23,8 +25,6 @@ #include #include -static struct dtpm *__parent; - static DEFINE_PER_CPU(struct dtpm *, dtpm_per_cpu); struct dtpm_cpu { @@ -32,57 +32,16 @@ struct dtpm_cpu { int cpu; }; -/* - * When a new CPU is inserted at hotplug or boot time, add the power - * contribution and update the dtpm tree. - */ -static int power_add(struct dtpm *dtpm, struct em_perf_domain *em) -{ - u64 power_min, power_max; - - power_min = em->table[0].power; - power_min *= MICROWATT_PER_MILLIWATT; - power_min += dtpm->power_min; - - power_max = em->table[em->nr_perf_states - 1].power; - power_max *= MICROWATT_PER_MILLIWATT; - power_max += dtpm->power_max; - - return dtpm_update_power(dtpm, power_min, power_max); -} - -/* - * When a CPU is unplugged, remove its power contribution from the - * dtpm tree. - */ -static int power_sub(struct dtpm *dtpm, struct em_perf_domain *em) -{ - u64 power_min, power_max; - - power_min = em->table[0].power; - power_min *= MICROWATT_PER_MILLIWATT; - power_min = dtpm->power_min - power_min; - - power_max = em->table[em->nr_perf_states - 1].power; - power_max *= MICROWATT_PER_MILLIWATT; - power_max = dtpm->power_max - power_max; - - return dtpm_update_power(dtpm, power_min, power_max); -} - static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) { struct dtpm_cpu *dtpm_cpu = dtpm->private; - struct em_perf_domain *pd; + struct em_perf_domain *pd = em_cpu_get(dtpm_cpu->cpu); struct cpumask cpus; unsigned long freq; u64 power; int i, nr_cpus; - pd = em_cpu_get(dtpm_cpu->cpu); - cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); - nr_cpus = cpumask_weight(&cpus); for (i = 0; i < pd->nr_perf_states; i++) { @@ -113,6 +72,7 @@ static u64 get_pd_power_uw(struct dtpm *dtpm) pd = em_cpu_get(dtpm_cpu->cpu); freq = cpufreq_quick_get(dtpm_cpu->cpu); + cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); nr_cpus = cpumask_weight(&cpus); @@ -128,6 +88,27 @@ static u64 get_pd_power_uw(struct dtpm *dtpm) return 0; } +static int update_pd_power_uw(struct dtpm *dtpm) +{ + struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct em_perf_domain *em = em_cpu_get(dtpm_cpu->cpu); + struct cpumask cpus; + int nr_cpus; + + cpumask_and(&cpus, cpu_online_mask, to_cpumask(em->cpus)); + nr_cpus = cpumask_weight(&cpus); + + dtpm->power_min = em->table[0].power; + dtpm->power_min *= MICROWATT_PER_MILLIWATT; + dtpm->power_min *= nr_cpus; + + dtpm->power_max = em->table[em->nr_perf_states - 1].power; + dtpm->power_max *= MICROWATT_PER_MILLIWATT; + dtpm->power_max *= nr_cpus; + + return 0; +} + static void pd_release(struct dtpm *dtpm) { struct dtpm_cpu *dtpm_cpu = dtpm->private; @@ -139,39 +120,24 @@ static void pd_release(struct dtpm *dtpm) } static struct dtpm_ops dtpm_ops = { - .set_power_uw = set_pd_power_limit, - .get_power_uw = get_pd_power_uw, - .release = pd_release, + .set_power_uw = set_pd_power_limit, + .get_power_uw = get_pd_power_uw, + .update_power_uw = update_pd_power_uw, + .release = pd_release, }; static int cpuhp_dtpm_cpu_offline(unsigned int cpu) { - struct cpufreq_policy *policy; struct em_perf_domain *pd; struct dtpm *dtpm; - policy = cpufreq_cpu_get(cpu); - - if (!policy) - return 0; - pd = em_cpu_get(cpu); if (!pd) return -EINVAL; dtpm = per_cpu(dtpm_per_cpu, cpu); - power_sub(dtpm, pd); - - if (cpumask_weight(policy->cpus) != 1) - return 0; - - for_each_cpu(cpu, policy->related_cpus) - per_cpu(dtpm_per_cpu, cpu) = NULL; - - dtpm_unregister(dtpm); - - return 0; + return dtpm_update_power(dtpm); } static int cpuhp_dtpm_cpu_online(unsigned int cpu) @@ -184,7 +150,6 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) int ret = -ENOMEM; policy = cpufreq_cpu_get(cpu); - if (!policy) return 0; @@ -194,7 +159,7 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) dtpm = per_cpu(dtpm_per_cpu, cpu); if (dtpm) - return power_add(dtpm, pd); + return dtpm_update_power(dtpm); dtpm = dtpm_alloc(&dtpm_ops); if (!dtpm) @@ -210,27 +175,20 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) for_each_cpu(cpu, policy->related_cpus) per_cpu(dtpm_per_cpu, cpu) = dtpm; - sprintf(name, "cpu%d", dtpm_cpu->cpu); + snprintf(name, sizeof(name), "cpu%d-cpufreq", dtpm_cpu->cpu); - ret = dtpm_register(name, dtpm, __parent); + ret = dtpm_register(name, dtpm, NULL); if (ret) goto out_kfree_dtpm_cpu; - ret = power_add(dtpm, pd); - if (ret) - goto out_dtpm_unregister; - ret = freq_qos_add_request(&policy->constraints, &dtpm_cpu->qos_req, FREQ_QOS_MAX, pd->table[pd->nr_perf_states - 1].frequency); if (ret) - goto out_power_sub; + goto out_dtpm_unregister; return 0; -out_power_sub: - power_sub(dtpm, pd); - out_dtpm_unregister: dtpm_unregister(dtpm); dtpm_cpu = NULL; @@ -248,10 +206,38 @@ out_kfree_dtpm: int dtpm_register_cpu(struct dtpm *parent) { - __parent = parent; + int ret; - return cpuhp_setup_state(CPUHP_AP_DTPM_CPU_ONLINE, - "dtpm_cpu:online", - cpuhp_dtpm_cpu_online, - cpuhp_dtpm_cpu_offline); + /* + * The callbacks at CPU hotplug time are calling + * dtpm_update_power() which in turns calls update_pd_power(). + * + * The function update_pd_power() uses the online mask to + * figure out the power consumption limits. + * + * At CPUHP_AP_ONLINE_DYN, the CPU is present in the CPU + * online mask when the cpuhp_dtpm_cpu_online function is + * called, but the CPU is still in the online mask for the + * tear down callback. So the power can not be updated when + * the CPU is unplugged. + * + * At CPUHP_AP_DTPM_CPU_DEAD, the situation is the opposite as + * above. The CPU online mask is not up to date when the CPU + * is plugged in. + * + * For this reason, we need to call the online and offline + * callbacks at different moments when the CPU online mask is + * consistent with the power numbers we want to update. + */ + ret = cpuhp_setup_state(CPUHP_AP_DTPM_CPU_DEAD, "dtpm_cpu:offline", + NULL, cpuhp_dtpm_cpu_offline); + if (ret < 0) + return ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "dtpm_cpu:online", + cpuhp_dtpm_cpu_online, NULL); + if (ret < 0) + return ret; + + return 0; } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 832d8a74fa59..ad9a34a80440 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -97,6 +97,7 @@ enum cpuhp_state { CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, + CPUHP_AP_DTPM_CPU_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, @@ -242,7 +243,6 @@ enum cpuhp_state { CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, - CPUHP_AP_DTPM_CPU_ONLINE, CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index e80a332e3d8a..acf8d3638988 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -29,6 +29,7 @@ struct dtpm { struct dtpm_ops { u64 (*set_power_uw)(struct dtpm *, u64); u64 (*get_power_uw)(struct dtpm *); + int (*update_power_uw)(struct dtpm *); void (*release)(struct dtpm *); }; @@ -62,7 +63,7 @@ static inline struct dtpm *to_dtpm(struct powercap_zone *zone) return container_of(zone, struct dtpm, zone); } -int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max); +int dtpm_update_power(struct dtpm *dtpm); int dtpm_release_zone(struct powercap_zone *pcz); From 7a89d7eacf8e84f2afb94db5ae9d9f9faa93f01c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 12 Mar 2021 14:04:09 +0100 Subject: [PATCH 12/15] powercap/drivers/dtpm: Simplify the dtpm table The dtpm table is an array of pointers, that forces the user of the table to define initdata along with the declaration of the table entry. It is more efficient to create an array of dtpm structure, so the declaration of the table entry can be done by initializing the different fields. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210312130411.29833-3-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 4 ++-- drivers/powercap/dtpm_cpu.c | 4 +++- include/linux/dtpm.h | 20 +++++++++----------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 58433b8ef9a1..8c032398f6ce 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -467,7 +467,7 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) static int __init dtpm_init(void) { - struct dtpm_descr **dtpm_descr; + struct dtpm_descr *dtpm_descr; pct = powercap_register_control_type(NULL, "dtpm", NULL); if (IS_ERR(pct)) { @@ -476,7 +476,7 @@ static int __init dtpm_init(void) } for_each_dtpm_table(dtpm_descr) - (*dtpm_descr)->init(*dtpm_descr); + dtpm_descr->init(); return 0; } diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index f6076de39540..98841524a782 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -204,7 +204,7 @@ out_kfree_dtpm: return ret; } -int dtpm_register_cpu(struct dtpm *parent) +static int __init dtpm_cpu_init(void) { int ret; @@ -241,3 +241,5 @@ int dtpm_register_cpu(struct dtpm *parent) return 0; } + +DTPM_DECLARE(dtpm_cpu, dtpm_cpu_init); diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index acf8d3638988..1e53db6bd5f9 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -33,25 +33,23 @@ struct dtpm_ops { void (*release)(struct dtpm *); }; -struct dtpm_descr; - -typedef int (*dtpm_init_t)(struct dtpm_descr *); +typedef int (*dtpm_init_t)(void); struct dtpm_descr { - struct dtpm *parent; - const char *name; dtpm_init_t init; }; /* Init section thermal table */ -extern struct dtpm_descr *__dtpm_table[]; -extern struct dtpm_descr *__dtpm_table_end[]; +extern struct dtpm_descr __dtpm_table[]; +extern struct dtpm_descr __dtpm_table_end[]; -#define DTPM_TABLE_ENTRY(name) \ - static typeof(name) *__dtpm_table_entry_##name \ - __used __section("__dtpm_table") = &name +#define DTPM_TABLE_ENTRY(name, __init) \ + static struct dtpm_descr __dtpm_table_entry_##name \ + __used __section("__dtpm_table") = { \ + .init = __init, \ + } -#define DTPM_DECLARE(name) DTPM_TABLE_ENTRY(name) +#define DTPM_DECLARE(name, init) DTPM_TABLE_ENTRY(name, init) #define for_each_dtpm_table(__dtpm) \ for (__dtpm = __dtpm_table; \ From d2cdc6adc30879d81160199fc7c6ab890fc4bd4c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 12 Mar 2021 14:04:10 +0100 Subject: [PATCH 13/15] powercap/drivers/dtpm: Use container_of instead of a private data field The dtpm framework provides an API to allocate a dtpm node. However when a backend dtpm driver needs to allocate a dtpm node it must define its own structure and store the pointer of this structure in the private field of the dtpm structure. It is more elegant to use the container_of macro and add the dtpm structure inside the dtpm backend specific structure. The code will be able to deal properly with the dtpm structure as a generic entity, making all this even more self-encapsulated. The dtpm_alloc() function does no longer make sense as the dtpm structure will be allocated when allocating the device specific dtpm structure. The dtpm_init() is provided instead. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210312130411.29833-4-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 18 +++++--------- drivers/powercap/dtpm_cpu.c | 48 ++++++++++++++++++------------------- include/linux/dtpm.h | 3 +-- 3 files changed, 30 insertions(+), 39 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 8c032398f6ce..c7c5529b61eb 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -357,24 +357,18 @@ static struct powercap_zone_ops zone_ops = { }; /** - * dtpm_alloc - Allocate and initialize a dtpm struct - * @name: a string specifying the name of the node - * - * Return: a struct dtpm pointer, NULL in case of error + * dtpm_init - Allocate and initialize a dtpm struct + * @dtpm: The dtpm struct pointer to be initialized + * @ops: The dtpm device specific ops, NULL for a virtual node */ -struct dtpm *dtpm_alloc(struct dtpm_ops *ops) +void dtpm_init(struct dtpm *dtpm, struct dtpm_ops *ops) { - struct dtpm *dtpm; - - dtpm = kzalloc(sizeof(*dtpm), GFP_KERNEL); if (dtpm) { INIT_LIST_HEAD(&dtpm->children); INIT_LIST_HEAD(&dtpm->sibling); dtpm->weight = 1024; dtpm->ops = ops; } - - return dtpm; } /** @@ -465,7 +459,7 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) return 0; } -static int __init dtpm_init(void) +static int __init init_dtpm(void) { struct dtpm_descr *dtpm_descr; @@ -480,4 +474,4 @@ static int __init dtpm_init(void) return 0; } -late_initcall(dtpm_init); +late_initcall(init_dtpm); diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 98841524a782..2e21e4e2b01f 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -25,16 +25,22 @@ #include #include -static DEFINE_PER_CPU(struct dtpm *, dtpm_per_cpu); - struct dtpm_cpu { + struct dtpm dtpm; struct freq_qos_request qos_req; int cpu; }; +static DEFINE_PER_CPU(struct dtpm_cpu *, dtpm_per_cpu); + +static struct dtpm_cpu *to_dtpm_cpu(struct dtpm *dtpm) +{ + return container_of(dtpm, struct dtpm_cpu, dtpm); +} + static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) { - struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *pd = em_cpu_get(dtpm_cpu->cpu); struct cpumask cpus; unsigned long freq; @@ -64,7 +70,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) static u64 get_pd_power_uw(struct dtpm *dtpm) { - struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *pd; struct cpumask cpus; unsigned long freq; @@ -90,7 +96,7 @@ static u64 get_pd_power_uw(struct dtpm *dtpm) static int update_pd_power_uw(struct dtpm *dtpm) { - struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *em = em_cpu_get(dtpm_cpu->cpu); struct cpumask cpus; int nr_cpus; @@ -111,7 +117,7 @@ static int update_pd_power_uw(struct dtpm *dtpm) static void pd_release(struct dtpm *dtpm) { - struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); if (freq_qos_request_active(&dtpm_cpu->qos_req)) freq_qos_remove_request(&dtpm_cpu->qos_req); @@ -129,20 +135,19 @@ static struct dtpm_ops dtpm_ops = { static int cpuhp_dtpm_cpu_offline(unsigned int cpu) { struct em_perf_domain *pd; - struct dtpm *dtpm; + struct dtpm_cpu *dtpm_cpu; pd = em_cpu_get(cpu); if (!pd) return -EINVAL; - dtpm = per_cpu(dtpm_per_cpu, cpu); + dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); - return dtpm_update_power(dtpm); + return dtpm_update_power(&dtpm_cpu->dtpm); } static int cpuhp_dtpm_cpu_online(unsigned int cpu) { - struct dtpm *dtpm; struct dtpm_cpu *dtpm_cpu; struct cpufreq_policy *policy; struct em_perf_domain *pd; @@ -157,27 +162,23 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) if (!pd) return -EINVAL; - dtpm = per_cpu(dtpm_per_cpu, cpu); - if (dtpm) - return dtpm_update_power(dtpm); - - dtpm = dtpm_alloc(&dtpm_ops); - if (!dtpm) - return -EINVAL; + dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); + if (dtpm_cpu) + return dtpm_update_power(&dtpm_cpu->dtpm); dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL); if (!dtpm_cpu) - goto out_kfree_dtpm; + return -ENOMEM; - dtpm->private = dtpm_cpu; + dtpm_init(&dtpm_cpu->dtpm, &dtpm_ops); dtpm_cpu->cpu = cpu; for_each_cpu(cpu, policy->related_cpus) - per_cpu(dtpm_per_cpu, cpu) = dtpm; + per_cpu(dtpm_per_cpu, cpu) = dtpm_cpu; snprintf(name, sizeof(name), "cpu%d-cpufreq", dtpm_cpu->cpu); - ret = dtpm_register(name, dtpm, NULL); + ret = dtpm_register(name, &dtpm_cpu->dtpm, NULL); if (ret) goto out_kfree_dtpm_cpu; @@ -190,17 +191,14 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) return 0; out_dtpm_unregister: - dtpm_unregister(dtpm); + dtpm_unregister(&dtpm_cpu->dtpm); dtpm_cpu = NULL; - dtpm = NULL; out_kfree_dtpm_cpu: for_each_cpu(cpu, policy->related_cpus) per_cpu(dtpm_per_cpu, cpu) = NULL; kfree(dtpm_cpu); -out_kfree_dtpm: - kfree(dtpm); return ret; } diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index 1e53db6bd5f9..2890f6370eb9 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -23,7 +23,6 @@ struct dtpm { u64 power_max; u64 power_min; int weight; - void *private; }; struct dtpm_ops { @@ -65,7 +64,7 @@ int dtpm_update_power(struct dtpm *dtpm); int dtpm_release_zone(struct powercap_zone *pcz); -struct dtpm *dtpm_alloc(struct dtpm_ops *ops); +void dtpm_init(struct dtpm *dtpm, struct dtpm_ops *ops); void dtpm_unregister(struct dtpm *dtpm); From eb82bace893169b319c563b7f813c58a0a5a9f76 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 12 Mar 2021 14:04:11 +0100 Subject: [PATCH 14/15] powercap/drivers/dtpm: Scale the power with the load Currently the power consumption is based on the current OPP power assuming the entire performance domain is fully loaded. That gives very gross power estimation and we can do much better by using the load to scale the power consumption. Use the utilization to normalize and scale the power usage over the max possible power. Tested on a rock960 with 2 big CPUS, the power consumption estimation conforms with the expected one. Before this change: ~$ ~/dhrystone -t 1 -l 10000& ~$ cat /sys/devices/virtual/powercap/dtpm/dtpm:0/dtpm:0:1/constraint_0_max_power_uw 2260000 After this change: ~$ ~/dhrystone -t 1 -l 10000& ~$ cat /sys/devices/virtual/powercap/dtpm/dtpm:0/dtpm:0:1/constraint_0_max_power_uw 1130000 ~$ ~/dhrystone -t 2 -l 10000& ~$ cat /sys/devices/virtual/powercap/dtpm/dtpm:0/dtpm:0:1/constraint_0_max_power_uw 2260000 Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210312130411.29833-5-daniel.lezcano@linaro.org --- drivers/powercap/dtpm_cpu.c | 46 +++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 2e21e4e2b01f..44faa3a74db6 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -68,27 +68,59 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) return power_limit; } +static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power) +{ + unsigned long max = 0, sum_util = 0; + int cpu; + + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + + /* + * The capacity is the same for all CPUs belonging to + * the same perf domain, so a single call to + * arch_scale_cpu_capacity() is enough. However, we + * need the CPU parameter to be initialized by the + * loop, so the call ends up in this block. + * + * We can initialize 'max' with a cpumask_first() call + * before the loop but the bits computation is not + * worth given the arch_scale_cpu_capacity() just + * returns a value where the resulting assembly code + * will be optimized by the compiler. + */ + max = arch_scale_cpu_capacity(cpu); + sum_util += sched_cpu_util(cpu, max); + } + + /* + * In the improbable case where all the CPUs of the perf + * domain are offline, 'max' will be zero and will lead to an + * illegal operation with a zero division. + */ + return max ? (power * ((sum_util << 10) / max)) >> 10 : 0; +} + static u64 get_pd_power_uw(struct dtpm *dtpm) { struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); struct em_perf_domain *pd; - struct cpumask cpus; + struct cpumask *pd_mask; unsigned long freq; - int i, nr_cpus; + int i; pd = em_cpu_get(dtpm_cpu->cpu); - freq = cpufreq_quick_get(dtpm_cpu->cpu); - cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); - nr_cpus = cpumask_weight(&cpus); + pd_mask = em_span_cpus(pd); + + freq = cpufreq_quick_get(dtpm_cpu->cpu); for (i = 0; i < pd->nr_perf_states; i++) { if (pd->table[i].frequency < freq) continue; - return pd->table[i].power * - MICROWATT_PER_MILLIWATT * nr_cpus; + return scale_pd_power_uw(pd_mask, pd->table[i].power * + MICROWATT_PER_MILLIWATT); } return 0; From 5d8cb8db9f791252ef5b7951b6d8cc6281de60a6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 18 Mar 2021 21:52:38 +0100 Subject: [PATCH 15/15] powercap/drivers/dtpm: Fix power limit initialization When a DTPM node is registered its power limit must be initialized to the power max. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210318205238.21937-1-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index c7c5529b61eb..b9fac786246a 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -448,8 +448,10 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) root = dtpm; } - if (dtpm->ops && !dtpm->ops->update_power_uw(dtpm)) + if (dtpm->ops && !dtpm->ops->update_power_uw(dtpm)) { __dtpm_add_power(dtpm); + dtpm->power_limit = dtpm->power_max; + } pr_info("Registered dtpm node '%s' / %llu-%llu uW, \n", dtpm->zone.name, dtpm->power_min, dtpm->power_max);