From c214f124161d446b340597e7c968e0a2dc149142 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:10 +0000 Subject: [PATCH 01/32] arch_topology: Introduce thermal pressure update function The thermal pressure is a mechanism which is used for providing information about reduced CPU performance to the scheduler. Usually code has to convert the value from frequency units into capacity units, which are understandable by the scheduler. Create a common conversion code which can be just used via a handy API. Internally, the topology_update_thermal_pressure() operates on frequency in MHz and max CPU frequency is taken from 'freq_factor' (per-cpu). Signed-off-by: Lukasz Luba Reviewed-by: Thara Gopinath Signed-off-by: Viresh Kumar --- arch/arm/include/asm/topology.h | 1 + arch/arm64/include/asm/topology.h | 1 + drivers/base/arch_topology.c | 43 ++++++++++++++++++++++++++++++- include/linux/arch_topology.h | 3 +++ include/linux/sched/topology.h | 7 +++++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 470299ee2fba..f1eafacc9a30 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,7 @@ /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index ec2db3419c41..7a421cbc99ed 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -33,6 +33,7 @@ void update_freq_counters_refs(void); /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index ff16a36a908b..a3ae0b4eea09 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -22,6 +22,7 @@ static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; +static DEFINE_PER_CPU(u32, freq_factor) = 1; static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -165,6 +166,47 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, } EXPORT_SYMBOL_GPL(topology_set_thermal_pressure); +/** + * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * @cpus : The related CPUs for which capacity has been reduced + * @capped_freq : The maximum allowed frequency that CPUs can run at + * + * Update the value of thermal pressure for all @cpus in the mask. The + * cpumask should include all (online+offline) affected CPUs, to avoid + * operating on stale data when hot-plug is used for some CPUs. The + * @capped_freq reflects the currently allowed max CPUs frequency due to + * thermal capping. It might be also a boost frequency value, which is bigger + * than the internal 'freq_factor' max frequency. In such case the pressure + * value should simply be removed, since this is an indication that there is + * no thermal throttling. The @capped_freq must be provided in kHz. + */ +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq) +{ + unsigned long max_capacity, capacity; + u32 max_freq; + int cpu; + + cpu = cpumask_first(cpus); + max_capacity = arch_scale_cpu_capacity(cpu); + max_freq = per_cpu(freq_factor, cpu); + + /* Convert to MHz scale which is used in 'freq_factor' */ + capped_freq /= 1000; + + /* + * Handle properly the boost frequencies, which should simply clean + * the thermal pressure value. + */ + if (max_freq <= capped_freq) + capacity = max_capacity; + else + capacity = mult_frac(max_capacity, capped_freq, max_freq); + + arch_set_thermal_pressure(cpus, max_capacity - capacity); +} +EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); + static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -217,7 +259,6 @@ static void update_topology_flags_workfn(struct work_struct *work) update_topology = 0; } -static DEFINE_PER_CPU(u32, freq_factor) = 1; static u32 *raw_capacity; static int free_raw_capacity(void) diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index b97cea83b25e..ace1e5dcf773 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -59,6 +59,9 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) void topology_set_thermal_pressure(const struct cpumask *cpus, unsigned long th_pressure); +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq); + struct cpu_topology { int thread_id; int core_id; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c07bfa2d80f2..6e89a8e43aa7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -273,6 +273,13 @@ void arch_set_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_update_thermal_pressure +static __always_inline +void arch_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_frequency) +{ } +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); From 5168b1be09055436cc9ff34509bda1719023baa8 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:11 +0000 Subject: [PATCH 02/32] thermal: cpufreq_cooling: Use new thermal pressure update function Thermal pressure provides a new API, which allows to use CPU frequency as an argument. That removes the need of local conversion to capacity. Use this new function and remove old conversion code. Signed-off-by: Lukasz Luba Signed-off-by: Viresh Kumar --- drivers/thermal/cpufreq_cooling.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 43b1ae8a7789..0bfb8eebd126 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -462,7 +462,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; struct cpumask *cpus; unsigned int frequency; - unsigned long max_capacity, capacity; int ret; /* Request state should be less than max_level */ @@ -479,10 +478,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, if (ret >= 0) { cpufreq_cdev->cpufreq_state = state; cpus = cpufreq_cdev->policy->related_cpus; - max_capacity = arch_scale_cpu_capacity(cpumask_first(cpus)); - capacity = frequency * max_capacity; - capacity /= cpufreq_cdev->policy->cpuinfo.max_freq; - arch_set_thermal_pressure(cpus, max_capacity - capacity); + arch_update_thermal_pressure(cpus, frequency); ret = 0; } From 93d9e6f93e1586fcc97498c764be2e8c8401f4bd Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:12 +0000 Subject: [PATCH 03/32] cpufreq: qcom-cpufreq-hw: Update offline CPUs per-cpu thermal pressure The thermal pressure signal gives information to the scheduler about reduced CPU capacity due to thermal. It is based on a value stored in a per-cpu 'thermal_pressure' variable. The online CPUs will get the new value there, while the offline won't. Unfortunately, when the CPU is back online, the value read from per-cpu variable might be wrong (stale data). This might affect the scheduler decisions, since it sees the CPU capacity differently than what is actually available. Fix it by making sure that all online+offline CPUs would get the proper value in their per-cpu variable when there is throttling or throttling is removed. Fixes: 275157b367f479 ("cpufreq: qcom-cpufreq-hw: Add dcvs interrupt support") Reviewed-by: Thara Gopinath Signed-off-by: Lukasz Luba Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index a2be0df7e174..0138b2ec406d 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -304,7 +304,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) if (capacity > max_capacity) capacity = max_capacity; - arch_set_thermal_pressure(policy->cpus, max_capacity - capacity); + arch_set_thermal_pressure(policy->related_cpus, + max_capacity - capacity); /* * In the unlikely case policy is unregistered do not enable From 0258cb19c77deb755747b97f690931193ced0c55 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:13 +0000 Subject: [PATCH 04/32] cpufreq: qcom-cpufreq-hw: Use new thermal pressure update function Thermal pressure provides a new API, which allows to use CPU frequency as an argument. That removes the need of local conversion to capacity. Use this new API and remove old local conversion code. The new arch_update_thermal_pressure() also accepts boost frequencies, which solves issue in the driver code with wrong reduced capacity calculation. The reduced capacity was calculated wrongly due to 'policy->cpuinfo.max_freq' used as a divider. The value present there was actually the boost frequency. Thus, even a normal maximum frequency value which corresponds to max CPU capacity (arch_scale_cpu_capacity(cpu_id)) is not able to remove the capping. The second side effect which is solved is that the reduced frequency wasn't properly translated into the right reduced capacity, e.g. boost frequency = 3000MHz (stored in policy->cpuinfo.max_freq) max normal frequency = 2500MHz (which is 1024 capacity) 2nd highest frequency = 2000MHz (which translates to 819 capacity) Then in a scenario when the 'throttled_freq' max allowed frequency was 2000MHz the driver translated it into 682 capacity: capacity = 1024 * 2000 / 3000 = 682 Then set the pressure value bigger than actually applied by the HW: max_capacity - capacity => 1024 - 682 = 342 (<- thermal pressure) Which was causing higher throttling and misleading task scheduler about available CPU capacity. A proper calculation in such case should be: capacity = 1024 * 2000 / 2500 = 819 1024 - 819 = 205 (<- thermal pressure) This patch relies on the new arch_update_thermal_pressure() handling correctly such use case (with boost frequencies). Signed-off-by: Lukasz Luba Reviewed-by: Thara Gopinath Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 0138b2ec406d..248135e5087e 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -275,10 +275,10 @@ static unsigned int qcom_lmh_get_throttle_freq(struct qcom_cpufreq_data *data) static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) { - unsigned long max_capacity, capacity, freq_hz, throttled_freq; struct cpufreq_policy *policy = data->policy; int cpu = cpumask_first(policy->cpus); struct device *dev = get_cpu_device(cpu); + unsigned long freq_hz, throttled_freq; struct dev_pm_opp *opp; unsigned int freq; @@ -295,17 +295,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data) throttled_freq = freq_hz / HZ_PER_KHZ; - /* Update thermal pressure */ - - max_capacity = arch_scale_cpu_capacity(cpu); - capacity = mult_frac(max_capacity, throttled_freq, policy->cpuinfo.max_freq); - - /* Don't pass boost capacity to scheduler */ - if (capacity > max_capacity) - capacity = max_capacity; - - arch_set_thermal_pressure(policy->related_cpus, - max_capacity - capacity); + /* Update thermal pressure (the boost frequencies are accepted) */ + arch_update_thermal_pressure(policy->related_cpus, throttled_freq); /* * In the unlikely case policy is unregistered do not enable From 7e97b3dc2556743dd02612c92a8de7026e8d7dc9 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:14 +0000 Subject: [PATCH 05/32] arch_topology: Remove unused topology_set_thermal_pressure() and related There is no need of this function (and related) since code has been converted to use the new arch_update_thermal_pressure() API. The old code can be removed. Signed-off-by: Lukasz Luba Signed-off-by: Viresh Kumar --- arch/arm/include/asm/topology.h | 1 - arch/arm64/include/asm/topology.h | 1 - drivers/base/arch_topology.c | 17 +++++------------ include/linux/arch_topology.h | 3 --- include/linux/sched/topology.h | 7 ------- init/Kconfig | 2 +- 6 files changed, 6 insertions(+), 25 deletions(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index f1eafacc9a30..c7d2510e5a78 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -23,7 +23,6 @@ /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_set_thermal_pressure topology_set_thermal_pressure #define arch_update_thermal_pressure topology_update_thermal_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 7a421cbc99ed..f386b90a79c8 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -32,7 +32,6 @@ void update_freq_counters_refs(void); /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure -#define arch_set_thermal_pressure topology_set_thermal_pressure #define arch_update_thermal_pressure topology_update_thermal_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index a3ae0b4eea09..976154140f0b 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -156,16 +156,6 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) DEFINE_PER_CPU(unsigned long, thermal_pressure); -void topology_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} -EXPORT_SYMBOL_GPL(topology_set_thermal_pressure); - /** * topology_update_thermal_pressure() - Update thermal pressure for CPUs * @cpus : The related CPUs for which capacity has been reduced @@ -183,7 +173,7 @@ EXPORT_SYMBOL_GPL(topology_set_thermal_pressure); void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq) { - unsigned long max_capacity, capacity; + unsigned long max_capacity, capacity, th_pressure; u32 max_freq; int cpu; @@ -203,7 +193,10 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, else capacity = mult_frac(max_capacity, capped_freq, max_freq); - arch_set_thermal_pressure(cpus, max_capacity - capacity); + th_pressure = max_capacity - capacity; + + for_each_cpu(cpu, cpus) + WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); } EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index ace1e5dcf773..cce6136b300a 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -56,9 +56,6 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) return per_cpu(thermal_pressure, cpu); } -void topology_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure); - void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 6e89a8e43aa7..8054641c0a7b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -266,13 +266,6 @@ unsigned long arch_scale_thermal_pressure(int cpu) } #endif -#ifndef arch_set_thermal_pressure -static __always_inline -void arch_set_thermal_pressure(const struct cpumask *cpus, - unsigned long th_pressure) -{ } -#endif - #ifndef arch_update_thermal_pressure static __always_inline void arch_update_thermal_pressure(const struct cpumask *cpus, diff --git a/init/Kconfig b/init/Kconfig index 036b750e8d8a..086fd11348e0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -550,7 +550,7 @@ config SCHED_THERMAL_PRESSURE i.e. put less load on throttled CPUs than on non/less throttled ones. This requires the architecture to implement - arch_set_thermal_pressure() and arch_scale_thermal_pressure(). + arch_update_thermal_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT bool "BSD Process Accounting" From be6592ed56a7cca8a001ff339cb9325bfa3c6e3f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 11 Nov 2021 17:48:06 +0200 Subject: [PATCH 06/32] cpufreq: qcom-cpufreq-hw: Avoid stack buffer for IRQ name Registering an IRQ requires the string buffer containing the name to remain allocated, as the name is not copied into another buffer. So let's add a irq_name field to the data struct instead, which is guaranteed to have the appropriate lifetime. Cc: Thara Gopinath Cc: Bjorn Andersson Cc: Andy Gross Signed-off-by: Ard Biesheuvel Reviewed-by: Thara Gopinath Tested-by: Steev Klimaszewski Signed-off-by: Vladimir Zapolskiy Reviewed-by: Matthias Kaehlcke Reviewed-by: Bjorn Andersson Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 248135e5087e..63c11e7dfba0 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -46,6 +46,7 @@ struct qcom_cpufreq_data { */ struct mutex throttle_lock; int throttle_irq; + char irq_name[15]; bool cancel_throttle; struct delayed_work throttle_work; struct cpufreq_policy *policy; @@ -367,7 +368,6 @@ static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index) { struct qcom_cpufreq_data *data = policy->driver_data; struct platform_device *pdev = cpufreq_get_driver_data(); - char irq_name[15]; int ret; /* @@ -384,11 +384,11 @@ static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index) mutex_init(&data->throttle_lock); INIT_DEFERRABLE_WORK(&data->throttle_work, qcom_lmh_dcvs_poll); - snprintf(irq_name, sizeof(irq_name), "dcvsh-irq-%u", policy->cpu); + snprintf(data->irq_name, sizeof(data->irq_name), "dcvsh-irq-%u", policy->cpu); ret = request_threaded_irq(data->throttle_irq, NULL, qcom_lmh_dcvs_handle_irq, - IRQF_ONESHOT, irq_name, data); + IRQF_ONESHOT, data->irq_name, data); if (ret) { - dev_err(&pdev->dev, "Error registering %s: %d\n", irq_name, ret); + dev_err(&pdev->dev, "Error registering %s: %d\n", data->irq_name, ret); return 0; } From e0e27c3d4e20dab861566f1c348ae44e4b498630 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 11 Nov 2021 17:48:07 +0200 Subject: [PATCH 07/32] cpufreq: qcom-hw: Fix probable nested interrupt handling Re-enabling an interrupt from its own interrupt handler may cause an interrupt storm, if there is a pending interrupt and because its handling is disabled due to already done entrance into the handler above in the stack. Also, apparently it is improper to lock a mutex in an interrupt contex. Fixes: 275157b367f4 ("cpufreq: qcom-cpufreq-hw: Add dcvs interrupt support") Signed-off-by: Vladimir Zapolskiy Reviewed-by: Matthias Kaehlcke Reviewed-by: Bjorn Andersson Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 63c11e7dfba0..5b609e205435 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -335,9 +335,9 @@ static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data) /* Disable interrupt and enable polling */ disable_irq_nosync(c_data->throttle_irq); - qcom_lmh_dcvs_notify(c_data); + schedule_delayed_work(&c_data->throttle_work, 0); - return 0; + return IRQ_HANDLED; } static const struct qcom_cpufreq_soc_data qcom_soc_data = { From 3ed6dfbd3bb987b3d2de86304ae45972ebff5870 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Thu, 11 Nov 2021 17:48:08 +0200 Subject: [PATCH 08/32] cpufreq: qcom-hw: Set CPU affinity of dcvsh interrupts In runtime CPU cluster specific dcvsh interrupts may be handled on unrelated CPU cores, it leads to an issue of too excessive number of received and handled interrupts, but this is not observed, if CPU affinity of the interrupt handler is set in accordance to CPU clusters. The change reduces a number of received interrupts in about 10-100 times. Signed-off-by: Vladimir Zapolskiy Reviewed-by: Matthias Kaehlcke Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 5b609e205435..5b0acf5448c3 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -392,6 +392,11 @@ static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index) return 0; } + ret = irq_set_affinity_hint(data->throttle_irq, policy->cpus); + if (ret) + dev_err(&pdev->dev, "Failed to set CPU affinity of %s[%d]\n", + data->irq_name, data->throttle_irq); + return 0; } From 8f5783ad9eb83747471f61f94dbe209fb9fb8a7d Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 16 Nov 2021 18:03:46 -0800 Subject: [PATCH 09/32] cpufreq: qcom-hw: Use optional irq API Use platform_get_irq_optional() to avoid a noisy error message when the irq isn't specified. The irq is definitely optional given that we only care about errors that are -EPROBE_DEFER here. Cc: Thara Gopinath Signed-off-by: Stephen Boyd Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 5b0acf5448c3..05f3d7876e44 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -374,9 +374,11 @@ static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index) * Look for LMh interrupt. If no interrupt line is specified / * if there is an error, allow cpufreq to be enabled as usual. */ - data->throttle_irq = platform_get_irq(pdev, index); - if (data->throttle_irq <= 0) - return data->throttle_irq == -EPROBE_DEFER ? -EPROBE_DEFER : 0; + data->throttle_irq = platform_get_irq_optional(pdev, index); + if (data->throttle_irq == -ENXIO) + return 0; + if (data->throttle_irq < 0) + return data->throttle_irq; data->cancel_throttle = false; data->policy = policy; From 458b03f81afbb27143c45d47c2d8f418b2ba2407 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 10 Dec 2021 17:12:18 +0100 Subject: [PATCH 10/32] cpufreq: intel_pstate: Drop redundant intel_pstate_get_hwp_cap() call It is not necessary to call intel_pstate_get_hwp_cap() from intel_pstate_update_perf_limits(), because it gets called from intel_pstate_verify_cpu_policy() which is either invoked directly right before intel_pstate_update_perf_limits(), in intel_cpufreq_verify_policy() in the passive mode, or called from driver callbacks in a sequence that causes it to be followed by an immediate intel_pstate_update_perf_limits(). Namely, in the active mode intel_cpufreq_verify_policy() is called by intel_pstate_verify_policy() which is the ->verify() callback routine of intel_pstate and gets called by the cpufreq core right before intel_pstate_set_policy(), which is the driver's ->setoplicy() callback routine, where intel_pstate_update_perf_limits() is called. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index dec2a5649ac1..87902060e1c6 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2486,18 +2486,14 @@ static void intel_pstate_update_perf_limits(struct cpudata *cpu, * HWP needs some special consideration, because HWP_REQUEST uses * abstract values to represent performance rather than pure ratios. */ - if (hwp_active) { - intel_pstate_get_hwp_cap(cpu); + if (hwp_active && cpu->pstate.scaling != perf_ctl_scaling) { + int scaling = cpu->pstate.scaling; + int freq; - if (cpu->pstate.scaling != perf_ctl_scaling) { - int scaling = cpu->pstate.scaling; - int freq; - - freq = max_policy_perf * perf_ctl_scaling; - max_policy_perf = DIV_ROUND_UP(freq, scaling); - freq = min_policy_perf * perf_ctl_scaling; - min_policy_perf = DIV_ROUND_UP(freq, scaling); - } + freq = max_policy_perf * perf_ctl_scaling; + max_policy_perf = DIV_ROUND_UP(freq, scaling); + freq = min_policy_perf * perf_ctl_scaling; + min_policy_perf = DIV_ROUND_UP(freq, scaling); } pr_debug("cpu:%d min_policy_perf:%d max_policy_perf:%d\n", From b6e6f8beec98ba7541213c506fe908517fdc52b8 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 16 Dec 2021 10:33:05 -0800 Subject: [PATCH 11/32] cpufreq: intel_pstate: Update EPP for AlderLake mobile There is an expectation from users that they can get frequency specified by cpufreq/cpuinfo_max_freq when conditions permit. But with AlderLake mobile it may not be possible. This is possible that frequency is clipped based on the system power-up EPP value. In this case users can update cpufreq/energy_performance_preference to some performance oriented EPP to limit clipping of frequencies. To get out of box behavior as the prior generations of CPUs, update EPP for AlderLake mobile CPUs on boot. On prior generations of CPUs EPP = 128 was enough to get maximum frequency, but with AlderLake mobile the equivalent EPP is 102. Since EPP is model specific, this is possible that they have different meaning on each generation of CPU. The current EPP string "balance_performance" corresponds to EPP = 128. Change the EPP corresponding to "balance_performance" to 102 for only AlderLake mobile CPUs and update this on each CPU during boot. To implement reuse epp_values[] array and update the modified EPP at the index for BALANCE_PERFORMANCE. Add a dummy EPP_INDEX_DEFAULT to epp_values[] to match indexes in the energy_perf_strings[]. After HWP PM is enabled also update EPP when "balance_performance" is redefined for the very first time after the boot on each CPU. On subsequent suspend/resume or offline/online the old EPP is restored, so no specific action is needed. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 79 ++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 87902060e1c6..38cabc38261a 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -664,19 +664,29 @@ static int intel_pstate_set_epb(int cpu, s16 pref) * 3 balance_power * 4 power */ + +enum energy_perf_value_index { + EPP_INDEX_DEFAULT = 0, + EPP_INDEX_PERFORMANCE, + EPP_INDEX_BALANCE_PERFORMANCE, + EPP_INDEX_BALANCE_POWERSAVE, + EPP_INDEX_POWERSAVE, +}; + static const char * const energy_perf_strings[] = { - "default", - "performance", - "balance_performance", - "balance_power", - "power", + [EPP_INDEX_DEFAULT] = "default", + [EPP_INDEX_PERFORMANCE] = "performance", + [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", + [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", + [EPP_INDEX_POWERSAVE] = "power", NULL }; -static const unsigned int epp_values[] = { - HWP_EPP_PERFORMANCE, - HWP_EPP_BALANCE_PERFORMANCE, - HWP_EPP_BALANCE_POWERSAVE, - HWP_EPP_POWERSAVE +static unsigned int epp_values[] = { + [EPP_INDEX_DEFAULT] = 0, /* Unused index */ + [EPP_INDEX_PERFORMANCE] = HWP_EPP_PERFORMANCE, + [EPP_INDEX_BALANCE_PERFORMANCE] = HWP_EPP_BALANCE_PERFORMANCE, + [EPP_INDEX_BALANCE_POWERSAVE] = HWP_EPP_BALANCE_POWERSAVE, + [EPP_INDEX_POWERSAVE] = HWP_EPP_POWERSAVE, }; static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp) @@ -690,14 +700,14 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw return epp; if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { - if (epp == HWP_EPP_PERFORMANCE) - return 1; - if (epp == HWP_EPP_BALANCE_PERFORMANCE) - return 2; - if (epp == HWP_EPP_BALANCE_POWERSAVE) - return 3; - if (epp == HWP_EPP_POWERSAVE) - return 4; + if (epp == epp_values[EPP_INDEX_PERFORMANCE]) + return EPP_INDEX_PERFORMANCE; + if (epp == epp_values[EPP_INDEX_BALANCE_PERFORMANCE]) + return EPP_INDEX_BALANCE_PERFORMANCE; + if (epp == epp_values[EPP_INDEX_BALANCE_POWERSAVE]) + return EPP_INDEX_BALANCE_POWERSAVE; + if (epp == epp_values[EPP_INDEX_POWERSAVE]) + return EPP_INDEX_POWERSAVE; *raw_epp = epp; return 0; } else if (boot_cpu_has(X86_FEATURE_EPB)) { @@ -757,7 +767,7 @@ static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, if (use_raw) epp = raw_epp; else if (epp == -EINVAL) - epp = epp_values[pref_index - 1]; + epp = epp_values[pref_index]; /* * To avoid confusion, refuse to set EPP to any values different @@ -843,7 +853,7 @@ static ssize_t store_energy_performance_preference( * upfront. */ if (!raw) - epp = ret ? epp_values[ret - 1] : cpu->epp_default; + epp = ret ? epp_values[ret] : cpu->epp_default; if (cpu->epp_cached != epp) { int err; @@ -1679,10 +1689,18 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata) wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); - if (cpudata->epp_default == -EINVAL) - cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); intel_pstate_enable_hwp_interrupt(cpudata); + + if (cpudata->epp_default >= 0) + return; + + if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) { + cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); + } else { + cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE]; + intel_pstate_set_epp(cpudata, cpudata->epp_default); + } } static int atom_get_min_pstate(void) @@ -3345,6 +3363,16 @@ static bool intel_pstate_hwp_is_enabled(void) return !!(value & 0x1); } +static const struct x86_cpu_id intel_epp_balance_perf[] = { + /* + * Set EPP value as 102, this is the max suggested EPP + * which can result in one core turbo frequency for + * AlderLake Mobile CPUs. + */ + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 102), + {} +}; + static int __init intel_pstate_init(void) { static struct cpudata **_all_cpu_data; @@ -3434,6 +3462,13 @@ hwp_cpu_matched: intel_pstate_sysfs_expose_params(); + if (hwp_active) { + const struct x86_cpu_id *id = x86_match_cpu(intel_epp_balance_perf); + + if (id) + epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = id->driver_data; + } + mutex_lock(&intel_pstate_driver_lock); rc = intel_pstate_register_driver(default_driver); mutex_unlock(&intel_pstate_driver_lock); From 521223d8b3ec078f670c7c35a1a04b1b2af07966 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 16 Dec 2021 20:32:15 +0100 Subject: [PATCH 12/32] cpufreq: Fix initialization of min and max frequency QoS requests The min and max frequency QoS requests in the cpufreq core are initialized to whatever the current min and max frequency values are at the init time, but if any of these values change later (for example, cpuinfo.max_freq is updated by the driver), these initial request values will be limiting the CPU frequency unnecessarily unless they are changed by user space via sysfs. To address this, initialize min_freq_req and max_freq_req to FREQ_QOS_MIN_DEFAULT_VALUE and FREQ_QOS_MAX_DEFAULT_VALUE, respectively, so they don't really limit anything until user space updates them. Reported-by: Srinivas Pandruvada Tested-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 096c3848fa41..76ffdaf8c8b5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1403,7 +1403,7 @@ static int cpufreq_online(unsigned int cpu) ret = freq_qos_add_request(&policy->constraints, policy->min_freq_req, FREQ_QOS_MIN, - policy->min); + FREQ_QOS_MIN_DEFAULT_VALUE); if (ret < 0) { /* * So we don't call freq_qos_remove_request() for an @@ -1423,7 +1423,7 @@ static int cpufreq_online(unsigned int cpu) ret = freq_qos_add_request(&policy->constraints, policy->max_freq_req, FREQ_QOS_MAX, - policy->max); + FREQ_QOS_MAX_DEFAULT_VALUE); if (ret < 0) { policy->max_freq_req = NULL; goto out_destroy_policy; From dfeeedc1bf5772226bddf51ed3f853e5a6707bf1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 17 Dec 2021 20:06:08 +0100 Subject: [PATCH 13/32] cpufreq: intel_pstate: Update cpuinfo.max_freq on HWP_CAP changes With HWP enabled, when the turbo range of performance levels is disabled by the platform firmware, the CPU capacity is given by the "guaranteed performance" field in MSR_HWP_CAPABILITIES which is generally dynamic. When it changes, the kernel receives an HWP notification interrupt handled by notify_hwp_interrupt(). When the "guaranteed performance" value changes in the above configuration, the CPU performance scaling needs to be adjusted so as to use the new CPU capacity in computations, which means that the cpuinfo.max_freq value needs to be updated for that CPU. Accordingly, modify intel_pstate_notify_work() to read MSR_HWP_CAPABILITIES and update cpuinfo.max_freq to reflect the new configuration (this update can be carried out even if the configuration doesn't actually change, because it simply doesn't matter then and it takes less time to update it than to do extra checks to decide whether or not a change has really occurred). Reported-by: Srinivas Pandruvada Tested-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 38cabc38261a..bc7f7e6759bd 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1134,19 +1134,22 @@ static void intel_pstate_update_policies(void) cpufreq_update_policy(cpu); } +static void __intel_pstate_update_max_freq(struct cpudata *cpudata, + struct cpufreq_policy *policy) +{ + policy->cpuinfo.max_freq = global.turbo_disabled_mf ? + cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; + refresh_frequency_limits(policy); +} + static void intel_pstate_update_max_freq(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); - struct cpudata *cpudata; if (!policy) return; - cpudata = all_cpu_data[cpu]; - policy->cpuinfo.max_freq = global.turbo_disabled_mf ? - cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; - - refresh_frequency_limits(policy); + __intel_pstate_update_max_freq(all_cpu_data[cpu], policy); cpufreq_cpu_release(policy); } @@ -1594,8 +1597,15 @@ static void intel_pstate_notify_work(struct work_struct *work) { struct cpudata *cpudata = container_of(to_delayed_work(work), struct cpudata, hwp_notify_work); + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu); + + if (policy) { + intel_pstate_get_hwp_cap(cpudata); + __intel_pstate_update_max_freq(cpudata, policy); + + cpufreq_cpu_release(policy); + } - cpufreq_update_policy(cpudata->cpu); wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0); } From d776790a55367b9313d03c364c04ff47e7f5ea83 Mon Sep 17 00:00:00 2001 From: "Hector.Yuan" Date: Fri, 29 Oct 2021 19:42:23 +0800 Subject: [PATCH 14/32] cpufreq: mediatek-hw: Fix double devm_remap in hotplug case When hotpluging policy cpu, cpu policy init will be called multiple times. Unplug CPU7 -> CPU6 -> CPU5 -> CPU4, then plug CPU4 again. In this case, devm_remap will double remap and resource allocate fail. So replace devm_remap to ioremap and release resources in cpu policy exit. Signed-off-by: Hector.Yuan Signed-off-by: Viresh Kumar --- drivers/cpufreq/mediatek-cpufreq-hw.c | 33 ++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index 8ddbd0c5ce37..0a94c56ddad2 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -36,6 +36,8 @@ enum { struct mtk_cpufreq_data { struct cpufreq_frequency_table *table; void __iomem *reg_bases[REG_ARRAY_SIZE]; + struct resource *res; + void __iomem *base; int nr_opp; }; @@ -156,6 +158,7 @@ static int mtk_cpu_resources_init(struct platform_device *pdev, { struct mtk_cpufreq_data *data; struct device *dev = &pdev->dev; + struct resource *res; void __iomem *base; int ret, i; int index; @@ -170,9 +173,26 @@ static int mtk_cpu_resources_init(struct platform_device *pdev, if (index < 0) return index; - base = devm_platform_ioremap_resource(pdev, index); - if (IS_ERR(base)) - return PTR_ERR(base); + res = platform_get_resource(pdev, IORESOURCE_MEM, index); + if (!res) { + dev_err(dev, "failed to get mem resource %d\n", index); + return -ENODEV; + } + + if (!request_mem_region(res->start, resource_size(res), res->name)) { + dev_err(dev, "failed to request resource %pR\n", res); + return -EBUSY; + } + + base = ioremap(res->start, resource_size(res)); + if (!base) { + dev_err(dev, "failed to map resource %pR\n", res); + ret = -ENOMEM; + goto release_region; + } + + data->base = base; + data->res = res; for (i = REG_FREQ_LUT_TABLE; i < REG_ARRAY_SIZE; i++) data->reg_bases[i] = base + offsets[i]; @@ -187,6 +207,9 @@ static int mtk_cpu_resources_init(struct platform_device *pdev, policy->driver_data = data; return 0; +release_region: + release_mem_region(res->start, resource_size(res)); + return ret; } static int mtk_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) @@ -233,9 +256,13 @@ static int mtk_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) static int mtk_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy) { struct mtk_cpufreq_data *data = policy->driver_data; + struct resource *res = data->res; + void __iomem *base = data->base; /* HW should be in paused state now */ writel_relaxed(0x0, data->reg_bases[REG_FREQ_ENABLE]); + iounmap(base); + release_mem_region(res->start, resource_size(res)); return 0; } From fe262d5c1fc54fac8703d3d82a14aaad9627a168 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Dec 2021 14:19:12 +0100 Subject: [PATCH 15/32] cpufreq: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the cpufreq code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 5 +++-- drivers/cpufreq/cpufreq_conservative.c | 5 +++-- drivers/cpufreq/cpufreq_ondemand.c | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 76ffdaf8c8b5..b8d95536ee22 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -924,7 +924,7 @@ cpufreq_freq_attr_rw(scaling_max_freq); cpufreq_freq_attr_rw(scaling_governor); cpufreq_freq_attr_rw(scaling_setspeed); -static struct attribute *default_attrs[] = { +static struct attribute *cpufreq_attrs[] = { &cpuinfo_min_freq.attr, &cpuinfo_max_freq.attr, &cpuinfo_transition_latency.attr, @@ -938,6 +938,7 @@ static struct attribute *default_attrs[] = { &scaling_setspeed.attr, NULL }; +ATTRIBUTE_GROUPS(cpufreq); #define to_policy(k) container_of(k, struct cpufreq_policy, kobj) #define to_attr(a) container_of(a, struct freq_attr, attr) @@ -1000,7 +1001,7 @@ static const struct sysfs_ops sysfs_ops = { static struct kobj_type ktype_cpufreq = { .sysfs_ops = &sysfs_ops, - .default_attrs = default_attrs, + .default_groups = cpufreq_groups, .release = cpufreq_sysfs_release, }; diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 0879ec3c170c..08515f7e515f 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -257,7 +257,7 @@ gov_attr_rw(ignore_nice_load); gov_attr_rw(down_threshold); gov_attr_rw(freq_step); -static struct attribute *cs_attributes[] = { +static struct attribute *cs_attrs[] = { &sampling_rate.attr, &sampling_down_factor.attr, &up_threshold.attr, @@ -266,6 +266,7 @@ static struct attribute *cs_attributes[] = { &freq_step.attr, NULL }; +ATTRIBUTE_GROUPS(cs); /************************** sysfs end ************************/ @@ -315,7 +316,7 @@ static void cs_start(struct cpufreq_policy *policy) static struct dbs_governor cs_governor = { .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"), - .kobj_type = { .default_attrs = cs_attributes }, + .kobj_type = { .default_groups = cs_groups }, .gov_dbs_update = cs_dbs_update, .alloc = cs_alloc, .free = cs_free, diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 3b8f924771b4..6a41ea4729b8 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -328,7 +328,7 @@ gov_attr_rw(sampling_down_factor); gov_attr_rw(ignore_nice_load); gov_attr_rw(powersave_bias); -static struct attribute *od_attributes[] = { +static struct attribute *od_attrs[] = { &sampling_rate.attr, &up_threshold.attr, &sampling_down_factor.attr, @@ -337,6 +337,7 @@ static struct attribute *od_attributes[] = { &io_is_busy.attr, NULL }; +ATTRIBUTE_GROUPS(od); /************************** sysfs end ************************/ @@ -401,7 +402,7 @@ static struct od_ops od_ops = { static struct dbs_governor od_dbs_gov = { .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"), - .kobj_type = { .default_attrs = od_attributes }, + .kobj_type = { .default_groups = od_groups }, .gov_dbs_update = od_dbs_update, .alloc = od_alloc, .free = od_free, From d341db8f48ea43314f489921962c7f8f4ec27239 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:04:55 +0800 Subject: [PATCH 16/32] x86/cpufeatures: Add AMD Collaborative Processor Performance Control feature flag Add Collaborative Processor Performance Control feature flag for AMD processors. This feature flag will be used on the following AMD P-State driver. The AMD P-State driver has two approaches to implement the frequency control behavior. That depends on the CPU hardware implementation. One is "Full MSR Support" and another is "Shared Memory Support". The feature flag indicates the current processors with "Full MSR Support". Acked-by: Borislav Petkov Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d5b5f2ab87a0..18de5f76f198 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -315,6 +315,7 @@ #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ From 89aa94b4a218339b08f052a28c55322d5a13fc9e Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:04:56 +0800 Subject: [PATCH 17/32] x86/msr: Add AMD CPPC MSR definitions AMD CPPC (Collaborative Processor Performance Control) function uses MSR registers to manage the performance hints. So add the MSR register macro here. Signed-off-by: Huang Rui Acked-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 01e2650b9585..3faf0f97edb1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -486,6 +486,23 @@ #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +/* AMD Collaborative Processor Performance Control MSRs */ +#define MSR_AMD_CPPC_CAP1 0xc00102b0 +#define MSR_AMD_CPPC_ENABLE 0xc00102b1 +#define MSR_AMD_CPPC_CAP2 0xc00102b2 +#define MSR_AMD_CPPC_REQ 0xc00102b3 +#define MSR_AMD_CPPC_STATUS 0xc00102b4 + +#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff) +#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff) +#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff) +#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff) + +#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0) +#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8) +#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) +#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 From a2c8f92bea5f8f1a87fc3caf063d67876dbf5d47 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Fri, 24 Dec 2021 09:04:57 +0800 Subject: [PATCH 18/32] ACPI: CPPC: Implement support for SystemIO registers According to the ACPI v6.2 (and later) specification, SystemIO can be used for _CPC registers. This teaches cppc_acpi how to handle such registers. This patch was tested using the amd_pstate driver on my Zephyrus G15 (model GA503QS) using the current version 410 BIOS, which uses a SystemIO register for the HighestPerformance element in _CPC. Signed-off-by: Steven Noonan Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 52 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index b62c87b8ce4a..69e074647003 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -118,6 +118,8 @@ static DEFINE_PER_CPU(struct cpc_desc *, cpc_desc_ptr); */ #define NUM_RETRIES 500ULL +#define OVER_16BTS_MASK ~0xFFFFULL + #define define_one_cppc_ro(_name) \ static struct kobj_attribute _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -746,9 +748,26 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) goto out_free; cpc_ptr->cpc_regs[i-2].sys_mem_vaddr = addr; } + } else if (gas_t->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { + if (gas_t->access_width < 1 || gas_t->access_width > 3) { + /* + * 1 = 8-bit, 2 = 16-bit, and 3 = 32-bit. + * SystemIO doesn't implement 64-bit + * registers. + */ + pr_debug("Invalid access width %d for SystemIO register\n", + gas_t->access_width); + goto out_free; + } + if (gas_t->address & OVER_16BTS_MASK) { + /* SystemIO registers use 16-bit integer addresses */ + pr_debug("Invalid IO port %llu for SystemIO register\n", + gas_t->address); + goto out_free; + } } else { if (gas_t->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE || !cpc_ffh_supported()) { - /* Support only PCC ,SYS MEM and FFH type regs */ + /* Support only PCC, SystemMemory, SystemIO, and FFH type regs. */ pr_debug("Unsupported register type: %d\n", gas_t->space_id); goto out_free; } @@ -923,7 +942,21 @@ static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) } *val = 0; - if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0) + + if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { + u32 width = 8 << (reg->access_width - 1); + acpi_status status; + + status = acpi_os_read_port((acpi_io_address)reg->address, + (u32 *)val, width); + if (ACPI_FAILURE(status)) { + pr_debug("Error: Failed to read SystemIO port %llx\n", + reg->address); + return -EFAULT; + } + + return 0; + } else if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0) vaddr = GET_PCC_VADDR(reg->address, pcc_ss_id); else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) vaddr = reg_res->sys_mem_vaddr; @@ -962,7 +995,20 @@ static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val) int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cpc_reg *reg = ®_res->cpc_entry.reg; - if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0) + if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { + u32 width = 8 << (reg->access_width - 1); + acpi_status status; + + status = acpi_os_write_port((acpi_io_address)reg->address, + (u32)val, width); + if (ACPI_FAILURE(status)) { + pr_debug("Error: Failed to write SystemIO port %llx\n", + reg->address); + return -EFAULT; + } + + return 0; + } else if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0) vaddr = GET_PCC_VADDR(reg->address, pcc_ss_id); else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) vaddr = reg_res->sys_mem_vaddr; From 2aeca6bd02776d7f56a49a32be0dd184f204d888 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 24 Dec 2021 09:04:58 +0800 Subject: [PATCH 19/32] ACPI: CPPC: Check present CPUs for determining _CPC is valid As this is a static check, it should be based upon what is currently present on the system. This makes probeing more deterministic. While local APIC flags field (lapic_flags) of cpu core in MADT table is 0, then the cpu core won't be enabled. In this case, _CPC won't be found in this core, and return back to _CPC invalid with walking through possible cpus (include disable cpus). This is not expected, so switch to check present CPUs instead. Reported-by: Jinzhou Su Signed-off-by: Mario Limonciello Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 69e074647003..cc790037d643 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -413,7 +413,7 @@ bool acpi_cpc_valid(void) struct cpc_desc *cpc_ptr; int cpu; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { cpc_ptr = per_cpu(cpc_desc_ptr, cpu); if (!cpc_ptr) return false; From fb0b00af04d083770d6e2762b2838357519f7d2d Mon Sep 17 00:00:00 2001 From: Jinzhou Su Date: Fri, 24 Dec 2021 09:04:59 +0800 Subject: [PATCH 20/32] ACPI: CPPC: Add CPPC enable register function Add a new function to enable CPPC feature. This function will write Continuous Performance Control package EnableRegister field on the processor. CPPC EnableRegister register described in section 8.4.7.1 of ACPI 6.4: This element is optional. If supported, contains a resource descriptor with a single Register() descriptor that describes a register to which OSPM writes a One to enable CPPC on this processor. Before this register is set, the processor will be controlled by legacy mechanisms (ACPI Pstates, firmware, etc.). This register will be used for AMD processors to enable AMD P-State function instead of legacy ACPI P-States. Signed-off-by: Jinzhou Su Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 45 ++++++++++++++++++++++++++++++++++++++++ include/acpi/cppc_acpi.h | 5 +++++ 2 files changed, 50 insertions(+) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index cc790037d643..e4fd5a8237e2 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1275,6 +1275,51 @@ out_err: } EXPORT_SYMBOL_GPL(cppc_get_perf_ctrs); +/** + * cppc_set_enable - Set to enable CPPC on the processor by writing the + * Continuous Performance Control package EnableRegister field. + * @cpu: CPU for which to enable CPPC register. + * @enable: 0 - disable, 1 - enable CPPC feature on the processor. + * + * Return: 0 for success, -ERRNO or -EIO otherwise. + */ +int cppc_set_enable(int cpu, bool enable) +{ + int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); + struct cpc_register_resource *enable_reg; + struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu); + struct cppc_pcc_data *pcc_ss_data = NULL; + int ret = -EINVAL; + + if (!cpc_desc) { + pr_debug("No CPC descriptor for CPU:%d\n", cpu); + return -EINVAL; + } + + enable_reg = &cpc_desc->cpc_regs[ENABLE]; + + if (CPC_IN_PCC(enable_reg)) { + + if (pcc_ss_id < 0) + return -EIO; + + ret = cpc_write(cpu, enable_reg, enable); + if (ret) + return ret; + + pcc_ss_data = pcc_data[pcc_ss_id]; + + down_write(&pcc_ss_data->pcc_lock); + /* after writing CPC, transfer the ownership of PCC to platfrom */ + ret = send_pcc_cmd(pcc_ss_id, CMD_WRITE); + up_write(&pcc_ss_data->pcc_lock); + return ret; + } + + return cpc_write(cpu, enable_reg, enable); +} +EXPORT_SYMBOL_GPL(cppc_set_enable); + /** * cppc_set_perf - Set a CPU's performance controls. * @cpu: CPU for which to set performance controls. diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index bc159a9b4a73..92b7ea8d8f5e 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -138,6 +138,7 @@ extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); +extern int cppc_set_enable(int cpu, bool enable); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); extern bool acpi_cpc_valid(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); @@ -162,6 +163,10 @@ static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls) { return -ENOTSUPP; } +static inline int cppc_set_enable(int cpu, bool enable) +{ + return -ENOTSUPP; +} static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps) { return -ENOTSUPP; From ec437d71db77a181227bf6d0ac9d4a80e58ecf0f Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:00 +0800 Subject: [PATCH 21/32] cpufreq: amd-pstate: Introduce a new AMD P-State driver to support future processors AMD P-State is the AMD CPU performance scaling driver that introduces a new CPU frequency control mechanism on AMD Zen based CPU series in Linux kernel. The new mechanism is based on Collaborative processor performance control (CPPC) which is finer grain frequency management than legacy ACPI hardware P-States. Current AMD CPU platforms are using the ACPI P-states driver to manage CPU frequency and clocks with switching only in 3 P-states. AMD P-State is to replace the ACPI P-states controls, allows a flexible, low-latency interface for the Linux kernel to directly communicate the performance hints to hardware. AMD P-State leverages the Linux kernel governors such as *schedutil*, *ondemand*, etc. to manage the performance hints which are provided by CPPC hardware functionality. The first version for AMD P-State is to support one of the Zen3 processors, and we will support more in future after we verify the hardware and SBIOS functionalities. There are two types of hardware implementations for AMD P-State: one is full MSR support and another is shared memory support. It can use X86_FEATURE_CPPC feature flag to distinguish the different types. Using the new AMD P-State method + kernel governors (*schedutil*, *ondemand*, ...) to manage the frequency update is the most appropriate bridge between AMD Zen based hardware processor and Linux kernel, the processor is able to adjust to the most efficiency frequency according to the kernel scheduler loading. Please check the detailed CPU feature and MSR register description in Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors: https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.x86 | 17 ++ drivers/cpufreq/Makefile | 1 + drivers/cpufreq/amd-pstate.c | 386 +++++++++++++++++++++++++++++++++++ 3 files changed, 404 insertions(+) create mode 100644 drivers/cpufreq/amd-pstate.c diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index 92701a18bdd9..a951768c3ebb 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -34,6 +34,23 @@ config X86_PCC_CPUFREQ If in doubt, say N. +config X86_AMD_PSTATE + tristate "AMD Processor P-State driver" + depends on X86 + select ACPI_PROCESSOR if ACPI + select ACPI_CPPC_LIB if X86_64 && ACPI + select CPU_FREQ_GOV_SCHEDUTIL if SMP + help + This driver adds a CPUFreq driver which utilizes a fine grain + processor performance frequency control range instead of legacy + performance levels. _CPC needs to be present in the ACPI tables + of the system. + + For details, take a look at: + . + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" depends on ACPI_PROCESSOR diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 48ee5859030c..c8d307010922 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o # speedstep-* is preferred over p4-clockmod. obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_AMD_PSTATE) += amd-pstate.o obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c new file mode 100644 index 000000000000..2698ed5ec6d7 --- /dev/null +++ b/drivers/cpufreq/amd-pstate.c @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * amd-pstate.c - AMD Processor P-state Frequency Driver + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Author: Huang Rui + * + * AMD P-State introduces a new CPU performance scaling design for AMD + * processors using the ACPI Collaborative Performance and Power Control (CPPC) + * feature which works with the AMD SMU firmware providing a finer grained + * frequency control range. It is to replace the legacy ACPI P-States control, + * allows a flexible, low-latency interface for the Linux kernel to directly + * communicate the performance hints to hardware. + * + * AMD P-State is supported on recent AMD Zen base CPU series include some of + * Zen2 and Zen3 processors. _CPC needs to be present in the ACPI tables of AMD + * P-State supported system. And there are two types of hardware implementations + * for AMD P-State: 1) Full MSR Solution and 2) Shared Memory Solution. + * X86_FEATURE_CPPC CPU feature flag is used to distinguish the different types. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#define AMD_PSTATE_TRANSITION_LATENCY 0x20000 +#define AMD_PSTATE_TRANSITION_DELAY 500 + +static struct cpufreq_driver amd_pstate_driver; + +/** + * struct amd_cpudata - private CPU data for AMD P-State + * @cpu: CPU number + * @cppc_req_cached: cached performance request hints + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions + * @nominal_perf: the maximum sustained performance level of the processor, + * assuming ideal operating conditions + * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power + * savings are achieved + * @lowest_perf: the absolute lowest performance level of the processor + * @max_freq: the frequency that mapped to highest_perf + * @min_freq: the frequency that mapped to lowest_perf + * @nominal_freq: the frequency that mapped to nominal_perf + * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf + * + * The amd_cpudata is key private data for each CPU thread in AMD P-State, and + * represents all the attributes and goals that AMD P-State requests at runtime. + */ +struct amd_cpudata { + int cpu; + + u64 cppc_req_cached; + + u32 highest_perf; + u32 nominal_perf; + u32 lowest_nonlinear_perf; + u32 lowest_perf; + + u32 max_freq; + u32 min_freq; + u32 nominal_freq; + u32 lowest_nonlinear_freq; +}; + +static inline int amd_pstate_enable(bool enable) +{ + return wrmsrl_safe(MSR_AMD_CPPC_ENABLE, enable); +} + +static int amd_pstate_init_perf(struct amd_cpudata *cpudata) +{ + u64 cap1; + + int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, + &cap1); + if (ret) + return ret; + + /* + * TODO: Introduce AMD specific power feature. + * + * CPPC entry doesn't indicate the highest performance in some ASICs. + */ + WRITE_ONCE(cpudata->highest_perf, amd_get_highest_perf()); + + WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); + + return 0; +} + +static void amd_pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, + u32 des_perf, u32 max_perf, bool fast_switch) +{ + if (fast_switch) + wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached)); + else + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, + READ_ONCE(cpudata->cppc_req_cached)); +} + +static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, + u32 des_perf, u32 max_perf, bool fast_switch) +{ + u64 prev = READ_ONCE(cpudata->cppc_req_cached); + u64 value = prev; + + value &= ~AMD_CPPC_MIN_PERF(~0L); + value |= AMD_CPPC_MIN_PERF(min_perf); + + value &= ~AMD_CPPC_DES_PERF(~0L); + value |= AMD_CPPC_DES_PERF(des_perf); + + value &= ~AMD_CPPC_MAX_PERF(~0L); + value |= AMD_CPPC_MAX_PERF(max_perf); + + if (value == prev) + return; + + WRITE_ONCE(cpudata->cppc_req_cached, value); + + amd_pstate_update_perf(cpudata, min_perf, des_perf, + max_perf, fast_switch); +} + +static int amd_pstate_verify(struct cpufreq_policy_data *policy) +{ + cpufreq_verify_within_cpu_limits(policy); + + return 0; +} + +static int amd_pstate_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct cpufreq_freqs freqs; + struct amd_cpudata *cpudata = policy->driver_data; + unsigned long max_perf, min_perf, des_perf, cap_perf; + + if (!cpudata->max_freq) + return -ENODEV; + + cap_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); + max_perf = cap_perf; + + freqs.old = policy->cur; + freqs.new = target_freq; + + des_perf = DIV_ROUND_CLOSEST(target_freq * cap_perf, + cpudata->max_freq); + + cpufreq_freq_transition_begin(policy, &freqs); + amd_pstate_update(cpudata, min_perf, des_perf, + max_perf, false); + cpufreq_freq_transition_end(policy, &freqs, false); + + return 0; +} + +static int amd_get_min_freq(struct amd_cpudata *cpudata) +{ + struct cppc_perf_caps cppc_perf; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + + /* Switch to khz */ + return cppc_perf.lowest_freq * 1000; +} + +static int amd_get_max_freq(struct amd_cpudata *cpudata) +{ + struct cppc_perf_caps cppc_perf; + u32 max_perf, max_freq, nominal_freq, nominal_perf; + u64 boost_ratio; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + + nominal_freq = cppc_perf.nominal_freq; + nominal_perf = READ_ONCE(cpudata->nominal_perf); + max_perf = READ_ONCE(cpudata->highest_perf); + + boost_ratio = div_u64(max_perf << SCHED_CAPACITY_SHIFT, + nominal_perf); + + max_freq = nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT; + + /* Switch to khz */ + return max_freq * 1000; +} + +static int amd_get_nominal_freq(struct amd_cpudata *cpudata) +{ + struct cppc_perf_caps cppc_perf; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + + /* Switch to khz */ + return cppc_perf.nominal_freq * 1000; +} + +static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) +{ + struct cppc_perf_caps cppc_perf; + u32 lowest_nonlinear_freq, lowest_nonlinear_perf, + nominal_freq, nominal_perf; + u64 lowest_nonlinear_ratio; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + + nominal_freq = cppc_perf.nominal_freq; + nominal_perf = READ_ONCE(cpudata->nominal_perf); + + lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; + + lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, + nominal_perf); + + lowest_nonlinear_freq = nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT; + + /* Switch to khz */ + return lowest_nonlinear_freq * 1000; +} + +static int amd_pstate_cpu_init(struct cpufreq_policy *policy) +{ + int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; + struct device *dev; + struct amd_cpudata *cpudata; + + dev = get_cpu_device(policy->cpu); + if (!dev) + return -ENODEV; + + cpudata = kzalloc(sizeof(*cpudata), GFP_KERNEL); + if (!cpudata) + return -ENOMEM; + + cpudata->cpu = policy->cpu; + + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata; + + min_freq = amd_get_min_freq(cpudata); + max_freq = amd_get_max_freq(cpudata); + nominal_freq = amd_get_nominal_freq(cpudata); + lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); + + if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { + dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", + min_freq, max_freq); + ret = -EINVAL; + goto free_cpudata; + } + + policy->cpuinfo.transition_latency = AMD_PSTATE_TRANSITION_LATENCY; + policy->transition_delay_us = AMD_PSTATE_TRANSITION_DELAY; + + policy->min = min_freq; + policy->max = max_freq; + + policy->cpuinfo.min_freq = min_freq; + policy->cpuinfo.max_freq = max_freq; + + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + + /* Initial processor data capability frequencies */ + cpudata->max_freq = max_freq; + cpudata->min_freq = min_freq; + cpudata->nominal_freq = nominal_freq; + cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; + + policy->driver_data = cpudata; + + return 0; + +free_cpudata: + kfree(cpudata); + return ret; +} + +static int amd_pstate_cpu_exit(struct cpufreq_policy *policy) +{ + struct amd_cpudata *cpudata; + + cpudata = policy->driver_data; + + kfree(cpudata); + + return 0; +} + +static struct cpufreq_driver amd_pstate_driver = { + .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, + .verify = amd_pstate_verify, + .target = amd_pstate_target, + .init = amd_pstate_cpu_init, + .exit = amd_pstate_cpu_exit, + .name = "amd-pstate", +}; + +static int __init amd_pstate_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return -ENODEV; + + if (!acpi_cpc_valid()) { + pr_debug("the _CPC object is not present in SBIOS\n"); + return -ENODEV; + } + + /* don't keep reloading if cpufreq_driver exists */ + if (cpufreq_get_current_driver()) + return -EEXIST; + + /* capability check */ + if (!boot_cpu_has(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is not supported\n"); + return -ENODEV; + } + + /* enable amd pstate feature */ + ret = amd_pstate_enable(true); + if (ret) { + pr_err("failed to enable amd-pstate with return %d\n", ret); + return ret; + } + + ret = cpufreq_register_driver(&amd_pstate_driver); + if (ret) + pr_err("failed to register amd_pstate_driver with return %d\n", + ret); + + return ret; +} + +static void __exit amd_pstate_exit(void) +{ + cpufreq_unregister_driver(&amd_pstate_driver); + + amd_pstate_enable(false); +} + +module_init(amd_pstate_init); +module_exit(amd_pstate_exit); + +MODULE_AUTHOR("Huang Rui "); +MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); +MODULE_LICENSE("GPL"); From 1d215f0319c20662b701692a2fafc7b3b8a58ae1 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:01 +0800 Subject: [PATCH 22/32] cpufreq: amd-pstate: Add fast switch function for AMD P-State Introduce the fast switch function for AMD P-State on the AMD processors which support the full MSR register control. It's able to decrease the latency on interrupt context. Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 2698ed5ec6d7..8c9c199b560e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -183,6 +183,39 @@ static int amd_pstate_target(struct cpufreq_policy *policy, return 0; } +static void amd_pstate_adjust_perf(unsigned int cpu, + unsigned long _min_perf, + unsigned long target_perf, + unsigned long capacity) +{ + unsigned long max_perf, min_perf, des_perf, + cap_perf, lowest_nonlinear_perf; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; + + cap_perf = READ_ONCE(cpudata->highest_perf); + lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); + + des_perf = cap_perf; + if (target_perf < capacity) + des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity); + + min_perf = READ_ONCE(cpudata->highest_perf); + if (_min_perf < capacity) + min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity); + + if (min_perf < lowest_nonlinear_perf) + min_perf = lowest_nonlinear_perf; + + max_perf = cap_perf; + if (max_perf < min_perf) + max_perf = min_perf; + + des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); + + amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true); +} + static int amd_get_min_freq(struct amd_cpudata *cpudata) { struct cppc_perf_caps cppc_perf; @@ -299,6 +332,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; + policy->fast_switch_possible = true; + /* Initial processor data capability frequencies */ cpudata->max_freq = max_freq; cpudata->min_freq = min_freq; @@ -329,6 +364,7 @@ static struct cpufreq_driver amd_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = amd_pstate_verify, .target = amd_pstate_target, + .adjust_perf = amd_pstate_adjust_perf, .init = amd_pstate_cpu_init, .exit = amd_pstate_cpu_exit, .name = "amd-pstate", From e059c184da47e92c6236f26b0fdaf9e92f0d55b5 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:02 +0800 Subject: [PATCH 23/32] cpufreq: amd-pstate: Introduce the support for the processors with shared memory solution In some of Zen2 and Zen3 based processors, they are using the shared memory that exposed from ACPI SBIOS. In this kind of the processors, there is no MSR support, so we add acpi cppc function as the backend for them. It is using a module param (shared_mem) to enable related processors manually. We will enable this by default once we address performance issue on this solution. Signed-off-by: Jinzhou Su Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 105 ++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 8c9c199b560e..cc62f7484007 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -48,6 +48,20 @@ #define AMD_PSTATE_TRANSITION_LATENCY 0x20000 #define AMD_PSTATE_TRANSITION_DELAY 500 +/* + * TODO: We need more time to fine tune processors with shared memory solution + * with community together. + * + * There are some performance drops on the CPU benchmarks which reports from + * Suse. We are co-working with them to fine tune the shared memory solution. So + * we disable it by default to go acpi-cpufreq on these processors and add a + * module parameter to be able to enable it manually for debugging. + */ +static bool shared_mem = false; +module_param(shared_mem, bool, 0444); +MODULE_PARM_DESC(shared_mem, + "enable amd-pstate on processors with shared memory solution (false = disabled (default), true = enabled)"); + static struct cpufreq_driver amd_pstate_driver; /** @@ -85,12 +99,32 @@ struct amd_cpudata { u32 lowest_nonlinear_freq; }; -static inline int amd_pstate_enable(bool enable) +static inline int pstate_enable(bool enable) { return wrmsrl_safe(MSR_AMD_CPPC_ENABLE, enable); } -static int amd_pstate_init_perf(struct amd_cpudata *cpudata) +static int cppc_enable(bool enable) +{ + int cpu, ret = 0; + + for_each_present_cpu(cpu) { + ret = cppc_set_enable(cpu, enable); + if (ret) + return ret; + } + + return ret; +} + +DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable); + +static inline int amd_pstate_enable(bool enable) +{ + return static_call(amd_pstate_enable)(enable); +} + +static int pstate_init_perf(struct amd_cpudata *cpudata) { u64 cap1; @@ -113,8 +147,33 @@ static int amd_pstate_init_perf(struct amd_cpudata *cpudata) return 0; } -static void amd_pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, - u32 des_perf, u32 max_perf, bool fast_switch) +static int cppc_init_perf(struct amd_cpudata *cpudata) +{ + struct cppc_perf_caps cppc_perf; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + + WRITE_ONCE(cpudata->highest_perf, amd_get_highest_perf()); + + WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, + cppc_perf.lowest_nonlinear_perf); + WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); + + return 0; +} + +DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf); + +static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) +{ + return static_call(amd_pstate_init_perf)(cpudata); +} + +static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, + u32 des_perf, u32 max_perf, bool fast_switch) { if (fast_switch) wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached)); @@ -123,6 +182,29 @@ static void amd_pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, READ_ONCE(cpudata->cppc_req_cached)); } +static void cppc_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, + u32 max_perf, bool fast_switch) +{ + struct cppc_perf_ctrls perf_ctrls; + + perf_ctrls.max_perf = max_perf; + perf_ctrls.min_perf = min_perf; + perf_ctrls.desired_perf = des_perf; + + cppc_set_perf(cpudata->cpu, &perf_ctrls); +} + +DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf); + +static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, + u32 max_perf, bool fast_switch) +{ + static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, + max_perf, fast_switch); +} + static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch) { @@ -332,7 +414,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; - policy->fast_switch_possible = true; + if (boot_cpu_has(X86_FEATURE_CPPC)) + policy->fast_switch_possible = true; /* Initial processor data capability frequencies */ cpudata->max_freq = max_freq; @@ -364,7 +447,6 @@ static struct cpufreq_driver amd_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = amd_pstate_verify, .target = amd_pstate_target, - .adjust_perf = amd_pstate_adjust_perf, .init = amd_pstate_cpu_init, .exit = amd_pstate_cpu_exit, .name = "amd-pstate", @@ -387,8 +469,15 @@ static int __init amd_pstate_init(void) return -EEXIST; /* capability check */ - if (!boot_cpu_has(X86_FEATURE_CPPC)) { - pr_debug("AMD CPPC MSR based functionality is not supported\n"); + if (boot_cpu_has(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); + amd_pstate_driver.adjust_perf = amd_pstate_adjust_perf; + } else if (shared_mem) { + static_call_update(amd_pstate_enable, cppc_enable); + static_call_update(amd_pstate_init_perf, cppc_init_perf); + static_call_update(amd_pstate_update_perf, cppc_update_perf); + } else { + pr_info("This processor supports shared memory solution, you can enable it with amd_pstate.shared_mem=1\n"); return -ENODEV; } From 60e10f896dbf6d78f912e4972081bd4119131376 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:03 +0800 Subject: [PATCH 24/32] cpufreq: amd-pstate: Add trace for AMD P-State module Add trace event to monitor the performance value changes which is controlled by cpu governors. Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Makefile | 6 ++- drivers/cpufreq/amd-pstate-trace.c | 2 + drivers/cpufreq/amd-pstate-trace.h | 77 ++++++++++++++++++++++++++++++ drivers/cpufreq/amd-pstate.c | 4 ++ 4 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 drivers/cpufreq/amd-pstate-trace.c create mode 100644 drivers/cpufreq/amd-pstate-trace.h diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index c8d307010922..285de70af877 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -17,6 +17,10 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET) += cpufreq_governor_attr_set.o obj-$(CONFIG_CPUFREQ_DT) += cpufreq-dt.o obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o +# Traces +CFLAGS_amd-pstate-trace.o := -I$(src) +amd_pstate-y := amd-pstate.o amd-pstate-trace.o + ################################################################################## # x86 drivers. # Link order matters. K8 is preferred to ACPI because of firmware bugs in early @@ -25,7 +29,7 @@ obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o # speedstep-* is preferred over p4-clockmod. obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o -obj-$(CONFIG_X86_AMD_PSTATE) += amd-pstate.o +obj-$(CONFIG_X86_AMD_PSTATE) += amd_pstate.o obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o diff --git a/drivers/cpufreq/amd-pstate-trace.c b/drivers/cpufreq/amd-pstate-trace.c new file mode 100644 index 000000000000..891b696dcd69 --- /dev/null +++ b/drivers/cpufreq/amd-pstate-trace.c @@ -0,0 +1,2 @@ +#define CREATE_TRACE_POINTS +#include "amd-pstate-trace.h" diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h new file mode 100644 index 000000000000..647505957d4f --- /dev/null +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * amd-pstate-trace.h - AMD Processor P-state Frequency Driver Tracer + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Author: Huang Rui + */ + +#if !defined(_AMD_PSTATE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _AMD_PSTATE_TRACE_H + +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM amd_cpu + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE amd-pstate-trace + +#define TPS(x) tracepoint_string(x) + +TRACE_EVENT(amd_pstate_perf, + + TP_PROTO(unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity, + unsigned int cpu_id, + bool changed, + bool fast_switch + ), + + TP_ARGS(min_perf, + target_perf, + capacity, + cpu_id, + changed, + fast_switch + ), + + TP_STRUCT__entry( + __field(unsigned long, min_perf) + __field(unsigned long, target_perf) + __field(unsigned long, capacity) + __field(unsigned int, cpu_id) + __field(bool, changed) + __field(bool, fast_switch) + ), + + TP_fast_assign( + __entry->min_perf = min_perf; + __entry->target_perf = target_perf; + __entry->capacity = capacity; + __entry->cpu_id = cpu_id; + __entry->changed = changed; + __entry->fast_switch = fast_switch; + ), + + TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu cpu_id=%u changed=%s fast_switch=%s", + (unsigned long)__entry->min_perf, + (unsigned long)__entry->target_perf, + (unsigned long)__entry->capacity, + (unsigned int)__entry->cpu_id, + (__entry->changed) ? "true" : "false", + (__entry->fast_switch) ? "true" : "false" + ) +); + +#endif /* _AMD_PSTATE_TRACE_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#include diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index cc62f7484007..63efd5de98a2 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -44,6 +44,7 @@ #include #include #include +#include "amd-pstate-trace.h" #define AMD_PSTATE_TRANSITION_LATENCY 0x20000 #define AMD_PSTATE_TRANSITION_DELAY 500 @@ -220,6 +221,9 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, value &= ~AMD_CPPC_MAX_PERF(~0L); value |= AMD_CPPC_MAX_PERF(max_perf); + trace_amd_pstate_perf(min_perf, des_perf, max_perf, + cpudata->cpu, (value != prev), fast_switch); + if (value == prev) return; From 41271016dfa4a08487bf2862f817edc9070489d1 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:04 +0800 Subject: [PATCH 25/32] cpufreq: amd-pstate: Add boost mode support for AMD P-State If the sbios supports the boost mode of AMD P-State, let's switch to boost enabled by default. Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 69 ++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 63efd5de98a2..9e23efc7b9eb 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -87,6 +87,7 @@ static struct cpufreq_driver amd_pstate_driver; struct amd_cpudata { int cpu; + struct freq_qos_request req[2]; u64 cppc_req_cached; u32 highest_perf; @@ -98,6 +99,8 @@ struct amd_cpudata { u32 min_freq; u32 nominal_freq; u32 lowest_nonlinear_freq; + + bool boost_supported; }; static inline int pstate_enable(bool enable) @@ -374,6 +377,45 @@ static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) return lowest_nonlinear_freq * 1000; } +static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) +{ + struct amd_cpudata *cpudata = policy->driver_data; + int ret; + + if (!cpudata->boost_supported) { + pr_err("Boost mode is not supported by this processor or SBIOS\n"); + return -EINVAL; + } + + if (state) + policy->cpuinfo.max_freq = cpudata->max_freq; + else + policy->cpuinfo.max_freq = cpudata->nominal_freq; + + policy->max = policy->cpuinfo.max_freq; + + ret = freq_qos_update_request(&cpudata->req[1], + policy->cpuinfo.max_freq); + if (ret < 0) + return ret; + + return 0; +} + +static void amd_pstate_boost_init(struct amd_cpudata *cpudata) +{ + u32 highest_perf, nominal_perf; + + highest_perf = READ_ONCE(cpudata->highest_perf); + nominal_perf = READ_ONCE(cpudata->nominal_perf); + + if (highest_perf <= nominal_perf) + return; + + cpudata->boost_supported = true; + amd_pstate_driver.boost_enabled = true; +} + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; @@ -392,7 +434,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) ret = amd_pstate_init_perf(cpudata); if (ret) - goto free_cpudata; + goto free_cpudata1; min_freq = amd_get_min_freq(cpudata); max_freq = amd_get_max_freq(cpudata); @@ -403,7 +445,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", min_freq, max_freq); ret = -EINVAL; - goto free_cpudata; + goto free_cpudata1; } policy->cpuinfo.transition_latency = AMD_PSTATE_TRANSITION_LATENCY; @@ -421,6 +463,20 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (boot_cpu_has(X86_FEATURE_CPPC)) policy->fast_switch_possible = true; + ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], + FREQ_QOS_MIN, policy->cpuinfo.min_freq); + if (ret < 0) { + dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); + goto free_cpudata1; + } + + ret = freq_qos_add_request(&policy->constraints, &cpudata->req[1], + FREQ_QOS_MAX, policy->cpuinfo.max_freq); + if (ret < 0) { + dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret); + goto free_cpudata2; + } + /* Initial processor data capability frequencies */ cpudata->max_freq = max_freq; cpudata->min_freq = min_freq; @@ -429,9 +485,13 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata; + amd_pstate_boost_init(cpudata); + return 0; -free_cpudata: +free_cpudata2: + freq_qos_remove_request(&cpudata->req[0]); +free_cpudata1: kfree(cpudata); return ret; } @@ -442,6 +502,8 @@ static int amd_pstate_cpu_exit(struct cpufreq_policy *policy) cpudata = policy->driver_data; + freq_qos_remove_request(&cpudata->req[1]); + freq_qos_remove_request(&cpudata->req[0]); kfree(cpudata); return 0; @@ -453,6 +515,7 @@ static struct cpufreq_driver amd_pstate_driver = { .target = amd_pstate_target, .init = amd_pstate_cpu_init, .exit = amd_pstate_cpu_exit, + .set_boost = amd_pstate_set_boost, .name = "amd-pstate", }; From ec4e3326a95507ea96fbad21b9472f5ba25500a7 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:05 +0800 Subject: [PATCH 26/32] cpufreq: amd-pstate: Add AMD P-State frequencies attributes Introduce sysfs attributes to get the different level processor frequencies. Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 9e23efc7b9eb..dbb7eee11170 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -509,6 +509,52 @@ static int amd_pstate_cpu_exit(struct cpufreq_policy *policy) return 0; } +/* Sysfs attributes */ + +/* + * This frequency is to indicate the maximum hardware frequency. + * If boost is not active but supported, the frequency will be larger than the + * one in cpuinfo. + */ +static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, + char *buf) +{ + int max_freq; + struct amd_cpudata *cpudata; + + cpudata = policy->driver_data; + + max_freq = amd_get_max_freq(cpudata); + if (max_freq < 0) + return max_freq; + + return sprintf(&buf[0], "%u\n", max_freq); +} + +static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, + char *buf) +{ + int freq; + struct amd_cpudata *cpudata; + + cpudata = policy->driver_data; + + freq = amd_get_lowest_nonlinear_freq(cpudata); + if (freq < 0) + return freq; + + return sprintf(&buf[0], "%u\n", freq); +} + +cpufreq_freq_attr_ro(amd_pstate_max_freq); +cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); + +static struct freq_attr *amd_pstate_attr[] = { + &amd_pstate_max_freq, + &amd_pstate_lowest_nonlinear_freq, + NULL, +}; + static struct cpufreq_driver amd_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = amd_pstate_verify, @@ -517,6 +563,7 @@ static struct cpufreq_driver amd_pstate_driver = { .exit = amd_pstate_cpu_exit, .set_boost = amd_pstate_set_boost, .name = "amd-pstate", + .attr = amd_pstate_attr, }; static int __init amd_pstate_init(void) From 3ad7fde16a04aa70df8a59cba99e225ef9adf42f Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:06 +0800 Subject: [PATCH 27/32] cpufreq: amd-pstate: Add AMD P-State performance attributes Introduce sysfs attributes to get the different level AMD P-State performances. Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index dbb7eee11170..40ceb031abf5 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -546,12 +546,30 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli return sprintf(&buf[0], "%u\n", freq); } +/* + * In some of ASICs, the highest_perf is not the one in the _CPC table, so we + * need to expose it to sysfs. + */ +static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, + char *buf) +{ + u32 perf; + struct amd_cpudata *cpudata = policy->driver_data; + + perf = READ_ONCE(cpudata->highest_perf); + + return sprintf(&buf[0], "%u\n", perf); +} + cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); +cpufreq_freq_attr_ro(amd_pstate_highest_perf); + static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, &amd_pstate_lowest_nonlinear_freq, + &amd_pstate_highest_perf, NULL, }; From c22760885fd6f7161fabde8e7db63d7f085b125a Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:07 +0800 Subject: [PATCH 28/32] Documentation: amd-pstate: Add AMD P-State driver introduction Introduce the AMD P-State driver design and implementation. Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/acpi/cppc_sysfs.rst | 2 + Documentation/admin-guide/pm/amd-pstate.rst | 382 ++++++++++++++++++ .../admin-guide/pm/working-state.rst | 1 + 3 files changed, 385 insertions(+) create mode 100644 Documentation/admin-guide/pm/amd-pstate.rst diff --git a/Documentation/admin-guide/acpi/cppc_sysfs.rst b/Documentation/admin-guide/acpi/cppc_sysfs.rst index fccf22114e85..e53d76365aa7 100644 --- a/Documentation/admin-guide/acpi/cppc_sysfs.rst +++ b/Documentation/admin-guide/acpi/cppc_sysfs.rst @@ -4,6 +4,8 @@ Collaborative Processor Performance Control (CPPC) ================================================== +.. _cppc_sysfs: + CPPC ==== diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst new file mode 100644 index 000000000000..2f066df4ee9c --- /dev/null +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -0,0 +1,382 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=============================================== +``amd-pstate`` CPU Performance Scaling Driver +=============================================== + +:Copyright: |copy| 2021 Advanced Micro Devices, Inc. + +:Author: Huang Rui + + +Introduction +=================== + +``amd-pstate`` is the AMD CPU performance scaling driver that introduces a +new CPU frequency control mechanism on modern AMD APU and CPU series in +Linux kernel. The new mechanism is based on Collaborative Processor +Performance Control (CPPC) which provides finer grain frequency management +than legacy ACPI hardware P-States. Current AMD CPU/APU platforms are using +the ACPI P-states driver to manage CPU frequency and clocks with switching +only in 3 P-states. CPPC replaces the ACPI P-states controls, allows a +flexible, low-latency interface for the Linux kernel to directly +communicate the performance hints to hardware. + +``amd-pstate`` leverages the Linux kernel governors such as ``schedutil``, +``ondemand``, etc. to manage the performance hints which are provided by +CPPC hardware functionality that internally follows the hardware +specification (for details refer to AMD64 Architecture Programmer's Manual +Volume 2: System Programming [1]_). Currently ``amd-pstate`` supports basic +frequency control function according to kernel governors on some of the +Zen2 and Zen3 processors, and we will implement more AMD specific functions +in future after we verify them on the hardware and SBIOS. + + +AMD CPPC Overview +======================= + +Collaborative Processor Performance Control (CPPC) interface enumerates a +continuous, abstract, and unit-less performance value in a scale that is +not tied to a specific performance state / frequency. This is an ACPI +standard [2]_ which software can specify application performance goals and +hints as a relative target to the infrastructure limits. AMD processors +provides the low latency register model (MSR) instead of AML code +interpreter for performance adjustments. ``amd-pstate`` will initialize a +``struct cpufreq_driver`` instance ``amd_pstate_driver`` with the callbacks +to manage each performance update behavior. :: + + Highest Perf ------>+-----------------------+ +-----------------------+ + | | | | + | | | | + | | Max Perf ---->| | + | | | | + | | | | + Nominal Perf ------>+-----------------------+ +-----------------------+ + | | | | + | | | | + | | | | + | | | | + | | | | + | | | | + | | Desired Perf ---->| | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | | + Lowest non- | | | | + linear perf ------>+-----------------------+ +-----------------------+ + | | | | + | | Lowest perf ---->| | + | | | | + Lowest perf ------>+-----------------------+ +-----------------------+ + | | | | + | | | | + | | | | + 0 ------>+-----------------------+ +-----------------------+ + + AMD P-States Performance Scale + + +.. _perf_cap: + +AMD CPPC Performance Capability +-------------------------------- + +Highest Performance (RO) +......................... + +It is the absolute maximum performance an individual processor may reach, +assuming ideal conditions. This performance level may not be sustainable +for long durations and may only be achievable if other platform components +are in a specific state; for example, it may require other processors be in +an idle state. This would be equivalent to the highest frequencies +supported by the processor. + +Nominal (Guaranteed) Performance (RO) +...................................... + +It is the maximum sustained performance level of the processor, assuming +ideal operating conditions. In absence of an external constraint (power, +thermal, etc.) this is the performance level the processor is expected to +be able to maintain continuously. All cores/processors are expected to be +able to sustain their nominal performance state simultaneously. + +Lowest non-linear Performance (RO) +................................... + +It is the lowest performance level at which nonlinear power savings are +achieved, for example, due to the combined effects of voltage and frequency +scaling. Above this threshold, lower performance levels should be generally +more energy efficient than higher performance levels. This register +effectively conveys the most efficient performance level to ``amd-pstate``. + +Lowest Performance (RO) +........................ + +It is the absolute lowest performance level of the processor. Selecting a +performance level lower than the lowest nonlinear performance level may +cause an efficiency penalty but should reduce the instantaneous power +consumption of the processor. + +AMD CPPC Performance Control +------------------------------ + +``amd-pstate`` passes performance goals through these registers. The +register drives the behavior of the desired performance target. + +Minimum requested performance (RW) +................................... + +``amd-pstate`` specifies the minimum allowed performance level. + +Maximum requested performance (RW) +................................... + +``amd-pstate`` specifies a limit the maximum performance that is expected +to be supplied by the hardware. + +Desired performance target (RW) +................................... + +``amd-pstate`` specifies a desired target in the CPPC performance scale as +a relative number. This can be expressed as percentage of nominal +performance (infrastructure max). Below the nominal sustained performance +level, desired performance expresses the average performance level of the +processor subject to hardware. Above the nominal performance level, +processor must provide at least nominal performance requested and go higher +if current operating conditions allow. + +Energy Performance Preference (EPP) (RW) +......................................... + +Provides a hint to the hardware if software wants to bias toward performance +(0x0) or energy efficiency (0xff). + + +Key Governors Support +======================= + +``amd-pstate`` can be used with all the (generic) scaling governors listed +by the ``scaling_available_governors`` policy attribute in ``sysfs``. Then, +it is responsible for the configuration of policy objects corresponding to +CPUs and provides the ``CPUFreq`` core (and the scaling governors attached +to the policy objects) with accurate information on the maximum and minimum +operating frequencies supported by the hardware. Users can check the +``scaling_cur_freq`` information comes from the ``CPUFreq`` core. + +``amd-pstate`` mainly supports ``schedutil`` and ``ondemand`` for dynamic +frequency control. It is to fine tune the processor configuration on +``amd-pstate`` to the ``schedutil`` with CPU CFS scheduler. ``amd-pstate`` +registers adjust_perf callback to implement the CPPC similar performance +update behavior. It is initialized by ``sugov_start`` and then populate the +CPU's update_util_data pointer to assign ``sugov_update_single_perf`` as +the utilization update callback function in CPU scheduler. CPU scheduler +will call ``cpufreq_update_util`` and assign the target performance +according to the ``struct sugov_cpu`` that utilization update belongs to. +Then ``amd-pstate`` updates the desired performance according to the CPU +scheduler assigned. + + +Processor Support +======================= + +The ``amd-pstate`` initialization will fail if the _CPC in ACPI SBIOS is +not existed at the detected processor, and it uses ``acpi_cpc_valid`` to +check the _CPC existence. All Zen based processors support legacy ACPI +hardware P-States function, so while the ``amd-pstate`` fails to be +initialized, the kernel will fall back to initialize ``acpi-cpufreq`` +driver. + +There are two types of hardware implementations for ``amd-pstate``: one is +`Full MSR Support `_ and another is `Shared Memory Support +`_. It can use :c:macro:`X86_FEATURE_CPPC` feature flag (for +details refer to Processor Programming Reference (PPR) for AMD Family +19h Model 51h, Revision A1 Processors [3]_) to indicate the different +types. ``amd-pstate`` is to register different ``static_call`` instances +for different hardware implementations. + +Currently, some of Zen2 and Zen3 processors support ``amd-pstate``. In the +future, it will be supported on more and more AMD processors. + +Full MSR Support +----------------- + +Some new Zen3 processors such as Cezanne provide the MSR registers directly +while the :c:macro:`X86_FEATURE_CPPC` CPU feature flag is set. +``amd-pstate`` can handle the MSR register to implement the fast switch +function in ``CPUFreq`` that can shrink latency of frequency control on the +interrupt context. The functions with ``pstate_xxx`` prefix represent the +operations of MSR registers. + +Shared Memory Support +---------------------- + +If :c:macro:`X86_FEATURE_CPPC` CPU feature flag is not set, that means the +processor supports shared memory solution. In this case, ``amd-pstate`` +uses the ``cppc_acpi`` helper methods to implement the callback functions +that defined on ``static_call``. The functions with ``cppc_xxx`` prefix +represent the operations of acpi cppc helpers for shared memory solution. + + +AMD P-States and ACPI hardware P-States always can be supported in one +processor. But AMD P-States has the higher priority and if it is enabled +with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond +to the request from AMD P-States. + + +User Space Interface in ``sysfs`` +================================== + +``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to +control its functionality at the system level. They located in the +``/sys/devices/system/cpu/cpufreq/policyX/`` directory and affect all CPUs. :: + + root@hr-test1:/home/ray# ls /sys/devices/system/cpu/cpufreq/policy0/*amd* + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_highest_perf + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq + /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq + + +``amd_pstate_highest_perf / amd_pstate_max_freq`` + +Maximum CPPC performance and CPU frequency that the driver is allowed to +set in percent of the maximum supported CPPC performance level (the highest +performance supported in `AMD CPPC Performance Capability `_). +In some of ASICs, the highest CPPC performance is not the one in the _CPC +table, so we need to expose it to sysfs. If boost is not active but +supported, this maximum frequency will be larger than the one in +``cpuinfo``. +This attribute is read-only. + +``amd_pstate_lowest_nonlinear_freq`` + +The lowest non-linear CPPC CPU frequency that the driver is allowed to set +in percent of the maximum supported CPPC performance level (Please see the +lowest non-linear performance in `AMD CPPC Performance Capability +`_). +This attribute is read-only. + +For other performance and frequency values, we can read them back from +``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. + + +``amd-pstate`` vs ``acpi-cpufreq`` +====================================== + +On majority of AMD platforms supported by ``acpi-cpufreq``, the ACPI tables +provided by the platform firmware used for CPU performance scaling, but +only provides 3 P-states on AMD processors. +However, on modern AMD APU and CPU series, it provides the collaborative +processor performance control according to ACPI protocol and customize this +for AMD platforms. That is fine-grain and continuous frequency range +instead of the legacy hardware P-states. ``amd-pstate`` is the kernel +module which supports the new AMD P-States mechanism on most of future AMD +platforms. The AMD P-States mechanism will be the more performance and energy +efficiency frequency management method on AMD processors. + +Kernel Module Options for ``amd-pstate`` +========================================= + +``shared_mem`` +Use a module param (shared_mem) to enable related processors manually with +**amd_pstate.shared_mem=1**. +Due to the performance issue on the processors with `Shared Memory Support +`_, so we disable it for the moment and will enable this by default +once we address performance issue on this solution. + +The way to check whether current processor is `Full MSR Support `_ +or `Shared Memory Support `_ : :: + + ray@hr-test1:~$ lscpu | grep cppc + Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm + +If CPU Flags have cppc, then this processor supports `Full MSR Support +`_. Otherwise it supports `Shared Memory Support `_. + + +``cpupower`` tool support for ``amd-pstate`` +=============================================== + +``amd-pstate`` is supported on ``cpupower`` tool that can be used to dump the frequency +information. And it is in progress to support more and more operations for new +``amd-pstate`` module with this tool. :: + + root@hr-test1:/home/ray# cpupower frequency-info + analyzing CPU 0: + driver: amd-pstate + CPUs which run at the same hardware frequency: 0 + CPUs which need to have their frequency coordinated by software: 0 + maximum transition latency: 131 us + hardware limits: 400 MHz - 4.68 GHz + available cpufreq governors: ondemand conservative powersave userspace performance schedutil + current policy: frequency should be within 400 MHz and 4.68 GHz. + The governor "schedutil" may decide which speed to use + within this range. + current CPU frequency: Unable to call hardware + current CPU frequency: 4.02 GHz (asserted by call to kernel) + boost state support: + Supported: yes + Active: yes + AMD PSTATE Highest Performance: 166. Maximum Frequency: 4.68 GHz. + AMD PSTATE Nominal Performance: 117. Nominal Frequency: 3.30 GHz. + AMD PSTATE Lowest Non-linear Performance: 39. Lowest Non-linear Frequency: 1.10 GHz. + AMD PSTATE Lowest Performance: 15. Lowest Frequency: 400 MHz. + + +Diagnostics and Tuning +======================= + +Trace Events +-------------- + +There are two static trace events that can be used for ``amd-pstate`` +diagnostics. One of them is the cpu_frequency trace event generally used +by ``CPUFreq``, and the other one is the ``amd_pstate_perf`` trace event +specific to ``amd-pstate``. The following sequence of shell commands can +be used to enable them and see their output (if the kernel is generally +configured to support event tracing). :: + + root@hr-test1:/home/ray# cd /sys/kernel/tracing/ + root@hr-test1:/sys/kernel/tracing# echo 1 > events/amd_cpu/enable + root@hr-test1:/sys/kernel/tracing# cat trace + # tracer: nop + # + # entries-in-buffer/entries-written: 47827/42233061 #P:2 + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / delay + # TASK-PID CPU# |||| TIMESTAMP FUNCTION + # | | | |||| | | + -0 [015] dN... 4995.979886: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=15 changed=false fast_switch=true + -0 [007] d.h.. 4995.979893: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=7 changed=false fast_switch=true + cat-2161 [000] d.... 4995.980841: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=0 changed=false fast_switch=true + sshd-2125 [004] d.s.. 4995.980968: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=4 changed=false fast_switch=true + -0 [007] d.s.. 4995.980968: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=7 changed=false fast_switch=true + -0 [003] d.s.. 4995.980971: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=3 changed=false fast_switch=true + -0 [011] d.s.. 4995.980996: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=11 changed=false fast_switch=true + +The cpu_frequency trace event will be triggered either by the ``schedutil`` scaling +governor (for the policies it is attached to), or by the ``CPUFreq`` core (for the +policies with other scaling governors). + + +Reference +=========== + +.. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming, + https://www.amd.com/system/files/TechDocs/24593.pdf + +.. [2] Advanced Configuration and Power Interface Specification, + https://uefi.org/sites/default/files/resources/ACPI_Spec_6_4_Jan22.pdf + +.. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors + https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index f40994c422dc..5d2757e2de65 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -11,6 +11,7 @@ Working-State Power Management intel_idle cpufreq intel_pstate + amd-pstate cpufreq_drivers intel_epb intel-speed-select From 38fec059bb69793f38cfa7a671d4bdbfe2a647aa Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Fri, 24 Dec 2021 09:05:08 +0800 Subject: [PATCH 29/32] MAINTAINERS: Add AMD P-State driver maintainer entry I will continue to add new feature and processor support, optimize the performance, and handle the issues for AMD P-State driver. Signed-off-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 13f9a84a617e..93b8492d9a2c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -993,6 +993,13 @@ S: Supported T: git https://gitlab.freedesktop.org/agd5f/linux.git F: drivers/gpu/drm/amd/pm/ +AMD PSTATE DRIVER +M: Huang Rui +L: linux-pm@vger.kernel.org +S: Supported +F: Documentation/admin-guide/pm/amd-pstate.rst +F: drivers/cpufreq/amd-pstate* + AMD PTDMA DRIVER M: Sanjay R Mehta L: dmaengine@vger.kernel.org From bdc4fd3d48e7e97dd7efc14affe384280e197071 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 6 Jan 2022 10:16:05 +0800 Subject: [PATCH 30/32] cpufreq: amd-pstate: Fix struct amd_cpudata kernel-doc comment Add the description of @req and @boost_supported in struct amd_cpudata kernel-doc comment to remove warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. drivers/cpufreq/amd-pstate.c:104: warning: Function parameter or member 'req' not described in 'amd_cpudata' drivers/cpufreq/amd-pstate.c:104: warning: Function parameter or member 'boost_supported' not described in 'amd_cpudata' Reported-by: Abaci Robot Signed-off-by: Yang Li Acked-by: Huang Rui Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 40ceb031abf5..9ce75ed11f8e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -68,6 +68,7 @@ static struct cpufreq_driver amd_pstate_driver; /** * struct amd_cpudata - private CPU data for AMD P-State * @cpu: CPU number + * @req: constraint request to apply * @cppc_req_cached: cached performance request hints * @highest_perf: the maximum performance an individual processor may reach, * assuming ideal conditions @@ -80,6 +81,7 @@ static struct cpufreq_driver amd_pstate_driver; * @min_freq: the frequency that mapped to lowest_perf * @nominal_freq: the frequency that mapped to nominal_perf * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf + * @boost_supported: check whether the Processor or SBIOS supports boost mode * * The amd_cpudata is key private data for each CPU thread in AMD P-State, and * represents all the attributes and goals that AMD P-State requests at runtime. From a2e6840b37b45d04e095c47f961211b7697cb063 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Thu, 6 Jan 2022 15:43:05 +0800 Subject: [PATCH 31/32] cpufreq: amd-pstate: Fix Kconfig dependencies for AMD P-State MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AMD P-State driver is based on ACPI CPPC function, so ACPI should be dependence of this driver in the kernel config. In file included from ../drivers/cpufreq/amd-pstate.c:40:0: ../include/acpi/processor.h:226:2: error: unknown type name ‘phys_cpuid_t’ phys_cpuid_t phys_id; /* CPU hardware ID such as APIC ID for x86 */ ^~~~~~~~~~~~ ../include/acpi/processor.h:355:1: error: unknown type name ‘phys_cpuid_t’; did you mean ‘phys_addr_t’? phys_cpuid_t acpi_get_phys_id(acpi_handle, int type, u32 acpi_id); ^~~~~~~~~~~~ phys_addr_t CC drivers/rtc/rtc-rv3029c2.o ../include/acpi/processor.h:356:1: error: unknown type name ‘phys_cpuid_t’; did you mean ‘phys_addr_t’? phys_cpuid_t acpi_map_madt_entry(u32 acpi_id); ^~~~~~~~~~~~ phys_addr_t ../include/acpi/processor.h:357:20: error: unknown type name ‘phys_cpuid_t’; did you mean ‘phys_addr_t’? int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id); ^~~~~~~~~~~~ phys_addr_t See https://lore.kernel.org/lkml/20e286d4-25d7-fb6e-31a1-4349c805aae3@infradead.org/. Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: Huang Rui [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.x86 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index a951768c3ebb..55516043b656 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -36,9 +36,9 @@ config X86_PCC_CPUFREQ config X86_AMD_PSTATE tristate "AMD Processor P-State driver" - depends on X86 - select ACPI_PROCESSOR if ACPI - select ACPI_CPPC_LIB if X86_64 && ACPI + depends on X86 && ACPI + select ACPI_PROCESSOR + select ACPI_CPPC_LIB if X86_64 select CPU_FREQ_GOV_SCHEDUTIL if SMP help This driver adds a CPUFreq driver which utilizes a fine grain From 6c4ab1b86dac3954d15c00c1a6396d60a1023fab Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Thu, 6 Jan 2022 15:43:06 +0800 Subject: [PATCH 32/32] x86, sched: Fix undefined reference to init_freq_invariance_cppc() build error The init_freq_invariance_cppc function is implemented in smpboot and depends on CONFIG_SMP. MODPOST vmlinux.symvers MODINFO modules.builtin.modinfo GEN modules.builtin LD .tmp_vmlinux.kallsyms1 ld: drivers/acpi/cppc_acpi.o: in function `acpi_cppc_processor_probe': /home/ray/brahma3/linux/drivers/acpi/cppc_acpi.c:819: undefined reference to `init_freq_invariance_cppc' make: *** [Makefile:1161: vmlinux] Error 1 See https://lore.kernel.org/lkml/484af487-7511-647e-5c5b-33d4429acdec@infradead.org/. Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") Reported-by: kernel test robot Reported-by: Randy Dunlap Reported-by: Stephen Rothwell Signed-off-by: Huang Rui [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index cc164777e661..2f0b6be8eaab 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -221,7 +221,7 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled) } #endif -#ifdef CONFIG_ACPI_CPPC_LIB +#if defined(CONFIG_ACPI_CPPC_LIB) && defined(CONFIG_SMP) void init_freq_invariance_cppc(void); #define init_freq_invariance_cppc init_freq_invariance_cppc #endif