From 9d7c48506518684847bc17d4e3f0a103d83aa5c8 Mon Sep 17 00:00:00 2001 From: Andreas Rammhold Date: Fri, 16 Jul 2021 22:00:34 +0200 Subject: [PATCH 01/58] tools: cpupower: fix typo in cpupower-idle-set(1) manpage The tools name was wrong in the SYNTAX section of the manpage it should read "idle-set" instead of "idle-info". Signed-off-by: Andreas Rammhold Signed-off-by: Shuah Khan --- tools/power/cpupower/man/cpupower-idle-set.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/man/cpupower-idle-set.1 b/tools/power/cpupower/man/cpupower-idle-set.1 index 21916cff7516..8cef3c71e19e 100644 --- a/tools/power/cpupower/man/cpupower-idle-set.1 +++ b/tools/power/cpupower/man/cpupower-idle-set.1 @@ -4,7 +4,7 @@ cpupower\-idle\-set \- Utility to set cpu idle state specific kernel options .SH "SYNTAX" .LP -cpupower [ \-c cpulist ] idle\-info [\fIoptions\fP] +cpupower [ \-c cpulist ] idle\-set [\fIoptions\fP] .SH "DESCRIPTION" .LP The cpupower idle\-set subcommand allows to set cpu idle, also called cpu From 101025ff8e47d3c938ad2ae646a1794b9a8aa730 Mon Sep 17 00:00:00 2001 From: ozkanonur Date: Thu, 13 Jan 2022 00:04:21 +0300 Subject: [PATCH 02/58] tools/power/cpupower/{ToDo => TODO}: Rename the todo file Renamed the to-do file to 'TODO' instead of 'ToDo' to comply with the naming standard. Signed-off-by: ozkanonur Signed-off-by: Shuah Khan --- tools/power/cpupower/{ToDo => TODO} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tools/power/cpupower/{ToDo => TODO} (100%) diff --git a/tools/power/cpupower/ToDo b/tools/power/cpupower/TODO similarity index 100% rename from tools/power/cpupower/ToDo rename to tools/power/cpupower/TODO From b9794a822281944ef3de5b1812a94cbdb8134320 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 28 Jan 2022 17:35:33 +0100 Subject: [PATCH 03/58] powercap/drivers/dtpm: Convert the init table section to a simple array The init table section is freed after the system booted. However the next changes will make per module the DTPM description, so the table won't be accessible when the module is loaded. In order to fix that, we should move the table to the data section where there are very few entries and that makes strange to add it there. The main goal of the table was to keep self-encapsulated code and we can keep it almost as it by using an array instead. Suggested-by: Ulf Hansson Reviewed-by: Ulf Hansson Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220128163537.212248-2-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 2 ++ drivers/powercap/dtpm_cpu.c | 5 ++++- drivers/powercap/dtpm_subsys.h | 18 ++++++++++++++++++ include/asm-generic/vmlinux.lds.h | 11 ----------- include/linux/dtpm.h | 24 +++--------------------- 5 files changed, 27 insertions(+), 33 deletions(-) create mode 100644 drivers/powercap/dtpm_subsys.h diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 8cb45f2d3d78..0e5c93443c70 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -24,6 +24,8 @@ #include #include +#include "dtpm_subsys.h" + #define DTPM_POWER_LIMIT_FLAG 0 static const char *constraint_name[] = { diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index b740866b228d..5763e0ce2af5 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -269,4 +269,7 @@ static int __init dtpm_cpu_init(void) return 0; } -DTPM_DECLARE(dtpm_cpu, dtpm_cpu_init); +struct dtpm_subsys_ops dtpm_cpu_ops = { + .name = KBUILD_MODNAME, + .init = dtpm_cpu_init, +}; diff --git a/drivers/powercap/dtpm_subsys.h b/drivers/powercap/dtpm_subsys.h new file mode 100644 index 000000000000..2a3a2055f60e --- /dev/null +++ b/drivers/powercap/dtpm_subsys.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Linaro Ltd + * + * Author: Daniel Lezcano + */ +#ifndef ___DTPM_SUBSYS_H__ +#define ___DTPM_SUBSYS_H__ + +extern struct dtpm_subsys_ops dtpm_cpu_ops; + +struct dtpm_subsys_ops *dtpm_subsys[] = { +#ifdef CONFIG_DTPM_CPU + &dtpm_cpu_ops, +#endif +}; + +#endif diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 42f3866bca69..2a10db2f0bc5 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -321,16 +321,6 @@ #define THERMAL_TABLE(name) #endif -#ifdef CONFIG_DTPM -#define DTPM_TABLE() \ - . = ALIGN(8); \ - __dtpm_table = .; \ - KEEP(*(__dtpm_table)) \ - __dtpm_table_end = .; -#else -#define DTPM_TABLE() -#endif - #define KERNEL_DTB() \ STRUCT_ALIGN(); \ __dtb_start = .; \ @@ -723,7 +713,6 @@ ACPI_PROBE_TABLE(irqchip) \ ACPI_PROBE_TABLE(timer) \ THERMAL_TABLE(governor) \ - DTPM_TABLE() \ EARLYCON_TABLE() \ LSM_TABLE() \ EARLY_LSM_TABLE() \ diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index d37e5d06a357..506048158a50 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -32,29 +32,11 @@ struct dtpm_ops { void (*release)(struct dtpm *); }; -typedef int (*dtpm_init_t)(void); - -struct dtpm_descr { - dtpm_init_t init; +struct dtpm_subsys_ops { + const char *name; + int (*init)(void); }; -/* Init section thermal table */ -extern struct dtpm_descr __dtpm_table[]; -extern struct dtpm_descr __dtpm_table_end[]; - -#define DTPM_TABLE_ENTRY(name, __init) \ - static struct dtpm_descr __dtpm_table_entry_##name \ - __used __section("__dtpm_table") = { \ - .init = __init, \ - } - -#define DTPM_DECLARE(name, init) DTPM_TABLE_ENTRY(name, init) - -#define for_each_dtpm_table(__dtpm) \ - for (__dtpm = __dtpm_table; \ - __dtpm < __dtpm_table_end; \ - __dtpm++) - static inline struct dtpm *to_dtpm(struct powercap_zone *zone) { return container_of(zone, struct dtpm, zone); From 3759ec678e8944dc2ea70cab77a300408f78ae27 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 28 Jan 2022 17:35:34 +0100 Subject: [PATCH 04/58] powercap/drivers/dtpm: Add hierarchy creation The DTPM framework is available but without a way to configure it. This change provides a way to create a hierarchy of DTPM node where the power consumption reflects the sum of the children's power consumption. It is up to the platform to specify an array of dtpm nodes where each element has a pointer to its parent, except the top most one. The type of the node gives the indication of which initialization callback to call. At this time, we can create a virtual node, where its purpose is to be a parent in the hierarchy, and a DT node where the name describes its path. In order to ensure a nice self-encapsulation, the DTPM subsys array contains a couple of initialization functions, one to setup the DTPM backend and one to initialize it up. With this approach, the DTPM framework has a very few material to export. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220128163537.212248-3-daniel.lezcano@linaro.org --- drivers/powercap/Kconfig | 1 + drivers/powercap/dtpm.c | 196 +++++++++++++++++++++++++++++++++++++-- include/linux/dtpm.h | 15 +++ 3 files changed, 206 insertions(+), 6 deletions(-) diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index 8242e8c5ed77..b1ca339957e3 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -46,6 +46,7 @@ config IDLE_INJECT config DTPM bool "Power capping for Dynamic Thermal Power Management (EXPERIMENTAL)" + depends on OF help This enables support for the power capping for the dynamic thermal power management userspace engine. diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 0e5c93443c70..414826a1509b 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "dtpm_subsys.h" @@ -463,14 +464,197 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) return 0; } -static int __init init_dtpm(void) +static struct dtpm *dtpm_setup_virtual(const struct dtpm_node *hierarchy, + struct dtpm *parent) { - pct = powercap_register_control_type(NULL, "dtpm", NULL); - if (IS_ERR(pct)) { - pr_err("Failed to register control type\n"); - return PTR_ERR(pct); + struct dtpm *dtpm; + int ret; + + dtpm = kzalloc(sizeof(*dtpm), GFP_KERNEL); + if (!dtpm) + return ERR_PTR(-ENOMEM); + dtpm_init(dtpm, NULL); + + ret = dtpm_register(hierarchy->name, dtpm, parent); + if (ret) { + pr_err("Failed to register dtpm node '%s': %d\n", + hierarchy->name, ret); + kfree(dtpm); + return ERR_PTR(ret); + } + + return dtpm; +} + +static struct dtpm *dtpm_setup_dt(const struct dtpm_node *hierarchy, + struct dtpm *parent) +{ + struct device_node *np; + int i, ret; + + np = of_find_node_by_path(hierarchy->name); + if (!np) { + pr_err("Failed to find '%s'\n", hierarchy->name); + return ERR_PTR(-ENXIO); + } + + for (i = 0; i < ARRAY_SIZE(dtpm_subsys); i++) { + + if (!dtpm_subsys[i]->setup) + continue; + + ret = dtpm_subsys[i]->setup(parent, np); + if (ret) { + pr_err("Failed to setup '%s': %d\n", dtpm_subsys[i]->name, ret); + of_node_put(np); + return ERR_PTR(ret); + } + } + + of_node_put(np); + + /* + * By returning a NULL pointer, we let know the caller there + * is no child for us as we are a leaf of the tree + */ + return NULL; +} + +typedef struct dtpm * (*dtpm_node_callback_t)(const struct dtpm_node *, struct dtpm *); + +dtpm_node_callback_t dtpm_node_callback[] = { + [DTPM_NODE_VIRTUAL] = dtpm_setup_virtual, + [DTPM_NODE_DT] = dtpm_setup_dt, +}; + +static int dtpm_for_each_child(const struct dtpm_node *hierarchy, + const struct dtpm_node *it, struct dtpm *parent) +{ + struct dtpm *dtpm; + int i, ret; + + for (i = 0; hierarchy[i].name; i++) { + + if (hierarchy[i].parent != it) + continue; + + dtpm = dtpm_node_callback[hierarchy[i].type](&hierarchy[i], parent); + + /* + * A NULL pointer means there is no children, hence we + * continue without going deeper in the recursivity. + */ + if (!dtpm) + continue; + + /* + * There are multiple reasons why the callback could + * fail. The generic glue is abstracting the backend + * and therefore it is not possible to report back or + * take a decision based on the error. In any case, + * if this call fails, it is not critical in the + * hierarchy creation, we can assume the underlying + * service is not found, so we continue without this + * branch in the tree but with a warning to log the + * information the node was not created. + */ + if (IS_ERR(dtpm)) { + pr_warn("Failed to create '%s' in the hierarchy\n", + hierarchy[i].name); + continue; + } + + ret = dtpm_for_each_child(hierarchy, &hierarchy[i], dtpm); + if (ret) + return ret; } return 0; } -late_initcall(init_dtpm); + +/** + * dtpm_create_hierarchy - Create the dtpm hierarchy + * @hierarchy: An array of struct dtpm_node describing the hierarchy + * + * The function is called by the platform specific code with the + * description of the different node in the hierarchy. It creates the + * tree in the sysfs filesystem under the powercap dtpm entry. + * + * The expected tree has the format: + * + * struct dtpm_node hierarchy[] = { + * [0] { .name = "topmost", type = DTPM_NODE_VIRTUAL }, + * [1] { .name = "package", .type = DTPM_NODE_VIRTUAL, .parent = &hierarchy[0] }, + * [2] { .name = "/cpus/cpu0", .type = DTPM_NODE_DT, .parent = &hierarchy[1] }, + * [3] { .name = "/cpus/cpu1", .type = DTPM_NODE_DT, .parent = &hierarchy[1] }, + * [4] { .name = "/cpus/cpu2", .type = DTPM_NODE_DT, .parent = &hierarchy[1] }, + * [5] { .name = "/cpus/cpu3", .type = DTPM_NODE_DT, .parent = &hierarchy[1] }, + * [6] { } + * }; + * + * The last element is always an empty one and marks the end of the + * array. + * + * Return: zero on success, a negative value in case of error. Errors + * are reported back from the underlying functions. + */ +int dtpm_create_hierarchy(struct of_device_id *dtpm_match_table) +{ + const struct of_device_id *match; + const struct dtpm_node *hierarchy; + struct device_node *np; + int i, ret; + + if (pct) + return -EBUSY; + + pct = powercap_register_control_type(NULL, "dtpm", NULL); + if (IS_ERR(pct)) { + pr_err("Failed to register control type\n"); + ret = PTR_ERR(pct); + goto out_pct; + } + + ret = -ENODEV; + np = of_find_node_by_path("/"); + if (!np) + goto out_err; + + match = of_match_node(dtpm_match_table, np); + + of_node_put(np); + + if (!match) + goto out_err; + + hierarchy = match->data; + if (!hierarchy) { + ret = -EFAULT; + goto out_err; + } + + ret = dtpm_for_each_child(hierarchy, NULL, NULL); + if (ret) + goto out_err; + + for (i = 0; i < ARRAY_SIZE(dtpm_subsys); i++) { + + if (!dtpm_subsys[i]->init) + continue; + + ret = dtpm_subsys[i]->init(); + if (ret) + pr_info("Failed to initialze '%s': %d", + dtpm_subsys[i]->name, ret); + } + + return 0; + +out_err: + powercap_unregister_control_type(pct); +out_pct: + pct = NULL; + + return ret; +} +EXPORT_SYMBOL_GPL(dtpm_create_hierarchy); diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index 506048158a50..f7a25c70dd4c 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -32,9 +32,23 @@ struct dtpm_ops { void (*release)(struct dtpm *); }; +struct device_node; + struct dtpm_subsys_ops { const char *name; int (*init)(void); + int (*setup)(struct dtpm *, struct device_node *); +}; + +enum DTPM_NODE_TYPE { + DTPM_NODE_VIRTUAL = 0, + DTPM_NODE_DT, +}; + +struct dtpm_node { + enum DTPM_NODE_TYPE type; + const char *name; + struct dtpm_node *parent; }; static inline struct dtpm *to_dtpm(struct powercap_zone *zone) @@ -52,4 +66,5 @@ void dtpm_unregister(struct dtpm *dtpm); int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent); +int dtpm_create_hierarchy(struct of_device_id *dtpm_match_table); #endif From 73dbcb6e37bf0c43bac8c15fe5bcab2bec2367fb Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 28 Jan 2022 17:35:35 +0100 Subject: [PATCH 05/58] powercap/drivers/dtpm: Add CPU DT initialization support Based on the previous DT changes in the core code, use the 'setup' callback to initialize the CPU DTPM backend. Code is reorganized to stick to the DTPM table description. No functional changes. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220128163537.212248-4-daniel.lezcano@linaro.org --- drivers/powercap/dtpm_cpu.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 5763e0ce2af5..eed5ad688d46 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,17 @@ static int cpuhp_dtpm_cpu_offline(unsigned int cpu) } static int cpuhp_dtpm_cpu_online(unsigned int cpu) +{ + struct dtpm_cpu *dtpm_cpu; + + dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); + if (dtpm_cpu) + return dtpm_update_power(&dtpm_cpu->dtpm); + + return 0; +} + +static int __dtpm_cpu_setup(int cpu, struct dtpm *parent) { struct dtpm_cpu *dtpm_cpu; struct cpufreq_policy *policy; @@ -183,6 +195,10 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) char name[CPUFREQ_NAME_LEN]; int ret = -ENOMEM; + dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); + if (dtpm_cpu) + return 0; + policy = cpufreq_cpu_get(cpu); if (!policy) return 0; @@ -191,10 +207,6 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) if (!pd) return -EINVAL; - dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); - if (dtpm_cpu) - return dtpm_update_power(&dtpm_cpu->dtpm); - dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL); if (!dtpm_cpu) return -ENOMEM; @@ -207,7 +219,7 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) snprintf(name, sizeof(name), "cpu%d-cpufreq", dtpm_cpu->cpu); - ret = dtpm_register(name, &dtpm_cpu->dtpm, NULL); + ret = dtpm_register(name, &dtpm_cpu->dtpm, parent); if (ret) goto out_kfree_dtpm_cpu; @@ -231,7 +243,18 @@ out_kfree_dtpm_cpu: return ret; } -static int __init dtpm_cpu_init(void) +static int dtpm_cpu_setup(struct dtpm *dtpm, struct device_node *np) +{ + int cpu; + + cpu = of_cpu_node_to_id(np); + if (cpu < 0) + return 0; + + return __dtpm_cpu_setup(cpu, dtpm); +} + +static int dtpm_cpu_init(void) { int ret; @@ -272,4 +295,5 @@ static int __init dtpm_cpu_init(void) struct dtpm_subsys_ops dtpm_cpu_ops = { .name = KBUILD_MODNAME, .init = dtpm_cpu_init, + .setup = dtpm_cpu_setup, }; From e446556173170e675a7a321e76ce5fa3587de724 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 28 Jan 2022 17:35:36 +0100 Subject: [PATCH 06/58] powercap/drivers/dtpm: Add dtpm devfreq with energy model support Currently the dtpm supports the CPUs via cpufreq and the energy model. This change provides the same for the device which supports devfreq. Each device supporting devfreq and having an energy model can be added to the hierarchy. The concept is the same as the cpufreq DTPM support: the QoS is used to aggregate the requests and the energy model gives the value of the instantaneous power consumption ponderated by the load of the device. Cc: Chanwoo Choi Cc: Lukasz Luba Cc: Kyungmin Park Cc: MyungJoo Ham Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220128163537.212248-5-daniel.lezcano@linaro.org --- drivers/powercap/Kconfig | 7 ++ drivers/powercap/Makefile | 1 + drivers/powercap/dtpm_devfreq.c | 203 ++++++++++++++++++++++++++++++++ drivers/powercap/dtpm_subsys.h | 4 + 4 files changed, 215 insertions(+) create mode 100644 drivers/powercap/dtpm_devfreq.c diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index b1ca339957e3..515e3ceb3393 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -57,4 +57,11 @@ config DTPM_CPU help This enables support for CPU power limitation based on energy model. + +config DTPM_DEVFREQ + bool "Add device power capping based on the energy model" + depends on DTPM && ENERGY_MODEL + help + This enables support for device power limitation based on + energy model. endif diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile index fabcf388a8d3..494617cdad88 100644 --- a/drivers/powercap/Makefile +++ b/drivers/powercap/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_DTPM) += dtpm.o obj-$(CONFIG_DTPM_CPU) += dtpm_cpu.o +obj-$(CONFIG_DTPM_DEVFREQ) += dtpm_devfreq.o obj-$(CONFIG_POWERCAP) += powercap_sys.o obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o diff --git a/drivers/powercap/dtpm_devfreq.c b/drivers/powercap/dtpm_devfreq.c new file mode 100644 index 000000000000..91276761a31d --- /dev/null +++ b/drivers/powercap/dtpm_devfreq.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 Linaro Limited + * + * Author: Daniel Lezcano + * + * The devfreq device combined with the energy model and the load can + * give an estimation of the power consumption as well as limiting the + * power. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +struct dtpm_devfreq { + struct dtpm dtpm; + struct dev_pm_qos_request qos_req; + struct devfreq *devfreq; +}; + +static struct dtpm_devfreq *to_dtpm_devfreq(struct dtpm *dtpm) +{ + return container_of(dtpm, struct dtpm_devfreq, dtpm); +} + +static int update_pd_power_uw(struct dtpm *dtpm) +{ + struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm); + struct devfreq *devfreq = dtpm_devfreq->devfreq; + struct device *dev = devfreq->dev.parent; + struct em_perf_domain *pd = em_pd_get(dev); + + dtpm->power_min = pd->table[0].power; + dtpm->power_min *= MICROWATT_PER_MILLIWATT; + + dtpm->power_max = pd->table[pd->nr_perf_states - 1].power; + dtpm->power_max *= MICROWATT_PER_MILLIWATT; + + return 0; +} + +static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) +{ + struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm); + struct devfreq *devfreq = dtpm_devfreq->devfreq; + struct device *dev = devfreq->dev.parent; + struct em_perf_domain *pd = em_pd_get(dev); + unsigned long freq; + u64 power; + int i; + + for (i = 0; i < pd->nr_perf_states; i++) { + + power = pd->table[i].power * MICROWATT_PER_MILLIWATT; + if (power > power_limit) + break; + } + + freq = pd->table[i - 1].frequency; + + dev_pm_qos_update_request(&dtpm_devfreq->qos_req, freq); + + power_limit = pd->table[i - 1].power * MICROWATT_PER_MILLIWATT; + + return power_limit; +} + +static void _normalize_load(struct devfreq_dev_status *status) +{ + if (status->total_time > 0xfffff) { + status->total_time >>= 10; + status->busy_time >>= 10; + } + + status->busy_time <<= 10; + status->busy_time /= status->total_time ? : 1; + + status->busy_time = status->busy_time ? : 1; + status->total_time = 1024; +} + +static u64 get_pd_power_uw(struct dtpm *dtpm) +{ + struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm); + struct devfreq *devfreq = dtpm_devfreq->devfreq; + struct device *dev = devfreq->dev.parent; + struct em_perf_domain *pd = em_pd_get(dev); + struct devfreq_dev_status status; + unsigned long freq; + u64 power; + int i; + + mutex_lock(&devfreq->lock); + status = devfreq->last_status; + mutex_unlock(&devfreq->lock); + + freq = DIV_ROUND_UP(status.current_frequency, HZ_PER_KHZ); + _normalize_load(&status); + + for (i = 0; i < pd->nr_perf_states; i++) { + + if (pd->table[i].frequency < freq) + continue; + + power = pd->table[i].power * MICROWATT_PER_MILLIWATT; + power *= status.busy_time; + power >>= 10; + + return power; + } + + return 0; +} + +static void pd_release(struct dtpm *dtpm) +{ + struct dtpm_devfreq *dtpm_devfreq = to_dtpm_devfreq(dtpm); + + if (dev_pm_qos_request_active(&dtpm_devfreq->qos_req)) + dev_pm_qos_remove_request(&dtpm_devfreq->qos_req); + + kfree(dtpm_devfreq); +} + +static struct dtpm_ops dtpm_ops = { + .set_power_uw = set_pd_power_limit, + .get_power_uw = get_pd_power_uw, + .update_power_uw = update_pd_power_uw, + .release = pd_release, +}; + +static int __dtpm_devfreq_setup(struct devfreq *devfreq, struct dtpm *parent) +{ + struct device *dev = devfreq->dev.parent; + struct dtpm_devfreq *dtpm_devfreq; + struct em_perf_domain *pd; + int ret = -ENOMEM; + + pd = em_pd_get(dev); + if (!pd) { + ret = dev_pm_opp_of_register_em(dev, NULL); + if (ret) { + pr_err("No energy model available for '%s'\n", dev_name(dev)); + return -EINVAL; + } + } + + dtpm_devfreq = kzalloc(sizeof(*dtpm_devfreq), GFP_KERNEL); + if (!dtpm_devfreq) + return -ENOMEM; + + dtpm_init(&dtpm_devfreq->dtpm, &dtpm_ops); + + dtpm_devfreq->devfreq = devfreq; + + ret = dtpm_register(dev_name(dev), &dtpm_devfreq->dtpm, parent); + if (ret) { + pr_err("Failed to register '%s': %d\n", dev_name(dev), ret); + kfree(dtpm_devfreq); + return ret; + } + + ret = dev_pm_qos_add_request(dev, &dtpm_devfreq->qos_req, + DEV_PM_QOS_MAX_FREQUENCY, + PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); + if (ret) { + pr_err("Failed to add QoS request: %d\n", ret); + goto out_dtpm_unregister; + } + + dtpm_update_power(&dtpm_devfreq->dtpm); + + return 0; + +out_dtpm_unregister: + dtpm_unregister(&dtpm_devfreq->dtpm); + + return ret; +} + +static int dtpm_devfreq_setup(struct dtpm *dtpm, struct device_node *np) +{ + struct devfreq *devfreq; + + devfreq = devfreq_get_devfreq_by_node(np); + if (IS_ERR(devfreq)) + return 0; + + return __dtpm_devfreq_setup(devfreq, dtpm); +} + +struct dtpm_subsys_ops dtpm_devfreq_ops = { + .name = KBUILD_MODNAME, + .setup = dtpm_devfreq_setup, +}; diff --git a/drivers/powercap/dtpm_subsys.h b/drivers/powercap/dtpm_subsys.h index 2a3a2055f60e..db1712938a96 100644 --- a/drivers/powercap/dtpm_subsys.h +++ b/drivers/powercap/dtpm_subsys.h @@ -8,11 +8,15 @@ #define ___DTPM_SUBSYS_H__ extern struct dtpm_subsys_ops dtpm_cpu_ops; +extern struct dtpm_subsys_ops dtpm_devfreq_ops; struct dtpm_subsys_ops *dtpm_subsys[] = { #ifdef CONFIG_DTPM_CPU &dtpm_cpu_ops, #endif +#ifdef CONFIG_DTPM_DEVFREQ + &dtpm_devfreq_ops, +#endif }; #endif From b9d6c47a2be8d273ecc063afda6e9fd66a35116d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 28 Jan 2022 17:35:37 +0100 Subject: [PATCH 07/58] rockchip/soc/drivers: Add DTPM description for rk3399 The DTPM framework does support now the hierarchy description. The platform specific code can call the hierarchy creation function with an array of struct dtpm_node pointing to their parent. This patch provides a description of the big / Little CPUs and the GPU and tie them together under a virtual 'package' name. Only rk3399 is described now. The description could be extended in the future with the memory controller with devfreq. The description is always a module and it describes the soft dependencies. The userspace has to load the softdeps module in the right order. Signed-off-by: Daniel Lezcano Reviewed-by; Heiko Stuebner Link: https://lore.kernel.org/r/20220128163537.212248-6-daniel.lezcano@linaro.org --- drivers/soc/rockchip/Kconfig | 8 +++++ drivers/soc/rockchip/Makefile | 1 + drivers/soc/rockchip/dtpm.c | 59 +++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 drivers/soc/rockchip/dtpm.c diff --git a/drivers/soc/rockchip/Kconfig b/drivers/soc/rockchip/Kconfig index 25eb2c1e31bb..156ac0e0c8fe 100644 --- a/drivers/soc/rockchip/Kconfig +++ b/drivers/soc/rockchip/Kconfig @@ -34,4 +34,12 @@ config ROCKCHIP_PM_DOMAINS If unsure, say N. +config ROCKCHIP_DTPM + tristate "Rockchip DTPM hierarchy" + depends on DTPM && m + help + Describe the hierarchy for the Dynamic Thermal Power + Management tree on this platform. That will create all the + power capping capable devices. + endif diff --git a/drivers/soc/rockchip/Makefile b/drivers/soc/rockchip/Makefile index 875032f7344e..05f31a4e743c 100644 --- a/drivers/soc/rockchip/Makefile +++ b/drivers/soc/rockchip/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_ROCKCHIP_GRF) += grf.o obj-$(CONFIG_ROCKCHIP_IODOMAIN) += io-domain.o obj-$(CONFIG_ROCKCHIP_PM_DOMAINS) += pm_domains.o +obj-$(CONFIG_ROCKCHIP_DTPM) += dtpm.o diff --git a/drivers/soc/rockchip/dtpm.c b/drivers/soc/rockchip/dtpm.c new file mode 100644 index 000000000000..ebebb748488b --- /dev/null +++ b/drivers/soc/rockchip/dtpm.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 Linaro Limited + * + * Author: Daniel Lezcano + * + * DTPM hierarchy description + */ +#include +#include +#include +#include + +static struct dtpm_node __initdata rk3399_hierarchy[] = { + [0]{ .name = "rk3399", + .type = DTPM_NODE_VIRTUAL }, + [1]{ .name = "package", + .type = DTPM_NODE_VIRTUAL, + .parent = &rk3399_hierarchy[0] }, + [2]{ .name = "/cpus/cpu@0", + .type = DTPM_NODE_DT, + .parent = &rk3399_hierarchy[1] }, + [3]{ .name = "/cpus/cpu@1", + .type = DTPM_NODE_DT, + .parent = &rk3399_hierarchy[1] }, + [4]{ .name = "/cpus/cpu@2", + .type = DTPM_NODE_DT, + .parent = &rk3399_hierarchy[1] }, + [5]{ .name = "/cpus/cpu@3", + .type = DTPM_NODE_DT, + .parent = &rk3399_hierarchy[1] }, + [6]{ .name = "/cpus/cpu@100", + .type = DTPM_NODE_DT, + .parent = &rk3399_hierarchy[1] }, + [7]{ .name = "/cpus/cpu@101", + .type = DTPM_NODE_DT, + .parent = &rk3399_hierarchy[1] }, + [8]{ .name = "/gpu@ff9a0000", + .type = DTPM_NODE_DT, + .parent = &rk3399_hierarchy[1] }, + [9]{ /* sentinel */ } +}; + +static struct of_device_id __initdata rockchip_dtpm_match_table[] = { + { .compatible = "rockchip,rk3399", .data = rk3399_hierarchy }, + {}, +}; + +static int __init rockchip_dtpm_init(void) +{ + return dtpm_create_hierarchy(rockchip_dtpm_match_table); +} +module_init(rockchip_dtpm_init); + +MODULE_SOFTDEP("pre: panfrost cpufreq-dt"); +MODULE_DESCRIPTION("Rockchip DTPM driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:dtpm"); +MODULE_AUTHOR("Daniel Lezcano Date: Sun, 23 Jan 2022 20:45:06 +0800 Subject: [PATCH 08/58] cpufreq: Move to_gov_attr_set() to cpufreq.h So it can be reused by other codes. Signed-off-by: Kevin Hao Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor_attr_set.c | 5 ----- include/linux/cpufreq.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq_governor_attr_set.c b/drivers/cpufreq/cpufreq_governor_attr_set.c index a6f365b9cc1a..771770ea0ed0 100644 --- a/drivers/cpufreq/cpufreq_governor_attr_set.c +++ b/drivers/cpufreq/cpufreq_governor_attr_set.c @@ -8,11 +8,6 @@ #include "cpufreq_governor.h" -static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj) -{ - return container_of(kobj, struct gov_attr_set, kobj); -} - static inline struct governor_attr *to_gov_attr(struct attribute *attr) { return container_of(attr, struct governor_attr, attr); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1ab29e61b078..f0dfc0b260ec 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -658,6 +658,11 @@ struct gov_attr_set { /* sysfs ops for cpufreq governors */ extern const struct sysfs_ops governor_sysfs_ops; +static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj) +{ + return container_of(kobj, struct gov_attr_set, kobj); +} + void gov_attr_set_init(struct gov_attr_set *attr_set, struct list_head *list_node); void gov_attr_set_get(struct gov_attr_set *attr_set, struct list_head *list_node); unsigned int gov_attr_set_put(struct gov_attr_set *attr_set, struct list_head *list_node); From 53725c4cbd4567423ff6143c5d10300e53ecf52a Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Sun, 23 Jan 2022 20:45:07 +0800 Subject: [PATCH 09/58] cpufreq: schedutil: Use to_gov_attr_set() to get the gov_attr_set The to_gov_attr_set() has been moved to the cpufreq.h, so use it to get the gov_attr_set. Signed-off-by: Kevin Hao Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 26778884d9ab..cffcd08f4ec8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov); static void sugov_tunables_free(struct kobject *kobj) { - struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); kfree(to_sugov_tunables(attr_set)); } From 7ddf5e37631ac7a96920f0f8aa3c8c4c289aaa25 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Jan 2022 20:43:59 +0100 Subject: [PATCH 10/58] cpufreq: longhaul: Replace acpi_bus_get_device() Replace acpi_bus_get_device() that is going to be dropped with acpi_fetch_acpi_dev(). No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/longhaul.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c index c538a153ee82..3e000e1a75c6 100644 --- a/drivers/cpufreq/longhaul.c +++ b/drivers/cpufreq/longhaul.c @@ -668,9 +668,9 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle, u32 nesting_level, void *context, void **return_value) { - struct acpi_device *d; + struct acpi_device *d = acpi_fetch_acpi_dev(obj_handle); - if (acpi_bus_get_device(obj_handle, &d)) + if (!d) return 0; *return_value = acpi_driver_data(d); From 3f51aa9e296fe4af785d5761bb12556fb2494761 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 9 Feb 2022 19:29:51 +0800 Subject: [PATCH 11/58] PM: hibernate: fix load_image_and_restore() error path As 'swsusp_check' open 'hib_resume_bdev', if call 'create_basic_memory_bitmaps' failed, we need to close 'hib_resume_bdev' in 'load_image_and_restore' function. Signed-off-by: Ye Bin [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..49d1df0218cb 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -689,8 +689,10 @@ static int load_image_and_restore(void) lock_device_hotplug(); error = create_basic_memory_bitmaps(); - if (error) + if (error) { + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; + } error = swsusp_read(&flags); swsusp_close(FMODE_READ | FMODE_EXCL); From c8be60c12041145e663249af261286d402b4c5e3 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:17 +0800 Subject: [PATCH 12/58] cpupower: Add AMD P-State capability flag Add AMD P-State capability flag in cpupower to indicate AMD new P-State kernel module support on Ryzen processors. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/helpers.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 33ffacee7fcb..b4813efdfb00 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -73,6 +73,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_AMD_HW_PSTATE 0x00000100 #define CPUPOWER_CAP_AMD_PSTATEDEF 0x00000200 #define CPUPOWER_CAP_AMD_CPB_MSR 0x00000400 +#define CPUPOWER_CAP_AMD_PSTATE 0x00000800 #define CPUPOWER_AMD_CPBDIS 0x02000000 From 46c273a0958274f1e1e69f3540ae827a92e0660f Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:18 +0800 Subject: [PATCH 13/58] cpupower: Add the function to check AMD P-State enabled The processor with AMD P-State function also supports legacy ACPI hardware P-States feature as well. Once driver sets AMD P-State eanbled, the processor will respond the finer grain AMD P-State feature instead of legacy ACPI P-States. So it introduces the cpupower_amd_pstate_enabled() to check whether the current kernel enables AMD P-State or AMD CPUFreq module. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/helpers.h | 10 ++++++++++ tools/power/cpupower/utils/helpers/misc.c | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index b4813efdfb00..62771a086871 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -11,6 +11,7 @@ #include #include +#include #include "helpers/bitmask.h" #include @@ -136,6 +137,12 @@ extern int decode_pstates(unsigned int cpu, int boost_states, extern int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, int * states); + +/* AMD P-State stuff **************************/ +bool cpupower_amd_pstate_enabled(void); + +/* AMD P-State stuff **************************/ + /* * CPUID functions returning a single datum */ @@ -168,6 +175,9 @@ static inline int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, int * states) { return -1; } +static inline bool cpupower_amd_pstate_enabled(void) +{ return false; } + /* cpuid and cpuinfo helpers **************************/ static inline unsigned int cpuid_eax(unsigned int op) { return 0; }; diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index fc6e34511721..0c483cdefcc2 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -3,9 +3,11 @@ #include #include #include +#include #include "helpers/helpers.h" #include "helpers/sysfs.h" +#include "cpufreq.h" #if defined(__i386__) || defined(__x86_64__) @@ -83,6 +85,22 @@ int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val) return 0; } +bool cpupower_amd_pstate_enabled(void) +{ + char *driver = cpufreq_get_driver(0); + bool ret = false; + + if (!driver) + return ret; + + if (!strcmp(driver, "amd-pstate")) + ret = true; + + cpufreq_put_driver(driver); + + return ret; +} + #endif /* #if defined(__i386__) || defined(__x86_64__) */ /* get_cpustate From 083792f368b8ceea7ae035b6641e9cef3aceb366 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:19 +0800 Subject: [PATCH 14/58] cpupower: Initial AMD P-State capability If kernel starts the AMD P-State module, the cpupower will initial the capability flag as CPUPOWER_CAP_AMD_PSTATE. And once AMD P-State capability is set, it won't need to set legacy ACPI relative capabilities anymore. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/cpuid.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index 72eb43593180..eae91f11d187 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -149,6 +149,19 @@ out: if (ext_cpuid_level >= 0x80000008 && cpuid_ebx(0x80000008) & (1 << 4)) cpu_info->caps |= CPUPOWER_CAP_AMD_RDPRU; + + if (cpupower_amd_pstate_enabled()) { + cpu_info->caps |= CPUPOWER_CAP_AMD_PSTATE; + + /* + * If AMD P-State is enabled, the firmware will treat + * AMD P-State function as high priority. + */ + cpu_info->caps &= ~CPUPOWER_CAP_AMD_CPB; + cpu_info->caps &= ~CPUPOWER_CAP_AMD_CPB_MSR; + cpu_info->caps &= ~CPUPOWER_CAP_AMD_HW_PSTATE; + cpu_info->caps &= ~CPUPOWER_CAP_AMD_PSTATEDEF; + } } if (cpu_info->vendor == X86_VENDOR_INTEL) { From e3ede97657d8cfc4fd75aecad50269534bb55aed Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:20 +0800 Subject: [PATCH 15/58] cpupower: Add the function to get the sysfs value from specific table Expose the helper into cpufreq header, then cpufreq driver can use this function to get the sysfs value if it has any specific sysfs interfaces. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpufreq.c | 23 ++++++++++++++++------- tools/power/cpupower/lib/cpufreq.h | 12 ++++++++++++ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tools/power/cpupower/lib/cpufreq.c b/tools/power/cpupower/lib/cpufreq.c index c3b56db8b921..1516d23c17c9 100644 --- a/tools/power/cpupower/lib/cpufreq.c +++ b/tools/power/cpupower/lib/cpufreq.c @@ -83,20 +83,21 @@ static const char *cpufreq_value_files[MAX_CPUFREQ_VALUE_READ_FILES] = { [STATS_NUM_TRANSITIONS] = "stats/total_trans" }; - -static unsigned long sysfs_cpufreq_get_one_value(unsigned int cpu, - enum cpufreq_value which) +unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu, + const char **table, + unsigned int index, + unsigned int size) { unsigned long value; unsigned int len; char linebuf[MAX_LINE_LEN]; char *endp; - if (which >= MAX_CPUFREQ_VALUE_READ_FILES) + if (!table || index >= size || !table[index]) return 0; - len = sysfs_cpufreq_read_file(cpu, cpufreq_value_files[which], - linebuf, sizeof(linebuf)); + len = sysfs_cpufreq_read_file(cpu, table[index], linebuf, + sizeof(linebuf)); if (len == 0) return 0; @@ -109,6 +110,14 @@ static unsigned long sysfs_cpufreq_get_one_value(unsigned int cpu, return value; } +static unsigned long sysfs_cpufreq_get_one_value(unsigned int cpu, + enum cpufreq_value which) +{ + return cpufreq_get_sysfs_value_from_table(cpu, cpufreq_value_files, + which, + MAX_CPUFREQ_VALUE_READ_FILES); +} + /* read access to files which contain one string */ enum cpufreq_string { @@ -124,7 +133,7 @@ static const char *cpufreq_string_files[MAX_CPUFREQ_STRING_FILES] = { static char *sysfs_cpufreq_get_one_string(unsigned int cpu, - enum cpufreq_string which) + enum cpufreq_string which) { char linebuf[MAX_LINE_LEN]; char *result; diff --git a/tools/power/cpupower/lib/cpufreq.h b/tools/power/cpupower/lib/cpufreq.h index 95f4fd9e2656..2f3c84035806 100644 --- a/tools/power/cpupower/lib/cpufreq.h +++ b/tools/power/cpupower/lib/cpufreq.h @@ -203,6 +203,18 @@ int cpufreq_modify_policy_governor(unsigned int cpu, char *governor); int cpufreq_set_frequency(unsigned int cpu, unsigned long target_frequency); +/* + * get the sysfs value from specific table + * + * Read the value with the sysfs file name from specific table. Does + * only work if the cpufreq driver has the specific sysfs interfaces. + */ + +unsigned long cpufreq_get_sysfs_value_from_table(unsigned int cpu, + const char **table, + unsigned int index, + unsigned int size); + #ifdef __cplusplus } #endif From 4a06806e5d4a781d2c81f6064985018562b2604b Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:21 +0800 Subject: [PATCH 16/58] cpupower: Introduce ACPI CPPC library Kernel ACPI subsytem introduced the sysfs attributes for acpi cppc library in below path: /sys/devices/system/cpu/cpuX/acpi_cppc/ And these attributes will be used for AMD P-State driver to provide some performance and frequency values. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 6 +-- tools/power/cpupower/lib/acpi_cppc.c | 59 ++++++++++++++++++++++++++++ tools/power/cpupower/lib/acpi_cppc.h | 21 ++++++++++ 3 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 tools/power/cpupower/lib/acpi_cppc.c create mode 100644 tools/power/cpupower/lib/acpi_cppc.h diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 3b1594447f29..e9b6de314654 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -143,9 +143,9 @@ UTIL_HEADERS = utils/helpers/helpers.h utils/idle_monitor/cpupower-monitor.h \ utils/helpers/bitmask.h \ utils/idle_monitor/idle_monitors.h utils/idle_monitor/idle_monitors.def -LIB_HEADERS = lib/cpufreq.h lib/cpupower.h lib/cpuidle.h -LIB_SRC = lib/cpufreq.c lib/cpupower.c lib/cpuidle.c -LIB_OBJS = lib/cpufreq.o lib/cpupower.o lib/cpuidle.o +LIB_HEADERS = lib/cpufreq.h lib/cpupower.h lib/cpuidle.h lib/acpi_cppc.h +LIB_SRC = lib/cpufreq.c lib/cpupower.c lib/cpuidle.c lib/acpi_cppc.c +LIB_OBJS = lib/cpufreq.o lib/cpupower.o lib/cpuidle.o lib/acpi_cppc.o LIB_OBJS := $(addprefix $(OUTPUT),$(LIB_OBJS)) override CFLAGS += -pipe diff --git a/tools/power/cpupower/lib/acpi_cppc.c b/tools/power/cpupower/lib/acpi_cppc.c new file mode 100644 index 000000000000..c401ac331e9f --- /dev/null +++ b/tools/power/cpupower/lib/acpi_cppc.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpupower_intern.h" +#include "acpi_cppc.h" + +/* ACPI CPPC sysfs access ***********************************************/ + +static int acpi_cppc_read_file(unsigned int cpu, const char *fname, + char *buf, size_t buflen) +{ + char path[SYSFS_PATH_MAX]; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/acpi_cppc/%s", + cpu, fname); + return cpupower_read_sysfs(path, buf, buflen); +} + +static const char * const acpi_cppc_value_files[] = { + [HIGHEST_PERF] = "highest_perf", + [LOWEST_PERF] = "lowest_perf", + [NOMINAL_PERF] = "nominal_perf", + [LOWEST_NONLINEAR_PERF] = "lowest_nonlinear_perf", + [LOWEST_FREQ] = "lowest_freq", + [NOMINAL_FREQ] = "nominal_freq", + [REFERENCE_PERF] = "reference_perf", + [WRAPAROUND_TIME] = "wraparound_time" +}; + +unsigned long acpi_cppc_get_data(unsigned int cpu, enum acpi_cppc_value which) +{ + unsigned long long value; + unsigned int len; + char linebuf[MAX_LINE_LEN]; + char *endp; + + if (which >= MAX_CPPC_VALUE_FILES) + return 0; + + len = acpi_cppc_read_file(cpu, acpi_cppc_value_files[which], + linebuf, sizeof(linebuf)); + if (len == 0) + return 0; + + value = strtoull(linebuf, &endp, 0); + + if (endp == linebuf || errno == ERANGE) + return 0; + + return value; +} diff --git a/tools/power/cpupower/lib/acpi_cppc.h b/tools/power/cpupower/lib/acpi_cppc.h new file mode 100644 index 000000000000..85ca83080316 --- /dev/null +++ b/tools/power/cpupower/lib/acpi_cppc.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ACPI_CPPC_H__ +#define __ACPI_CPPC_H__ + +enum acpi_cppc_value { + HIGHEST_PERF, + LOWEST_PERF, + NOMINAL_PERF, + LOWEST_NONLINEAR_PERF, + LOWEST_FREQ, + NOMINAL_FREQ, + REFERENCE_PERF, + WRAPAROUND_TIME, + MAX_CPPC_VALUE_FILES +}; + +unsigned long acpi_cppc_get_data(unsigned int cpu, + enum acpi_cppc_value which); + +#endif /* _ACPI_CPPC_H */ From 33e43f3636dffe84753847eee79ea0e3527105e6 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:22 +0800 Subject: [PATCH 17/58] cpupower: Add AMD P-State sysfs definition and access helper Introduce the marco definitions and access helper function for AMD P-State sysfs interfaces such as each performance goals and frequency levels in amd helper file. They will be used to read the sysfs attribute from AMD P-State cpufreq driver for cpupower utilities. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 97f2c857048e..4d45d1b44164 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -8,7 +8,10 @@ #include #include "helpers/helpers.h" +#include "cpufreq.h" +#include "acpi_cppc.h" +/* ACPI P-States Helper Functions for AMD Processors ***************/ #define MSR_AMD_PSTATE_STATUS 0xc0010063 #define MSR_AMD_PSTATE 0xc0010064 #define MSR_AMD_PSTATE_LIMIT 0xc0010061 @@ -146,4 +149,31 @@ int amd_pci_get_num_boost_states(int *active, int *states) pci_cleanup(pci_acc); return 0; } + +/* ACPI P-States Helper Functions for AMD Processors ***************/ + +/* AMD P-State Helper Functions ************************************/ +enum amd_pstate_value { + AMD_PSTATE_HIGHEST_PERF, + AMD_PSTATE_MAX_FREQ, + AMD_PSTATE_LOWEST_NONLINEAR_FREQ, + MAX_AMD_PSTATE_VALUE_READ_FILES, +}; + +static const char *amd_pstate_value_files[MAX_AMD_PSTATE_VALUE_READ_FILES] = { + [AMD_PSTATE_HIGHEST_PERF] = "amd_pstate_highest_perf", + [AMD_PSTATE_MAX_FREQ] = "amd_pstate_max_freq", + [AMD_PSTATE_LOWEST_NONLINEAR_FREQ] = "amd_pstate_lowest_nonlinear_freq", +}; + +static unsigned long amd_pstate_get_data(unsigned int cpu, + enum amd_pstate_value value) +{ + return cpufreq_get_sysfs_value_from_table(cpu, + amd_pstate_value_files, + value, + MAX_AMD_PSTATE_VALUE_READ_FILES); +} + +/* AMD P-State Helper Functions ************************************/ #endif /* defined(__i386__) || defined(__x86_64__) */ From bf9801baa81802dac7e2a5318944ca2f4bfa74ef Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:23 +0800 Subject: [PATCH 18/58] cpupower: Enable boost state support for AMD P-State module The legacy ACPI hardware P-States function has 3 P-States on ACPI table, the CPU frequency only can be switched between the 3 P-States. While the processor supports the boost state, it will have another boost state that the frequency can be higher than P0 state, and the state can be decoded by the function of decode_pstates() and read by amd_pci_get_num_boost_states(). However, the new AMD P-State function is different than legacy ACPI hardware P-State on AMD processors. That has a finer grain frequency range between the highest and lowest frequency. And boost frequency is actually the frequency which is mapped on highest performance ratio. The similar previous P0 frequency is mapped on nominal performance ratio. If the highest performance on the processor is higher than nominal performance, then we think the current processor supports the boost state. And it uses amd_pstate_boost_init() to initialize boost for AMD P-State function. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 18 ++++++++++++++++++ tools/power/cpupower/utils/helpers/helpers.h | 5 +++++ tools/power/cpupower/utils/helpers/misc.c | 2 ++ 3 files changed, 25 insertions(+) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 4d45d1b44164..f5ba528dc7db 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -175,5 +175,23 @@ static unsigned long amd_pstate_get_data(unsigned int cpu, MAX_AMD_PSTATE_VALUE_READ_FILES); } +void amd_pstate_boost_init(unsigned int cpu, int *support, int *active) +{ + unsigned long highest_perf, nominal_perf, cpuinfo_min, + cpuinfo_max, amd_pstate_max; + + highest_perf = amd_pstate_get_data(cpu, AMD_PSTATE_HIGHEST_PERF); + nominal_perf = acpi_cppc_get_data(cpu, NOMINAL_PERF); + + *support = highest_perf > nominal_perf ? 1 : 0; + if (!(*support)) + return; + + cpufreq_get_hardware_limits(cpu, &cpuinfo_min, &cpuinfo_max); + amd_pstate_max = amd_pstate_get_data(cpu, AMD_PSTATE_MAX_FREQ); + + *active = cpuinfo_max == amd_pstate_max ? 1 : 0; +} + /* AMD P-State Helper Functions ************************************/ #endif /* defined(__i386__) || defined(__x86_64__) */ diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 62771a086871..326491e11c6e 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -140,6 +140,8 @@ extern int cpufreq_has_boost_support(unsigned int cpu, int *support, /* AMD P-State stuff **************************/ bool cpupower_amd_pstate_enabled(void); +void amd_pstate_boost_init(unsigned int cpu, + int *support, int *active); /* AMD P-State stuff **************************/ @@ -177,6 +179,9 @@ static inline int cpufreq_has_boost_support(unsigned int cpu, int *support, static inline bool cpupower_amd_pstate_enabled(void) { return false; } +static inline void amd_pstate_boost_init(unsigned int cpu, int *support, + int *active) +{} /* cpuid and cpuinfo helpers **************************/ diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index 0c483cdefcc2..e0d3145434d3 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -41,6 +41,8 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, if (ret) return ret; } + } else if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATE) { + amd_pstate_boost_init(cpu, support, active); } else if (cpupower_cpu_info.caps & CPUPOWER_CAP_INTEL_IDA) *support = *active = 1; return 0; From 35fdf42d90d09d2d00ef65999fe338027a6b4d8e Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:24 +0800 Subject: [PATCH 19/58] cpupower: Move print_speed function into misc helper The print_speed can be as a common function, and expose it into misc helper header. Then it can be used on other helper files as well. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 59 ++++---------------- tools/power/cpupower/utils/helpers/helpers.h | 1 + tools/power/cpupower/utils/helpers/misc.c | 40 +++++++++++++ 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index f9895e31ff5a..b429454bf3ae 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -84,43 +84,6 @@ static void proc_cpufreq_output(void) } static int no_rounding; -static void print_speed(unsigned long speed) -{ - unsigned long tmp; - - if (no_rounding) { - if (speed > 1000000) - printf("%u.%06u GHz", ((unsigned int) speed/1000000), - ((unsigned int) speed%1000000)); - else if (speed > 1000) - printf("%u.%03u MHz", ((unsigned int) speed/1000), - (unsigned int) (speed%1000)); - else - printf("%lu kHz", speed); - } else { - if (speed > 1000000) { - tmp = speed%10000; - if (tmp >= 5000) - speed += 10000; - printf("%u.%02u GHz", ((unsigned int) speed/1000000), - ((unsigned int) (speed%1000000)/10000)); - } else if (speed > 100000) { - tmp = speed%1000; - if (tmp >= 500) - speed += 1000; - printf("%u MHz", ((unsigned int) speed/1000)); - } else if (speed > 1000) { - tmp = speed%100; - if (tmp >= 50) - speed += 100; - printf("%u.%01u MHz", ((unsigned int) speed/1000), - ((unsigned int) (speed%1000)/100)); - } - } - - return; -} - static void print_duration(unsigned long duration) { unsigned long tmp; @@ -254,11 +217,11 @@ static int get_boost_mode(unsigned int cpu) if (freqs) { printf(_(" boost frequency steps: ")); while (freqs->next) { - print_speed(freqs->frequency); + print_speed(freqs->frequency, no_rounding); printf(", "); freqs = freqs->next; } - print_speed(freqs->frequency); + print_speed(freqs->frequency, no_rounding); printf("\n"); cpufreq_put_available_frequencies(freqs); } @@ -277,7 +240,7 @@ static int get_freq_kernel(unsigned int cpu, unsigned int human) return -EINVAL; } if (human) { - print_speed(freq); + print_speed(freq, no_rounding); } else printf("%lu", freq); printf(_(" (asserted by call to kernel)\n")); @@ -296,7 +259,7 @@ static int get_freq_hardware(unsigned int cpu, unsigned int human) return -EINVAL; } if (human) { - print_speed(freq); + print_speed(freq, no_rounding); } else printf("%lu", freq); printf(_(" (asserted by call to hardware)\n")); @@ -316,9 +279,9 @@ static int get_hardware_limits(unsigned int cpu, unsigned int human) if (human) { printf(_(" hardware limits: ")); - print_speed(min); + print_speed(min, no_rounding); printf(" - "); - print_speed(max); + print_speed(max, no_rounding); printf("\n"); } else { printf("%lu %lu\n", min, max); @@ -350,9 +313,9 @@ static int get_policy(unsigned int cpu) return -EINVAL; } printf(_(" current policy: frequency should be within ")); - print_speed(policy->min); + print_speed(policy->min, no_rounding); printf(_(" and ")); - print_speed(policy->max); + print_speed(policy->max, no_rounding); printf(".\n "); printf(_("The governor \"%s\" may decide which speed to use\n" @@ -436,7 +399,7 @@ static int get_freq_stats(unsigned int cpu, unsigned int human) struct cpufreq_stats *stats = cpufreq_get_stats(cpu, &total_time); while (stats) { if (human) { - print_speed(stats->frequency); + print_speed(stats->frequency, no_rounding); printf(":%.2f%%", (100.0 * stats->time_in_state) / total_time); } else @@ -486,11 +449,11 @@ static void debug_output_one(unsigned int cpu) if (freqs) { printf(_(" available frequency steps: ")); while (freqs->next) { - print_speed(freqs->frequency); + print_speed(freqs->frequency, no_rounding); printf(", "); freqs = freqs->next; } - print_speed(freqs->frequency); + print_speed(freqs->frequency, no_rounding); printf("\n"); cpufreq_put_available_frequencies(freqs); } diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 326491e11c6e..fa2a8c1b1d26 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -200,5 +200,6 @@ extern struct bitmask *offline_cpus; void get_cpustate(void); void print_online_cpus(void); void print_offline_cpus(void); +void print_speed(unsigned long speed, int no_rounding); #endif /* __CPUPOWERUTILS_HELPERS__ */ diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index e0d3145434d3..9547b29254a7 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -164,3 +164,43 @@ void print_offline_cpus(void) printf(_("cpupower set operation was not performed on them\n")); } } + +/* + * print_speed + * + * Print the exact CPU frequency with appropriate unit + */ +void print_speed(unsigned long speed, int no_rounding) +{ + unsigned long tmp; + + if (no_rounding) { + if (speed > 1000000) + printf("%u.%06u GHz", ((unsigned int)speed / 1000000), + ((unsigned int)speed % 1000000)); + else if (speed > 1000) + printf("%u.%03u MHz", ((unsigned int)speed / 1000), + (unsigned int)(speed % 1000)); + else + printf("%lu kHz", speed); + } else { + if (speed > 1000000) { + tmp = speed % 10000; + if (tmp >= 5000) + speed += 10000; + printf("%u.%02u GHz", ((unsigned int)speed / 1000000), + ((unsigned int)(speed % 1000000) / 10000)); + } else if (speed > 100000) { + tmp = speed % 1000; + if (tmp >= 500) + speed += 1000; + printf("%u MHz", ((unsigned int)speed / 1000)); + } else if (speed > 1000) { + tmp = speed % 100; + if (tmp >= 50) + speed += 100; + printf("%u.%01u MHz", ((unsigned int)speed / 1000), + ((unsigned int)(speed % 1000) / 100)); + } + } +} From d8363e29178249bb505ae388ce1658484396fcde Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:25 +0800 Subject: [PATCH 20/58] cpupower: Add function to print AMD P-State performance capabilities AMD P-State kernel module is using the fine grain frequency instead of acpi hardware pstate. So add a function to print performance and frequency values. Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 9 ++++-- tools/power/cpupower/utils/helpers/amd.c | 29 ++++++++++++++++++++ tools/power/cpupower/utils/helpers/helpers.h | 5 ++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index b429454bf3ae..235243ec5ce0 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -146,9 +146,12 @@ static int get_boost_mode_x86(unsigned int cpu) printf(_(" Supported: %s\n"), support ? _("yes") : _("no")); printf(_(" Active: %s\n"), active ? _("yes") : _("no")); - if ((cpupower_cpu_info.vendor == X86_VENDOR_AMD && - cpupower_cpu_info.family >= 0x10) || - cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { + if (cpupower_cpu_info.vendor == X86_VENDOR_AMD && + cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATE) { + return 0; + } else if ((cpupower_cpu_info.vendor == X86_VENDOR_AMD && + cpupower_cpu_info.family >= 0x10) || + cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { ret = decode_pstates(cpu, b_states, pstates, &pstate_no); if (ret) return ret; diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index f5ba528dc7db..c519cc89c97f 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -193,5 +193,34 @@ void amd_pstate_boost_init(unsigned int cpu, int *support, int *active) *active = cpuinfo_max == amd_pstate_max ? 1 : 0; } +void amd_pstate_show_perf_and_freq(unsigned int cpu, int no_rounding) +{ + printf(_(" AMD PSTATE Highest Performance: %lu. Maximum Frequency: "), + amd_pstate_get_data(cpu, AMD_PSTATE_HIGHEST_PERF)); + /* + * If boost isn't active, the cpuinfo_max doesn't indicate real max + * frequency. So we read it back from amd-pstate sysfs entry. + */ + print_speed(amd_pstate_get_data(cpu, AMD_PSTATE_MAX_FREQ), no_rounding); + printf(".\n"); + + printf(_(" AMD PSTATE Nominal Performance: %lu. Nominal Frequency: "), + acpi_cppc_get_data(cpu, NOMINAL_PERF)); + print_speed(acpi_cppc_get_data(cpu, NOMINAL_FREQ) * 1000, + no_rounding); + printf(".\n"); + + printf(_(" AMD PSTATE Lowest Non-linear Performance: %lu. Lowest Non-linear Frequency: "), + acpi_cppc_get_data(cpu, LOWEST_NONLINEAR_PERF)); + print_speed(amd_pstate_get_data(cpu, AMD_PSTATE_LOWEST_NONLINEAR_FREQ), + no_rounding); + printf(".\n"); + + printf(_(" AMD PSTATE Lowest Performance: %lu. Lowest Frequency: "), + acpi_cppc_get_data(cpu, LOWEST_PERF)); + print_speed(acpi_cppc_get_data(cpu, LOWEST_FREQ) * 1000, no_rounding); + printf(".\n"); +} + /* AMD P-State Helper Functions ************************************/ #endif /* defined(__i386__) || defined(__x86_64__) */ diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index fa2a8c1b1d26..96e4bede078b 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -142,6 +142,8 @@ extern int cpufreq_has_boost_support(unsigned int cpu, int *support, bool cpupower_amd_pstate_enabled(void); void amd_pstate_boost_init(unsigned int cpu, int *support, int *active); +void amd_pstate_show_perf_and_freq(unsigned int cpu, + int no_rounding); /* AMD P-State stuff **************************/ @@ -182,6 +184,9 @@ static inline bool cpupower_amd_pstate_enabled(void) static inline void amd_pstate_boost_init(unsigned int cpu, int *support, int *active) {} +static inline void amd_pstate_show_perf_and_freq(unsigned int cpu, + int no_rounding) +{} /* cpuid and cpuinfo helpers **************************/ From 7b75bbdf5bedebed387aac6ad8411ed1cf3db5d0 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 30 Jan 2022 22:02:03 +0100 Subject: [PATCH 21/58] powercap/dtpm: Change locking scheme The different functions are all called through the dtpm_create_hierarchy() which handle the mutex. The different functions are used in this context, consequently with the lock always held. Remove all locks taken in the function and add the lock in the hierarchy creation function. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220130210210.549877-1-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 95 ++++++++++++----------------------------- 1 file changed, 27 insertions(+), 68 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 414826a1509b..0b0121c37a1b 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -51,9 +51,7 @@ static int get_max_power_range_uw(struct powercap_zone *pcz, u64 *max_power_uw) { struct dtpm *dtpm = to_dtpm(pcz); - mutex_lock(&dtpm_lock); *max_power_uw = dtpm->power_max - dtpm->power_min; - mutex_unlock(&dtpm_lock); return 0; } @@ -83,14 +81,7 @@ static int __get_power_uw(struct dtpm *dtpm, u64 *power_uw) static int get_power_uw(struct powercap_zone *pcz, u64 *power_uw) { - struct dtpm *dtpm = to_dtpm(pcz); - int ret; - - mutex_lock(&dtpm_lock); - ret = __get_power_uw(dtpm, power_uw); - mutex_unlock(&dtpm_lock); - - return ret; + return __get_power_uw(to_dtpm(pcz), power_uw); } static void __dtpm_rebalance_weight(struct dtpm *dtpm) @@ -133,7 +124,16 @@ static void __dtpm_add_power(struct dtpm *dtpm) } } -static int __dtpm_update_power(struct dtpm *dtpm) +/** + * dtpm_update_power - Update the power on the dtpm + * @dtpm: a pointer to a dtpm structure to update + * + * Function to update the power values of the dtpm node specified in + * parameter. These new values will be propagated to the tree. + * + * Return: zero on success, -EINVAL if the values are inconsistent + */ +int dtpm_update_power(struct dtpm *dtpm) { int ret; @@ -155,26 +155,6 @@ static int __dtpm_update_power(struct dtpm *dtpm) return ret; } -/** - * dtpm_update_power - Update the power on the dtpm - * @dtpm: a pointer to a dtpm structure to update - * - * Function to update the power values of the dtpm node specified in - * parameter. These new values will be propagated to the tree. - * - * Return: zero on success, -EINVAL if the values are inconsistent - */ -int dtpm_update_power(struct dtpm *dtpm) -{ - int ret; - - mutex_lock(&dtpm_lock); - ret = __dtpm_update_power(dtpm); - mutex_unlock(&dtpm_lock); - - return ret; -} - /** * dtpm_release_zone - Cleanup when the node is released * @pcz: a pointer to a powercap_zone structure @@ -191,20 +171,14 @@ int dtpm_release_zone(struct powercap_zone *pcz) struct dtpm *dtpm = to_dtpm(pcz); struct dtpm *parent = dtpm->parent; - mutex_lock(&dtpm_lock); - - if (!list_empty(&dtpm->children)) { - mutex_unlock(&dtpm_lock); + if (!list_empty(&dtpm->children)) return -EBUSY; - } if (parent) list_del(&dtpm->sibling); __dtpm_sub_power(dtpm); - mutex_unlock(&dtpm_lock); - if (dtpm->ops) dtpm->ops->release(dtpm); @@ -216,23 +190,12 @@ int dtpm_release_zone(struct powercap_zone *pcz) return 0; } -static int __get_power_limit_uw(struct dtpm *dtpm, int cid, u64 *power_limit) -{ - *power_limit = dtpm->power_limit; - return 0; -} - static int get_power_limit_uw(struct powercap_zone *pcz, int cid, u64 *power_limit) { - struct dtpm *dtpm = to_dtpm(pcz); - int ret; - - mutex_lock(&dtpm_lock); - ret = __get_power_limit_uw(dtpm, cid, power_limit); - mutex_unlock(&dtpm_lock); - - return ret; + *power_limit = to_dtpm(pcz)->power_limit; + + return 0; } /* @@ -292,7 +255,7 @@ static int __set_power_limit_uw(struct dtpm *dtpm, int cid, u64 power_limit) ret = __set_power_limit_uw(child, cid, power); if (!ret) - ret = __get_power_limit_uw(child, cid, &power); + ret = get_power_limit_uw(&child->zone, cid, &power); if (ret) break; @@ -310,8 +273,6 @@ static int set_power_limit_uw(struct powercap_zone *pcz, struct dtpm *dtpm = to_dtpm(pcz); int ret; - mutex_lock(&dtpm_lock); - /* * Don't allow values outside of the power range previously * set when initializing the power numbers. @@ -323,8 +284,6 @@ static int set_power_limit_uw(struct powercap_zone *pcz, pr_debug("%s: power limit: %llu uW, power max: %llu uW\n", dtpm->zone.name, dtpm->power_limit, dtpm->power_max); - mutex_unlock(&dtpm_lock); - return ret; } @@ -335,11 +294,7 @@ static const char *get_constraint_name(struct powercap_zone *pcz, int cid) static int get_max_power_uw(struct powercap_zone *pcz, int id, u64 *max_power) { - struct dtpm *dtpm = to_dtpm(pcz); - - mutex_lock(&dtpm_lock); - *max_power = dtpm->power_max; - mutex_unlock(&dtpm_lock); + *max_power = to_dtpm(pcz)->power_max; return 0; } @@ -442,8 +397,6 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) if (IS_ERR(pcz)) return PTR_ERR(pcz); - mutex_lock(&dtpm_lock); - if (parent) { list_add_tail(&dtpm->sibling, &parent->children); dtpm->parent = parent; @@ -459,8 +412,6 @@ int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) pr_debug("Registered dtpm node '%s' / %llu-%llu uW, \n", dtpm->zone.name, dtpm->power_min, dtpm->power_max); - mutex_unlock(&dtpm_lock); - return 0; } @@ -605,8 +556,12 @@ int dtpm_create_hierarchy(struct of_device_id *dtpm_match_table) struct device_node *np; int i, ret; - if (pct) - return -EBUSY; + mutex_lock(&dtpm_lock); + + if (pct) { + ret = -EBUSY; + goto out_unlock; + } pct = powercap_register_control_type(NULL, "dtpm", NULL); if (IS_ERR(pct)) { @@ -648,12 +603,16 @@ int dtpm_create_hierarchy(struct of_device_id *dtpm_match_table) dtpm_subsys[i]->name, ret); } + mutex_unlock(&dtpm_lock); + return 0; out_err: powercap_unregister_control_type(pct); out_pct: pct = NULL; +out_unlock: + mutex_unlock(&dtpm_lock); return ret; } From 0aea2e4ec2a2bfa2d7e8820e37ba5b5ce04f20a5 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 30 Jan 2022 22:02:04 +0100 Subject: [PATCH 22/58] powercap/dtpm_cpu: Reset per_cpu variable in the release function The release function does not reset the per cpu variable when it is called. That will prevent creation again as the variable will be already from the previous creation. Fix it by resetting them. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220130210210.549877-2-daniel.lezcano@linaro.org --- drivers/powercap/dtpm_cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index eed5ad688d46..71f45d2f5a60 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -151,10 +151,17 @@ static int update_pd_power_uw(struct dtpm *dtpm) static void pd_release(struct dtpm *dtpm) { struct dtpm_cpu *dtpm_cpu = to_dtpm_cpu(dtpm); + struct cpufreq_policy *policy; if (freq_qos_request_active(&dtpm_cpu->qos_req)) freq_qos_remove_request(&dtpm_cpu->qos_req); + policy = cpufreq_cpu_get(dtpm_cpu->cpu); + if (policy) { + for_each_cpu(dtpm_cpu->cpu, policy->related_cpus) + per_cpu(dtpm_per_cpu, dtpm_cpu->cpu) = NULL; + } + kfree(dtpm_cpu); } From 690de0b4013f6f35bc9fced12746b9f396c471ae Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 30 Jan 2022 22:02:05 +0100 Subject: [PATCH 23/58] powercap/dtpm: Fixup kfree for virtual node When the node is virtual there is no release function associated which can free the memory. Free the memory when no 'ops' exists. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220130210210.549877-3-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 0b0121c37a1b..7bddd25a6767 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -181,12 +181,12 @@ int dtpm_release_zone(struct powercap_zone *pcz) if (dtpm->ops) dtpm->ops->release(dtpm); + else + kfree(dtpm); if (root == dtpm) root = NULL; - kfree(dtpm); - return 0; } From c404c64d64bc31bebe8a2015103671f7cd282731 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 30 Jan 2022 22:02:06 +0100 Subject: [PATCH 24/58] powercap/dtpm: Destroy hierarchy function The hierarchy creation function exits but without a destroy hierarchy function. Due to that, the modules creating the hierarchy can not be unloaded properly because they don't have an exit callback. Provide the dtpm_destroy_hierarchy() function to remove the previously created hierarchy. The function relies on all the release mechanisms implemented by the underlying powercap framework. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220130210210.549877-4-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 43 +++++++++++++++++++++++++++++++++++++++++ include/linux/dtpm.h | 3 +++ 2 files changed, 46 insertions(+) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 7bddd25a6767..d9d74f981118 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -617,3 +617,46 @@ out_unlock: return ret; } EXPORT_SYMBOL_GPL(dtpm_create_hierarchy); + +static void __dtpm_destroy_hierarchy(struct dtpm *dtpm) +{ + struct dtpm *child, *aux; + + list_for_each_entry_safe(child, aux, &dtpm->children, sibling) + __dtpm_destroy_hierarchy(child); + + /* + * At this point, we know all children were removed from the + * recursive call before + */ + dtpm_unregister(dtpm); +} + +void dtpm_destroy_hierarchy(void) +{ + int i; + + mutex_lock(&dtpm_lock); + + if (!pct) + goto out_unlock; + + __dtpm_destroy_hierarchy(root); + + + for (i = 0; i < ARRAY_SIZE(dtpm_subsys); i++) { + + if (!dtpm_subsys[i]->exit) + continue; + + dtpm_subsys[i]->exit(); + } + + powercap_unregister_control_type(pct); + + pct = NULL; + +out_unlock: + mutex_unlock(&dtpm_lock); +} +EXPORT_SYMBOL_GPL(dtpm_destroy_hierarchy); diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index f7a25c70dd4c..a4a13514b730 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -37,6 +37,7 @@ struct device_node; struct dtpm_subsys_ops { const char *name; int (*init)(void); + void (*exit)(void); int (*setup)(struct dtpm *, struct device_node *); }; @@ -67,4 +68,6 @@ void dtpm_unregister(struct dtpm *dtpm); int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent); int dtpm_create_hierarchy(struct of_device_id *dtpm_match_table); + +void dtpm_destroy_hierarchy(void); #endif From 4712a236db409d5ee5dccb8c7e57fe54d7d3ec66 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 30 Jan 2022 22:02:07 +0100 Subject: [PATCH 25/58] powercap/dtpm: Move the 'root' reset place The 'root' node is checked everytime a dtpm node is destroyed. When we reach the end of the hierarchy destruction function, we can unconditionnaly set the 'root' node to NULL again. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220130210210.549877-5-daniel.lezcano@linaro.org --- drivers/powercap/dtpm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index d9d74f981118..ec931a06d90a 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -184,9 +184,6 @@ int dtpm_release_zone(struct powercap_zone *pcz) else kfree(dtpm); - if (root == dtpm) - root = NULL; - return 0; } @@ -656,6 +653,8 @@ void dtpm_destroy_hierarchy(void) pct = NULL; + root = NULL; + out_unlock: mutex_unlock(&dtpm_lock); } From bfded2ca8f36935ff13b3b30f8e66d6135e178ac Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 30 Jan 2022 22:02:08 +0100 Subject: [PATCH 26/58] powercap/dtpm_cpu: Add exit function Now that we can destroy the hierarchy, the code must remove what it had put in place at the creation. In our case, the cpu hotplug callbacks. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220130210210.549877-6-daniel.lezcano@linaro.org --- drivers/powercap/dtpm_cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 71f45d2f5a60..bca2f912d349 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -299,8 +299,15 @@ static int dtpm_cpu_init(void) return 0; } +static void dtpm_cpu_exit(void) +{ + cpuhp_remove_state_nocalls(CPUHP_AP_ONLINE_DYN); + cpuhp_remove_state_nocalls(CPUHP_AP_DTPM_CPU_DEAD); +} + struct dtpm_subsys_ops dtpm_cpu_ops = { .name = KBUILD_MODNAME, .init = dtpm_cpu_init, + .exit = dtpm_cpu_exit, .setup = dtpm_cpu_setup, }; From f1ebef9e55f3c49063b575e97d2019832b8f8ef9 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 30 Jan 2022 22:02:09 +0100 Subject: [PATCH 27/58] dtpm/soc/rk3399: Add the ability to unload the module The dtpm hierarchy can now be removed with the dtpm_destroy_hierarchy() function. Add the module_exit() callback so the module can be unloaded by removing the previously created hierarchy. Signed-off-by: Daniel Lezcano Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20220130210210.549877-7-daniel.lezcano@linaro.org --- drivers/soc/rockchip/dtpm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/soc/rockchip/dtpm.c b/drivers/soc/rockchip/dtpm.c index ebebb748488b..5a23784b5221 100644 --- a/drivers/soc/rockchip/dtpm.c +++ b/drivers/soc/rockchip/dtpm.c @@ -52,6 +52,12 @@ static int __init rockchip_dtpm_init(void) } module_init(rockchip_dtpm_init); +static void __exit rockchip_dtpm_exit(void) +{ + return dtpm_destroy_hierarchy(); +} +module_exit(rockchip_dtpm_exit); + MODULE_SOFTDEP("pre: panfrost cpufreq-dt"); MODULE_DESCRIPTION("Rockchip DTPM driver"); MODULE_LICENSE("GPL"); From 8382dce5e4835c045f33b8958a5f559d212cdd11 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Tue, 22 Feb 2022 23:34:26 +0800 Subject: [PATCH 28/58] cpupower: Add "perf" option to print AMD P-State information Add "-c --perf" option in cpupower-frequency-info to get the performance and frequency values for AMD P-State. Commit message amended: Shuah Khan Reviewed-by: Shuah Khan Signed-off-by: Huang Rui Signed-off-by: Shuah Khan --- .../cpupower/man/cpupower-frequency-info.1 | 3 +++ tools/power/cpupower/utils/cpufreq-info.c | 19 ++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/power/cpupower/man/cpupower-frequency-info.1 b/tools/power/cpupower/man/cpupower-frequency-info.1 index 6aa8d239dff9..dd545b499480 100644 --- a/tools/power/cpupower/man/cpupower-frequency-info.1 +++ b/tools/power/cpupower/man/cpupower-frequency-info.1 @@ -53,6 +53,9 @@ human\-readable output for the \-f, \-w, \-s and \-y parameters. \fB\-n\fR \fB\-\-no-rounding\fR Output frequencies and latencies without rounding off values. .TP +\fB\-c\fR \fB\-\-perf\fR +Get performances and frequencies capabilities of CPPC, by reading it from hardware (only available on the hardware with CPPC). +.TP .SH "REMARKS" .LP By default only values of core zero are displayed. How to display settings of diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index 235243ec5ce0..0646f615fe2d 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -438,6 +438,17 @@ static int get_latency(unsigned int cpu, unsigned int human) return 0; } +/* --performance / -c */ + +static int get_perf_cap(unsigned int cpu) +{ + if (cpupower_cpu_info.vendor == X86_VENDOR_AMD && + cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATE) + amd_pstate_show_perf_and_freq(cpu, no_rounding); + + return 0; +} + static void debug_output_one(unsigned int cpu) { struct cpufreq_available_frequencies *freqs; @@ -466,6 +477,7 @@ static void debug_output_one(unsigned int cpu) if (get_freq_hardware(cpu, 1) < 0) get_freq_kernel(cpu, 1); get_boost_mode(cpu); + get_perf_cap(cpu); } static struct option info_opts[] = { @@ -484,6 +496,7 @@ static struct option info_opts[] = { {"proc", no_argument, NULL, 'o'}, {"human", no_argument, NULL, 'm'}, {"no-rounding", no_argument, NULL, 'n'}, + {"performance", no_argument, NULL, 'c'}, { }, }; @@ -497,7 +510,7 @@ int cmd_freq_info(int argc, char **argv) int output_param = 0; do { - ret = getopt_long(argc, argv, "oefwldpgrasmybn", info_opts, + ret = getopt_long(argc, argv, "oefwldpgrasmybnc", info_opts, NULL); switch (ret) { case '?': @@ -520,6 +533,7 @@ int cmd_freq_info(int argc, char **argv) case 'e': case 's': case 'y': + case 'c': if (output_param) { output_param = -1; cont = 0; @@ -626,6 +640,9 @@ int cmd_freq_info(int argc, char **argv) case 'y': ret = get_latency(cpu, human); break; + case 'c': + ret = get_perf_cap(cpu); + break; } if (ret) return ret; From e7d90cfac5510f8c94baa18f9f3f7808859c8332 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 17 Feb 2022 13:49:50 +0100 Subject: [PATCH 29/58] PM: domains: Prevent power off for parent unless child is in deepest state A PM domain managed by genpd may support multiple idlestates (power-off states). During genpd_power_off() a genpd governor may be asked to select one of the idlestates based upon the dev PM QoS constraints, for example. However, there is a problem with the behaviour around this in genpd. More precisely, a parent-domain is allowed to be powered off, no matter of what idlestate that has been selected for the child-domain. For the stm32mp1 platform from STMicro, this behaviour doesn't play well. Instead, the parent-domain must not be powered off, unless the deepest idlestate has been selected for the child-domain. As the current behaviour in genpd is quite questionable anyway, let's simply change it into what is needed by the stm32mp1 platform. If it surprisingly turns out that other platforms may need a different behaviour from genpd, then we will have to revisit this to find a way to make it configurable. Signed-off-by: Ulf Hansson Reviewed-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 5db704f02e71..c87588c21700 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -636,6 +636,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, atomic_read(&genpd->sd_count) > 0) return -EBUSY; + /* + * The children must be in their deepest (powered-off) states to allow + * the parent to be powered off. Note that, there's no need for + * additional locking, as powering on a child, requires the parent's + * lock to be acquired first. + */ + list_for_each_entry(link, &genpd->parent_links, parent_node) { + struct generic_pm_domain *child = link->child; + if (child->state_idx < child->state_count - 1) + return -EBUSY; + } + list_for_each_entry(pdd, &genpd->dev_list, list_node) { enum pm_qos_flags_status stat; @@ -1073,6 +1085,13 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock, || atomic_read(&genpd->sd_count) > 0) return; + /* Check that the children are in their deepest (powered-off) state. */ + list_for_each_entry(link, &genpd->parent_links, parent_node) { + struct generic_pm_domain *child = link->child; + if (child->state_idx < child->state_count - 1) + return; + } + /* Choose the deepest state when suspending */ genpd->state_idx = genpd->state_count - 1; if (_genpd_power_off(genpd, false)) From 9a6582b839281ee0e874621f1a2139d2aeb9489e Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Wed, 23 Feb 2022 09:03:23 +0100 Subject: [PATCH 30/58] PM: domains: use dev_err_probe() to simplify error handling dev_err_probe() can reduce code size, makes the code easier to read and has the added benefit of recording the defer reason for later read out. Use it where appropriate. This also fixes an issue, where an error message in __genpd_dev_pm_attach was not terminated by a line break. Signed-off-by: Ahmad Fatoum Signed-off-by: Sascha Hauer Acked-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index c87588c21700..c0d9ad01b32c 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2267,12 +2267,8 @@ int of_genpd_add_provider_simple(struct device_node *np, /* Parse genpd OPP table */ if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table(&genpd->dev); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&genpd->dev, "Failed to add OPP table: %d\n", - ret); - return ret; - } + if (ret) + return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n"); /* * Save table for faster processing while setting performance @@ -2331,9 +2327,8 @@ int of_genpd_add_provider_onecell(struct device_node *np, if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n", - i, ret); + dev_err_probe(&genpd->dev, ret, + "Failed to add OPP table for index %d\n", i); goto error; } @@ -2691,12 +2686,8 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev, ret = genpd_add_device(pd, dev, base_dev); mutex_unlock(&gpd_list_lock); - if (ret < 0) { - if (ret != -EPROBE_DEFER) - dev_err(dev, "failed to add to PM domain %s: %d", - pd->name, ret); - return ret; - } + if (ret < 0) + return dev_err_probe(dev, ret, "failed to add to PM domain %s\n", pd->name); dev->pm_domain->detach = genpd_dev_pm_detach; dev->pm_domain->sync = genpd_dev_pm_sync; From f6bfe8b5b2c2a5ac8bd2fc7bca3706e6c3fc26d8 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Fri, 25 Feb 2022 14:48:15 +0800 Subject: [PATCH 31/58] PM: domains: Fix sleep-in-atomic bug caused by genpd_debug_remove() When a genpd with GENPD_FLAG_IRQ_SAFE gets removed, the following sleep-in-atomic bug will be seen, as genpd_debug_remove() will be called with a spinlock being held. [ 0.029183] BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1460 [ 0.029204] in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 1, name: swapper/0 [ 0.029219] preempt_count: 1, expected: 0 [ 0.029230] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.17.0-rc4+ #489 [ 0.029245] Hardware name: Thundercomm TurboX CM2290 (DT) [ 0.029256] Call trace: [ 0.029265] dump_backtrace.part.0+0xbc/0xd0 [ 0.029285] show_stack+0x3c/0xa0 [ 0.029298] dump_stack_lvl+0x7c/0xa0 [ 0.029311] dump_stack+0x18/0x34 [ 0.029323] __might_resched+0x10c/0x13c [ 0.029338] __might_sleep+0x4c/0x80 [ 0.029351] down_read+0x24/0xd0 [ 0.029363] lookup_one_len_unlocked+0x9c/0xcc [ 0.029379] lookup_positive_unlocked+0x10/0x50 [ 0.029392] debugfs_lookup+0x68/0xac [ 0.029406] genpd_remove.part.0+0x12c/0x1b4 [ 0.029419] of_genpd_remove_last+0xa8/0xd4 [ 0.029434] psci_cpuidle_domain_probe+0x174/0x53c [ 0.029449] platform_probe+0x68/0xe0 [ 0.029462] really_probe+0x190/0x430 [ 0.029473] __driver_probe_device+0x90/0x18c [ 0.029485] driver_probe_device+0x40/0xe0 [ 0.029497] __driver_attach+0xf4/0x1d0 [ 0.029508] bus_for_each_dev+0x70/0xd0 [ 0.029523] driver_attach+0x24/0x30 [ 0.029534] bus_add_driver+0x164/0x22c [ 0.029545] driver_register+0x78/0x130 [ 0.029556] __platform_driver_register+0x28/0x34 [ 0.029569] psci_idle_init_domains+0x1c/0x28 [ 0.029583] do_one_initcall+0x50/0x1b0 [ 0.029595] kernel_init_freeable+0x214/0x280 [ 0.029609] kernel_init+0x2c/0x13c [ 0.029622] ret_from_fork+0x10/0x20 It doesn't seem necessary to call genpd_debug_remove() with the lock, so move it out from locking to fix the problem. Fixes: 718072ceb211 ("PM: domains: create debugfs nodes when adding power domains") Signed-off-by: Shawn Guo Reviewed-by: Ulf Hansson Cc: 5.11+ # 5.11+ Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index c0d9ad01b32c..1ee878d126fd 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2077,9 +2077,9 @@ static int genpd_remove(struct generic_pm_domain *genpd) kfree(link); } - genpd_debug_remove(genpd); list_del(&genpd->gpd_list_node); genpd_unlock(genpd); + genpd_debug_remove(genpd); cancel_work_sync(&genpd->power_off_work); if (genpd_is_cpu_domain(genpd)) free_cpumask_var(genpd->cpus); From 7dfe105dfc724c82ed3d79a4c47439c516a2410b Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 11 Feb 2022 08:10:27 -0800 Subject: [PATCH 32/58] PM: sleep: wakeup: Fix typos in comments Remove the second 'the'. Replace the second 'of' with 'the'. Replace 'couter' with 'counter'. Signed-off-by: Tom Rix Acked-by: Randy Dunlap Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeirq.c | 2 +- drivers/base/power/wakeup.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index 0004db4a9d3b..d487a6bac630 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq); * * Enables wakeirq conditionally. We need to enable wake-up interrupt * lazily on the first rpm_suspend(). This is needed as the consumer device - * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would + * starts in RPM_SUSPENDED state, and the first pm_runtime_get() would * otherwise try to disable already disabled wakeirq. The wake-up interrupt * starts disabled with IRQ_NOAUTOEN set. * diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 8666590201c9..a57d469676ca 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -587,7 +587,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws) * @ws: Wakeup source to handle. * * Update the @ws' statistics and, if @ws has just been activated, notify the PM - * core of the event by incrementing the counter of of wakeup events being + * core of the event by incrementing the counter of the wakeup events being * processed. */ static void wakeup_source_activate(struct wakeup_source *ws) @@ -733,7 +733,7 @@ static void wakeup_source_deactivate(struct wakeup_source *ws) /* * Increment the counter of registered wakeup events and decrement the - * couter of wakeup events in progress simultaneously. + * counter of wakeup events in progress simultaneously. */ cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count); trace_wakeup_source_deactivate(ws->name, cec); From 444e1154b2bf0b881b65ba1bba5bc8e691fac04a Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 15 Feb 2022 11:37:50 +0800 Subject: [PATCH 33/58] PM: hibernate: Clean up non-kernel-doc comments Address the following W=1 kernel build warning: kernel/power/swap.c:120: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..c51f5507b34f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -89,7 +89,7 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** +/* * The swap_map_handle structure is used for handling swap in * a file-alike way */ @@ -117,7 +117,7 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; -/** +/* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ @@ -171,7 +171,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** +/* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ @@ -190,7 +190,7 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** +/* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. From a644161ba11df38e5582e718c99668e282ddbf36 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 28 Feb 2022 11:57:59 -0800 Subject: [PATCH 34/58] Documentation: admin-guide: pm: Document uncore frequency scaling Added documentation to configure uncore frequency limits in Intel Xeon processors. Signed-off-by: Srinivas Pandruvada [ rjw: Clean up the document wording ] Signed-off-by: Rafael J. Wysocki --- .../pm/intel_uncore_frequency_scaling.rst | 60 +++++++++++++++++++ .../admin-guide/pm/working-state.rst | 1 + 2 files changed, 61 insertions(+) create mode 100644 Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst diff --git a/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst new file mode 100644 index 000000000000..09169d935835 --- /dev/null +++ b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +============================== +Intel Uncore Frequency Scaling +============================== + +:Copyright: |copy| 2022 Intel Corporation + +:Author: Srinivas Pandruvada + +Introduction +------------ + +The uncore can consume significant amount of power in Intel's Xeon servers based +on the workload characteristics. To optimize the total power and improve overall +performance, SoCs have internal algorithms for scaling uncore frequency. These +algorithms monitor workload usage of uncore and set a desirable frequency. + +It is possible that users have different expectations of uncore performance and +want to have control over it. The objective is similar to allowing users to set +the scaling min/max frequencies via cpufreq sysfs to improve CPU performance. +Users may have some latency sensitive workloads where they do not want any +change to uncore frequency. Also, users may have workloads which require +different core and uncore performance at distinct phases and they may want to +use both cpufreq and the uncore scaling interface to distribute power and +improve overall performance. + +Sysfs Interface +--------------- + +To control uncore frequency, a sysfs interface is provided in the directory: +`/sys/devices/system/cpu/intel_uncore_frequency/`. + +There is one directory for each package and die combination as the scope of +uncore scaling control is per die in multiple die/package SoCs or per +package for single die per package SoCs. The name represents the +scope of control. For example: 'package_00_die_00' is for package id 0 and +die 0. + +Each package_*_die_* contains the following attributes: + +``initial_max_freq_khz`` + Out of reset, this attribute represent the maximum possible frequency. + This is a read-only attribute. If users adjust max_freq_khz, + they can always go back to maximum using the value from this attribute. + +``initial_min_freq_khz`` + Out of reset, this attribute represent the minimum possible frequency. + This is a read-only attribute. If users adjust min_freq_khz, + they can always go back to minimum using the value from this attribute. + +``max_freq_khz`` + This attribute is used to set the maximum uncore frequency. + +``min_freq_khz`` + This attribute is used to set the minimum uncore frequency. + +``current_freq_khz`` + This attribute is used to get the current uncore frequency. diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index 5d2757e2de65..ee45887811ff 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -15,3 +15,4 @@ Working-State Power Management cpufreq_drivers intel_epb intel-speed-select + intel_uncore_frequency_scaling From ba7ffcd4c4da374b0f64666354eeeda7d3827131 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Feb 2022 14:05:32 -0800 Subject: [PATCH 35/58] PM: hibernate: fix __setup handler error handling If an invalid value is used in "resumedelay=", it is silently ignored. Add a warning message and then let the __setup handler return 1 to indicate that the kernel command line option has been handled. Fixes: 317cf7e5e85e3 ("PM / hibernate: convert simple_strtoul to kstrtoul") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 49d1df0218cb..0ac805b753e5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1330,7 +1330,7 @@ static int __init resumedelay_setup(char *str) int rc = kstrtouint(str, 0, &resume_delay); if (rc) - return rc; + pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } From 7a64ca17e4dd50d5f910769167f3553902777844 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Feb 2022 14:05:44 -0800 Subject: [PATCH 36/58] PM: suspend: fix return value of __setup handler If an invalid option is given for "test_suspend=