From 3f51aa9e296fe4af785d5761bb12556fb2494761 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 9 Feb 2022 19:29:51 +0800 Subject: [PATCH 01/11] PM: hibernate: fix load_image_and_restore() error path As 'swsusp_check' open 'hib_resume_bdev', if call 'create_basic_memory_bitmaps' failed, we need to close 'hib_resume_bdev' in 'load_image_and_restore' function. Signed-off-by: Ye Bin [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..49d1df0218cb 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -689,8 +689,10 @@ static int load_image_and_restore(void) lock_device_hotplug(); error = create_basic_memory_bitmaps(); - if (error) + if (error) { + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; + } error = swsusp_read(&flags); swsusp_close(FMODE_READ | FMODE_EXCL); From e7d90cfac5510f8c94baa18f9f3f7808859c8332 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 17 Feb 2022 13:49:50 +0100 Subject: [PATCH 02/11] PM: domains: Prevent power off for parent unless child is in deepest state A PM domain managed by genpd may support multiple idlestates (power-off states). During genpd_power_off() a genpd governor may be asked to select one of the idlestates based upon the dev PM QoS constraints, for example. However, there is a problem with the behaviour around this in genpd. More precisely, a parent-domain is allowed to be powered off, no matter of what idlestate that has been selected for the child-domain. For the stm32mp1 platform from STMicro, this behaviour doesn't play well. Instead, the parent-domain must not be powered off, unless the deepest idlestate has been selected for the child-domain. As the current behaviour in genpd is quite questionable anyway, let's simply change it into what is needed by the stm32mp1 platform. If it surprisingly turns out that other platforms may need a different behaviour from genpd, then we will have to revisit this to find a way to make it configurable. Signed-off-by: Ulf Hansson Reviewed-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 5db704f02e71..c87588c21700 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -636,6 +636,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, atomic_read(&genpd->sd_count) > 0) return -EBUSY; + /* + * The children must be in their deepest (powered-off) states to allow + * the parent to be powered off. Note that, there's no need for + * additional locking, as powering on a child, requires the parent's + * lock to be acquired first. + */ + list_for_each_entry(link, &genpd->parent_links, parent_node) { + struct generic_pm_domain *child = link->child; + if (child->state_idx < child->state_count - 1) + return -EBUSY; + } + list_for_each_entry(pdd, &genpd->dev_list, list_node) { enum pm_qos_flags_status stat; @@ -1073,6 +1085,13 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock, || atomic_read(&genpd->sd_count) > 0) return; + /* Check that the children are in their deepest (powered-off) state. */ + list_for_each_entry(link, &genpd->parent_links, parent_node) { + struct generic_pm_domain *child = link->child; + if (child->state_idx < child->state_count - 1) + return; + } + /* Choose the deepest state when suspending */ genpd->state_idx = genpd->state_count - 1; if (_genpd_power_off(genpd, false)) From 9a6582b839281ee0e874621f1a2139d2aeb9489e Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Wed, 23 Feb 2022 09:03:23 +0100 Subject: [PATCH 03/11] PM: domains: use dev_err_probe() to simplify error handling dev_err_probe() can reduce code size, makes the code easier to read and has the added benefit of recording the defer reason for later read out. Use it where appropriate. This also fixes an issue, where an error message in __genpd_dev_pm_attach was not terminated by a line break. Signed-off-by: Ahmad Fatoum Signed-off-by: Sascha Hauer Acked-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index c87588c21700..c0d9ad01b32c 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2267,12 +2267,8 @@ int of_genpd_add_provider_simple(struct device_node *np, /* Parse genpd OPP table */ if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table(&genpd->dev); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&genpd->dev, "Failed to add OPP table: %d\n", - ret); - return ret; - } + if (ret) + return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n"); /* * Save table for faster processing while setting performance @@ -2331,9 +2327,8 @@ int of_genpd_add_provider_onecell(struct device_node *np, if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n", - i, ret); + dev_err_probe(&genpd->dev, ret, + "Failed to add OPP table for index %d\n", i); goto error; } @@ -2691,12 +2686,8 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev, ret = genpd_add_device(pd, dev, base_dev); mutex_unlock(&gpd_list_lock); - if (ret < 0) { - if (ret != -EPROBE_DEFER) - dev_err(dev, "failed to add to PM domain %s: %d", - pd->name, ret); - return ret; - } + if (ret < 0) + return dev_err_probe(dev, ret, "failed to add to PM domain %s\n", pd->name); dev->pm_domain->detach = genpd_dev_pm_detach; dev->pm_domain->sync = genpd_dev_pm_sync; From f6bfe8b5b2c2a5ac8bd2fc7bca3706e6c3fc26d8 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Fri, 25 Feb 2022 14:48:15 +0800 Subject: [PATCH 04/11] PM: domains: Fix sleep-in-atomic bug caused by genpd_debug_remove() When a genpd with GENPD_FLAG_IRQ_SAFE gets removed, the following sleep-in-atomic bug will be seen, as genpd_debug_remove() will be called with a spinlock being held. [ 0.029183] BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1460 [ 0.029204] in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 1, name: swapper/0 [ 0.029219] preempt_count: 1, expected: 0 [ 0.029230] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.17.0-rc4+ #489 [ 0.029245] Hardware name: Thundercomm TurboX CM2290 (DT) [ 0.029256] Call trace: [ 0.029265] dump_backtrace.part.0+0xbc/0xd0 [ 0.029285] show_stack+0x3c/0xa0 [ 0.029298] dump_stack_lvl+0x7c/0xa0 [ 0.029311] dump_stack+0x18/0x34 [ 0.029323] __might_resched+0x10c/0x13c [ 0.029338] __might_sleep+0x4c/0x80 [ 0.029351] down_read+0x24/0xd0 [ 0.029363] lookup_one_len_unlocked+0x9c/0xcc [ 0.029379] lookup_positive_unlocked+0x10/0x50 [ 0.029392] debugfs_lookup+0x68/0xac [ 0.029406] genpd_remove.part.0+0x12c/0x1b4 [ 0.029419] of_genpd_remove_last+0xa8/0xd4 [ 0.029434] psci_cpuidle_domain_probe+0x174/0x53c [ 0.029449] platform_probe+0x68/0xe0 [ 0.029462] really_probe+0x190/0x430 [ 0.029473] __driver_probe_device+0x90/0x18c [ 0.029485] driver_probe_device+0x40/0xe0 [ 0.029497] __driver_attach+0xf4/0x1d0 [ 0.029508] bus_for_each_dev+0x70/0xd0 [ 0.029523] driver_attach+0x24/0x30 [ 0.029534] bus_add_driver+0x164/0x22c [ 0.029545] driver_register+0x78/0x130 [ 0.029556] __platform_driver_register+0x28/0x34 [ 0.029569] psci_idle_init_domains+0x1c/0x28 [ 0.029583] do_one_initcall+0x50/0x1b0 [ 0.029595] kernel_init_freeable+0x214/0x280 [ 0.029609] kernel_init+0x2c/0x13c [ 0.029622] ret_from_fork+0x10/0x20 It doesn't seem necessary to call genpd_debug_remove() with the lock, so move it out from locking to fix the problem. Fixes: 718072ceb211 ("PM: domains: create debugfs nodes when adding power domains") Signed-off-by: Shawn Guo Reviewed-by: Ulf Hansson Cc: 5.11+ # 5.11+ Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index c0d9ad01b32c..1ee878d126fd 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2077,9 +2077,9 @@ static int genpd_remove(struct generic_pm_domain *genpd) kfree(link); } - genpd_debug_remove(genpd); list_del(&genpd->gpd_list_node); genpd_unlock(genpd); + genpd_debug_remove(genpd); cancel_work_sync(&genpd->power_off_work); if (genpd_is_cpu_domain(genpd)) free_cpumask_var(genpd->cpus); From 7dfe105dfc724c82ed3d79a4c47439c516a2410b Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 11 Feb 2022 08:10:27 -0800 Subject: [PATCH 05/11] PM: sleep: wakeup: Fix typos in comments Remove the second 'the'. Replace the second 'of' with 'the'. Replace 'couter' with 'counter'. Signed-off-by: Tom Rix Acked-by: Randy Dunlap Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeirq.c | 2 +- drivers/base/power/wakeup.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index 0004db4a9d3b..d487a6bac630 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq); * * Enables wakeirq conditionally. We need to enable wake-up interrupt * lazily on the first rpm_suspend(). This is needed as the consumer device - * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would + * starts in RPM_SUSPENDED state, and the first pm_runtime_get() would * otherwise try to disable already disabled wakeirq. The wake-up interrupt * starts disabled with IRQ_NOAUTOEN set. * diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 8666590201c9..a57d469676ca 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -587,7 +587,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws) * @ws: Wakeup source to handle. * * Update the @ws' statistics and, if @ws has just been activated, notify the PM - * core of the event by incrementing the counter of of wakeup events being + * core of the event by incrementing the counter of the wakeup events being * processed. */ static void wakeup_source_activate(struct wakeup_source *ws) @@ -733,7 +733,7 @@ static void wakeup_source_deactivate(struct wakeup_source *ws) /* * Increment the counter of registered wakeup events and decrement the - * couter of wakeup events in progress simultaneously. + * counter of wakeup events in progress simultaneously. */ cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count); trace_wakeup_source_deactivate(ws->name, cec); From 444e1154b2bf0b881b65ba1bba5bc8e691fac04a Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 15 Feb 2022 11:37:50 +0800 Subject: [PATCH 06/11] PM: hibernate: Clean up non-kernel-doc comments Address the following W=1 kernel build warning: kernel/power/swap.c:120: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..c51f5507b34f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -89,7 +89,7 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** +/* * The swap_map_handle structure is used for handling swap in * a file-alike way */ @@ -117,7 +117,7 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; -/** +/* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ @@ -171,7 +171,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** +/* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ @@ -190,7 +190,7 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** +/* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. From a644161ba11df38e5582e718c99668e282ddbf36 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 28 Feb 2022 11:57:59 -0800 Subject: [PATCH 07/11] Documentation: admin-guide: pm: Document uncore frequency scaling Added documentation to configure uncore frequency limits in Intel Xeon processors. Signed-off-by: Srinivas Pandruvada [ rjw: Clean up the document wording ] Signed-off-by: Rafael J. Wysocki --- .../pm/intel_uncore_frequency_scaling.rst | 60 +++++++++++++++++++ .../admin-guide/pm/working-state.rst | 1 + 2 files changed, 61 insertions(+) create mode 100644 Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst diff --git a/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst new file mode 100644 index 000000000000..09169d935835 --- /dev/null +++ b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +============================== +Intel Uncore Frequency Scaling +============================== + +:Copyright: |copy| 2022 Intel Corporation + +:Author: Srinivas Pandruvada + +Introduction +------------ + +The uncore can consume significant amount of power in Intel's Xeon servers based +on the workload characteristics. To optimize the total power and improve overall +performance, SoCs have internal algorithms for scaling uncore frequency. These +algorithms monitor workload usage of uncore and set a desirable frequency. + +It is possible that users have different expectations of uncore performance and +want to have control over it. The objective is similar to allowing users to set +the scaling min/max frequencies via cpufreq sysfs to improve CPU performance. +Users may have some latency sensitive workloads where they do not want any +change to uncore frequency. Also, users may have workloads which require +different core and uncore performance at distinct phases and they may want to +use both cpufreq and the uncore scaling interface to distribute power and +improve overall performance. + +Sysfs Interface +--------------- + +To control uncore frequency, a sysfs interface is provided in the directory: +`/sys/devices/system/cpu/intel_uncore_frequency/`. + +There is one directory for each package and die combination as the scope of +uncore scaling control is per die in multiple die/package SoCs or per +package for single die per package SoCs. The name represents the +scope of control. For example: 'package_00_die_00' is for package id 0 and +die 0. + +Each package_*_die_* contains the following attributes: + +``initial_max_freq_khz`` + Out of reset, this attribute represent the maximum possible frequency. + This is a read-only attribute. If users adjust max_freq_khz, + they can always go back to maximum using the value from this attribute. + +``initial_min_freq_khz`` + Out of reset, this attribute represent the minimum possible frequency. + This is a read-only attribute. If users adjust min_freq_khz, + they can always go back to minimum using the value from this attribute. + +``max_freq_khz`` + This attribute is used to set the maximum uncore frequency. + +``min_freq_khz`` + This attribute is used to set the minimum uncore frequency. + +``current_freq_khz`` + This attribute is used to get the current uncore frequency. diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index 5d2757e2de65..ee45887811ff 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -15,3 +15,4 @@ Working-State Power Management cpufreq_drivers intel_epb intel-speed-select + intel_uncore_frequency_scaling From ba7ffcd4c4da374b0f64666354eeeda7d3827131 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Feb 2022 14:05:32 -0800 Subject: [PATCH 08/11] PM: hibernate: fix __setup handler error handling If an invalid value is used in "resumedelay=", it is silently ignored. Add a warning message and then let the __setup handler return 1 to indicate that the kernel command line option has been handled. Fixes: 317cf7e5e85e3 ("PM / hibernate: convert simple_strtoul to kstrtoul") Signed-off-by: Randy Dunlap Reported-by: Igor Zhbanov Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 49d1df0218cb..0ac805b753e5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1330,7 +1330,7 @@ static int __init resumedelay_setup(char *str) int rc = kstrtouint(str, 0, &resume_delay); if (rc) - return rc; + pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } From 7a64ca17e4dd50d5f910769167f3553902777844 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Feb 2022 14:05:44 -0800 Subject: [PATCH 09/11] PM: suspend: fix return value of __setup handler If an invalid option is given for "test_suspend=