On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the cpus cannot be independently powered down, either due to sequencing restrictions (on Tegra 2, cpu 0 must be the last to power down), or due to HW bugs (on OMAP4460, a cpu powering up will corrupt the gic state unless the other cpu runs a work around). Each cpu has a power state that it can enter without coordinating with the other cpu (usually Wait For Interrupt, or WFI), and one or more "coupled" power states that affect blocks shared between the cpus (L2 cache, interrupt controller, and sometimes the whole SoC). Entering a coupled power state must be tightly controlled on both cpus. The easiest solution to implementing coupled cpu power states is to hotplug all but one cpu whenever possible, usually using a cpufreq governor that looks at cpu load to determine when to enable the secondary cpus. This causes problems, as hotplug is an expensive operation, so the number of hotplug transitions must be minimized, leading to very slow response to loads, often on the order of seconds. This file implements an alternative solution, where each cpu will wait in the WFI state until all cpus are ready to enter a coupled state, at which point the coupled state function will be called on all cpus at approximately the same time. Once all cpus are ready to enter idle, they are woken by an smp cross call. At this point, there is a chance that one of the cpus will find work to do, and choose not to enter idle. A final pass is needed to guarantee that all cpus will call the power state enter function at the same time. During this pass, each cpu will increment the ready counter, and continue once the ready counter matches the number of online coupled cpus. If any cpu exits idle, the other cpus will decrement their counter and retry. To use coupled cpuidle states, a cpuidle driver must: Set struct cpuidle_device.coupled_cpus to the mask of all coupled cpus, usually the same as cpu_possible_mask if all cpus are part of the same cluster. The coupled_cpus mask must be set in the struct cpuidle_device for each cpu. Set struct cpuidle_device.safe_state to a state that is not a coupled state. This is usually WFI. Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each state that affects multiple cpus. Provide a struct cpuidle_state.enter function for each state that affects multiple cpus. This function is guaranteed to be called on all cpus at approximately the same time. The driver should ensure that the cpus all abort together if any cpu tries to abort once the function is called. update1: cpuidle: coupled: fix count of online cpus online_count was never incremented on boot, and was also counting cpus that were not part of the coupled set. Fix both issues by introducting a new function that counts online coupled cpus, and call it from register as well as the hotplug notifier. update2: cpuidle: coupled: fix decrementing ready count cpuidle_coupled_set_not_ready sometimes refuses to decrement the ready count in order to prevent a race condition. This makes it unsuitable for use when finished with idle. Add a new function cpuidle_coupled_set_done that decrements both the ready count and waiting count, and call it after idle is complete. Cc: Amit Kucheria <amit.kucheria@linaro.org> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Trinabh Gupta <g.trinabh@gmail.com> Cc: Deepthi Dharwar <deepthi@linux.vnet.ibm.com> Reviewed-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Reviewed-by: Kevin Hilman <khilman@ti.com> Tested-by: Kevin Hilman <khilman@ti.com> Signed-off-by: Colin Cross <ccross@android.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Len Brown <len.brown@intel.com>
543 lines
12 KiB
C
543 lines
12 KiB
C
/*
|
|
* cpuidle.c - core cpuidle infrastructure
|
|
*
|
|
* (C) 2006-2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
|
|
* Shaohua Li <shaohua.li@intel.com>
|
|
* Adam Belay <abelay@novell.com>
|
|
*
|
|
* This code is licenced under the GPL.
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/pm_qos.h>
|
|
#include <linux/cpu.h>
|
|
#include <linux/cpuidle.h>
|
|
#include <linux/ktime.h>
|
|
#include <linux/hrtimer.h>
|
|
#include <linux/module.h>
|
|
#include <trace/events/power.h>
|
|
|
|
#include "cpuidle.h"
|
|
|
|
DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
|
|
|
|
DEFINE_MUTEX(cpuidle_lock);
|
|
LIST_HEAD(cpuidle_detected_devices);
|
|
|
|
static int enabled_devices;
|
|
static int off __read_mostly;
|
|
static int initialized __read_mostly;
|
|
|
|
int cpuidle_disabled(void)
|
|
{
|
|
return off;
|
|
}
|
|
void disable_cpuidle(void)
|
|
{
|
|
off = 1;
|
|
}
|
|
|
|
#if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
|
|
static void cpuidle_kick_cpus(void)
|
|
{
|
|
cpu_idle_wait();
|
|
}
|
|
#elif defined(CONFIG_SMP)
|
|
# error "Arch needs cpu_idle_wait() equivalent here"
|
|
#else /* !CONFIG_ARCH_HAS_CPU_IDLE_WAIT && !CONFIG_SMP */
|
|
static void cpuidle_kick_cpus(void) {}
|
|
#endif
|
|
|
|
static int __cpuidle_register_device(struct cpuidle_device *dev);
|
|
|
|
static inline int cpuidle_enter(struct cpuidle_device *dev,
|
|
struct cpuidle_driver *drv, int index)
|
|
{
|
|
struct cpuidle_state *target_state = &drv->states[index];
|
|
return target_state->enter(dev, drv, index);
|
|
}
|
|
|
|
static inline int cpuidle_enter_tk(struct cpuidle_device *dev,
|
|
struct cpuidle_driver *drv, int index)
|
|
{
|
|
return cpuidle_wrap_enter(dev, drv, index, cpuidle_enter);
|
|
}
|
|
|
|
typedef int (*cpuidle_enter_t)(struct cpuidle_device *dev,
|
|
struct cpuidle_driver *drv, int index);
|
|
|
|
static cpuidle_enter_t cpuidle_enter_ops;
|
|
|
|
/**
|
|
* cpuidle_play_dead - cpu off-lining
|
|
*
|
|
* Returns in case of an error or no driver
|
|
*/
|
|
int cpuidle_play_dead(void)
|
|
{
|
|
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
|
|
struct cpuidle_driver *drv = cpuidle_get_driver();
|
|
int i, dead_state = -1;
|
|
int power_usage = -1;
|
|
|
|
if (!drv)
|
|
return -ENODEV;
|
|
|
|
/* Find lowest-power state that supports long-term idle */
|
|
for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
|
|
struct cpuidle_state *s = &drv->states[i];
|
|
|
|
if (s->power_usage < power_usage && s->enter_dead) {
|
|
power_usage = s->power_usage;
|
|
dead_state = i;
|
|
}
|
|
}
|
|
|
|
if (dead_state != -1)
|
|
return drv->states[dead_state].enter_dead(dev, dead_state);
|
|
|
|
return -ENODEV;
|
|
}
|
|
|
|
/**
|
|
* cpuidle_enter_state - enter the state and update stats
|
|
* @dev: cpuidle device for this cpu
|
|
* @drv: cpuidle driver for this cpu
|
|
* @next_state: index into drv->states of the state to enter
|
|
*/
|
|
int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
|
|
int next_state)
|
|
{
|
|
int entered_state;
|
|
|
|
entered_state = cpuidle_enter_ops(dev, drv, next_state);
|
|
|
|
if (entered_state >= 0) {
|
|
/* Update cpuidle counters */
|
|
/* This can be moved to within driver enter routine
|
|
* but that results in multiple copies of same code.
|
|
*/
|
|
dev->states_usage[entered_state].time +=
|
|
(unsigned long long)dev->last_residency;
|
|
dev->states_usage[entered_state].usage++;
|
|
} else {
|
|
dev->last_residency = 0;
|
|
}
|
|
|
|
return entered_state;
|
|
}
|
|
|
|
/**
|
|
* cpuidle_idle_call - the main idle loop
|
|
*
|
|
* NOTE: no locks or semaphores should be used here
|
|
* return non-zero on failure
|
|
*/
|
|
int cpuidle_idle_call(void)
|
|
{
|
|
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
|
|
struct cpuidle_driver *drv = cpuidle_get_driver();
|
|
int next_state, entered_state;
|
|
|
|
if (off)
|
|
return -ENODEV;
|
|
|
|
if (!initialized)
|
|
return -ENODEV;
|
|
|
|
/* check if the device is ready */
|
|
if (!dev || !dev->enabled)
|
|
return -EBUSY;
|
|
|
|
#if 0
|
|
/* shows regressions, re-enable for 2.6.29 */
|
|
/*
|
|
* run any timers that can be run now, at this point
|
|
* before calculating the idle duration etc.
|
|
*/
|
|
hrtimer_peek_ahead_timers();
|
|
#endif
|
|
|
|
/* ask the governor for the next state */
|
|
next_state = cpuidle_curr_governor->select(drv, dev);
|
|
if (need_resched()) {
|
|
local_irq_enable();
|
|
return 0;
|
|
}
|
|
|
|
trace_power_start_rcuidle(POWER_CSTATE, next_state, dev->cpu);
|
|
trace_cpu_idle_rcuidle(next_state, dev->cpu);
|
|
|
|
if (cpuidle_state_is_coupled(dev, drv, next_state))
|
|
entered_state = cpuidle_enter_state_coupled(dev, drv,
|
|
next_state);
|
|
else
|
|
entered_state = cpuidle_enter_state(dev, drv, next_state);
|
|
|
|
trace_power_end_rcuidle(dev->cpu);
|
|
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
|
|
|
|
/* give the governor an opportunity to reflect on the outcome */
|
|
if (cpuidle_curr_governor->reflect)
|
|
cpuidle_curr_governor->reflect(dev, entered_state);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* cpuidle_install_idle_handler - installs the cpuidle idle loop handler
|
|
*/
|
|
void cpuidle_install_idle_handler(void)
|
|
{
|
|
if (enabled_devices) {
|
|
/* Make sure all changes finished before we switch to new idle */
|
|
smp_wmb();
|
|
initialized = 1;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler
|
|
*/
|
|
void cpuidle_uninstall_idle_handler(void)
|
|
{
|
|
if (enabled_devices) {
|
|
initialized = 0;
|
|
cpuidle_kick_cpus();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* cpuidle_pause_and_lock - temporarily disables CPUIDLE
|
|
*/
|
|
void cpuidle_pause_and_lock(void)
|
|
{
|
|
mutex_lock(&cpuidle_lock);
|
|
cpuidle_uninstall_idle_handler();
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
|
|
|
|
/**
|
|
* cpuidle_resume_and_unlock - resumes CPUIDLE operation
|
|
*/
|
|
void cpuidle_resume_and_unlock(void)
|
|
{
|
|
cpuidle_install_idle_handler();
|
|
mutex_unlock(&cpuidle_lock);
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
|
|
|
|
/**
|
|
* cpuidle_wrap_enter - performs timekeeping and irqen around enter function
|
|
* @dev: pointer to a valid cpuidle_device object
|
|
* @drv: pointer to a valid cpuidle_driver object
|
|
* @index: index of the target cpuidle state.
|
|
*/
|
|
int cpuidle_wrap_enter(struct cpuidle_device *dev,
|
|
struct cpuidle_driver *drv, int index,
|
|
int (*enter)(struct cpuidle_device *dev,
|
|
struct cpuidle_driver *drv, int index))
|
|
{
|
|
ktime_t time_start, time_end;
|
|
s64 diff;
|
|
|
|
time_start = ktime_get();
|
|
|
|
index = enter(dev, drv, index);
|
|
|
|
time_end = ktime_get();
|
|
|
|
local_irq_enable();
|
|
|
|
diff = ktime_to_us(ktime_sub(time_end, time_start));
|
|
if (diff > INT_MAX)
|
|
diff = INT_MAX;
|
|
|
|
dev->last_residency = (int) diff;
|
|
|
|
return index;
|
|
}
|
|
|
|
#ifdef CONFIG_ARCH_HAS_CPU_RELAX
|
|
static int poll_idle(struct cpuidle_device *dev,
|
|
struct cpuidle_driver *drv, int index)
|
|
{
|
|
ktime_t t1, t2;
|
|
s64 diff;
|
|
|
|
t1 = ktime_get();
|
|
local_irq_enable();
|
|
while (!need_resched())
|
|
cpu_relax();
|
|
|
|
t2 = ktime_get();
|
|
diff = ktime_to_us(ktime_sub(t2, t1));
|
|
if (diff > INT_MAX)
|
|
diff = INT_MAX;
|
|
|
|
dev->last_residency = (int) diff;
|
|
|
|
return index;
|
|
}
|
|
|
|
static void poll_idle_init(struct cpuidle_driver *drv)
|
|
{
|
|
struct cpuidle_state *state = &drv->states[0];
|
|
|
|
snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");
|
|
snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
|
|
state->exit_latency = 0;
|
|
state->target_residency = 0;
|
|
state->power_usage = -1;
|
|
state->flags = 0;
|
|
state->enter = poll_idle;
|
|
state->disable = 0;
|
|
}
|
|
#else
|
|
static void poll_idle_init(struct cpuidle_driver *drv) {}
|
|
#endif /* CONFIG_ARCH_HAS_CPU_RELAX */
|
|
|
|
/**
|
|
* cpuidle_enable_device - enables idle PM for a CPU
|
|
* @dev: the CPU
|
|
*
|
|
* This function must be called between cpuidle_pause_and_lock and
|
|
* cpuidle_resume_and_unlock when used externally.
|
|
*/
|
|
int cpuidle_enable_device(struct cpuidle_device *dev)
|
|
{
|
|
int ret, i;
|
|
struct cpuidle_driver *drv = cpuidle_get_driver();
|
|
|
|
if (dev->enabled)
|
|
return 0;
|
|
if (!drv || !cpuidle_curr_governor)
|
|
return -EIO;
|
|
if (!dev->state_count)
|
|
dev->state_count = drv->state_count;
|
|
|
|
if (dev->registered == 0) {
|
|
ret = __cpuidle_register_device(dev);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
cpuidle_enter_ops = drv->en_core_tk_irqen ?
|
|
cpuidle_enter_tk : cpuidle_enter;
|
|
|
|
poll_idle_init(drv);
|
|
|
|
if ((ret = cpuidle_add_state_sysfs(dev)))
|
|
return ret;
|
|
|
|
if (cpuidle_curr_governor->enable &&
|
|
(ret = cpuidle_curr_governor->enable(drv, dev)))
|
|
goto fail_sysfs;
|
|
|
|
for (i = 0; i < dev->state_count; i++) {
|
|
dev->states_usage[i].usage = 0;
|
|
dev->states_usage[i].time = 0;
|
|
}
|
|
dev->last_residency = 0;
|
|
|
|
smp_wmb();
|
|
|
|
dev->enabled = 1;
|
|
|
|
enabled_devices++;
|
|
return 0;
|
|
|
|
fail_sysfs:
|
|
cpuidle_remove_state_sysfs(dev);
|
|
|
|
return ret;
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cpuidle_enable_device);
|
|
|
|
/**
|
|
* cpuidle_disable_device - disables idle PM for a CPU
|
|
* @dev: the CPU
|
|
*
|
|
* This function must be called between cpuidle_pause_and_lock and
|
|
* cpuidle_resume_and_unlock when used externally.
|
|
*/
|
|
void cpuidle_disable_device(struct cpuidle_device *dev)
|
|
{
|
|
if (!dev->enabled)
|
|
return;
|
|
if (!cpuidle_get_driver() || !cpuidle_curr_governor)
|
|
return;
|
|
|
|
dev->enabled = 0;
|
|
|
|
if (cpuidle_curr_governor->disable)
|
|
cpuidle_curr_governor->disable(cpuidle_get_driver(), dev);
|
|
|
|
cpuidle_remove_state_sysfs(dev);
|
|
enabled_devices--;
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cpuidle_disable_device);
|
|
|
|
/**
|
|
* __cpuidle_register_device - internal register function called before register
|
|
* and enable routines
|
|
* @dev: the cpu
|
|
*
|
|
* cpuidle_lock mutex must be held before this is called
|
|
*/
|
|
static int __cpuidle_register_device(struct cpuidle_device *dev)
|
|
{
|
|
int ret;
|
|
struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);
|
|
struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver();
|
|
|
|
if (!dev)
|
|
return -EINVAL;
|
|
if (!try_module_get(cpuidle_driver->owner))
|
|
return -EINVAL;
|
|
|
|
init_completion(&dev->kobj_unregister);
|
|
|
|
per_cpu(cpuidle_devices, dev->cpu) = dev;
|
|
list_add(&dev->device_list, &cpuidle_detected_devices);
|
|
ret = cpuidle_add_sysfs(cpu_dev);
|
|
if (ret)
|
|
goto err_sysfs;
|
|
|
|
ret = cpuidle_coupled_register_device(dev);
|
|
if (ret)
|
|
goto err_coupled;
|
|
|
|
dev->registered = 1;
|
|
return 0;
|
|
|
|
err_coupled:
|
|
cpuidle_remove_sysfs(cpu_dev);
|
|
wait_for_completion(&dev->kobj_unregister);
|
|
err_sysfs:
|
|
list_del(&dev->device_list);
|
|
per_cpu(cpuidle_devices, dev->cpu) = NULL;
|
|
module_put(cpuidle_driver->owner);
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* cpuidle_register_device - registers a CPU's idle PM feature
|
|
* @dev: the cpu
|
|
*/
|
|
int cpuidle_register_device(struct cpuidle_device *dev)
|
|
{
|
|
int ret;
|
|
|
|
mutex_lock(&cpuidle_lock);
|
|
|
|
if ((ret = __cpuidle_register_device(dev))) {
|
|
mutex_unlock(&cpuidle_lock);
|
|
return ret;
|
|
}
|
|
|
|
cpuidle_enable_device(dev);
|
|
cpuidle_install_idle_handler();
|
|
|
|
mutex_unlock(&cpuidle_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cpuidle_register_device);
|
|
|
|
/**
|
|
* cpuidle_unregister_device - unregisters a CPU's idle PM feature
|
|
* @dev: the cpu
|
|
*/
|
|
void cpuidle_unregister_device(struct cpuidle_device *dev)
|
|
{
|
|
struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);
|
|
struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver();
|
|
|
|
if (dev->registered == 0)
|
|
return;
|
|
|
|
cpuidle_pause_and_lock();
|
|
|
|
cpuidle_disable_device(dev);
|
|
|
|
cpuidle_remove_sysfs(cpu_dev);
|
|
list_del(&dev->device_list);
|
|
wait_for_completion(&dev->kobj_unregister);
|
|
per_cpu(cpuidle_devices, dev->cpu) = NULL;
|
|
|
|
cpuidle_coupled_unregister_device(dev);
|
|
|
|
cpuidle_resume_and_unlock();
|
|
|
|
module_put(cpuidle_driver->owner);
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cpuidle_unregister_device);
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
static void smp_callback(void *v)
|
|
{
|
|
/* we already woke the CPU up, nothing more to do */
|
|
}
|
|
|
|
/*
|
|
* This function gets called when a part of the kernel has a new latency
|
|
* requirement. This means we need to get all processors out of their C-state,
|
|
* and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that
|
|
* wakes them all right up.
|
|
*/
|
|
static int cpuidle_latency_notify(struct notifier_block *b,
|
|
unsigned long l, void *v)
|
|
{
|
|
smp_call_function(smp_callback, NULL, 1);
|
|
return NOTIFY_OK;
|
|
}
|
|
|
|
static struct notifier_block cpuidle_latency_notifier = {
|
|
.notifier_call = cpuidle_latency_notify,
|
|
};
|
|
|
|
static inline void latency_notifier_init(struct notifier_block *n)
|
|
{
|
|
pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY, n);
|
|
}
|
|
|
|
#else /* CONFIG_SMP */
|
|
|
|
#define latency_notifier_init(x) do { } while (0)
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
/**
|
|
* cpuidle_init - core initializer
|
|
*/
|
|
static int __init cpuidle_init(void)
|
|
{
|
|
int ret;
|
|
|
|
if (cpuidle_disabled())
|
|
return -ENODEV;
|
|
|
|
ret = cpuidle_add_interface(cpu_subsys.dev_root);
|
|
if (ret)
|
|
return ret;
|
|
|
|
latency_notifier_init(&cpuidle_latency_notifier);
|
|
|
|
return 0;
|
|
}
|
|
|
|
module_param(off, int, 0444);
|
|
core_initcall(cpuidle_init);
|