From e5f1b245870d59be0e6cc3b33edf5406a3b59648 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 5 Oct 2016 09:33:12 +0200 Subject: [PATCH 01/17] cpuidle: governors: Remove remaining old module code The governor's code use try_module_get() and put_module() to refcount the governor's module. But the governors are not compiled as module. The refcount does not prevent to switch the governor or unload a module as they aren't compiled as modules. The code is pointless, so remove it. Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governor.c | 4 ---- drivers/cpuidle/governors/ladder.c | 2 -- drivers/cpuidle/governors/menu.c | 2 -- include/linux/cpuidle.h | 2 -- 4 files changed, 10 deletions(-) diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index fb9f511cca23..4e78263e34a4 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -9,7 +9,6 @@ */ #include -#include #include #include "cpuidle.h" @@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov) if (cpuidle_curr_governor) { list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_disable_device(dev); - module_put(cpuidle_curr_governor->owner); } cpuidle_curr_governor = gov; if (gov) { - if (!try_module_get(cpuidle_curr_governor->owner)) - return -EINVAL; list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_enable_device(dev); cpuidle_install_idle_handler(); diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 63bd5a403e22..fe8f08948fcb 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = { .enable = ladder_enable_device, .select = ladder_select_state, .reflect = ladder_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 03d38c291de6..d9b5b9398a0f 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,7 +19,6 @@ #include #include #include -#include /* * Please note when changing the tuning values: @@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = { .enable = menu_enable_device, .select = menu_select, .reflect = menu_reflect, - .owner = THIS_MODULE, }; /** diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb31373c3478..15deea449edc 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -235,8 +235,6 @@ struct cpuidle_governor { int (*select) (struct cpuidle_driver *drv, struct cpuidle_device *dev); void (*reflect) (struct cpuidle_device *dev, int index); - - struct module *owner; }; #ifdef CONFIG_CPU_IDLE From a94e502c22b61072f808f53392a8433bc948b03d Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 10 Nov 2016 14:24:33 +0000 Subject: [PATCH 02/17] cpuidle: dt: assign ->enter_freeze to same as ->enter callback function enter_freeze() callback is expected atleast to do the same as enter() but it has to guarantee that interrupts aren't enabled at any point in its execution, as the tick is frozen. CPUs execute ->enter_freeze with the local tick or entire timekeeping suspended, so it must not re-enable interrupts at any point (even temporarily) or attempt to change states of clock event devices. It will be called when the system goes to suspend-to-idle and will reduce power usage because CPUs won't be awaken for unnecessary IRQs (i.e. woken up only on IRQs from "wakeup sources") We can reuse the same code for both the enter() and enter_freeze() callbacks as along as they don't re-enable interrupts. Only "coupled" cpuidle mechanism enables interrupts and doing that with timekeeping suspended is generally not safe. Since this generic DT based idle driver doesn't support "coupled" states, it is safe to assume that the interrupts are not re-enabled. This patch assign enter_freeze to same as enter callback function which helps to save power without any intermittent spurious wakeups from suspend-to-idle. Signed-off-by: Sudeep Holla Tested-by: Andy Gross Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/dt_idle_states.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c index a5c111b67f37..ffca4fc0061d 100644 --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state, * state enter function. */ idle_state->enter = match_id->data; + /* + * Since this is not a "coupled" state, it's safe to assume interrupts + * won't be enabled when it exits allowing the tick to be frozen + * safely. So enter() can be also enter_freeze() callback. + */ + idle_state->enter_freeze = match_id->data; err = of_property_read_u32(state_node, "wakeup-latency-us", &idle_state->exit_latency); From ed61390bff3ad43166a2552651b09ebd95dd1da5 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Tue, 22 Nov 2016 16:08:06 +1100 Subject: [PATCH 03/17] cpuidle/powernv: staticise powernv_idle_driver powernv_idle_driver isn't exported, it can be made static. Found by sparse. Signed-off-by: Andrew Donnellan Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-powernv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 7fe442ca38f4..0835a37a5f3a 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -22,7 +22,7 @@ #define POWERNV_THRESHOLD_LATENCY_NS 200000 -struct cpuidle_driver powernv_idle_driver = { +static struct cpuidle_driver powernv_idle_driver = { .name = "powernv_idle", .owner = THIS_MODULE, }; From bb8313b603eb8fd52de48a079bfcd72dcab2ef1e Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 28 Nov 2016 23:03:04 -0800 Subject: [PATCH 04/17] cpuidle: Allow enforcing deepest idle state selection When idle injection is used to cap power, we need to override the governor's choice of idle states. For this reason, make it possible the deepest idle state selection to be enforced by setting a flag on a given CPU to achieve the maximum potential power draw reduction. Signed-off-by: Jacob Pan [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 13 ++++++++++++- include/linux/cpuidle.h | 7 ++++++- kernel/sched/idle.c | 13 ++++++++----- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207abb5a4..afc005b917fe 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,17 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -#ifdef CONFIG_SUSPEND +/* Set the current cpu to use the deepest idle state, override governors */ +void cpuidle_use_deepest_state(bool enable) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + dev->use_deepest_state = enable; + preempt_enable(); +} + /** * cpuidle_find_deepest_state - Find the deepest available idle state. * @drv: cpuidle driver for the given CPU. @@ -109,6 +119,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return find_deepest_state(drv, dev, UINT_MAX, 0, false); } +#ifdef CONFIG_SUSPEND static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 15deea449edc..da346f2817a8 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -74,6 +74,7 @@ struct cpuidle_driver_kobj; struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; + unsigned int use_deepest_state:1; unsigned int cpu; int last_residency; @@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #endif -#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) +#ifdef CONFIG_CPU_IDLE extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev); +extern void cpuidle_use_deepest_state(bool enable); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) +{ +} #endif /* kernel/sched/idle.c */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d5300d..513e4dfeeae7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); From c1de45ca831acee9b72c9320dde447edafadb43f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Nov 2016 23:03:05 -0800 Subject: [PATCH 05/17] sched/idle: Add support for tasks that inject idle Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use realtime tasks to take control of CPU then inject idle. There are two issues with this approach: 1. Low efficiency: injected idle task is treated as busy so sched ticks do not stop during injected idle period, the result of these unwanted wakeups can be ~20% loss in power savings. 2. Idle accounting: injected idle time is presented to user as busy. This patch addresses the issues by introducing a new PF_IDLE flag which allows any given task to be treated as idle task while the flag is set. Therefore, idle injection tasks can run through the normal flow of NOHZ idle enter/exit to get the correct accounting as well as tick stop when possible. The implication is that idle task is then no longer limited to PID == 0. Acked-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- include/linux/cpu.h | 2 + include/linux/sched.h | 3 +- kernel/fork.c | 2 +- kernel/sched/core.c | 1 + kernel/sched/idle.c | 162 ++++++++++++++++++++++++++---------------- 5 files changed, 107 insertions(+), 63 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b886dc17f2f3..ac0efae38072 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); +void play_idle(unsigned long duration_ms); + #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..114c7fcb6af6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, /* * Per process flags */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ @@ -2609,7 +2610,7 @@ extern struct task_struct *idle_task(int cpu); */ static inline bool is_idle_task(const struct task_struct *p) { - return p->pid == 0; + return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..5074b2f0827b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1537,7 +1537,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..63b3a8a49884 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5285,6 +5285,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 513e4dfeeae7..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -205,76 +205,65 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ - while (1) { - /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. - */ + __current_set_polling(); + tick_nohz_idle_enter(); - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + while (!need_resched()) { + check_pgt_cache(); + rmb(); - while (!need_resched()) { - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } - - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); } - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); + local_irq_disable(); + arch_cpu_idle_enter(); /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ - smp_mb__after_atomic(); - - sched_ttwu_pending(); - schedule_preempt_disabled(); + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); } bool cpu_in_idle(unsigned long pc) @@ -283,6 +272,56 @@ bool cpu_in_idle(unsigned long pc) pc < (unsigned long)__cpuidle_text_end; } +struct idle_timer { + struct hrtimer timer; + int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); + + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); + void cpu_startup_entry(enum cpuhp_state state) { /* @@ -302,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } From 14f3f7d8cbceedab17f16cf301414fa3384117fe Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 28 Nov 2016 13:44:49 -0800 Subject: [PATCH 06/17] thermal/intel_powerclamp: Remove duplicated code that starts the kthread This patch removes code duplication. It does not modify the functionality. Signed-off-by: Petr Mladek Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel_powerclamp.c | 45 +++++++++++++----------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index 0e4dc0afcfd2..63657d193db5 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -508,10 +508,27 @@ static void poll_pkg_cstate(struct work_struct *dummy) schedule_delayed_work(&poll_pkg_cstate_work, HZ); } +static void start_power_clamp_thread(unsigned long cpu) +{ + struct task_struct **p = per_cpu_ptr(powerclamp_thread, cpu); + struct task_struct *thread; + + thread = kthread_create_on_node(clamp_thread, + (void *) cpu, + cpu_to_node(cpu), + "kidle_inject/%ld", cpu); + if (IS_ERR(thread)) + return; + + /* bind to cpu here */ + kthread_bind(thread, cpu); + wake_up_process(thread); + *p = thread; +} + static int start_power_clamp(void) { unsigned long cpu; - struct task_struct *thread; set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); /* prevent cpu hotplug */ @@ -527,20 +544,7 @@ static int start_power_clamp(void) /* start one thread per online cpu */ for_each_online_cpu(cpu) { - struct task_struct **p = - per_cpu_ptr(powerclamp_thread, cpu); - - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%ld", cpu); - /* bind to cpu here */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *p = thread; - } - + start_power_clamp_thread(cpu); } put_online_cpus(); @@ -572,7 +576,6 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned long cpu = (unsigned long)hcpu; - struct task_struct *thread; struct task_struct **percpu_thread = per_cpu_ptr(powerclamp_thread, cpu); @@ -581,15 +584,7 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%lu", cpu); - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *percpu_thread = thread; - } + start_power_clamp_thread(cpu); /* prefer BSP as controlling CPU */ if (cpu == 0) { control_cpu = 0; From 8d962ac7f396bc83fb381469521c27aed7b70f84 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 28 Nov 2016 13:44:50 -0800 Subject: [PATCH 07/17] thermal/intel_powerclamp: Convert the kthread to kthread worker API Kthreads are currently implemented as an infinite loop. Each has its own variant of checks for terminating, freezing, awakening. In many cases it is unclear to say in which state it is and sometimes it is done a wrong way. The plan is to convert kthreads into kthread_worker or workqueues API. It allows to split the functionality into separate operations. It helps to make a better structure. Also it defines a clean state where no locks are taken, IRQs blocked, the kthread might sleep or even be safely migrated. The kthread worker API is useful when we want to have a dedicated single thread for the work. It helps to make sure that it is available when needed. Also it allows a better control, e.g. define a scheduling priority. This patch converts the intel powerclamp kthreads into the kthread worker because they need to have a good control over the assigned CPUs. IMHO, the most natural way is to split one cycle into two works. First one does some balancing and let the CPU work normal way for some time. The second work checks what the CPU has done in the meantime and put it into C-state to reach the required idle time ratio. The delay between the two works is achieved by the delayed kthread work. The two works have to share some data that used to be local variables of the single kthread function. This is achieved by the new per-CPU struct kthread_worker_data. It might look as a complication. On the other hand, the long original kthread function was not nice either. The patch tries to avoid extra init and cleanup works. All the actions might be done outside the thread. They are moved to the functions that create or destroy the worker. Especially, I checked that the timers are assigned to the right CPU. The two works are queuing each other. It makes it a bit tricky to break it when we want to stop the worker. We use the global and per-worker "clamping" variables to make sure that the re-queuing eventually stops. We also cancel the works to make it faster. Note that the canceling is not reliable because the handling of the two variables and queuing is not synchronized via a lock. But it is not a big deal because it is just an optimization. The job is stopped faster than before in most cases. Signed-off-by: Petr Mladek Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel_powerclamp.c | 286 +++++++++++++++++------------ 1 file changed, 167 insertions(+), 119 deletions(-) diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index 63657d193db5..a94f7c849a4e 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -86,11 +86,27 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update */ static bool clamping; +static const struct sched_param sparam = { + .sched_priority = MAX_USER_RT_PRIO / 2, +}; +struct powerclamp_worker_data { + struct kthread_worker *worker; + struct kthread_work balancing_work; + struct kthread_delayed_work idle_injection_work; + struct timer_list wakeup_timer; + unsigned int cpu; + unsigned int count; + unsigned int guard; + unsigned int window_size_now; + unsigned int target_ratio; + unsigned int duration_jiffies; + bool clamping; +}; -static struct task_struct * __percpu *powerclamp_thread; +static struct powerclamp_worker_data * __percpu worker_data; static struct thermal_cooling_device *cooling_dev; static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu - * clamping thread + * clamping kthread worker */ static unsigned int duration; @@ -368,103 +384,104 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, return set_target_ratio + guard <= current_ratio; } -static int clamp_thread(void *arg) +static void clamp_balancing_func(struct kthread_work *work) { - int cpunr = (unsigned long)arg; - DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; - unsigned int count = 0; - unsigned int target_ratio; + struct powerclamp_worker_data *w_data; + int sleeptime; + unsigned long target_jiffies; + unsigned int compensated_ratio; + int interval; /* jiffies to sleep for each attempt */ - set_bit(cpunr, cpu_clamping_mask); - set_freezable(); - init_timer_on_stack(&wakeup_timer); - sched_setscheduler(current, SCHED_FIFO, ¶m); + w_data = container_of(work, struct powerclamp_worker_data, + balancing_work); - while (true == clamping && !kthread_should_stop() && - cpu_online(cpunr)) { - int sleeptime; - unsigned long target_jiffies; - unsigned int guard; - unsigned int compensated_ratio; - int interval; /* jiffies to sleep for each attempt */ - unsigned int duration_jiffies = msecs_to_jiffies(duration); - unsigned int window_size_now; + /* + * make sure user selected ratio does not take effect until + * the next round. adjust target_ratio if user has changed + * target such that we can converge quickly. + */ + w_data->target_ratio = READ_ONCE(set_target_ratio); + w_data->guard = 1 + w_data->target_ratio / 20; + w_data->window_size_now = window_size; + w_data->duration_jiffies = msecs_to_jiffies(duration); + w_data->count++; - try_to_freeze(); - /* - * make sure user selected ratio does not take effect until - * the next round. adjust target_ratio if user has changed - * target such that we can converge quickly. - */ - target_ratio = set_target_ratio; - guard = 1 + target_ratio/20; - window_size_now = window_size; - count++; + /* + * systems may have different ability to enter package level + * c-states, thus we need to compensate the injected idle ratio + * to achieve the actual target reported by the HW. + */ + compensated_ratio = w_data->target_ratio + + get_compensation(w_data->target_ratio); + if (compensated_ratio <= 0) + compensated_ratio = 1; + interval = w_data->duration_jiffies * 100 / compensated_ratio; - /* - * systems may have different ability to enter package level - * c-states, thus we need to compensate the injected idle ratio - * to achieve the actual target reported by the HW. - */ - compensated_ratio = target_ratio + - get_compensation(target_ratio); - if (compensated_ratio <= 0) - compensated_ratio = 1; - interval = duration_jiffies * 100 / compensated_ratio; + /* align idle time */ + target_jiffies = roundup(jiffies, interval); + sleeptime = target_jiffies - jiffies; + if (sleeptime <= 0) + sleeptime = 1; - /* align idle time */ - target_jiffies = roundup(jiffies, interval); - sleeptime = target_jiffies - jiffies; - if (sleeptime <= 0) - sleeptime = 1; - schedule_timeout_interruptible(sleeptime); - /* - * only elected controlling cpu can collect stats and update - * control parameters. - */ - if (cpunr == control_cpu && !(count%window_size_now)) { - should_skip = - powerclamp_adjust_controls(target_ratio, - guard, window_size_now); - smp_mb(); - } + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_delayed_work(w_data->worker, + &w_data->idle_injection_work, + sleeptime); +} - if (should_skip) - continue; +static void clamp_idle_injection_func(struct kthread_work *work) +{ + struct powerclamp_worker_data *w_data; + unsigned long target_jiffies; - target_jiffies = jiffies + duration_jiffies; - mod_timer(&wakeup_timer, target_jiffies); - if (unlikely(local_softirq_pending())) - continue; - /* - * stop tick sched during idle time, interrupts are still - * allowed. thus jiffies are updated properly. - */ - preempt_disable(); - /* mwait until target jiffies is reached */ - while (time_before(jiffies, target_jiffies)) { - unsigned long ecx = 1; - unsigned long eax = target_mwait; + w_data = container_of(work, struct powerclamp_worker_data, + idle_injection_work.work); - /* - * REVISIT: may call enter_idle() to notify drivers who - * can save power during cpu idle. same for exit_idle() - */ - local_touch_nmi(); - stop_critical_timings(); - mwait_idle_with_hints(eax, ecx); - start_critical_timings(); - atomic_inc(&idle_wakeup_counter); - } - preempt_enable(); + /* + * only elected controlling cpu can collect stats and update + * control parameters. + */ + if (w_data->cpu == control_cpu && + !(w_data->count % w_data->window_size_now)) { + should_skip = + powerclamp_adjust_controls(w_data->target_ratio, + w_data->guard, + w_data->window_size_now); + smp_mb(); } - del_timer_sync(&wakeup_timer); - clear_bit(cpunr, cpu_clamping_mask); - return 0; + if (should_skip) + goto balance; + + target_jiffies = jiffies + w_data->duration_jiffies; + mod_timer(&w_data->wakeup_timer, target_jiffies); + if (unlikely(local_softirq_pending())) + goto balance; + /* + * stop tick sched during idle time, interrupts are still + * allowed. thus jiffies are updated properly. + */ + preempt_disable(); + /* mwait until target jiffies is reached */ + while (time_before(jiffies, target_jiffies)) { + unsigned long ecx = 1; + unsigned long eax = target_mwait; + + /* + * REVISIT: may call enter_idle() to notify drivers who + * can save power during cpu idle. same for exit_idle() + */ + local_touch_nmi(); + stop_critical_timings(); + mwait_idle_with_hints(eax, ecx); + start_critical_timings(); + atomic_inc(&idle_wakeup_counter); + } + preempt_enable(); + +balance: + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_work(w_data->worker, &w_data->balancing_work); } /* @@ -508,22 +525,58 @@ static void poll_pkg_cstate(struct work_struct *dummy) schedule_delayed_work(&poll_pkg_cstate_work, HZ); } -static void start_power_clamp_thread(unsigned long cpu) +static void start_power_clamp_worker(unsigned long cpu) { - struct task_struct **p = per_cpu_ptr(powerclamp_thread, cpu); - struct task_struct *thread; + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + struct kthread_worker *worker; - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%ld", cpu); - if (IS_ERR(thread)) + worker = kthread_create_worker_on_cpu(cpu, KTW_FREEZABLE, + "kidle_inject/%ld", cpu); + if (IS_ERR(worker)) return; - /* bind to cpu here */ - kthread_bind(thread, cpu); - wake_up_process(thread); - *p = thread; + w_data->worker = worker; + w_data->count = 0; + w_data->cpu = cpu; + w_data->clamping = true; + set_bit(cpu, cpu_clamping_mask); + setup_timer(&w_data->wakeup_timer, noop_timer, 0); + sched_setscheduler(worker->task, SCHED_FIFO, &sparam); + kthread_init_work(&w_data->balancing_work, clamp_balancing_func); + kthread_init_delayed_work(&w_data->idle_injection_work, + clamp_idle_injection_func); + kthread_queue_work(w_data->worker, &w_data->balancing_work); +} + +static void stop_power_clamp_worker(unsigned long cpu) +{ + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + + if (!w_data->worker) + return; + + w_data->clamping = false; + /* + * Make sure that all works that get queued after this point see + * the clamping disabled. The counter part is not needed because + * there is an implicit memory barrier when the queued work + * is proceed. + */ + smp_wmb(); + kthread_cancel_work_sync(&w_data->balancing_work); + kthread_cancel_delayed_work_sync(&w_data->idle_injection_work); + /* + * The balancing work still might be queued here because + * the handling of the "clapming" variable, cancel, and queue + * operations are not synchronized via a lock. But it is not + * a big deal. The balancing work is fast and destroy kthread + * will wait for it. + */ + del_timer_sync(&w_data->wakeup_timer); + clear_bit(w_data->cpu, cpu_clamping_mask); + kthread_destroy_worker(w_data->worker); + + w_data->worker = NULL; } static int start_power_clamp(void) @@ -542,9 +595,9 @@ static int start_power_clamp(void) clamping = true; schedule_delayed_work(&poll_pkg_cstate_work, 0); - /* start one thread per online cpu */ + /* start one kthread worker per online cpu */ for_each_online_cpu(cpu) { - start_power_clamp_thread(cpu); + start_power_clamp_worker(cpu); } put_online_cpus(); @@ -554,20 +607,17 @@ static int start_power_clamp(void) static void end_power_clamp(void) { int i; - struct task_struct *thread; - clamping = false; /* - * make clamping visible to other cpus and give per cpu clamping threads - * sometime to exit, or gets killed later. + * Block requeuing in all the kthread workers. They will flush and + * stop faster. */ - smp_mb(); - msleep(20); + clamping = false; if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { - pr_debug("clamping thread for cpu %d alive, kill\n", i); - thread = *per_cpu_ptr(powerclamp_thread, i); - kthread_stop(thread); + pr_debug("clamping worker for cpu %d alive, destroy\n", + i); + stop_power_clamp_worker(i); } } } @@ -576,15 +626,13 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned long cpu = (unsigned long)hcpu; - struct task_struct **percpu_thread = - per_cpu_ptr(powerclamp_thread, cpu); if (false == clamping) goto exit_ok; switch (action) { case CPU_ONLINE: - start_power_clamp_thread(cpu); + start_power_clamp_worker(cpu); /* prefer BSP as controlling CPU */ if (cpu == 0) { control_cpu = 0; @@ -595,7 +643,7 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb, if (test_bit(cpu, cpu_clamping_mask)) { pr_err("cpu %lu dead but powerclamping thread is not\n", cpu); - kthread_stop(*percpu_thread); + stop_power_clamp_worker(cpu); } if (cpu == control_cpu) { control_cpu = smp_processor_id(); @@ -759,8 +807,8 @@ static int __init powerclamp_init(void) window_size = 2; register_hotcpu_notifier(&powerclamp_cpu_notifier); - powerclamp_thread = alloc_percpu(struct task_struct *); - if (!powerclamp_thread) { + worker_data = alloc_percpu(struct powerclamp_worker_data); + if (!worker_data) { retval = -ENOMEM; goto exit_unregister; } @@ -780,7 +828,7 @@ static int __init powerclamp_init(void) return 0; exit_free_thread: - free_percpu(powerclamp_thread); + free_percpu(worker_data); exit_unregister: unregister_hotcpu_notifier(&powerclamp_cpu_notifier); exit_free: @@ -793,7 +841,7 @@ static void __exit powerclamp_exit(void) { unregister_hotcpu_notifier(&powerclamp_cpu_notifier); end_power_clamp(); - free_percpu(powerclamp_thread); + free_percpu(worker_data); thermal_cooling_device_unregister(cooling_dev); kfree(cpu_clamping_mask); From cb91fef1b71954e3edc79fb4171b43f6aa2028c7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 28 Nov 2016 13:44:51 -0800 Subject: [PATCH 08/17] thermal/intel_powerclamp: Convert to CPU hotplug state This is a conversation to the new hotplug state machine with the difference that CPU_DEAD becomes CPU_PREDOWN. At the same time it makes the handling of the two states symmetrical. stop_power_clamp_worker() is called unconditionally and the controversial error message is removed. Finally, the hotplug state callbacks are removed after the powerclamping is stopped to avoid a potential race. Signed-off-by: Sebastian Andrzej Siewior [pmladek@suse.com: Fixed the possible race in powerclamp_exit()] Signed-off-by: Petr Mladek Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel_powerclamp.c | 75 +++++++++++++++--------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index a94f7c849a4e..c99af71c0ae7 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -530,8 +529,7 @@ static void start_power_clamp_worker(unsigned long cpu) struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); struct kthread_worker *worker; - worker = kthread_create_worker_on_cpu(cpu, KTW_FREEZABLE, - "kidle_inject/%ld", cpu); + worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu); if (IS_ERR(worker)) return; @@ -622,42 +620,34 @@ static void end_power_clamp(void) } } -static int powerclamp_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int powerclamp_cpu_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; - - if (false == clamping) - goto exit_ok; - - switch (action) { - case CPU_ONLINE: - start_power_clamp_worker(cpu); - /* prefer BSP as controlling CPU */ - if (cpu == 0) { - control_cpu = 0; - smp_mb(); - } - break; - case CPU_DEAD: - if (test_bit(cpu, cpu_clamping_mask)) { - pr_err("cpu %lu dead but powerclamping thread is not\n", - cpu); - stop_power_clamp_worker(cpu); - } - if (cpu == control_cpu) { - control_cpu = smp_processor_id(); - smp_mb(); - } + if (clamping == false) + return 0; + start_power_clamp_worker(cpu); + /* prefer BSP as controlling CPU */ + if (cpu == 0) { + control_cpu = 0; + smp_mb(); } - -exit_ok: - return NOTIFY_OK; + return 0; } -static struct notifier_block powerclamp_cpu_notifier = { - .notifier_call = powerclamp_cpu_callback, -}; +static int powerclamp_cpu_predown(unsigned int cpu) +{ + if (clamping == false) + return 0; + + stop_power_clamp_worker(cpu); + if (cpu != control_cpu) + return 0; + + control_cpu = cpumask_first(cpu_online_mask); + if (control_cpu == cpu) + control_cpu = cpumask_next(cpu, cpu_online_mask); + smp_mb(); + return 0; +} static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) @@ -788,6 +778,8 @@ file_error: debugfs_remove_recursive(debug_dir); } +static enum cpuhp_state hp_state; + static int __init powerclamp_init(void) { int retval; @@ -805,7 +797,14 @@ static int __init powerclamp_init(void) /* set default limit, maybe adjusted during runtime based on feedback */ window_size = 2; - register_hotcpu_notifier(&powerclamp_cpu_notifier); + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "thermal/intel_powerclamp:online", + powerclamp_cpu_online, + powerclamp_cpu_predown); + if (retval < 0) + goto exit_free; + + hp_state = retval; worker_data = alloc_percpu(struct powerclamp_worker_data); if (!worker_data) { @@ -830,7 +829,7 @@ static int __init powerclamp_init(void) exit_free_thread: free_percpu(worker_data); exit_unregister: - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); + cpuhp_remove_state_nocalls(hp_state); exit_free: kfree(cpu_clamping_mask); return retval; @@ -839,8 +838,8 @@ module_init(powerclamp_init); static void __exit powerclamp_exit(void) { - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); end_power_clamp(); + cpuhp_remove_state_nocalls(hp_state); free_percpu(worker_data); thermal_cooling_device_unregister(cooling_dev); kfree(cpu_clamping_mask); From feb6cd6a0f9f7d214351624d79e408cb2af91631 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 28 Nov 2016 13:44:52 -0800 Subject: [PATCH 09/17] thermal/intel_powerclamp: stop sched tick in forced idle With the introduction of play_idle(), idle injection kthread can go through the normal idle task processing to get correct accounting and turn off scheduler tick when possible. Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel_powerclamp.c | 35 +----------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index c99af71c0ae7..6d93f1d65dba 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -92,7 +92,6 @@ struct powerclamp_worker_data { struct kthread_worker *worker; struct kthread_work balancing_work; struct kthread_delayed_work idle_injection_work; - struct timer_list wakeup_timer; unsigned int cpu; unsigned int count; unsigned int guard; @@ -277,11 +276,6 @@ static u64 pkg_state_counter(void) return count; } -static void noop_timer(unsigned long foo) -{ - /* empty... just the fact that we get the interrupt wakes us up */ -} - static unsigned int get_compensation(int ratio) { unsigned int comp = 0; @@ -431,7 +425,6 @@ static void clamp_balancing_func(struct kthread_work *work) static void clamp_idle_injection_func(struct kthread_work *work) { struct powerclamp_worker_data *w_data; - unsigned long target_jiffies; w_data = container_of(work, struct powerclamp_worker_data, idle_injection_work.work); @@ -452,31 +445,7 @@ static void clamp_idle_injection_func(struct kthread_work *work) if (should_skip) goto balance; - target_jiffies = jiffies + w_data->duration_jiffies; - mod_timer(&w_data->wakeup_timer, target_jiffies); - if (unlikely(local_softirq_pending())) - goto balance; - /* - * stop tick sched during idle time, interrupts are still - * allowed. thus jiffies are updated properly. - */ - preempt_disable(); - /* mwait until target jiffies is reached */ - while (time_before(jiffies, target_jiffies)) { - unsigned long ecx = 1; - unsigned long eax = target_mwait; - - /* - * REVISIT: may call enter_idle() to notify drivers who - * can save power during cpu idle. same for exit_idle() - */ - local_touch_nmi(); - stop_critical_timings(); - mwait_idle_with_hints(eax, ecx); - start_critical_timings(); - atomic_inc(&idle_wakeup_counter); - } - preempt_enable(); + play_idle(jiffies_to_msecs(w_data->duration_jiffies)); balance: if (clamping && w_data->clamping && cpu_online(w_data->cpu)) @@ -538,7 +507,6 @@ static void start_power_clamp_worker(unsigned long cpu) w_data->cpu = cpu; w_data->clamping = true; set_bit(cpu, cpu_clamping_mask); - setup_timer(&w_data->wakeup_timer, noop_timer, 0); sched_setscheduler(worker->task, SCHED_FIFO, &sparam); kthread_init_work(&w_data->balancing_work, clamp_balancing_func); kthread_init_delayed_work(&w_data->idle_injection_work, @@ -570,7 +538,6 @@ static void stop_power_clamp_worker(unsigned long cpu) * a big deal. The balancing work is fast and destroy kthread * will wait for it. */ - del_timer_sync(&w_data->wakeup_timer); clear_bit(w_data->cpu, cpu_clamping_mask); kthread_destroy_worker(w_data->worker); From 5e7ec268fd48d63cfd0e3a9be6c6443f01673bd4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 25 Oct 2016 17:11:39 +0300 Subject: [PATCH 10/17] x86/intel_idle: Add CPU model 0x4a (Atom Z34xx series) Add CPU ID for Atom Z34xx processors. Datasheets indicate support for this, detailed information about potential quirks or limitations are missing, though. So we just reuse the definition from official BSP code. Signed-off-by: Andy Shevchenko Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 49 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4466a2f969d7..5ded9b22b015 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -724,6 +724,50 @@ static struct cpuidle_state atom_cstates[] = { { .enter = NULL } }; +static struct cpuidle_state tangier_cstates[] = { + { + .name = "C1-TNG", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 4, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C4-TNG", + .desc = "MWAIT 0x30", + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 100, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-TNG", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 560, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-TNG", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1200, + .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C9-TNG", + .desc = "MWAIT 0x64", + .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 20000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; static struct cpuidle_state avn_cstates[] = { { .name = "C1-AVN", @@ -978,6 +1022,10 @@ static const struct idle_cpu idle_cpu_atom = { .state_table = atom_cstates, }; +static const struct idle_cpu idle_cpu_tangier = { + .state_table = tangier_cstates, +}; + static const struct idle_cpu idle_cpu_lincroft = { .state_table = atom_cstates, .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, @@ -1066,6 +1114,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), + ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), From a2c1bc645e87346150516b3abf1933ed29d0f48b Mon Sep 17 00:00:00 2001 From: Piotr Luc Date: Thu, 13 Oct 2016 17:30:58 +0200 Subject: [PATCH 11/17] x86/intel_idle: Add Knights Mill CPUID Add Knights Mill (KNM) to the list of CPUIDs supported by intel_idle. Signed-off-by: Piotr Luc Reviewed-by: Dave Hansen Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 5ded9b22b015..d6377c16b3f8 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1133,6 +1133,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl), ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), + ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl), ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), {} From 2ed38cbe05e7ea27022e29d75e7b93c20ccc5eaa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Nov 2016 02:32:17 +0100 Subject: [PATCH 12/17] MAINTAINERS: Add bug tracking system location entries for cpuidle The kernel Bugzilla is used for tracking bugs in the cpuidle core and intel_idle, so document that. Signed-off-by: Rafael J. Wysocki Acked-by: Len Brown --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1cd38a7e0064..08753a5a23be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3367,6 +3367,7 @@ M: Daniel Lezcano L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git +B: https://bugzilla.kernel.org F: drivers/cpuidle/* F: include/linux/cpuidle.h @@ -6271,6 +6272,7 @@ INTEL IDLE DRIVER M: Len Brown L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git +B: https://bugzilla.kernel.org S: Supported F: drivers/idle/intel_idle.c From 6af33995318fdfb4914fb1c5e67450fdb3d32084 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Nov 2016 02:33:07 +0100 Subject: [PATCH 13/17] MAINTAINERS: Add Jacob Pan as a new intel_idle maintainer The intel_idle driver is going to be maintained by Jacob Pan now, so update MAINTAINERS accordingly. Signed-off-by: Rafael J. Wysocki Acked-by: Len Brown --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 08753a5a23be..0248f13c8adf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6269,6 +6269,7 @@ S: Maintained F: drivers/platform/x86/intel-vbtn.c INTEL IDLE DRIVER +M: Jacob Pan M: Len Brown L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git From 29d7bbada98e0969dd2ea159104aa55052a1c439 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Sun, 27 Nov 2016 00:13:31 +0100 Subject: [PATCH 14/17] intel_idle: Remove superfluous SMP fuction call Since commit 1cf4f629d9d2 ("cpu/hotplug: Move online calls to hotplugged cpu") the CPU_ONLINE and CPU_DOWN_PREPARE notifiers are always run on the hot plugged CPU, and as of commit 3b9d6da67e11 ("cpu/hotplug: Fix rollback during error-out in __cpu_disable()") the CPU_DOWN_FAILED notifier also runs on the hot plugged CPU. This patch converts the SMP functional calls into direct calls. smp_function_call_single() executes the function with interrupts disabled. This calling convention is not preserved, because tick_broadcast_enable() and tick_braodcast_disable() handle interrupts themselves. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Acked-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index d6377c16b3f8..3828babfb77b 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -971,8 +971,7 @@ static int cpu_hotplug_notify(struct notifier_block *n, case CPU_ONLINE: if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) - smp_call_function_single(hotcpu, __setup_broadcast_timer, - (void *)true, 1); + __setup_broadcast_timer((void *)true); /* * Some systems can hotplug a cpu at runtime after From fb1013a01673acf7e94e38cda169828ac76b345a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2016 10:51:43 +0100 Subject: [PATCH 15/17] intel_idle: Convert to hotplug state machine Install the callbacks via the state machine and let the core invoke the callbacks on the already online CPUs. The two smp_call_function_single() invocations in intel_idle_cpu_init() have been removed because intel_idle_cpu_init() is now invoked via the hotplug callback which runs on the target CPU. The IRQ-off calling convention for auto_demotion_disable() and c1e_promotion_disable() has not been preserved because only those two modify the MSR during CPU intialization. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 103 +++++++++++++++----------------------- 1 file changed, 39 insertions(+), 64 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 3828babfb77b..7d8ea3d5fda6 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); static void intel_idle_freeze(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); -static int intel_idle_cpu_init(int cpu); - static struct cpuidle_state *cpuidle_state_table; /* @@ -951,50 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev, mwait_idle_with_hints(eax, ecx); } -static void __setup_broadcast_timer(void *arg) +static void __setup_broadcast_timer(bool on) { - unsigned long on = (unsigned long)arg; - if (on) tick_broadcast_enable(); else tick_broadcast_disable(); } -static int cpu_hotplug_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct cpuidle_device *dev; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - - if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) - __setup_broadcast_timer((void *)true); - - /* - * Some systems can hotplug a cpu at runtime after - * the kernel has booted, we have to initialize the - * driver in this case - */ - dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu); - if (dev->registered) - break; - - if (intel_idle_cpu_init(hotcpu)) - return NOTIFY_BAD; - - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_hotplug_notifier = { - .notifier_call = cpu_hotplug_notify, -}; - -static void auto_demotion_disable(void *dummy) +static void auto_demotion_disable(void) { unsigned long long msr_bits; @@ -1002,7 +965,7 @@ static void auto_demotion_disable(void *dummy) msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } -static void c1e_promotion_disable(void *dummy) +static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -1422,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void) * allocate, initialize, register cpuidle_devices * @cpu: cpu/core to initialize */ -static int intel_idle_cpu_init(int cpu) +static int intel_idle_cpu_init(unsigned int cpu) { struct cpuidle_device *dev; dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); - dev->cpu = cpu; if (cpuidle_register_device(dev)) { @@ -1436,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu) } if (icpu->auto_demotion_disable_flags) - smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); + auto_demotion_disable(); if (icpu->disable_promotion_to_c1e) - smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1); + c1e_promotion_disable(); + + return 0; +} + +static int intel_idle_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev; + + if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) + __setup_broadcast_timer(true); + + /* + * Some systems can hotplug a cpu at runtime after + * the kernel has booted, we have to initialize the + * driver in this case + */ + dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); + if (!dev->registered) + return intel_idle_cpu_init(cpu); return 0; } static int __init intel_idle_init(void) { - int retval, i; + int retval; /* Do not load intel_idle at all for now if idle= is passed */ if (boot_option_idle_override != IDLE_NO_OVERRIDE) @@ -1466,35 +1447,29 @@ static int __init intel_idle_init(void) struct cpuidle_driver *drv = cpuidle_get_driver(); printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", drv ? drv->name : "none"); - free_percpu(intel_idle_cpuidle_devices); - return retval; + goto init_driver_fail; } - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - retval = intel_idle_cpu_init(i); - if (retval) { - intel_idle_cpuidle_devices_uninit(); - cpu_notifier_register_done(); - cpuidle_unregister_driver(&intel_idle_driver); - free_percpu(intel_idle_cpuidle_devices); - return retval; - } - } - __register_cpu_notifier(&cpu_hotplug_notifier); - if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; - else - on_each_cpu(__setup_broadcast_timer, (void *)true, 1); - cpu_notifier_register_done(); + retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online", + intel_idle_cpu_online, NULL); + if (retval < 0) + goto hp_setup_fail; pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", lapic_timer_reliable_states); return 0; + +hp_setup_fail: + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); +init_driver_fail: + free_percpu(intel_idle_cpuidle_devices); + return retval; + } device_initcall(intel_idle_init); From 8f6040cebd2382a5cfb201419d40e7a4a193a412 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sat, 3 Dec 2016 23:02:27 +0800 Subject: [PATCH 16/17] cpuidle: fix improper return value on error In function cpuidle_add_state_sysfs(), variable ret takes the return value. Its value should be negative on errors. Because ret is reset in the loop, its value will be 0 during the second and after repeat of the loop. If kzalloc() returns a NULL pointer then, it will return 0. It may be better to explicitly assign "-ENOMEM" when the call to kzalloc() fails. Link: https://bugzilla.kernel.org/show_bug.cgi?id=188901 Signed-off-by: Pan Bian Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 832a2c3f01ff..c5adc8c9ac43 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) /* state statistics */ for (i = 0; i < drv->state_count; i++) { kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); - if (!kobj) + if (!kobj) { + ret = -ENOMEM; goto error_state; + } kobj->state = &drv->states[i]; kobj->state_usage = &device->states_usage[i]; init_completion(&kobj->kobj_unregister); From 0e7414b7aa8b294fddefbad020798f7c8ebe1622 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 Dec 2016 23:31:32 +0100 Subject: [PATCH 17/17] cpuidle: Add a kerneldoc comment to cpuidle_use_deepest_state() Since cpuidle_use_deepest_state() is not static, add a proper kerneldoc comment to it to document its purpose. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index afc005b917fe..62810ff3b00f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,13 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -/* Set the current cpu to use the deepest idle state, override governors */ +/** + * cpuidle_use_deepest_state - Set/clear governor override flag. + * @enable: New value of the flag. + * + * Set/unset the current CPU to use the deepest idle state (override governors + * going forward if set). + */ void cpuidle_use_deepest_state(bool enable) { struct cpuidle_device *dev;