forked from Minki/linux
b8f8c3cf0a
Jack Ren and Eric Miao tracked down the following long standing problem in the NOHZ code: scheduler switch to idle task enable interrupts Window starts here ----> interrupt happens (does not set NEED_RESCHED) irq_exit() stops the tick ----> interrupt happens (does set NEED_RESCHED) return from schedule() cpu_idle(): preempt_disable(); Window ends here The interrupts can happen at any point inside the race window. The first interrupt stops the tick, the second one causes the scheduler to rerun and switch away from idle again and we end up with the tick disabled. The fact that it needs two interrupts where the first one does not set NEED_RESCHED and the second one does made the bug obscure and extremly hard to reproduce and analyse. Kudos to Jack and Eric. Solution: Limit the NOHZ functionality to the idle loop to make sure that we can not run into such a situation ever again. cpu_idle() { preempt_disable(); while(1) { tick_nohz_stop_sched_tick(1); <- tell NOHZ code that we are in the idle loop while (!need_resched()) halt(); tick_nohz_restart_sched_tick(); <- disables NOHZ mode preempt_enable_no_resched(); schedule(); preempt_disable(); } } In hindsight we should have done this forever, but ... /me grabs a large brown paperbag. Debugged-by: Jack Ren <jack.ren@marvell.com>, Debugged-by: eric miao <eric.y.miao@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
140 lines
3.2 KiB
C
140 lines
3.2 KiB
C
/*
|
|
* Idle daemon for PowerPC. Idle daemon will handle any action
|
|
* that needs to be taken when the system becomes idle.
|
|
*
|
|
* Originally written by Cort Dougan (cort@cs.nmt.edu).
|
|
* Subsequent 32-bit hacking by Tom Rini, Armin Kuster,
|
|
* Paul Mackerras and others.
|
|
*
|
|
* iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
|
|
*
|
|
* Additional shared processor, SMT, and firmware support
|
|
* Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
|
|
*
|
|
* 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/cpu.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/tick.h>
|
|
|
|
#include <asm/system.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/cputable.h>
|
|
#include <asm/time.h>
|
|
#include <asm/machdep.h>
|
|
#include <asm/smp.h>
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
/* this is used for software suspend, and that shuts down
|
|
* CPUs even while the system is still booting... */
|
|
#define cpu_should_die() (cpu_is_offline(smp_processor_id()) && \
|
|
(system_state == SYSTEM_RUNNING \
|
|
|| system_state == SYSTEM_BOOTING))
|
|
#else
|
|
#define cpu_should_die() 0
|
|
#endif
|
|
|
|
static int __init powersave_off(char *arg)
|
|
{
|
|
ppc_md.power_save = NULL;
|
|
return 0;
|
|
}
|
|
__setup("powersave=off", powersave_off);
|
|
|
|
/*
|
|
* The body of the idle task.
|
|
*/
|
|
void cpu_idle(void)
|
|
{
|
|
if (ppc_md.idle_loop)
|
|
ppc_md.idle_loop(); /* doesn't return */
|
|
|
|
set_thread_flag(TIF_POLLING_NRFLAG);
|
|
while (1) {
|
|
tick_nohz_stop_sched_tick(1);
|
|
while (!need_resched() && !cpu_should_die()) {
|
|
ppc64_runlatch_off();
|
|
|
|
if (ppc_md.power_save) {
|
|
clear_thread_flag(TIF_POLLING_NRFLAG);
|
|
/*
|
|
* smp_mb is so clearing of TIF_POLLING_NRFLAG
|
|
* is ordered w.r.t. need_resched() test.
|
|
*/
|
|
smp_mb();
|
|
local_irq_disable();
|
|
|
|
/* check again after disabling irqs */
|
|
if (!need_resched() && !cpu_should_die())
|
|
ppc_md.power_save();
|
|
|
|
local_irq_enable();
|
|
set_thread_flag(TIF_POLLING_NRFLAG);
|
|
|
|
} else {
|
|
/*
|
|
* Go into low thread priority and possibly
|
|
* low power mode.
|
|
*/
|
|
HMT_low();
|
|
HMT_very_low();
|
|
}
|
|
}
|
|
|
|
HMT_medium();
|
|
ppc64_runlatch_on();
|
|
tick_nohz_restart_sched_tick();
|
|
if (cpu_should_die())
|
|
cpu_die();
|
|
preempt_enable_no_resched();
|
|
schedule();
|
|
preempt_disable();
|
|
}
|
|
}
|
|
|
|
int powersave_nap;
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
/*
|
|
* Register the sysctl to set/clear powersave_nap.
|
|
*/
|
|
static ctl_table powersave_nap_ctl_table[]={
|
|
{
|
|
.ctl_name = KERN_PPC_POWERSAVE_NAP,
|
|
.procname = "powersave-nap",
|
|
.data = &powersave_nap,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = &proc_dointvec,
|
|
},
|
|
{}
|
|
};
|
|
static ctl_table powersave_nap_sysctl_root[] = {
|
|
{
|
|
.ctl_name = CTL_KERN,
|
|
.procname = "kernel",
|
|
.mode = 0555,
|
|
.child = powersave_nap_ctl_table,
|
|
},
|
|
{}
|
|
};
|
|
|
|
static int __init
|
|
register_powersave_nap_sysctl(void)
|
|
{
|
|
register_sysctl_table(powersave_nap_sysctl_root);
|
|
|
|
return 0;
|
|
}
|
|
__initcall(register_powersave_nap_sysctl);
|
|
#endif
|