Merge branch 'linux-2.6'

This commit is contained in:
Paul Mackerras
2008-01-31 11:25:51 +11:00
4565 changed files with 386028 additions and 214774 deletions

View File

@@ -54,3 +54,5 @@ config HZ
default 300 if HZ_300
default 1000 if HZ_1000
config SCHED_HRTICK
def_bool HIGH_RES_TIMERS && X86

View File

@@ -52,14 +52,13 @@ config PREEMPT
endchoice
config PREEMPT_BKL
bool "Preempt The Big Kernel Lock"
depends on SMP || PREEMPT
config RCU_TRACE
bool "Enable tracing for RCU - currently stats in debugfs"
select DEBUG_FS
default y
help
This option reduces the latency of the kernel by making the
big kernel lock preemptible.
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
Say Y here if you are building a kernel for a desktop system.
Say Y here if you want to enable RCU tracing
Say N if you are unsure.

View File

@@ -36,6 +36,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
@@ -43,6 +44,7 @@ obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -52,11 +54,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
ifeq ($(CONFIG_PREEMPT_RCU),y)
obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
endif
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

48
kernel/backtracetest.c Normal file
View File

@@ -0,0 +1,48 @@
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/delay.h>
static struct timer_list backtrace_timer;
static void backtrace_test_timer(unsigned long data)
{
printk("Testing a backtrace from irq context.\n");
printk("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
static int backtrace_regression_test(void)
{
printk("====[ backtrace testing ]===========\n");
printk("Testing a backtrace from process context.\n");
printk("The following trace is a kernel self test and not a bug!\n");
dump_stack();
init_timer(&backtrace_timer);
backtrace_timer.function = backtrace_test_timer;
mod_timer(&backtrace_timer, jiffies + 10);
msleep(10);
printk("====[ end of backtrace testing ]====\n");
return 0;
}
static void exitf(void)
{
}
module_init(backtrace_regression_test);
module_exit(exitf);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");

View File

@@ -15,9 +15,8 @@
#include <linux/stop_machine.h>
#include <linux/mutex.h>
/* This protects CPUs going up and down... */
/* Serializes the updates to cpu_online_map, cpu_present_map */
static DEFINE_MUTEX(cpu_add_remove_lock);
static DEFINE_MUTEX(cpu_bitmask_lock);
static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
*/
static int cpu_hotplug_disabled;
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
/*
* Also blocks the new readers during
* an ongoing cpu hotplug operation.
*/
int refcount;
wait_queue_head_t writer_queue;
} cpu_hotplug;
#define writer_exists() (cpu_hotplug.active_writer != NULL)
void __init cpu_hotplug_init(void)
{
cpu_hotplug.active_writer = NULL;
mutex_init(&cpu_hotplug.lock);
cpu_hotplug.refcount = 0;
init_waitqueue_head(&cpu_hotplug.writer_queue);
}
#ifdef CONFIG_HOTPLUG_CPU
/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
static struct task_struct *recursive;
static int recursive_depth;
void lock_cpu_hotplug(void)
void get_online_cpus(void)
{
struct task_struct *tsk = current;
if (tsk == recursive) {
static int warnings = 10;
if (warnings) {
printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
WARN_ON(1);
warnings--;
}
recursive_depth++;
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
}
mutex_lock(&cpu_bitmask_lock);
recursive = tsk;
}
EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
void unlock_cpu_hotplug(void)
}
EXPORT_SYMBOL_GPL(get_online_cpus);
void put_online_cpus(void)
{
WARN_ON(recursive != current);
if (recursive_depth) {
recursive_depth--;
if (cpu_hotplug.active_writer == current)
return;
}
recursive = NULL;
mutex_unlock(&cpu_bitmask_lock);
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount--;
if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
wake_up(&cpu_hotplug.writer_queue);
mutex_unlock(&cpu_hotplug.lock);
}
EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
EXPORT_SYMBOL_GPL(put_online_cpus);
#endif /* CONFIG_HOTPLUG_CPU */
/*
* The following two API's must be used when attempting
* to serialize the updates to cpu_online_map, cpu_present_map.
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
*
* Note that during a cpu-hotplug operation, the new readers, if any,
* will be blocked by the cpu_hotplug.lock
*
* Since cpu_maps_update_begin is always called after invoking
* cpu_maps_update_begin, we can be sure that only one writer is active.
*
* Note that theoretically, there is a possibility of a livelock:
* - Refcount goes to zero, last reader wakes up the sleeping
* writer.
* - Last reader unlocks the cpu_hotplug.lock.
* - A new reader arrives at this moment, bumps up the refcount.
* - The writer acquires the cpu_hotplug.lock finds the refcount
* non zero and goes to sleep again.
*
* However, this is very difficult to achieve in practice since
* get_online_cpus() not an api which is called all that often.
*
*/
static void cpu_hotplug_begin(void)
{
DECLARE_WAITQUEUE(wait, current);
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.active_writer = current;
add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
while (cpu_hotplug.refcount) {
set_current_state(TASK_UNINTERRUPTIBLE);
mutex_unlock(&cpu_hotplug.lock);
schedule();
mutex_lock(&cpu_hotplug.lock);
}
remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
}
static void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
}
/* Need to know about CPUs going up/down? */
int __cpuinit register_cpu_notifier(struct notifier_block *nb)
{
int ret;
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
ret = raw_notifier_chain_register(&cpu_chain, nb);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return ret;
}
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
{
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
}
EXPORT_SYMBOL(unregister_cpu_notifier);
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
cpu_hotplug_begin();
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);
mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
mutex_unlock(&cpu_bitmask_lock);
if (IS_ERR(p) || cpu_online(cpu)) {
/* CPU didn't die: tell everyone. Can't complain. */
@@ -202,7 +270,7 @@ out_thread:
out_allowed:
set_cpus_allowed(current, old_allowed);
out_release:
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
cpu_hotplug_done();
return err;
}
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
{
int err = 0;
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
if (cpu_hotplug_disabled)
err = -EBUSY;
else
err = _cpu_down(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return err;
}
#endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
cpu_hotplug_begin();
ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
-1, &nr_calls);
if (ret == NOTIFY_BAD) {
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
}
/* Arch-specific enabling code. */
mutex_lock(&cpu_bitmask_lock);
ret = __cpu_up(cpu);
mutex_unlock(&cpu_bitmask_lock);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
@@ -257,7 +323,7 @@ out_notify:
if (ret != 0)
__raw_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
cpu_hotplug_done();
return ret;
}
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
return -EINVAL;
}
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
if (cpu_hotplug_disabled)
err = -EBUSY;
else
err = _cpu_up(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return err;
}
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
first_cpu = first_cpu(cpu_online_map);
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return error;
}
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
cpu_hotplug_disabled = 0;
if (cpus_empty(frozen_cpus))
goto out;
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
}
cpus_clear(frozen_cpus);
out:
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
}
#endif /* CONFIG_PM_SLEEP_SMP */

View File

@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
*
* Call with cgroup_mutex held. May take callback_mutex during
* call due to the kfifo_alloc() and kmalloc() calls. May nest
* a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
* a call to the get_online_cpus()/put_online_cpus() pair.
* Must not be called holding callback_mutex, because we must not
* call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
* the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
* call get_online_cpus() while holding callback_mutex. Elsewhere
* the kernel nests callback_mutex inside get_online_cpus() calls.
* So the reverse nesting would risk an ABBA deadlock.
*
* The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
rebuild:
/* Have scheduler rebuild sched domains */
lock_cpu_hotplug();
get_online_cpus();
partition_sched_domains(ndoms, doms);
unlock_cpu_hotplug();
put_online_cpus();
done:
if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
* will call rebuild_sched_domains(). The lock_cpu_hotplug()
* will call rebuild_sched_domains(). The get_online_cpus()
* call in rebuild_sched_domains() must not be made while holding
* callback_mutex. Elsewhere the kernel nests callback_mutex inside
* lock_cpu_hotplug() calls. So the reverse nesting would risk an
* get_online_cpus() calls. So the reverse nesting would risk an
* ABBA deadlock.
*/

View File

@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr)
addr <= (unsigned long)_etext)
return 1;
if (addr >= (unsigned long)_sinittext &&
if (system_state == SYSTEM_BOOTING &&
addr >= (unsigned long)_sinittext &&
addr <= (unsigned long)_einittext)
return 1;
return 0;

View File

@@ -51,6 +51,7 @@
#include <linux/random.h>
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -392,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
destroy_context(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
/*
* Decrement the use count and release all resources for an mm.
@@ -791,6 +793,31 @@ out:
return error;
}
static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
struct io_context *ioc = current->io_context;
if (!ioc)
return 0;
/*
* Share io context with parent, if CLONE_IO is set
*/
if (clone_flags & CLONE_IO) {
tsk->io_context = ioc_task_link(ioc);
if (unlikely(!tsk->io_context))
return -ENOMEM;
} else if (ioprio_valid(ioc->ioprio)) {
tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
if (unlikely(!tsk->io_context))
return -ENOMEM;
tsk->io_context->ioprio = ioc->ioprio;
}
#endif
return 0;
}
/*
* Helper to unshare the files of the current task.
* We don't want to expose copy_files internals to
@@ -1045,6 +1072,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
#ifdef CONFIG_PREEMPT_RCU
p->rcu_read_lock_nesting = 0;
p->rcu_flipctr_idx = 0;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
@@ -1059,6 +1090,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
#ifdef CONFIG_DETECT_SOFTLOCKUP
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
#endif
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
@@ -1147,15 +1183,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_mm;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespaces;
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
retval = -ENOMEM;
pid = alloc_pid(task_active_pid_ns(p));
if (!pid)
goto bad_fork_cleanup_namespaces;
goto bad_fork_cleanup_io;
if (clone_flags & CLONE_NEWPID) {
retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1196,6 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p);
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
@@ -1224,9 +1263,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
p->ioprio = current->ioprio;
/*
* The task hasn't been attached yet, so its cpus_allowed mask will
* not be changed, nor will its assigned CPU.
@@ -1237,6 +1273,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* parent's CPU). This avoids alot of nasty races.
*/
p->cpus_allowed = current->cpus_allowed;
p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
!cpu_online(task_cpu(p))))
set_task_cpu(p, smp_processor_id());
@@ -1317,6 +1354,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
put_io_context(p->io_context);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_keys:

View File

@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
}
#endif /* BITS_PER_LONG >= 64 */
/*
* Check, whether the timer is on the callback pending list
*/
static inline int hrtimer_cb_pending(const struct hrtimer *timer)
{
return timer->state & HRTIMER_STATE_PENDING;
}
/*
* Remove a timer from the callback pending list
*/
static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
{
list_del_init(&timer->cb_entry);
}
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -493,22 +509,6 @@ void hres_timers_resume(void)
retrigger_next_event(NULL);
}
/*
* Check, whether the timer is on the callback pending list
*/
static inline int hrtimer_cb_pending(const struct hrtimer *timer)
{
return timer->state & HRTIMER_STATE_PENDING;
}
/*
* Remove a timer from the callback pending list
*/
static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
{
list_del_init(&timer->cb_entry);
}
/*
* Initialize the high resolution related parts of cpu_base
*/
@@ -516,7 +516,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next.tv64 = KTIME_MAX;
base->hres_active = 0;
INIT_LIST_HEAD(&base->cb_pending);
}
/*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
*/
static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
{
INIT_LIST_HEAD(&timer->cb_entry);
}
/*
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
{
return 0;
}
static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
static inline int hrtimer_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
return 0;
}
#endif /* CONFIG_HIGH_RES_TIMERS */
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
clock_id = CLOCK_MONOTONIC;
timer->base = &cpu_base->clock_base[clock_id];
INIT_LIST_HEAD(&timer->cb_entry);
hrtimer_init_timer_hres(timer);
#ifdef CONFIG_TIMER_STATS
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
}
EXPORT_SYMBOL_GPL(hrtimer_get_res);
static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
{
spin_lock_irq(&cpu_base->lock);
while (!list_empty(&cpu_base->cb_pending)) {
enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
int restart;
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
timer->state &= ~HRTIMER_STATE_CALLBACK;
if (restart == HRTIMER_RESTART) {
BUG_ON(hrtimer_active(timer));
/*
* Enqueue the timer, allow reprogramming of the event
* device
*/
enqueue_hrtimer(timer, timer->base, 1);
} else if (hrtimer_active(timer)) {
/*
* If the timer was rearmed on another CPU, reprogram
* the event device.
*/
if (timer->base->first == &timer->node)
hrtimer_reprogram(timer, timer->base);
}
}
spin_unlock_irq(&cpu_base->lock);
}
static void __run_hrtimer(struct hrtimer *timer)
{
struct hrtimer_clock_base *base = timer->base;
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
/*
* Used for scheduler timers, avoid lock inversion with
* rq->lock and tasklist_lock.
*
* These timers are required to deal with enqueue expiry
* themselves and are not allowed to migrate.
*/
spin_unlock(&cpu_base->lock);
restart = fn(timer);
spin_lock(&cpu_base->lock);
} else
restart = fn(timer);
/*
* Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
* reprogramming of the event hardware. This happens at the end of this
* function anyway.
*/
if (restart != HRTIMER_NORESTART) {
BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base, 0);
}
timer->state &= ~HRTIMER_STATE_CALLBACK;
}
#ifdef CONFIG_HIGH_RES_TIMERS
/*
@@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
continue;
}
__remove_hrtimer(timer, base,
HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
/*
* Note: We clear the CALLBACK bit after
* enqueue_hrtimer to avoid reprogramming of
* the event hardware. This happens at the end
* of this function anyway.
*/
if (timer->function(timer) != HRTIMER_NORESTART) {
BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base, 0);
}
timer->state &= ~HRTIMER_STATE_CALLBACK;
__run_hrtimer(timer);
}
spin_unlock(&cpu_base->lock);
base++;
@@ -1122,98 +1189,11 @@ void hrtimer_interrupt(struct clock_event_device *dev)
static void run_hrtimer_softirq(struct softirq_action *h)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
spin_lock_irq(&cpu_base->lock);
while (!list_empty(&cpu_base->cb_pending)) {
enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
int restart;
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
timer->state &= ~HRTIMER_STATE_CALLBACK;
if (restart == HRTIMER_RESTART) {
BUG_ON(hrtimer_active(timer));
/*
* Enqueue the timer, allow reprogramming of the event
* device
*/
enqueue_hrtimer(timer, timer->base, 1);
} else if (hrtimer_active(timer)) {
/*
* If the timer was rearmed on another CPU, reprogram
* the event device.
*/
if (timer->base->first == &timer->node)
hrtimer_reprogram(timer, timer->base);
}
}
spin_unlock_irq(&cpu_base->lock);
run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
}
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
* Expire the per base hrtimer-queue:
*/
static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
int index)
{
struct rb_node *node;
struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
if (!base->first)
return;
if (base->get_softirq_time)
base->softirq_time = base->get_softirq_time();
spin_lock_irq(&cpu_base->lock);
while ((node = base->first)) {
struct hrtimer *timer;
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
timer = rb_entry(node, struct hrtimer, node);
if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
#ifdef CONFIG_HIGH_RES_TIMERS
WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
#endif
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
timer->state &= ~HRTIMER_STATE_CALLBACK;
if (restart != HRTIMER_NORESTART) {
BUG_ON(hrtimer_active(timer));
enqueue_hrtimer(timer, base, 0);
}
}
spin_unlock_irq(&cpu_base->lock);
}
/*
* Called from timer softirq every jiffy, expire hrtimers:
*
@@ -1221,10 +1201,9 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
* softirq context in case the hrtimer initialization failed or has
* not been done yet.
*/
void hrtimer_run_queues(void)
void hrtimer_run_pending(void)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
int i;
if (hrtimer_hres_active())
return;
@@ -1238,8 +1217,54 @@ void hrtimer_run_queues(void)
* deadlock vs. xtime_lock.
*/
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
if (hrtimer_switch_to_hres())
return;
hrtimer_switch_to_hres();
run_hrtimer_pending(cpu_base);
}
/*
* Called from hardirq context every jiffy
*/
static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
int index)
{
struct rb_node *node;
struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
if (!base->first)
return;
if (base->get_softirq_time)
base->softirq_time = base->get_softirq_time();
spin_lock(&cpu_base->lock);
while ((node = base->first)) {
struct hrtimer *timer;
timer = rb_entry(node, struct hrtimer, node);
if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
list_add_tail(&timer->cb_entry,
&base->cpu_base->cb_pending);
continue;
}
__run_hrtimer(timer);
}
spin_unlock(&cpu_base->lock);
}
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
int i;
if (hrtimer_hres_active())
return;
hrtimer_get_softirq_time(cpu_base);
@@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
sl->timer.function = hrtimer_wakeup;
sl->task = task;
#ifdef CONFIG_HIGH_RES_TIMERS
sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
#endif
}
@@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start(&t->timer, t->timer.expires, mode);
if (!hrtimer_active(&t->timer))
t->task = NULL;
if (likely(t->task))
schedule();
@@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
cpu_base->clock_base[i].cpu_base = cpu_base;
INIT_LIST_HEAD(&cpu_base->cb_pending);
hrtimer_init_hres(cpu_base);
}

View File

@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
return;
}
printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
#ifdef CONFIG_DEBUG_SHIRQ
dump_stack();
#endif
spin_unlock_irqrestore(&desc->lock, flags);
return;
}

View File

@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
#endif
static int irq_spurious_read(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
struct irq_desc *d = &irq_desc[(long) data];
return sprintf(page, "count %u\n"
"unhandled %u\n"
"last_unhandled %u ms\n",
d->irq_count,
d->irqs_unhandled,
jiffies_to_msecs(d->last_unhandled));
}
#define MAX_NAMELEN 128
static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
void register_irq_proc(unsigned int irq)
{
char name [MAX_NAMELEN];
struct proc_dir_entry *entry;
if (!root_irq_dir ||
(irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
#ifdef CONFIG_SMP
{
struct proc_dir_entry *entry;
/* create /proc/irq/<irq>/smp_affinity */
entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
}
}
#endif
entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
if (entry) {
entry->data = (void *)(long)irq;
entry->read_proc = irq_spurious_read;
}
}
#undef MAX_NAMELEN

View File

@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/moduleparam.h>
static int irqfixup __read_mostly;
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
}
__setup("noirqdebug", noirqdebug_setup);
module_param(noirqdebug, bool, 0644);
MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
}
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
static int __init irqpoll_setup(char *str)
{

View File

@@ -233,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr,
int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
unsigned long *offset)
{
char namebuf[KSYM_NAME_LEN];
if (is_ksym_addr(addr))
return !!get_symbol_pos(addr, symbolsize, offset);
return !!module_address_lookup(addr, symbolsize, offset, NULL);
return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
}
/*
@@ -251,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr,
unsigned long *offset,
char **modname, char *namebuf)
{
const char *msym;
namebuf[KSYM_NAME_LEN - 1] = 0;
namebuf[0] = 0;
@@ -268,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr,
}
/* see if it's in a module */
msym = module_address_lookup(addr, symbolsize, offset, modname);
if (msym)
return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
return module_address_lookup(addr, symbolsize, offset, modname,
namebuf);
return NULL;
}

View File

@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
if (!err)
init_test_probes();
return err;
}

View File

@@ -17,30 +17,34 @@
#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
static struct subsys_attribute _name##_attr = \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kset *kset, char *page)
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
/* uevent helper program, used during early boo */
static ssize_t uevent_helper_show(struct kset *kset, char *page)
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(page, "%s\n", uevent_helper);
return sprintf(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count)
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (count+1 > UEVENT_HELPER_PATH_LEN)
return -ENOENT;
memcpy(uevent_helper, page, count);
memcpy(uevent_helper, buf, count);
uevent_helper[count] = '\0';
if (count && uevent_helper[count-1] == '\n')
uevent_helper[count-1] = '\0';
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper);
#endif
#ifdef CONFIG_KEXEC
static ssize_t kexec_loaded_show(struct kset *kset, char *page)
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(page, "%d\n", !!kexec_image);
return sprintf(buf, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(page, "%d\n", !!kexec_crash_image);
return sprintf(buf, "%d\n", !!kexec_crash_image);
}
KERNEL_ATTR_RO(kexec_crash_loaded);
static ssize_t vmcoreinfo_show(struct kset *kset, char *page)
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(page, "%lx %x\n",
return sprintf(buf, "%lx %x\n",
paddr_vmcoreinfo_note(),
(unsigned int)vmcoreinfo_max_size);
}
@@ -94,8 +101,8 @@ static struct bin_attribute notes_attr = {
.read = &notes_read,
};
decl_subsys(kernel, NULL, NULL);
EXPORT_SYMBOL_GPL(kernel_subsys);
struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -116,24 +123,39 @@ static struct attribute_group kernel_attr_group = {
static int __init ksysfs_init(void)
{
int error = subsystem_register(&kernel_subsys);
if (!error)
error = sysfs_create_group(&kernel_subsys.kobj,
&kernel_attr_group);
int error;
if (!error && notes_size > 0) {
kernel_kobj = kobject_create_and_add("kernel", NULL);
if (!kernel_kobj) {
error = -ENOMEM;
goto exit;
}
error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
if (error)
goto kset_exit;
if (notes_size > 0) {
notes_attr.size = notes_size;
error = sysfs_create_bin_file(&kernel_subsys.kobj,
&notes_attr);
error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
if (error)
goto group_exit;
}
/*
* Create "/sys/kernel/uids" directory and corresponding root user's
* directory under it.
*/
if (!error)
error = uids_kobject_init();
/* create the /sys/kernel/uids/ directory */
error = uids_sysfs_init();
if (error)
goto notes_exit;
return 0;
notes_exit:
if (notes_size > 0)
sysfs_remove_bin_file(kernel_kobj, &notes_attr);
group_exit:
sysfs_remove_group(kernel_kobj, &kernel_attr_group);
kset_exit:
kobject_put(kernel_kobj);
exit:
return error;
}

View File

@@ -15,6 +15,8 @@
#include <linux/mutex.h>
#include <asm/semaphore.h>
#define KTHREAD_NICE_LEVEL (-5)
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
if (pid < 0) {
create->result = ERR_PTR(pid);
} else {
struct sched_param param = { .sched_priority = 0 };
wait_for_completion(&create->started);
read_lock(&tasklist_lock);
create->result = find_task_by_pid(pid);
read_unlock(&tasklist_lock);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
sched_setscheduler(create->result, SCHED_NORMAL, &param);
set_user_nice(create->result, KTHREAD_NICE_LEVEL);
set_cpus_allowed(create->result, CPU_MASK_ALL);
}
complete(&create->done);
}
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_user_nice(tsk, -5);
set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed(tsk, CPU_MASK_ALL);
current->flags |= PF_NOFREEZE;

239
kernel/latencytop.c Normal file
View File

@@ -0,0 +1,239 @@
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
static DEFINE_SPINLOCK(latency_lock);
#define MAXLR 128
static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
void clear_all_latency_tracing(struct task_struct *p)
{
unsigned long flags;
if (!latencytop_enabled)
return;
spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
spin_unlock_irqrestore(&latency_lock, flags);
}
static void clear_global_latency_tracing(void)
{
unsigned long flags;
spin_lock_irqsave(&latency_lock, flags);
memset(&latency_record, 0, sizeof(latency_record));
spin_unlock_irqrestore(&latency_lock, flags);
}
static void __sched
account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
{
int firstnonnull = MAXLR + 1;
int i;
if (!latencytop_enabled)
return;
/* skip kernel threads for now */
if (!tsk->mm)
return;
for (i = 0; i < MAXLR; i++) {
int q;
int same = 1;
/* Nothing stored: */
if (!latency_record[i].backtrace[0]) {
if (firstnonnull > i)
firstnonnull = i;
continue;
}
for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
if (latency_record[i].backtrace[q] !=
lat->backtrace[q])
same = 0;
if (same && lat->backtrace[q] == 0)
break;
if (same && lat->backtrace[q] == ULONG_MAX)
break;
}
if (same) {
latency_record[i].count++;
latency_record[i].time += lat->time;
if (lat->time > latency_record[i].max)
latency_record[i].max = lat->time;
return;
}
}
i = firstnonnull;
if (i >= MAXLR - 1)
return;
/* Allocted a new one: */
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
{
struct stack_trace trace;
memset(&trace, 0, sizeof(trace));
trace.max_entries = LT_BACKTRACEDEPTH;
trace.entries = &lat->backtrace[0];
trace.skip = 0;
save_stack_trace_tsk(tsk, &trace);
}
void __sched
account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
{
unsigned long flags;
int i, q;
struct latency_record lat;
if (!latencytop_enabled)
return;
/* Long interruptible waits are generally user requested... */
if (inter && usecs > 5000)
return;
memset(&lat, 0, sizeof(lat));
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
store_stacktrace(tsk, &lat);
spin_lock_irqsave(&latency_lock, flags);
account_global_scheduler_latency(tsk, &lat);
/*
* short term hack; if we're > 32 we stop; future we recycle:
*/
tsk->latency_record_count++;
if (tsk->latency_record_count >= LT_SAVECOUNT)
goto out_unlock;
for (i = 0; i < LT_SAVECOUNT ; i++) {
struct latency_record *mylat;
int same = 1;
mylat = &tsk->latency_record[i];
for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
if (mylat->backtrace[q] !=
lat.backtrace[q])
same = 0;
if (same && lat.backtrace[q] == 0)
break;
if (same && lat.backtrace[q] == ULONG_MAX)
break;
}
if (same) {
mylat->count++;
mylat->time += lat.time;
if (lat.time > mylat->max)
mylat->max = lat.time;
goto out_unlock;
}
}
/* Allocated a new one: */
i = tsk->latency_record_count;
memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
out_unlock:
spin_unlock_irqrestore(&latency_lock, flags);
}
static int lstats_show(struct seq_file *m, void *v)
{
int i;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < MAXLR; i++) {
if (latency_record[i].backtrace[0]) {
int q;
seq_printf(m, "%i %li %li ",
latency_record[i].count,
latency_record[i].time,
latency_record[i].max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
char sym[KSYM_NAME_LEN];
char *c;
if (!latency_record[i].backtrace[q])
break;
if (latency_record[i].backtrace[q] == ULONG_MAX)
break;
sprint_symbol(sym, latency_record[i].backtrace[q]);
c = strchr(sym, '+');
if (c)
*c = 0;
seq_printf(m, "%s ", sym);
}
seq_printf(m, "\n");
}
}
return 0;
}
static ssize_t
lstats_write(struct file *file, const char __user *buf, size_t count,
loff_t *offs)
{
clear_global_latency_tracing();
return count;
}
static int lstats_open(struct inode *inode, struct file *filp)
{
return single_open(filp, lstats_show, NULL);
}
static struct file_operations lstats_fops = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
.llseek = seq_lseek,
.release = single_release,
};
static int __init init_lstats_procfs(void)
{
struct proc_dir_entry *pe;
pe = create_proc_entry("latency_stats", 0644, NULL);
if (!pe)
return -ENOMEM;
pe->proc_fops = &lstats_fops;
return 0;
}
__initcall(init_lstats_procfs);

View File

@@ -2932,7 +2932,7 @@ static void zap_class(struct lock_class *class)
}
static inline int within(void *addr, void *start, unsigned long size)
static inline int within(const void *addr, void *start, unsigned long size)
{
return addr >= start && addr < start + size;
}
@@ -2955,9 +2955,12 @@ void lockdep_free_key_range(void *start, unsigned long size)
head = classhash_table + i;
if (list_empty(head))
continue;
list_for_each_entry_safe(class, next, head, hash_entry)
list_for_each_entry_safe(class, next, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
zap_class(class);
}
}
if (locked)
@@ -3203,7 +3206,11 @@ retry:
EXPORT_SYMBOL_GPL(debug_show_all_locks);
void debug_show_held_locks(struct task_struct *task)
/*
* Careful: only use this function if you are sure that
* the task cannot run in parallel!
*/
void __debug_show_held_locks(struct task_struct *task)
{
if (unlikely(!debug_locks)) {
printk("INFO: lockdep is turned off.\n");
@@ -3211,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
}
lockdep_print_held_locks(task);
}
EXPORT_SYMBOL_GPL(__debug_show_held_locks);
void debug_show_held_locks(struct task_struct *task)
{
__debug_show_held_locks(task);
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);

View File

@@ -47,8 +47,6 @@
#include <asm/cacheflush.h>
#include <linux/license.h>
extern int module_sysfs_initialized;
#if 0
#define DEBUGP printk
#else
@@ -67,6 +65,9 @@ extern int module_sysfs_initialized;
static DEFINE_MUTEX(module_mutex);
static LIST_HEAD(modules);
/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
int register_module_notifier(struct notifier_block * nb)
@@ -86,8 +87,11 @@ EXPORT_SYMBOL(unregister_module_notifier);
static inline int strong_try_module_get(struct module *mod)
{
if (mod && mod->state == MODULE_STATE_COMING)
return -EBUSY;
if (try_module_get(mod))
return 0;
return try_module_get(mod);
else
return -ENOENT;
}
static inline void add_taint_module(struct module *mod, unsigned flag)
@@ -426,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
}
static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
{
int cpu;
for_each_possible_cpu(cpu)
memcpy(pcpudest + per_cpu_offset(cpu), from, size);
}
static int percpu_modinit(void)
{
pcpu_num_used = 2;
@@ -498,6 +510,8 @@ static struct module_attribute modinfo_##field = { \
MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);
static char last_unloaded_module[MODULE_NAME_LEN+1];
#ifdef CONFIG_MODULE_UNLOAD
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
@@ -539,11 +553,21 @@ static int already_uses(struct module *a, struct module *b)
static int use_module(struct module *a, struct module *b)
{
struct module_use *use;
int no_warn;
int no_warn, err;
if (b == NULL || already_uses(a, b)) return 1;
if (!strong_try_module_get(b))
/* If we're interrupted or time out, we fail. */
if (wait_event_interruptible_timeout(
module_wq, (err = strong_try_module_get(b)) != -EBUSY,
30 * HZ) <= 0) {
printk("%s: gave up waiting for init of module %s.\n",
a->name, b->name);
return 0;
}
/* If strong_try_module_get() returned a different error, we fail. */
if (err)
return 0;
DEBUGP("Allocating new usage for %s.\n", a->name);
@@ -721,6 +745,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
mod->exit();
mutex_lock(&module_mutex);
}
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
free_module(mod);
out:
@@ -814,7 +840,7 @@ static inline void module_unload_free(struct module *mod)
static inline int use_module(struct module *a, struct module *b)
{
return strong_try_module_get(b);
return strong_try_module_get(b) == 0;
}
static inline void module_unload_init(struct module *mod)
@@ -1122,7 +1148,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
++loaded;
}
notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes");
notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
if (!notes_attrs->dir)
goto out;
@@ -1212,6 +1238,7 @@ void module_remove_modinfo_attrs(struct module *mod)
int mod_sysfs_init(struct module *mod)
{
int err;
struct kobject *kobj;
if (!module_sysfs_initialized) {
printk(KERN_ERR "%s: module sysfs not initialized\n",
@@ -1219,15 +1246,25 @@ int mod_sysfs_init(struct module *mod)
err = -EINVAL;
goto out;
}
memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
if (err)
kobj = kset_find_obj(module_kset, mod->name);
if (kobj) {
printk(KERN_ERR "%s: module is already loaded\n", mod->name);
kobject_put(kobj);
err = -EINVAL;
goto out;
kobj_set_kset_s(&mod->mkobj, module_subsys);
}
mod->mkobj.mod = mod;
kobject_init(&mod->mkobj.kobj);
memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
mod->mkobj.kobj.kset = module_kset;
err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
"%s", mod->name);
if (err)
kobject_put(&mod->mkobj.kobj);
/* delay uevent until full sysfs population */
out:
return err;
}
@@ -1238,12 +1275,7 @@ int mod_sysfs_setup(struct module *mod,
{
int err;
/* delay uevent until full sysfs population */
err = kobject_add(&mod->mkobj.kobj);
if (err)
goto out;
mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
if (!mod->holders_dir) {
err = -ENOMEM;
goto out_unreg;
@@ -1263,11 +1295,9 @@ int mod_sysfs_setup(struct module *mod,
out_unreg_param:
module_param_sysfs_remove(mod);
out_unreg_holders:
kobject_unregister(mod->holders_dir);
kobject_put(mod->holders_dir);
out_unreg:
kobject_del(&mod->mkobj.kobj);
kobject_put(&mod->mkobj.kobj);
out:
return err;
}
#endif
@@ -1276,9 +1306,20 @@ static void mod_kobject_remove(struct module *mod)
{
module_remove_modinfo_attrs(mod);
module_param_sysfs_remove(mod);
kobject_unregister(mod->mkobj.drivers_dir);
kobject_unregister(mod->holders_dir);
kobject_unregister(&mod->mkobj.kobj);
kobject_put(mod->mkobj.drivers_dir);
kobject_put(mod->holders_dir);
kobject_put(&mod->mkobj.kobj);
}
/*
* link the module with the whole machine is stopped with interrupts off
* - this defends against kallsyms not taking locks
*/
static int __link_module(void *_mod)
{
struct module *mod = _mod;
list_add(&mod->list, &modules);
return 0;
}
/*
@@ -1330,7 +1371,7 @@ void *__symbol_get(const char *symbol)
preempt_disable();
value = __find_symbol(symbol, &owner, &crc, 1);
if (value && !strong_try_module_get(owner))
if (value && strong_try_module_get(owner) != 0)
value = 0;
preempt_enable();
@@ -1884,16 +1925,16 @@ static struct module *load_module(void __user *umod,
/* Now we've moved module, initialize linked lists, etc. */
module_unload_init(mod);
/* Initialize kobject, so we can reference it. */
/* add kobject, so we can reference it. */
err = mod_sysfs_init(mod);
if (err)
goto cleanup;
goto free_unload;
/* Set up license info based on the info section */
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
if (strcmp(mod->name, "ndiswrapper") == 0)
add_taint(TAINT_PROPRIETARY_MODULE);
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -2023,6 +2064,11 @@ static struct module *load_module(void __user *umod,
printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
mod->name);
/* Now sew it into the lists so we can get lockdep and oops
* info during argument parsing. Noone should access us, since
* strong_try_module_get() will fail. */
stop_machine_run(__link_module, mod, NR_CPUS);
/* Size of section 0 is 0, so this works well if no params */
err = parse_args(mod->name, mod->args,
(struct kernel_param *)
@@ -2031,7 +2077,7 @@ static struct module *load_module(void __user *umod,
/ sizeof(struct kernel_param),
NULL);
if (err < 0)
goto arch_cleanup;
goto unlink;
err = mod_sysfs_setup(mod,
(struct kernel_param *)
@@ -2039,7 +2085,7 @@ static struct module *load_module(void __user *umod,
sechdrs[setupindex].sh_size
/ sizeof(struct kernel_param));
if (err < 0)
goto arch_cleanup;
goto unlink;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2054,9 +2100,13 @@ static struct module *load_module(void __user *umod,
/* Done! */
return mod;
arch_cleanup:
unlink:
stop_machine_run(__unlink_module, mod, NR_CPUS);
module_arch_cleanup(mod);
cleanup:
kobject_del(&mod->mkobj.kobj);
kobject_put(&mod->mkobj.kobj);
free_unload:
module_unload_free(mod);
module_free(mod, mod->module_init);
free_core:
@@ -2076,17 +2126,6 @@ static struct module *load_module(void __user *umod,
goto free_hdr;
}
/*
* link the module with the whole machine is stopped with interrupts off
* - this defends against kallsyms not taking locks
*/
static int __link_module(void *_mod)
{
struct module *mod = _mod;
list_add(&mod->list, &modules);
return 0;
}
/* This is where the real work happens */
asmlinkage long
sys_init_module(void __user *umod,
@@ -2111,10 +2150,6 @@ sys_init_module(void __user *umod,
return PTR_ERR(mod);
}
/* Now sew it into the lists. They won't access us, since
strong_try_module_get() will fail. */
stop_machine_run(__link_module, mod, NR_CPUS);
/* Drop lock so they can recurse */
mutex_unlock(&module_mutex);
@@ -2133,6 +2168,7 @@ sys_init_module(void __user *umod,
mutex_lock(&module_mutex);
free_module(mod);
mutex_unlock(&module_mutex);
wake_up(&module_wq);
return ret;
}
@@ -2147,6 +2183,7 @@ sys_init_module(void __user *umod,
mod->init_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
wake_up(&module_wq);
return 0;
}
@@ -2211,14 +2248,13 @@ static const char *get_ksymbol(struct module *mod,
return mod->strtab + mod->symtab[best].st_name;
}
/* For kallsyms to ask for address resolution. NULL means not found.
We don't lock, as this is used for oops resolution and races are a
lesser concern. */
/* FIXME: Risky: returns a pointer into a module w/o lock */
const char *module_address_lookup(unsigned long addr,
unsigned long *size,
unsigned long *offset,
char **modname)
/* For kallsyms to ask for address resolution. NULL means not found. Careful
* not to lock to avoid deadlock on oopses, simply disable preemption. */
char *module_address_lookup(unsigned long addr,
unsigned long *size,
unsigned long *offset,
char **modname,
char *namebuf)
{
struct module *mod;
const char *ret = NULL;
@@ -2233,8 +2269,13 @@ const char *module_address_lookup(unsigned long addr,
break;
}
}
/* Make a copy in here where it's safe */
if (ret) {
strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
ret = namebuf;
}
preempt_enable();
return ret;
return (char *)ret;
}
int lookup_module_symbol_name(unsigned long addr, char *symname)
@@ -2362,21 +2403,30 @@ static void m_stop(struct seq_file *m, void *p)
mutex_unlock(&module_mutex);
}
static char *taint_flags(unsigned int taints, char *buf)
static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
if (taints) {
if (mod->taints ||
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
buf[bx++] = '(';
if (taints & TAINT_PROPRIETARY_MODULE)
if (mod->taints & TAINT_PROPRIETARY_MODULE)
buf[bx++] = 'P';
if (taints & TAINT_FORCED_MODULE)
if (mod->taints & TAINT_FORCED_MODULE)
buf[bx++] = 'F';
/*
* TAINT_FORCED_RMMOD: could be added.
* TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
* apply to modules.
*/
/* Show a - for module-is-being-unloaded */
if (mod->state == MODULE_STATE_GOING)
buf[bx++] = '-';
/* Show a + for module-is-being-loaded */
if (mod->state == MODULE_STATE_COMING)
buf[bx++] = '+';
buf[bx++] = ')';
}
buf[bx] = '\0';
@@ -2403,7 +2453,7 @@ static int m_show(struct seq_file *m, void *p)
/* Taints info */
if (mod->taints)
seq_printf(m, " %s", taint_flags(mod->taints, buf));
seq_printf(m, " %s", module_flags(mod, buf));
seq_printf(m, "\n");
return 0;
@@ -2498,97 +2548,12 @@ void print_modules(void)
printk("Modules linked in:");
list_for_each_entry(mod, &modules, list)
printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
printk(" %s%s", mod->name, module_flags(mod, buf));
if (last_unloaded_module[0])
printk(" [last unloaded: %s]", last_unloaded_module);
printk("\n");
}
#ifdef CONFIG_SYSFS
static char *make_driver_name(struct device_driver *drv)
{
char *driver_name;
driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
GFP_KERNEL);
if (!driver_name)
return NULL;
sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
return driver_name;
}
static void module_create_drivers_dir(struct module_kobject *mk)
{
if (!mk || mk->drivers_dir)
return;
mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers");
}
void module_add_driver(struct module *mod, struct device_driver *drv)
{
char *driver_name;
int no_warn;
struct module_kobject *mk = NULL;
if (!drv)
return;
if (mod)
mk = &mod->mkobj;
else if (drv->mod_name) {
struct kobject *mkobj;
/* Lookup built-in module entry in /sys/modules */
mkobj = kset_find_obj(&module_subsys, drv->mod_name);
if (mkobj) {
mk = container_of(mkobj, struct module_kobject, kobj);
/* remember our module structure */
drv->mkobj = mk;
/* kset_find_obj took a reference */
kobject_put(mkobj);
}
}
if (!mk)
return;
/* Don't check return codes; these calls are idempotent */
no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
driver_name = make_driver_name(drv);
if (driver_name) {
module_create_drivers_dir(mk);
no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
driver_name);
kfree(driver_name);
}
}
EXPORT_SYMBOL(module_add_driver);
void module_remove_driver(struct device_driver *drv)
{
struct module_kobject *mk = NULL;
char *driver_name;
if (!drv)
return;
sysfs_remove_link(&drv->kobj, "module");
if (drv->owner)
mk = &drv->owner->mkobj;
else if (drv->mkobj)
mk = drv->mkobj;
if (mk && mk->drivers_dir) {
driver_name = make_driver_name(drv);
if (driver_name) {
sysfs_remove_link(mk->drivers_dir, driver_name);
kfree(driver_name);
}
}
}
EXPORT_SYMBOL(module_remove_driver);
#endif
#ifdef CONFIG_MODVERSIONS
/* Generate the signature for struct module here, too, for modversions. */
void struct_module(struct module *mod) { return; }

View File

@@ -20,6 +20,7 @@
#include <linux/kexec.h>
#include <linux/debug_locks.h>
#include <linux/random.h>
#include <linux/kallsyms.h>
int panic_on_oops;
int tainted;
@@ -280,6 +281,13 @@ static int init_oops_id(void)
}
late_initcall(init_oops_id);
static void print_oops_end_marker(void)
{
init_oops_id();
printk(KERN_WARNING "---[ end trace %016llx ]---\n",
(unsigned long long)oops_id);
}
/*
* Called when the architecture exits its oops handler, after printing
* everything.
@@ -287,11 +295,26 @@ late_initcall(init_oops_id);
void oops_exit(void)
{
do_oops_enter_exit();
init_oops_id();
printk(KERN_WARNING "---[ end trace %016llx ]---\n",
(unsigned long long)oops_id);
print_oops_end_marker();
}
#ifdef WANT_WARN_ON_SLOWPATH
void warn_on_slowpath(const char *file, int line)
{
char function[KSYM_SYMBOL_LEN];
unsigned long caller = (unsigned long) __builtin_return_address(0);
sprint_symbol(function, caller);
printk(KERN_WARNING "------------[ cut here ]------------\n");
printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
line, function);
print_modules();
dump_stack();
print_oops_end_marker();
}
EXPORT_SYMBOL(warn_on_slowpath);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
/*
* Called when gcc's -fstack-protector feature is used, and

View File

@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp)
extern struct kernel_param __start___param[], __stop___param[];
#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
struct param_attribute
{
struct module_attribute mattr;
@@ -472,7 +470,7 @@ param_sysfs_setup(struct module_kobject *mk,
sizeof(mp->grp.attrs[0]));
size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
mp = kmalloc(size[0] + size[1], GFP_KERNEL);
mp = kzalloc(size[0] + size[1], GFP_KERNEL);
if (!mp)
return ERR_PTR(-ENOMEM);
@@ -560,11 +558,10 @@ static void __init kernel_param_sysfs_setup(const char *name,
BUG_ON(!mk);
mk->mod = THIS_MODULE;
kobj_set_kset_s(mk, module_subsys);
kobject_set_name(&mk->kobj, name);
kobject_init(&mk->kobj);
ret = kobject_add(&mk->kobj);
mk->kobj.kset = module_kset;
ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
if (ret) {
kobject_put(&mk->kobj);
printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
"error number %d\n", name, ret);
printk(KERN_ERR "The system will be unstable now.\n");
@@ -588,7 +585,7 @@ static void __init param_sysfs_builtin(void)
{
struct kernel_param *kp, *kp_begin = NULL;
unsigned int i, name_len, count = 0;
char modname[MAX_KBUILD_MODNAME + 1] = "";
char modname[MODULE_NAME_LEN + 1] = "";
for (i=0; i < __stop___param - __start___param; i++) {
char *dot;
@@ -596,12 +593,12 @@ static void __init param_sysfs_builtin(void)
kp = &__start___param[i];
max_name_len =
min_t(size_t, MAX_KBUILD_MODNAME, strlen(kp->name));
min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
dot = memchr(kp->name, '.', max_name_len);
if (!dot) {
DEBUGP("couldn't find period in first %d characters "
"of %s\n", MAX_KBUILD_MODNAME, kp->name);
"of %s\n", MODULE_NAME_LEN, kp->name);
continue;
}
name_len = dot - kp->name;
@@ -679,8 +676,6 @@ static struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
static struct kobj_type module_ktype;
static int uevent_filter(struct kset *kset, struct kobject *kobj)
{
struct kobj_type *ktype = get_ktype(kobj);
@@ -694,21 +689,11 @@ static struct kset_uevent_ops module_uevent_ops = {
.filter = uevent_filter,
};
decl_subsys(module, &module_ktype, &module_uevent_ops);
struct kset *module_kset;
int module_sysfs_initialized;
static void module_release(struct kobject *kobj)
{
/*
* Stupid empty release function to allow the memory for the kobject to
* be properly cleaned up. This will not need to be present for 2.6.25
* with the upcoming kobject core rework.
*/
}
static struct kobj_type module_ktype = {
struct kobj_type module_ktype = {
.sysfs_ops = &module_sysfs_ops,
.release = module_release,
};
/*
@@ -716,13 +701,11 @@ static struct kobj_type module_ktype = {
*/
static int __init param_sysfs_init(void)
{
int ret;
ret = subsystem_register(&module_subsys);
if (ret < 0) {
printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
__FILE__, __LINE__, ret);
return ret;
module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
if (!module_kset) {
printk(KERN_WARNING "%s (%d): error creating kset\n",
__FILE__, __LINE__);
return -ENOMEM;
}
module_sysfs_initialized = 1;
@@ -732,14 +715,7 @@ static int __init param_sysfs_init(void)
}
subsys_initcall(param_sysfs_init);
#else
#if 0
static struct sysfs_ops module_sysfs_ops = {
.show = NULL,
.store = NULL,
};
#endif
#endif
#endif /* CONFIG_SYSFS */
EXPORT_SYMBOL(param_set_byte);
EXPORT_SYMBOL(param_get_byte);

View File

@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
{
int maxfire;
struct list_head *timers = tsk->cpu_timers;
struct signal_struct *const sig = tsk->signal;
maxfire = 20;
tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk,
t->firing = 1;
list_move_tail(&t->entry, firing);
}
/*
* Check for the special case thread timers.
*/
if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
/*
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
/*
* At the soft limit, send a SIGXCPU every second.
*/
if (sig->rlim[RLIMIT_RTTIME].rlim_cur
< sig->rlim[RLIMIT_RTTIME].rlim_max) {
sig->rlim[RLIMIT_RTTIME].rlim_cur +=
USEC_PER_SEC;
}
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
}
/*

View File

@@ -567,7 +567,8 @@ static const char * const hibernation_modes[] = {
* supports it (as determined by having hibernation_ops).
*/
static ssize_t disk_show(struct kset *kset, char *buf)
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
int i;
char *start = buf;
@@ -597,7 +598,8 @@ static ssize_t disk_show(struct kset *kset, char *buf)
}
static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
int error = 0;
int i;
@@ -642,13 +644,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
power_attr(disk);
static ssize_t resume_show(struct kset *kset, char *buf)
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
unsigned int maj, min;
dev_t res;
@@ -674,12 +678,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
power_attr(resume);
static ssize_t image_size_show(struct kset *kset, char *buf)
static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%lu\n", image_size);
}
static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n)
static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
unsigned long size;
@@ -708,7 +714,7 @@ static struct attribute_group attr_group = {
static int __init pm_disk_init(void)
{
return sysfs_create_group(&power_subsys.kobj, &attr_group);
return sysfs_create_group(power_kobj, &attr_group);
}
core_initcall(pm_disk_init);

View File

@@ -276,8 +276,7 @@ EXPORT_SYMBOL(pm_suspend);
#endif /* CONFIG_SUSPEND */
decl_subsys(power,NULL,NULL);
struct kobject *power_kobj;
/**
* state - control system power state.
@@ -290,7 +289,8 @@ decl_subsys(power,NULL,NULL);
* proper enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kset *kset, char *buf)
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
#ifdef CONFIG_SUSPEND
@@ -311,7 +311,8 @@ static ssize_t state_show(struct kset *kset, char *buf)
return (s - buf);
}
static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -348,13 +349,15 @@ power_attr(state);
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
static ssize_t pm_trace_show(struct kset *kset, char *buf)
static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
pm_trace_store(struct kset *kset, const char *buf, size_t n)
pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
int val;
@@ -386,10 +389,10 @@ static struct attribute_group attr_group = {
static int __init pm_init(void)
{
int error = subsystem_register(&power_subsys);
if (!error)
error = sysfs_create_group(&power_subsys.kobj,&attr_group);
return error;
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
return sysfs_create_group(power_kobj, &attr_group);
}
core_initcall(pm_init);

View File

@@ -54,7 +54,7 @@ extern int pfn_is_nosave(unsigned long);
extern struct mutex pm_mutex;
#define power_attr(_name) \
static struct subsys_attribute _name##_attr = { \
static struct kobj_attribute _name##_attr = { \
.attr = { \
.name = __stringify(_name), \
.mode = 0644, \
@@ -63,8 +63,6 @@ static struct subsys_attribute _name##_attr = { \
.store = _name##_store, \
}
extern struct kset power_subsys;
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
extern int in_suspend;

View File

@@ -36,6 +36,13 @@
#include <asm/uaccess.h>
/*
* Architectures can override it:
*/
void __attribute__((weak)) early_printk(const char *fmt, ...)
{
}
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
/* printk's without a loglevel use this.. */
@@ -573,11 +580,6 @@ static int __init printk_time_setup(char *str)
__setup("time", printk_time_setup);
__attribute__((weak)) unsigned long long printk_clock(void)
{
return sched_clock();
}
/* Check if we have any console registered that can be called early in boot. */
static int have_callable_console(void)
{
@@ -628,30 +630,57 @@ asmlinkage int printk(const char *fmt, ...)
/* cpu currently holding logbuf_lock */
static volatile unsigned int printk_cpu = UINT_MAX;
const char printk_recursion_bug_msg [] =
KERN_CRIT "BUG: recent printk recursion!\n";
static int printk_recursion_bug;
asmlinkage int vprintk(const char *fmt, va_list args)
{
unsigned long flags;
int printed_len;
char *p;
static char printk_buf[1024];
static int log_level_unknown = 1;
static char printk_buf[1024];
unsigned long flags;
int printed_len = 0;
int this_cpu;
char *p;
boot_delay_msec();
preempt_disable();
if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
/* If a crash is occurring during printk() on this CPU,
* make sure we can't deadlock */
zap_locks();
/* This stops the holder of console_sem just where we want him */
raw_local_irq_save(flags);
this_cpu = smp_processor_id();
/*
* Ouch, printk recursed into itself!
*/
if (unlikely(printk_cpu == this_cpu)) {
/*
* If a crash is occurring during printk() on this CPU,
* then try to get the crash message out but make sure
* we can't deadlock. Otherwise just return to avoid the
* recursion and return - but flag the recursion so that
* it can be printed at the next appropriate moment:
*/
if (!oops_in_progress) {
printk_recursion_bug = 1;
goto out_restore_irqs;
}
zap_locks();
}
lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = smp_processor_id();
printk_cpu = this_cpu;
if (printk_recursion_bug) {
printk_recursion_bug = 0;
strcpy(printk_buf, printk_recursion_bug_msg);
printed_len = sizeof(printk_recursion_bug_msg);
}
/* Emit the output into the temporary buffer */
printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
printed_len += vscnprintf(printk_buf + printed_len,
sizeof(printk_buf), fmt, args);
/*
* Copy the output into log_buf. If the caller didn't provide
@@ -680,7 +709,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
loglev_char = default_message_loglevel
+ '0';
}
t = printk_clock();
t = cpu_clock(printk_cpu);
nanosec_rem = do_div(t, 1000000000);
tlen = sprintf(tbuf,
"<%c>[%5lu.%06lu] ",
@@ -744,6 +773,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
lockdep_on();
out_restore_irqs:
raw_local_irq_restore(flags);
}

View File

@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
static DEFINE_MUTEX(profile_flip_mutex);
#endif /* CONFIG_SMP */
static int __init profile_setup(char * str)
static int __init profile_setup(char *str)
{
static char __initdata schedstr[] = "schedule";
static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
void __init profile_init(void)
{
if (!prof_on)
if (!prof_on)
return;
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
}
/* Profile event notifications */
#ifdef CONFIG_PROFILING
static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
void profile_task_exit(struct task_struct * task)
void profile_task_exit(struct task_struct *task)
{
blocking_notifier_call_chain(&task_exit_notifier, 0, task);
}
int profile_handoff_task(struct task_struct * task)
int profile_handoff_task(struct task_struct *task)
{
int ret;
ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
}
int task_handoff_register(struct notifier_block * n)
int task_handoff_register(struct notifier_block *n)
{
return atomic_notifier_chain_register(&task_free_notifier, n);
}
EXPORT_SYMBOL_GPL(task_handoff_register);
int task_handoff_unregister(struct notifier_block * n)
int task_handoff_unregister(struct notifier_block *n)
{
return atomic_notifier_chain_unregister(&task_free_notifier, n);
}
EXPORT_SYMBOL_GPL(task_handoff_unregister);
int profile_event_register(enum profile_type type, struct notifier_block * n)
int profile_event_register(enum profile_type type, struct notifier_block *n)
{
int err = -EINVAL;
switch (type) {
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_register(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_register(
&munmap_notifier, n);
break;
}
return err;
}
int profile_event_unregister(enum profile_type type, struct notifier_block * n)
{
int err = -EINVAL;
switch (type) {
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_unregister(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_unregister(
&munmap_notifier, n);
break;
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_register(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_register(
&munmap_notifier, n);
break;
}
return err;
}
EXPORT_SYMBOL_GPL(profile_event_register);
int profile_event_unregister(enum profile_type type, struct notifier_block *n)
{
int err = -EINVAL;
switch (type) {
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_unregister(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_unregister(
&munmap_notifier, n);
break;
}
return err;
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
int register_timer_hook(int (*hook)(struct pt_regs *))
{
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
timer_hook = hook;
return 0;
}
EXPORT_SYMBOL_GPL(register_timer_hook);
void unregister_timer_hook(int (*hook)(struct pt_regs *))
{
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
/* make sure all CPUs see the NULL hook */
synchronize_sched(); /* Allow ongoing interrupts to complete. */
}
EXPORT_SYMBOL_GPL(register_timer_hook);
EXPORT_SYMBOL_GPL(unregister_timer_hook);
EXPORT_SYMBOL_GPL(task_handoff_register);
EXPORT_SYMBOL_GPL(task_handoff_unregister);
EXPORT_SYMBOL_GPL(profile_event_register);
EXPORT_SYMBOL_GPL(profile_event_unregister);
#endif /* CONFIG_PROFILING */
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
}
break;
out_free:
out_free:
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
per_cpu(cpu_profile_hits, cpu)[1] = NULL;
__free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
EXPORT_SYMBOL_GPL(profile_hits);
void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
#include <asm/uaccess.h>
#include <asm/ptrace.h>
static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
return len;
}
static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
unsigned long count, void *data)
static int prof_cpu_mask_write_proc(struct file *file,
const char __user *buffer, unsigned long count, void *data)
{
cpumask_t *mask = (cpumask_t *)data;
unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
struct proc_dir_entry *entry;
/* create /proc/irq/prof_cpu_mask */
if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
if (!entry)
return;
entry->data = (void *)&prof_cpu_mask;
entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
unsigned long p = *ppos;
ssize_t read;
char * pnt;
char *pnt;
unsigned int sample_step = 1 << prof_shift;
profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
read = 0;
while (p < sizeof(unsigned int) && count > 0) {
if (put_user(*((char *)(&sample_step)+p),buf))
if (put_user(*((char *)(&sample_step)+p), buf))
return -EFAULT;
buf++; p++; count--; read++;
}
pnt = (char *)prof_buffer + p - sizeof(atomic_t);
if (copy_to_user(buf,(void *)pnt,count))
if (copy_to_user(buf, (void *)pnt, count))
return -EFAULT;
read += count;
*ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
extern int setup_profiling_timer (unsigned int multiplier);
extern int setup_profiling_timer(unsigned int multiplier);
if (count == sizeof(int)) {
unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
return 0;
if (create_hash_tables())
return -1;
if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
if (!entry)
return 0;
entry->proc_fops = &proc_profile_operations;
entry->size = (1+prof_len) * sizeof(atomic_t);

View File

@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
return error;
}
#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
#else
#define is_singlestep(request) 0
#endif
#ifdef PTRACE_SINGLEBLOCK
#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
#else
#define is_singleblock(request) 0
#endif
#ifdef PTRACE_SYSEMU
#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP)
#else
#define is_sysemu_singlestep(request) 0
#endif
static int ptrace_resume(struct task_struct *child, long request, long data)
{
if (!valid_signal(data))
return -EIO;
if (request == PTRACE_SYSCALL)
set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
else
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
else
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
if (is_singleblock(request)) {
if (unlikely(!arch_has_block_step()))
return -EIO;
user_enable_block_step(child);
} else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
if (unlikely(!arch_has_single_step()))
return -EIO;
user_enable_single_step(child);
}
else
user_disable_single_step(child);
child->exit_code = data;
wake_up_process(child);
return 0;
}
int ptrace_request(struct task_struct *child, long request,
long addr, long data)
{
int ret = -EIO;
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
return generic_ptrace_peekdata(child, addr, data);
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
return generic_ptrace_pokedata(child, addr, data);
#ifdef PTRACE_OLDSETOPTIONS
case PTRACE_OLDSETOPTIONS:
#endif
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request,
case PTRACE_DETACH: /* detach a process that was attached. */
ret = ptrace_detach(child, data);
break;
#ifdef PTRACE_SINGLESTEP
case PTRACE_SINGLESTEP:
#endif
#ifdef PTRACE_SINGLEBLOCK
case PTRACE_SINGLEBLOCK:
#endif
#ifdef PTRACE_SYSEMU
case PTRACE_SYSEMU:
case PTRACE_SYSEMU_SINGLESTEP:
#endif
case PTRACE_SYSCALL:
case PTRACE_CONT:
return ptrace_resume(child, request, data);
case PTRACE_KILL:
if (child->exit_state) /* already dead */
return 0;
return ptrace_resume(child, request, SIGKILL);
default:
break;
}
@@ -470,6 +551,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
lock_kernel();
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
if (!ret)
arch_ptrace_attach(current);
goto out;
}
@@ -524,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
return (copied == sizeof(data)) ? 0 : -EIO;
}
#ifdef CONFIG_COMPAT
#include <linux/compat.h>
int compat_ptrace_request(struct task_struct *child, compat_long_t request,
compat_ulong_t addr, compat_ulong_t data)
{
compat_ulong_t __user *datap = compat_ptr(data);
compat_ulong_t word;
int ret;
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
ret = access_process_vm(child, addr, &word, sizeof(word), 0);
if (ret != sizeof(word))
ret = -EIO;
else
ret = put_user(word, datap);
break;
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
ret = access_process_vm(child, addr, &data, sizeof(data), 1);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
case PTRACE_GETEVENTMSG:
ret = put_user((compat_ulong_t) child->ptrace_message, datap);
break;
default:
ret = ptrace_request(child, request, addr, data);
}
return ret;
}
#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
compat_long_t addr, compat_long_t data)
{
struct task_struct *child;
long ret;
/*
* This lock_kernel fixes a subtle race with suid exec
*/
lock_kernel();
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
goto out;
}
child = ptrace_get_task_struct(pid);
if (IS_ERR(child)) {
ret = PTR_ERR(child);
goto out;
}
if (request == PTRACE_ATTACH) {
ret = ptrace_attach(child);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
*/
if (!ret)
arch_ptrace_attach(child);
goto out_put_task_struct;
}
ret = ptrace_check_attach(child, request == PTRACE_KILL);
if (!ret)
ret = compat_arch_ptrace(child, request, addr, data);
out_put_task_struct:
put_task_struct(child);
out:
unlock_kernel();
return ret;
}
#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
#endif /* CONFIG_COMPAT */

575
kernel/rcuclassic.c Normal file
View File

@@ -0,0 +1,575 @@
/*
* Read-Copy Update mechanism for mutual exclusion
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
* Based on the original work by Paul McKenney <paulmck@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
EXPORT_SYMBOL_GPL(rcu_lock_map);
#endif
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
static int blimit = 10;
static int qhimark = 10000;
static int qlowmark = 100;
#ifdef CONFIG_SMP
static void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
int cpu;
cpumask_t cpumask;
set_need_resched();
if (unlikely(!rcp->signaled)) {
rcp->signaled = 1;
/*
* Don't send IPI to itself. With irqs disabled,
* rdp->cpu is the current cpu.
*/
cpumask = rcp->cpumask;
cpu_clear(rdp->cpu, cpumask);
for_each_cpu_mask(cpu, cpumask)
smp_send_reschedule(cpu);
}
}
#else
static inline void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
set_need_resched();
}
#endif
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_ctrlblk);
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu);
/**
* call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_bh() assumes
* that the read-side critical sections end on completion of a softirq
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
* RCU read-side critical sections are delimited by rcu_read_lock() and
* rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
* and rcu_read_unlock_bh(), if in process context. These may be nested.
*/
void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_bh_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_bh_ctrlblk);
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed(void)
{
return rcu_ctrlblk.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed_bh(void)
{
return rcu_bh_ctrlblk.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
/* Raises the softirq for processing rcu_callbacks. */
static inline void raise_rcu_softirq(void)
{
raise_softirq(RCU_SOFTIRQ);
/*
* The smp_mb() here is required to ensure that this cpu's
* __rcu_process_callbacks() reads the most recently updated
* value of rcu->cur.
*/
smp_mb();
}
/*
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
struct rcu_head *next, *list;
int count = 0;
list = rdp->donelist;
while (list) {
next = list->next;
prefetch(next);
list->func(list);
list = next;
if (++count >= rdp->blimit)
break;
}
rdp->donelist = list;
local_irq_disable();
rdp->qlen -= count;
local_irq_enable();
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
if (!rdp->donelist)
rdp->donetail = &rdp->donelist;
else
raise_rcu_softirq();
}
/*
* Grace period handling:
* The grace period handling consists out of two steps:
* - A new grace period is started.
* This is done by rcu_start_batch. The start is not broadcasted to
* all cpus, they must pick this up by comparing rcp->cur with
* rdp->quiescbatch. All cpus are recorded in the
* rcu_ctrlblk.cpumask bitmap.
* - All cpus must go through a quiescent state.
* Since the start of the grace period is not broadcasted, at least two
* calls to rcu_check_quiescent_state are required:
* The first call just notices that a new grace period is running. The
* following calls check if there was a quiescent state since the beginning
* of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
* the bitmap is empty, then the grace period is completed.
* rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
* period (if necessary).
*/
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
* Caller must hold rcu_ctrlblk.lock.
*/
static void rcu_start_batch(struct rcu_ctrlblk *rcp)
{
if (rcp->next_pending &&
rcp->completed == rcp->cur) {
rcp->next_pending = 0;
/*
* next_pending == 0 must be visible in
* __rcu_process_callbacks() before it can see new value of cur.
*/
smp_wmb();
rcp->cur++;
/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
* Barrier Otherwise it can cause tickless idle CPUs to be
* included in rcp->cpumask, which will extend graceperiods
* unnecessarily.
*/
smp_mb();
cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
rcp->signaled = 0;
}
}
/*
* cpu went through a quiescent state since the beginning of the grace period.
* Clear it from the cpu mask and complete the grace period if it was the last
* cpu. Start another grace period if someone has further entries pending
*/
static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
{
cpu_clear(cpu, rcp->cpumask);
if (cpus_empty(rcp->cpumask)) {
/* batch completed ! */
rcp->completed = rcp->cur;
rcu_start_batch(rcp);
}
}
/*
* Check if the cpu has gone through a quiescent state (say context
* switch). If so and if it already hasn't done so in this RCU
* quiescent cycle, then indicate that it has done so.
*/
static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->quiescbatch != rcp->cur) {
/* start new grace period: */
rdp->qs_pending = 1;
rdp->passed_quiesc = 0;
rdp->quiescbatch = rcp->cur;
return;
}
/* Grace period already completed for this cpu?
* qs_pending is checked instead of the actual bitmap to avoid
* cacheline trashing.
*/
if (!rdp->qs_pending)
return;
/*
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
if (!rdp->passed_quiesc)
return;
rdp->qs_pending = 0;
spin_lock(&rcp->lock);
/*
* rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
* during cpu startup. Ignore the quiescent state.
*/
if (likely(rdp->quiescbatch == rcp->cur))
cpu_quiet(rdp->cpu, rcp);
spin_unlock(&rcp->lock);
}
#ifdef CONFIG_HOTPLUG_CPU
/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
* locking requirements, the list it's pulling from has to belong to a cpu
* which is dead and hence not processing interrupts.
*/
static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
struct rcu_head **tail)
{
local_irq_disable();
*this_rdp->nxttail = list;
if (list)
this_rdp->nxttail = tail;
local_irq_enable();
}
static void __rcu_offline_cpu(struct rcu_data *this_rdp,
struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* if the cpu going offline owns the grace period
* we can block indefinitely waiting for it, so flush
* it here
*/
spin_lock_bh(&rcp->lock);
if (rcp->cur != rcp->completed)
cpu_quiet(rdp->cpu, rcp);
spin_unlock_bh(&rcp->lock);
rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
}
static void rcu_offline_cpu(int cpu)
{
struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
&per_cpu(rcu_data, cpu));
__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
&per_cpu(rcu_bh_data, cpu));
put_cpu_var(rcu_data);
put_cpu_var(rcu_bh_data);
}
#else
static void rcu_offline_cpu(int cpu)
{
}
#endif
/*
* This does the RCU processing work from softirq context.
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
*rdp->donetail = rdp->curlist;
rdp->donetail = rdp->curtail;
rdp->curlist = NULL;
rdp->curtail = &rdp->curlist;
}
if (rdp->nxtlist && !rdp->curlist) {
local_irq_disable();
rdp->curlist = rdp->nxtlist;
rdp->curtail = rdp->nxttail;
rdp->nxtlist = NULL;
rdp->nxttail = &rdp->nxtlist;
local_irq_enable();
/*
* start the next batch of callbacks
*/
/* determine batch number */
rdp->batch = rcp->cur + 1;
/* see the comment and corresponding wmb() in
* the rcu_start_batch()
*/
smp_rmb();
if (!rcp->next_pending) {
/* and start it/schedule start if it's a new batch */
spin_lock(&rcp->lock);
rcp->next_pending = 1;
rcu_start_batch(rcp);
spin_unlock(&rcp->lock);
}
}
rcu_check_quiescent_state(rcp, rdp);
if (rdp->donelist)
rcu_do_batch(rdp);
}
static void rcu_process_callbacks(struct softirq_action *unused)
{
__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
}
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* This cpu has pending rcu entries and the grace period
* for them has completed.
*/
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
return 1;
/* This cpu has no pending entries, but there are new entries */
if (!rdp->curlist && rdp->nxtlist)
return 1;
/* This cpu has finished callbacks to invoke */
if (rdp->donelist)
return 1;
/* The rcu core waits for a quiescent state from the cpu */
if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
return 1;
/* nothing to do */
return 0;
}
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
int rcu_pending(int cpu)
{
return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
}
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so. This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*/
int rcu_needs_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
}
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
(idle_cpu(cpu) && !in_softirq() &&
hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
rcu_qsctr_inc(cpu);
rcu_bh_qsctr_inc(cpu);
} else if (!in_softirq())
rcu_bh_qsctr_inc(cpu);
raise_rcu_softirq();
}
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
memset(rdp, 0, sizeof(*rdp));
rdp->curtail = &rdp->curlist;
rdp->nxttail = &rdp->nxtlist;
rdp->donetail = &rdp->donelist;
rdp->quiescbatch = rcp->completed;
rdp->qs_pending = 0;
rdp->cpu = cpu;
rdp->blimit = blimit;
}
static void __cpuinit rcu_online_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
}
static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
/*
* Initializes rcu mechanism. Assumed to be called early.
* That is before local timer(SMP) or jiffie timer (uniproc) is setup.
* Note that rcu_qsctr and friends are implicitly
* initialized due to the choice of ``0'' for RCU_CTR_INVALID.
*/
void __init __rcu_init(void)
{
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
}
module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);

View File

@@ -15,7 +15,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2001
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
@@ -35,572 +35,27 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
EXPORT_SYMBOL_GPL(rcu_lock_map);
#endif
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
/* Fake initialization required by compiler */
static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
static int blimit = 10;
static int qhimark = 10000;
static int qlowmark = 100;
static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
#ifdef CONFIG_SMP
static void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
int cpu;
cpumask_t cpumask;
set_need_resched();
if (unlikely(!rcp->signaled)) {
rcp->signaled = 1;
/*
* Don't send IPI to itself. With irqs disabled,
* rdp->cpu is the current cpu.
*/
cpumask = rcp->cpumask;
cpu_clear(rdp->cpu, cpumask);
for_each_cpu_mask(cpu, cpumask)
smp_send_reschedule(cpu);
}
}
#else
static inline void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
set_need_resched();
}
#endif
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
void fastcall call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_ctrlblk);
}
local_irq_restore(flags);
}
/**
* call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_bh() assumes
* that the read-side critical sections end on completion of a softirq
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
* RCU read-side critical sections are delimited by rcu_read_lock() and
* rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
* and rcu_read_unlock_bh(), if in process context. These may be nested.
*/
void fastcall call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_bh_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_bh_ctrlblk);
}
local_irq_restore(flags);
}
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed(void)
{
return rcu_ctrlblk.completed;
}
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed_bh(void)
{
return rcu_bh_ctrlblk.completed;
}
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
}
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
static void rcu_barrier_func(void *notused)
{
int cpu = smp_processor_id();
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_head *head;
head = &rdp->barrier;
atomic_inc(&rcu_barrier_cpu_count);
call_rcu(head, rcu_barrier_callback);
}
/**
* rcu_barrier - Wait until all the in-flight RCUs are complete.
*/
void rcu_barrier(void)
{
BUG_ON(in_interrupt());
/* Take cpucontrol mutex to protect against CPU hotplug */
mutex_lock(&rcu_barrier_mutex);
init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 0);
on_each_cpu(rcu_barrier_func, NULL, 0, 1);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
struct rcu_head *next, *list;
int count = 0;
list = rdp->donelist;
while (list) {
next = list->next;
prefetch(next);
list->func(list);
list = next;
if (++count >= rdp->blimit)
break;
}
rdp->donelist = list;
local_irq_disable();
rdp->qlen -= count;
local_irq_enable();
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
if (!rdp->donelist)
rdp->donetail = &rdp->donelist;
else
tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
}
/*
* Grace period handling:
* The grace period handling consists out of two steps:
* - A new grace period is started.
* This is done by rcu_start_batch. The start is not broadcasted to
* all cpus, they must pick this up by comparing rcp->cur with
* rdp->quiescbatch. All cpus are recorded in the
* rcu_ctrlblk.cpumask bitmap.
* - All cpus must go through a quiescent state.
* Since the start of the grace period is not broadcasted, at least two
* calls to rcu_check_quiescent_state are required:
* The first call just notices that a new grace period is running. The
* following calls check if there was a quiescent state since the beginning
* of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
* the bitmap is empty, then the grace period is completed.
* rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
* period (if necessary).
*/
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
* Caller must hold rcu_ctrlblk.lock.
*/
static void rcu_start_batch(struct rcu_ctrlblk *rcp)
{
if (rcp->next_pending &&
rcp->completed == rcp->cur) {
rcp->next_pending = 0;
/*
* next_pending == 0 must be visible in
* __rcu_process_callbacks() before it can see new value of cur.
*/
smp_wmb();
rcp->cur++;
/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
* Barrier Otherwise it can cause tickless idle CPUs to be
* included in rcp->cpumask, which will extend graceperiods
* unnecessarily.
*/
smp_mb();
cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
rcp->signaled = 0;
}
}
/*
* cpu went through a quiescent state since the beginning of the grace period.
* Clear it from the cpu mask and complete the grace period if it was the last
* cpu. Start another grace period if someone has further entries pending
*/
static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
{
cpu_clear(cpu, rcp->cpumask);
if (cpus_empty(rcp->cpumask)) {
/* batch completed ! */
rcp->completed = rcp->cur;
rcu_start_batch(rcp);
}
}
/*
* Check if the cpu has gone through a quiescent state (say context
* switch). If so and if it already hasn't done so in this RCU
* quiescent cycle, then indicate that it has done so.
*/
static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->quiescbatch != rcp->cur) {
/* start new grace period: */
rdp->qs_pending = 1;
rdp->passed_quiesc = 0;
rdp->quiescbatch = rcp->cur;
return;
}
/* Grace period already completed for this cpu?
* qs_pending is checked instead of the actual bitmap to avoid
* cacheline trashing.
*/
if (!rdp->qs_pending)
return;
/*
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
if (!rdp->passed_quiesc)
return;
rdp->qs_pending = 0;
spin_lock(&rcp->lock);
/*
* rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
* during cpu startup. Ignore the quiescent state.
*/
if (likely(rdp->quiescbatch == rcp->cur))
cpu_quiet(rdp->cpu, rcp);
spin_unlock(&rcp->lock);
}
#ifdef CONFIG_HOTPLUG_CPU
/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
* locking requirements, the list it's pulling from has to belong to a cpu
* which is dead and hence not processing interrupts.
*/
static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
struct rcu_head **tail)
{
local_irq_disable();
*this_rdp->nxttail = list;
if (list)
this_rdp->nxttail = tail;
local_irq_enable();
}
static void __rcu_offline_cpu(struct rcu_data *this_rdp,
struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* if the cpu going offline owns the grace period
* we can block indefinitely waiting for it, so flush
* it here
*/
spin_lock_bh(&rcp->lock);
if (rcp->cur != rcp->completed)
cpu_quiet(rdp->cpu, rcp);
spin_unlock_bh(&rcp->lock);
rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
}
static void rcu_offline_cpu(int cpu)
{
struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
&per_cpu(rcu_data, cpu));
__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
&per_cpu(rcu_bh_data, cpu));
put_cpu_var(rcu_data);
put_cpu_var(rcu_bh_data);
tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
}
#else
static void rcu_offline_cpu(int cpu)
{
}
#endif
/*
* This does the RCU processing work from tasklet context.
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
*rdp->donetail = rdp->curlist;
rdp->donetail = rdp->curtail;
rdp->curlist = NULL;
rdp->curtail = &rdp->curlist;
}
if (rdp->nxtlist && !rdp->curlist) {
local_irq_disable();
rdp->curlist = rdp->nxtlist;
rdp->curtail = rdp->nxttail;
rdp->nxtlist = NULL;
rdp->nxttail = &rdp->nxtlist;
local_irq_enable();
/*
* start the next batch of callbacks
*/
/* determine batch number */
rdp->batch = rcp->cur + 1;
/* see the comment and corresponding wmb() in
* the rcu_start_batch()
*/
smp_rmb();
if (!rcp->next_pending) {
/* and start it/schedule start if it's a new batch */
spin_lock(&rcp->lock);
rcp->next_pending = 1;
rcu_start_batch(rcp);
spin_unlock(&rcp->lock);
}
}
rcu_check_quiescent_state(rcp, rdp);
if (rdp->donelist)
rcu_do_batch(rdp);
}
static void rcu_process_callbacks(unsigned long unused)
{
__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
}
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* This cpu has pending rcu entries and the grace period
* for them has completed.
*/
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
return 1;
/* This cpu has no pending entries, but there are new entries */
if (!rdp->curlist && rdp->nxtlist)
return 1;
/* This cpu has finished callbacks to invoke */
if (rdp->donelist)
return 1;
/* The rcu core waits for a quiescent state from the cpu */
if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
return 1;
/* nothing to do */
return 0;
}
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
int rcu_pending(int cpu)
{
return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
}
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so. This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*/
int rcu_needs_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
}
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
(idle_cpu(cpu) && !in_softirq() &&
hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
rcu_qsctr_inc(cpu);
rcu_bh_qsctr_inc(cpu);
} else if (!in_softirq())
rcu_bh_qsctr_inc(cpu);
tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
}
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
memset(rdp, 0, sizeof(*rdp));
rdp->curtail = &rdp->curlist;
rdp->nxttail = &rdp->nxtlist;
rdp->donetail = &rdp->donelist;
rdp->quiescbatch = rcp->completed;
rdp->qs_pending = 0;
rdp->cpu = cpu;
rdp->blimit = blimit;
}
static void __cpuinit rcu_online_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
}
static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
/*
* Initializes rcu mechanism. Assumed to be called early.
* That is before local timer(SMP) or jiffie timer (uniproc) is setup.
* Note that rcu_qsctr and friends are implicitly
* initialized due to the choice of ``0'' for RCU_CTR_INVALID.
*/
void __init rcu_init(void)
{
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
}
#include <linux/module.h>
struct rcu_synchronize {
struct rcu_head head;
struct completion completion;
};
static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
/* Because of FASTCALL declaration of complete, we use this wrapper */
static void wakeme_after_rcu(struct rcu_head *head)
{
@@ -618,9 +73,6 @@ static void wakeme_after_rcu(struct rcu_head *head)
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*
* If your read-side code is not protected by rcu_read_lock(), do -not-
* use synchronize_rcu().
*/
void synchronize_rcu(void)
{
@@ -633,12 +85,54 @@ void synchronize_rcu(void)
/* Wait for it */
wait_for_completion(&rcu.completion);
}
module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
EXPORT_SYMBOL_GPL(rcu_batches_completed);
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
EXPORT_SYMBOL_GPL(call_rcu);
EXPORT_SYMBOL_GPL(call_rcu_bh);
EXPORT_SYMBOL_GPL(synchronize_rcu);
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
}
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
static void rcu_barrier_func(void *notused)
{
int cpu = smp_processor_id();
struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
atomic_inc(&rcu_barrier_cpu_count);
call_rcu(head, rcu_barrier_callback);
}
/**
* rcu_barrier - Wait until all the in-flight RCUs are complete.
*/
void rcu_barrier(void)
{
BUG_ON(in_interrupt());
/* Take cpucontrol mutex to protect against CPU hotplug */
mutex_lock(&rcu_barrier_mutex);
init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 0);
/*
* The queueing of callbacks in all CPUs must be atomic with
* respect to RCU, otherwise one CPU may queue a callback,
* wait for a grace period, decrement barrier count and call
* complete(), while other CPUs have not yet queued anything.
* So, we need to make sure that grace periods cannot complete
* until all the callbacks are queued.
*/
rcu_read_lock();
on_each_cpu(rcu_barrier_func, NULL, 0, 1);
rcu_read_unlock();
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
void __init rcu_init(void)
{
__rcu_init();
}

953
kernel/rcupreempt.c Normal file
View File

@@ -0,0 +1,953 @@
/*
* Read-Copy Update mechanism for mutual exclusion, realtime implementation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2006
*
* Authors: Paul E. McKenney <paulmck@us.ibm.com>
* With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
* for pushing me away from locks and towards counters, and
* to Suparna Bhattacharya for pushing me completely away
* from atomic instructions on the read side.
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* Design Document: http://lwn.net/Articles/253651/
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU/ *.txt
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/random.h>
#include <linux/delay.h>
#include <linux/byteorder/swabb.h>
#include <linux/cpumask.h>
#include <linux/rcupreempt_trace.h>
/*
* Macro that prevents the compiler from reordering accesses, but does
* absolutely -nothing- to prevent CPUs from reordering. This is used
* only to mediate communication between mainline code and hardware
* interrupt and NMI handlers.
*/
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
/*
* PREEMPT_RCU data structures.
*/
/*
* GP_STAGES specifies the number of times the state machine has
* to go through the all the rcu_try_flip_states (see below)
* in a single Grace Period.
*
* GP in GP_STAGES stands for Grace Period ;)
*/
#define GP_STAGES 2
struct rcu_data {
spinlock_t lock; /* Protect rcu_data fields. */
long completed; /* Number of last completed batch. */
int waitlistcount;
struct tasklet_struct rcu_tasklet;
struct rcu_head *nextlist;
struct rcu_head **nexttail;
struct rcu_head *waitlist[GP_STAGES];
struct rcu_head **waittail[GP_STAGES];
struct rcu_head *donelist;
struct rcu_head **donetail;
long rcu_flipctr[2];
#ifdef CONFIG_RCU_TRACE
struct rcupreempt_trace trace;
#endif /* #ifdef CONFIG_RCU_TRACE */
};
/*
* States for rcu_try_flip() and friends.
*/
enum rcu_try_flip_states {
/*
* Stay here if nothing is happening. Flip the counter if somthing
* starts happening. Denoted by "I"
*/
rcu_try_flip_idle_state,
/*
* Wait here for all CPUs to notice that the counter has flipped. This
* prevents the old set of counters from ever being incremented once
* we leave this state, which in turn is necessary because we cannot
* test any individual counter for zero -- we can only check the sum.
* Denoted by "A".
*/
rcu_try_flip_waitack_state,
/*
* Wait here for the sum of the old per-CPU counters to reach zero.
* Denoted by "Z".
*/
rcu_try_flip_waitzero_state,
/*
* Wait here for each of the other CPUs to execute a memory barrier.
* This is necessary to ensure that these other CPUs really have
* completed executing their RCU read-side critical sections, despite
* their CPUs wildly reordering memory. Denoted by "M".
*/
rcu_try_flip_waitmb_state,
};
struct rcu_ctrlblk {
spinlock_t fliplock; /* Protect state-machine transitions. */
long completed; /* Number of last completed batch. */
enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
the rcu state machine */
};
static DEFINE_PER_CPU(struct rcu_data, rcu_data);
static struct rcu_ctrlblk rcu_ctrlblk = {
.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
.completed = 0,
.rcu_try_flip_state = rcu_try_flip_idle_state,
};
#ifdef CONFIG_RCU_TRACE
static char *rcu_try_flip_state_names[] =
{ "idle", "waitack", "waitzero", "waitmb" };
#endif /* #ifdef CONFIG_RCU_TRACE */
static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
/*
* Enum and per-CPU flag to determine when each CPU has seen
* the most recent counter flip.
*/
enum rcu_flip_flag_values {
rcu_flip_seen, /* Steady/initial state, last flip seen. */
/* Only GP detector can update. */
rcu_flipped /* Flip just completed, need confirmation. */
/* Only corresponding CPU can update. */
};
static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
= rcu_flip_seen;
/*
* Enum and per-CPU flag to determine when each CPU has executed the
* needed memory barrier to fence in memory references from its last RCU
* read-side critical section in the just-completed grace period.
*/
enum rcu_mb_flag_values {
rcu_mb_done, /* Steady/initial state, no mb()s required. */
/* Only GP detector can update. */
rcu_mb_needed /* Flip just completed, need an mb(). */
/* Only corresponding CPU can update. */
};
static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
= rcu_mb_done;
/*
* RCU_DATA_ME: find the current CPU's rcu_data structure.
* RCU_DATA_CPU: find the specified CPU's rcu_data structure.
*/
#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
/*
* Helper macro for tracing when the appropriate rcu_data is not
* cached in a local variable, but where the CPU number is so cached.
*/
#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
/*
* Helper macro for tracing when the appropriate rcu_data is not
* cached in a local variable.
*/
#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
/*
* Helper macro for tracing when the appropriate rcu_data is pointed
* to by a local variable.
*/
#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed(void)
{
return rcu_ctrlblk.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed);
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
void __rcu_read_lock(void)
{
int idx;
struct task_struct *t = current;
int nesting;
nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
if (nesting != 0) {
/* An earlier rcu_read_lock() covers us, just count it. */
t->rcu_read_lock_nesting = nesting + 1;
} else {
unsigned long flags;
/*
* We disable interrupts for the following reasons:
* - If we get scheduling clock interrupt here, and we
* end up acking the counter flip, it's like a promise
* that we will never increment the old counter again.
* Thus we will break that promise if that
* scheduling clock interrupt happens between the time
* we pick the .completed field and the time that we
* increment our counter.
*
* - We don't want to be preempted out here.
*
* NMIs can still occur, of course, and might themselves
* contain rcu_read_lock().
*/
local_irq_save(flags);
/*
* Outermost nesting of rcu_read_lock(), so increment
* the current counter for the current CPU. Use volatile
* casts to prevent the compiler from reordering.
*/
idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
/*
* Now that the per-CPU counter has been incremented, we
* are protected from races with rcu_read_lock() invoked
* from NMI handlers on this CPU. We can therefore safely
* increment the nesting counter, relieving further NMIs
* of the need to increment the per-CPU counter.
*/
ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
/*
* Now that we have preventing any NMIs from storing
* to the ->rcu_flipctr_idx, we can safely use it to
* remember which counter to decrement in the matching
* rcu_read_unlock().
*/
ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
local_irq_restore(flags);
}
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
void __rcu_read_unlock(void)
{
int idx;
struct task_struct *t = current;
int nesting;
nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
if (nesting > 1) {
/*
* We are still protected by the enclosing rcu_read_lock(),
* so simply decrement the counter.
*/
t->rcu_read_lock_nesting = nesting - 1;
} else {
unsigned long flags;
/*
* Disable local interrupts to prevent the grace-period
* detection state machine from seeing us half-done.
* NMIs can still occur, of course, and might themselves
* contain rcu_read_lock() and rcu_read_unlock().
*/
local_irq_save(flags);
/*
* Outermost nesting of rcu_read_unlock(), so we must
* decrement the current counter for the current CPU.
* This must be done carefully, because NMIs can
* occur at any point in this code, and any rcu_read_lock()
* and rcu_read_unlock() pairs in the NMI handlers
* must interact non-destructively with this code.
* Lots of volatile casts, and -very- careful ordering.
*
* Changes to this code, including this one, must be
* inspected, validated, and tested extremely carefully!!!
*/
/*
* First, pick up the index.
*/
idx = ACCESS_ONCE(t->rcu_flipctr_idx);
/*
* Now that we have fetched the counter index, it is
* safe to decrement the per-task RCU nesting counter.
* After this, any interrupts or NMIs will increment and
* decrement the per-CPU counters.
*/
ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
/*
* It is now safe to decrement this task's nesting count.
* NMIs that occur after this statement will route their
* rcu_read_lock() calls through this "else" clause, and
* will thus start incrementing the per-CPU counter on
* their own. They will also clobber ->rcu_flipctr_idx,
* but that is OK, since we have already fetched it.
*/
ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
local_irq_restore(flags);
}
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
/*
* If a global counter flip has occurred since the last time that we
* advanced callbacks, advance them. Hardware interrupts must be
* disabled when calling this function.
*/
static void __rcu_advance_callbacks(struct rcu_data *rdp)
{
int cpu;
int i;
int wlc = 0;
if (rdp->completed != rcu_ctrlblk.completed) {
if (rdp->waitlist[GP_STAGES - 1] != NULL) {
*rdp->donetail = rdp->waitlist[GP_STAGES - 1];
rdp->donetail = rdp->waittail[GP_STAGES - 1];
RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
}
for (i = GP_STAGES - 2; i >= 0; i--) {
if (rdp->waitlist[i] != NULL) {
rdp->waitlist[i + 1] = rdp->waitlist[i];
rdp->waittail[i + 1] = rdp->waittail[i];
wlc++;
} else {
rdp->waitlist[i + 1] = NULL;
rdp->waittail[i + 1] =
&rdp->waitlist[i + 1];
}
}
if (rdp->nextlist != NULL) {
rdp->waitlist[0] = rdp->nextlist;
rdp->waittail[0] = rdp->nexttail;
wlc++;
rdp->nextlist = NULL;
rdp->nexttail = &rdp->nextlist;
RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
} else {
rdp->waitlist[0] = NULL;
rdp->waittail[0] = &rdp->waitlist[0];
}
rdp->waitlistcount = wlc;
rdp->completed = rcu_ctrlblk.completed;
}
/*
* Check to see if this CPU needs to report that it has seen
* the most recent counter flip, thereby declaring that all
* subsequent rcu_read_lock() invocations will respect this flip.
*/
cpu = raw_smp_processor_id();
if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
smp_mb(); /* Subsequent counter accesses must see new value */
per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
smp_mb(); /* Subsequent RCU read-side critical sections */
/* seen -after- acknowledgement. */
}
}
/*
* Get here when RCU is idle. Decide whether we need to
* move out of idle state, and return non-zero if so.
* "Straightforward" approach for the moment, might later
* use callback-list lengths, grace-period duration, or
* some such to determine when to exit idle state.
* Might also need a pre-idle test that does not acquire
* the lock, but let's get the simple case working first...
*/
static int
rcu_try_flip_idle(void)
{
int cpu;
RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
if (!rcu_pending(smp_processor_id())) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
return 0;
}
/*
* Do the flip.
*/
RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
/*
* Need a memory barrier so that other CPUs see the new
* counter value before they see the subsequent change of all
* the rcu_flip_flag instances to rcu_flipped.
*/
smp_mb(); /* see above block comment. */
/* Now ask each CPU for acknowledgement of the flip. */
for_each_cpu_mask(cpu, rcu_cpu_online_map)
per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
return 1;
}
/*
* Wait for CPUs to acknowledge the flip.
*/
static int
rcu_try_flip_waitack(void)
{
int cpu;
RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
return 0;
}
/*
* Make sure our checks above don't bleed into subsequent
* waiting for the sum of the counters to reach zero.
*/
smp_mb(); /* see above block comment. */
RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
return 1;
}
/*
* Wait for collective ``last'' counter to reach zero,
* then tell all CPUs to do an end-of-grace-period memory barrier.
*/
static int
rcu_try_flip_waitzero(void)
{
int cpu;
int lastidx = !(rcu_ctrlblk.completed & 0x1);
int sum = 0;
/* Check to see if the sum of the "last" counters is zero. */
RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
if (sum != 0) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
return 0;
}
/*
* This ensures that the other CPUs see the call for
* memory barriers -after- the sum to zero has been
* detected here
*/
smp_mb(); /* ^^^^^^^^^^^^ */
/* Call for a memory barrier from each CPU. */
for_each_cpu_mask(cpu, rcu_cpu_online_map)
per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
return 1;
}
/*
* Wait for all CPUs to do their end-of-grace-period memory barrier.
* Return 0 once all CPUs have done so.
*/
static int
rcu_try_flip_waitmb(void)
{
int cpu;
RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
return 0;
}
smp_mb(); /* Ensure that the above checks precede any following flip. */
RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
return 1;
}
/*
* Attempt a single flip of the counters. Remember, a single flip does
* -not- constitute a grace period. Instead, the interval between
* at least GP_STAGES consecutive flips is a grace period.
*
* If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
* on a large SMP, they might want to use a hierarchical organization of
* the per-CPU-counter pairs.
*/
static void rcu_try_flip(void)
{
unsigned long flags;
RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
return;
}
/*
* Take the next transition(s) through the RCU grace-period
* flip-counter state machine.
*/
switch (rcu_ctrlblk.rcu_try_flip_state) {
case rcu_try_flip_idle_state:
if (rcu_try_flip_idle())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_waitack_state;
break;
case rcu_try_flip_waitack_state:
if (rcu_try_flip_waitack())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_waitzero_state;
break;
case rcu_try_flip_waitzero_state:
if (rcu_try_flip_waitzero())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_waitmb_state;
break;
case rcu_try_flip_waitmb_state:
if (rcu_try_flip_waitmb())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_idle_state;
}
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
}
/*
* Check to see if this CPU needs to do a memory barrier in order to
* ensure that any prior RCU read-side critical sections have committed
* their counter manipulations and critical-section memory references
* before declaring the grace period to be completed.
*/
static void rcu_check_mb(int cpu)
{
if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
smp_mb(); /* Ensure RCU read-side accesses are visible. */
per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
}
}
void rcu_check_callbacks(int cpu, int user)
{
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
rcu_check_mb(cpu);
if (rcu_ctrlblk.completed == rdp->completed)
rcu_try_flip();
spin_lock_irqsave(&rdp->lock, flags);
RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
__rcu_advance_callbacks(rdp);
if (rdp->donelist == NULL) {
spin_unlock_irqrestore(&rdp->lock, flags);
} else {
spin_unlock_irqrestore(&rdp->lock, flags);
raise_softirq(RCU_SOFTIRQ);
}
}
/*
* Needed by dynticks, to make sure all RCU processing has finished
* when we go idle:
*/
void rcu_advance_callbacks(int cpu, int user)
{
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
if (rcu_ctrlblk.completed == rdp->completed) {
rcu_try_flip();
if (rcu_ctrlblk.completed == rdp->completed)
return;
}
spin_lock_irqsave(&rdp->lock, flags);
RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
__rcu_advance_callbacks(rdp);
spin_unlock_irqrestore(&rdp->lock, flags);
}
#ifdef CONFIG_HOTPLUG_CPU
#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
*dsttail = srclist; \
if (srclist != NULL) { \
dsttail = srctail; \
srclist = NULL; \
srctail = &srclist;\
} \
} while (0)
void rcu_offline_cpu(int cpu)
{
int i;
struct rcu_head *list = NULL;
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
struct rcu_head **tail = &list;
/*
* Remove all callbacks from the newly dead CPU, retaining order.
* Otherwise rcu_barrier() will fail
*/
spin_lock_irqsave(&rdp->lock, flags);
rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
for (i = GP_STAGES - 1; i >= 0; i--)
rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
list, tail);
rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
spin_unlock_irqrestore(&rdp->lock, flags);
rdp->waitlistcount = 0;
/* Disengage the newly dead CPU from the grace-period computation. */
spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
rcu_check_mb(cpu);
if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
smp_mb(); /* Subsequent counter accesses must see new value */
per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
smp_mb(); /* Subsequent RCU read-side critical sections */
/* seen -after- acknowledgement. */
}
RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
cpu_clear(cpu, rcu_cpu_online_map);
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
/*
* Place the removed callbacks on the current CPU's queue.
* Make them all start a new grace period: simple approach,
* in theory could starve a given set of callbacks, but
* you would need to be doing some serious CPU hotplugging
* to make this happen. If this becomes a problem, adding
* a synchronize_rcu() to the hotplug path would be a simple
* fix.
*/
rdp = RCU_DATA_ME();
spin_lock_irqsave(&rdp->lock, flags);
*rdp->nexttail = list;
if (list)
rdp->nexttail = tail;
spin_unlock_irqrestore(&rdp->lock, flags);
}
void __devinit rcu_online_cpu(int cpu)
{
unsigned long flags;
spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
cpu_set(cpu, rcu_cpu_online_map);
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
void rcu_offline_cpu(int cpu)
{
}
void __devinit rcu_online_cpu(int cpu)
{
}
#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_process_callbacks(struct softirq_action *unused)
{
unsigned long flags;
struct rcu_head *next, *list;
struct rcu_data *rdp = RCU_DATA_ME();
spin_lock_irqsave(&rdp->lock, flags);
list = rdp->donelist;
if (list == NULL) {
spin_unlock_irqrestore(&rdp->lock, flags);
return;
}
rdp->donelist = NULL;
rdp->donetail = &rdp->donelist;
RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
spin_unlock_irqrestore(&rdp->lock, flags);
while (list) {
next = list->next;
list->func(list);
list = next;
RCU_TRACE_ME(rcupreempt_trace_invoke);
}
}
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = RCU_DATA_ME();
spin_lock(&rdp->lock);
__rcu_advance_callbacks(rdp);
*rdp->nexttail = head;
rdp->nexttail = &head->next;
RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
spin_unlock(&rdp->lock);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu);
/*
* Wait until all currently running preempt_disable() code segments
* (including hardware-irq-disable segments) complete. Note that
* in -rt this does -not- necessarily result in all currently executing
* interrupt -handlers- having completed.
*/
void __synchronize_sched(void)
{
cpumask_t oldmask;
int cpu;
if (sched_getaffinity(0, &oldmask) < 0)
oldmask = cpu_possible_map;
for_each_online_cpu(cpu) {
sched_setaffinity(0, cpumask_of_cpu(cpu));
schedule();
}
sched_setaffinity(0, oldmask);
}
EXPORT_SYMBOL_GPL(__synchronize_sched);
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so. Assumes that notifiers would take care of handling any
* outstanding requests from the RCU core.
*
* This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*/
int rcu_needs_cpu(int cpu)
{
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
return (rdp->donelist != NULL ||
!!rdp->waitlistcount ||
rdp->nextlist != NULL);
}
int rcu_pending(int cpu)
{
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
/* The CPU has at least one callback queued somewhere. */
if (rdp->donelist != NULL ||
!!rdp->waitlistcount ||
rdp->nextlist != NULL)
return 1;
/* The RCU core needs an acknowledgement from this CPU. */
if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
(per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
return 1;
/* This CPU has fallen behind the global grace-period number. */
if (rdp->completed != rcu_ctrlblk.completed)
return 1;
/* Nothing needed from this CPU. */
return 0;
}
static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
void __init __rcu_init(void)
{
int cpu;
int i;
struct rcu_data *rdp;
printk(KERN_NOTICE "Preemptible RCU implementation.\n");
for_each_possible_cpu(cpu) {
rdp = RCU_DATA_CPU(cpu);
spin_lock_init(&rdp->lock);
rdp->completed = 0;
rdp->waitlistcount = 0;
rdp->nextlist = NULL;
rdp->nexttail = &rdp->nextlist;
for (i = 0; i < GP_STAGES; i++) {
rdp->waitlist[i] = NULL;
rdp->waittail[i] = &rdp->waitlist[i];
}
rdp->donelist = NULL;
rdp->donetail = &rdp->donelist;
rdp->rcu_flipctr[0] = 0;
rdp->rcu_flipctr[1] = 0;
}
register_cpu_notifier(&rcu_nb);
/*
* We don't need protection against CPU-Hotplug here
* since
* a) If a CPU comes online while we are iterating over the
* cpu_online_map below, we would only end up making a
* duplicate call to rcu_online_cpu() which sets the corresponding
* CPU's mask in the rcu_cpu_online_map.
*
* b) A CPU cannot go offline at this point in time since the user
* does not have access to the sysfs interface, nor do we
* suspend the system.
*/
for_each_online_cpu(cpu)
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
}
/*
* Deprecated, use synchronize_rcu() or synchronize_sched() instead.
*/
void synchronize_kernel(void)
{
synchronize_rcu();
}
#ifdef CONFIG_RCU_TRACE
long *rcupreempt_flipctr(int cpu)
{
return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
}
EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
int rcupreempt_flip_flag(int cpu)
{
return per_cpu(rcu_flip_flag, cpu);
}
EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
int rcupreempt_mb_flag(int cpu)
{
return per_cpu(rcu_mb_flag, cpu);
}
EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
char *rcupreempt_try_flip_state_name(void)
{
return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
}
EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
{
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
return &rdp->trace;
}
EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
#endif /* #ifdef RCU_TRACE */

330
kernel/rcupreempt_trace.c Normal file
View File

@@ -0,0 +1,330 @@
/*
* Read-Copy Update tracing for realtime implementation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2006
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU/ *.txt
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/rcupreempt_trace.h>
#include <linux/debugfs.h>
static struct mutex rcupreempt_trace_mutex;
static char *rcupreempt_trace_buf;
#define RCUPREEMPT_TRACE_BUF_SIZE 4096
void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
{
trace->done_length += trace->wait_length;
trace->done_add += trace->wait_length;
trace->wait_length = 0;
}
void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
{
trace->wait_length += trace->next_length;
trace->wait_add += trace->next_length;
trace->next_length = 0;
}
void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
{
atomic_inc(&trace->rcu_try_flip_1);
}
void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
{
atomic_inc(&trace->rcu_try_flip_e1);
}
void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_i1++;
}
void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_ie1++;
}
void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_g1++;
}
void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_a1++;
}
void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_ae1++;
}
void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_a2++;
}
void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_z1++;
}
void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_ze1++;
}
void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_z2++;
}
void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_m1++;
}
void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_me1++;
}
void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_m2++;
}
void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
{
trace->rcu_check_callbacks++;
}
void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
{
trace->done_remove += trace->done_length;
trace->done_length = 0;
}
void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
{
atomic_inc(&trace->done_invoked);
}
void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
{
trace->next_add++;
trace->next_length++;
}
static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
{
struct rcupreempt_trace *cp;
int cpu;
memset(sp, 0, sizeof(*sp));
for_each_possible_cpu(cpu) {
cp = rcupreempt_trace_cpu(cpu);
sp->next_length += cp->next_length;
sp->next_add += cp->next_add;
sp->wait_length += cp->wait_length;
sp->wait_add += cp->wait_add;
sp->done_length += cp->done_length;
sp->done_add += cp->done_add;
sp->done_remove += cp->done_remove;
atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
sp->rcu_check_callbacks += cp->rcu_check_callbacks;
atomic_set(&sp->rcu_try_flip_1,
atomic_read(&cp->rcu_try_flip_1));
atomic_set(&sp->rcu_try_flip_e1,
atomic_read(&cp->rcu_try_flip_e1));
sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
}
}
static ssize_t rcustats_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct rcupreempt_trace trace;
ssize_t bcount;
int cnt = 0;
rcupreempt_trace_sum(&trace);
mutex_lock(&rcupreempt_trace_mutex);
snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"ggp=%ld rcc=%ld\n",
rcu_batches_completed(),
trace.rcu_check_callbacks);
snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
"1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
"z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
trace.next_add, trace.next_length,
trace.wait_add, trace.wait_length,
trace.done_add, trace.done_length,
trace.done_remove, atomic_read(&trace.done_invoked),
atomic_read(&trace.rcu_try_flip_1),
atomic_read(&trace.rcu_try_flip_e1),
trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
trace.rcu_try_flip_g1,
trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
trace.rcu_try_flip_a2,
trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
trace.rcu_try_flip_z2,
trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
trace.rcu_try_flip_m2);
bcount = simple_read_from_buffer(buffer, count, ppos,
rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
mutex_unlock(&rcupreempt_trace_mutex);
return bcount;
}
static ssize_t rcugp_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
long oldgp = rcu_batches_completed();
ssize_t bcount;
mutex_lock(&rcupreempt_trace_mutex);
synchronize_rcu();
snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
"oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
bcount = simple_read_from_buffer(buffer, count, ppos,
rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
mutex_unlock(&rcupreempt_trace_mutex);
return bcount;
}
static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
int cnt = 0;
int cpu;
int f = rcu_batches_completed() & 0x1;
ssize_t bcount;
mutex_lock(&rcupreempt_trace_mutex);
cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
"CPU last cur F M\n");
for_each_online_cpu(cpu) {
long *flipctr = rcupreempt_flipctr(cpu);
cnt += snprintf(&rcupreempt_trace_buf[cnt],
RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"%3d %4ld %3ld %d %d\n",
cpu,
flipctr[!f],
flipctr[f],
rcupreempt_flip_flag(cpu),
rcupreempt_mb_flag(cpu));
}
cnt += snprintf(&rcupreempt_trace_buf[cnt],
RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"ggp = %ld, state = %s\n",
rcu_batches_completed(),
rcupreempt_try_flip_state_name());
cnt += snprintf(&rcupreempt_trace_buf[cnt],
RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"\n");
bcount = simple_read_from_buffer(buffer, count, ppos,
rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
mutex_unlock(&rcupreempt_trace_mutex);
return bcount;
}
static struct file_operations rcustats_fops = {
.owner = THIS_MODULE,
.read = rcustats_read,
};
static struct file_operations rcugp_fops = {
.owner = THIS_MODULE,
.read = rcugp_read,
};
static struct file_operations rcuctrs_fops = {
.owner = THIS_MODULE,
.read = rcuctrs_read,
};
static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
static int rcupreempt_debugfs_init(void)
{
rcudir = debugfs_create_dir("rcu", NULL);
if (!rcudir)
goto out;
statdir = debugfs_create_file("rcustats", 0444, rcudir,
NULL, &rcustats_fops);
if (!statdir)
goto free_out;
gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
if (!gpdir)
goto free_out;
ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
NULL, &rcuctrs_fops);
if (!ctrsdir)
goto free_out;
return 0;
free_out:
if (statdir)
debugfs_remove(statdir);
if (gpdir)
debugfs_remove(gpdir);
debugfs_remove(rcudir);
out:
return 1;
}
static int __init rcupreempt_trace_init(void)
{
mutex_init(&rcupreempt_trace_mutex);
rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
if (!rcupreempt_trace_buf)
return 1;
return rcupreempt_debugfs_init();
}
static void __exit rcupreempt_trace_cleanup(void)
{
debugfs_remove(statdir);
debugfs_remove(gpdir);
debugfs_remove(ctrsdir);
debugfs_remove(rcudir);
kfree(rcupreempt_trace_buf);
}
module_init(rcupreempt_trace_init);
module_exit(rcupreempt_trace_cleanup);

View File

@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
cpumask_t tmp_mask = CPU_MASK_ALL;
int i;
lock_cpu_hotplug();
get_online_cpus();
/* No point in shuffling if there is only one online CPU (ex: UP) */
if (num_online_cpus() == 1) {
unlock_cpu_hotplug();
put_online_cpus();
return;
}
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
else
rcu_idle_cpu--;
unlock_cpu_hotplug();
put_online_cpus();
}
/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the

View File

@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
static struct sysdev_class rttest_sysclass = {
set_kset_name("rttest"),
.name = "rttest",
};
static int init_test_thread(int id)

File diff suppressed because it is too large Load Diff

View File

@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
PN(prev_clock_raw);
P(clock_warps);
P(clock_overflows);
P(clock_underflows);
P(clock_deep_idle_events);
PN(clock_max_delta);
P(cpu_load[0]);
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_max);
PN(se.slice_max);
PN(se.wait_max);
PN(se.wait_sum);
P(se.wait_count);
P(sched_info.bkl_count);
P(se.nr_migrations);
P(se.nr_migrations_cold);
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
p->se.wait_max = 0;
p->se.wait_sum = 0;
p->se.wait_count = 0;
p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0;
p->se.block_max = 0;

View File

@@ -20,6 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*/
#include <linux/latencytop.h>
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running)
unsigned long nr_latency = sched_nr_latency;
if (unlikely(nr_running > nr_latency)) {
period = sysctl_sched_min_granularity;
period *= nr_running;
do_div(period, nr_latency);
}
return period;
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->wait_max, max(se->wait_max,
rq_of(cfs_rq)->clock - se->wait_start));
schedstat_set(se->wait_count, se->wait_count + 1);
schedstat_set(se->wait_sum, se->wait_sum +
rq_of(cfs_rq)->clock - se->wait_start);
schedstat_set(se->wait_start, 0);
}
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_SCHEDSTATS
if (se->sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0;
se->sum_sleep_runtime += delta;
account_scheduler_latency(tsk, delta >> 10, 1);
}
if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start;
struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
struct task_struct *tsk = task_of(se);
profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
#endif
}
@@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
cfs_rq->curr = NULL;
}
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued)
return resched_task(rq_of(cfs_rq)->curr);
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
check_preempt_tick(cfs_rq, curr);
}
@@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
@@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
#define GROUP_IMBALANCE_PCT 20
#else /* CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
@@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
int requeue = rq->curr == p;
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
WARN_ON(task_rq(p) != rq);
if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
if (delta < 0) {
if (rq->curr == p)
resched_task(p);
return;
}
/*
* Don't schedule slices shorter than 10000ns, that just
* doesn't make sense. Rely on vruntime for fairness.
*/
if (!requeue)
delta = max(10000LL, delta);
hrtick_start(rq, delta, requeue);
}
}
#else
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
#endif
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
struct sched_entity *se = &p->se,
*topse = NULL; /* Highest schedulable entity */
int incload = 1;
for_each_sched_entity(se) {
if (se->on_rq)
topse = se;
if (se->on_rq) {
incload = 0;
break;
}
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
/* Increment cpu load if we just enqueued the first task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (incload)
inc_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
/*
@@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
struct sched_entity *se = &p->se,
*topse = NULL; /* Highest schedulable entity */
int decload = 1;
for_each_sched_entity(se) {
topse = se;
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, sleep);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
if (cfs_rq->load.weight) {
if (parent_entity(se))
decload = 0;
break;
}
sleep = 1;
}
/* Decrement cpu load if we just dequeued the last task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (decload)
dec_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
/*
@@ -835,6 +926,154 @@ static void yield_task_fair(struct rq *rq)
se->vruntime = rightmost->vruntime + 1;
}
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
cpumask_t tmp;
struct sched_domain *sd;
int i;
/*
* If it is idle, then it is the best cpu to run this task.
*
* This cpu is also the best, if it has more than one task already.
* Siblings must be also busy(in most cases) as they didn't already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info. This will avoid the checks and cache miss
* penalities associated with that.
*/
if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_IDLE) {
cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
}
return i;
}
}
} else {
break;
}
}
return cpu;
}
#else
static inline int wake_idle(int cpu, struct task_struct *p)
{
return cpu;
}
#endif
#ifdef CONFIG_SMP
static int select_task_rq_fair(struct task_struct *p, int sync)
{
int cpu, this_cpu;
struct rq *rq;
struct sched_domain *sd, *this_sd = NULL;
int new_cpu;
cpu = task_cpu(p);
rq = task_rq(p);
this_cpu = smp_processor_id();
new_cpu = cpu;
if (cpu == this_cpu)
goto out_set_cpu;
for_each_domain(this_cpu, sd) {
if (cpu_isset(cpu, sd->span)) {
this_sd = sd;
break;
}
}
if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
/*
* Check for affine wakeup and passive balancing possibilities.
*/
if (this_sd) {
int idx = this_sd->wake_idx;
unsigned int imbalance;
unsigned long load, this_load;
imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
load = source_load(cpu, idx);
this_load = target_load(this_cpu, idx);
new_cpu = this_cpu; /* Wake to this CPU if we can */
if (this_sd->flags & SD_WAKE_AFFINE) {
unsigned long tl = this_load;
unsigned long tl_per_task;
/*
* Attract cache-cold tasks on sync wakeups:
*/
if (sync && !task_hot(p, rq->clock, this_sd))
goto out_set_cpu;
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
if (sync)
tl -= current->se.load.weight;
if ((tl <= load &&
tl + target_load(cpu, idx) <= tl_per_task) ||
100*(tl + p->se.load.weight) <= imbalance*load) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
* there is no bad imbalance.
*/
schedstat_inc(this_sd, ttwu_move_affine);
schedstat_inc(p, se.nr_wakeups_affine);
goto out_set_cpu;
}
}
/*
* Start passive balancing when half the imbalance_pct
* limit is reached.
*/
if (this_sd->flags & SD_WAKE_BALANCE) {
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
schedstat_inc(p, se.nr_wakeups_passive);
goto out_set_cpu;
}
}
}
new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
static struct task_struct *pick_next_task_fair(struct rq *rq)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
@@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
return task_of(se);
p = task_of(se);
hrtick_start_fair(rq, p);
return p;
}
/*
@@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr;
struct task_struct *p;
if (!cfs_rq->nr_running)
return MAX_PRIO;
curr = cfs_rq->curr;
if (!curr)
curr = __pick_next_entity(cfs_rq);
p = task_of(curr);
return p->prio;
}
#endif
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator;
unsigned long load_moved;
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *this_cfs_rq;
long imbalance;
unsigned long maxload;
struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
unsigned long maxload, task_load, group_weight;
unsigned long thisload, per_task_load;
struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
task_load = busy_cfs_rq->load.weight;
group_weight = se->load.weight;
imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if (imbalance <= 0)
/*
* 'group_weight' is contributed by tasks of total weight
* 'task_load'. To move 'rem_load_move' worth of weight only,
* we need to move a maximum task load of:
*
* maxload = (remload / group_weight) * task_load;
*/
maxload = (rem_load_move * task_load) / group_weight;
if (!maxload || !task_load)
continue;
/* Don't pull more than imbalance/2 */
imbalance /= 2;
maxload = min(rem_load_move, imbalance);
per_task_load = task_load / busy_cfs_rq->nr_running;
/*
* balance_tasks will try to forcibly move atleast one task if
* possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
* maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
*/
if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
continue;
*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
/* Disable priority-based load balance */
*this_best_prio = 0;
thisload = this_cfs_rq->load.weight;
#else
# define maxload rem_load_move
#endif
@@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
* load_balance_[start|next]_fair iterators
*/
cfs_rq_iterator.arg = busy_cfs_rq;
rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
load_moved = balance_tasks(this_rq, this_cpu, busiest,
maxload, sd, idle, all_pinned,
this_best_prio,
&cfs_rq_iterator);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* load_moved holds the task load that was moved. The
* effective (group) weight moved would be:
* load_moved_eff = load_moved/task_load * group_weight;
*/
load_moved = (group_weight * load_moved) / task_load;
/* Adjust shares on both cpus to reflect load_moved */
group_weight -= load_moved;
set_se_shares(se, group_weight);
se = busy_cfs_rq->tg->se[this_cpu];
if (!thisload)
group_weight = load_moved;
else
group_weight = se->load.weight + load_moved;
set_se_shares(se, group_weight);
#endif
rem_load_move -= load_moved;
if (rem_load_move <= 0)
break;
}
@@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
/*
* scheduler tick hitting a task of our scheduling class:
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr)
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se);
entity_tick(cfs_rq, se, queued);
}
}
@@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
resched_task(rq->curr);
}
/*
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
static void prio_changed_fair(struct rq *rq, struct task_struct *p,
int oldprio, int running)
{
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p);
}
/*
* We switched to the sched_fair class.
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p,
int running)
{
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
if (running)
resched_task(rq->curr);
else
check_preempt_curr(rq, p);
}
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq->curr field when a task
@@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_wakeup,
@@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = {
.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_new = task_new_fair,
.prio_changed = prio_changed_fair,
.switched_to = switched_to_fair,
};
#ifdef CONFIG_SCHED_DEBUG
@@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
#endif
rcu_read_lock();
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
#endif

View File

@@ -5,6 +5,12 @@
* handled in sched_fair.c)
*/
#ifdef CONFIG_SMP
static int select_task_rq_idle(struct task_struct *p, int sync)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
*/
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
}
#endif
static void task_tick_idle(struct rq *rq, struct task_struct *curr)
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p,
int running)
{
/* Can this actually happen?? */
if (running)
resched_task(rq->curr);
else
check_preempt_curr(rq, p);
}
static void prio_changed_idle(struct rq *rq, struct task_struct *p,
int oldprio, int running)
{
/* This can happen for hot plug CPUS */
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p);
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = {
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_idle,
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = {
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
/* no .task_new for idle tasks */
};

File diff suppressed because it is too large Load Diff

View File

@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
current->comm, task_pid_nr(current), signr);
#if defined(__i386__) && !defined(__arch_um__)
printk("code at %08lx: ", regs->eip);
printk("code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
unsigned char insn;
__get_user(insn, (unsigned char *)(regs->eip + i));
__get_user(insn, (unsigned char *)(regs->ip + i));
printk("%02x ", insn);
}
}

View File

@@ -3,7 +3,9 @@
*
* Copyright (C) 1992 Linus Torvalds
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
* Distribute under GPLv2.
*
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
*/
#include <linux/module.h>
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void)
*/
void irq_enter(void)
{
#ifdef CONFIG_NO_HZ
int cpu = smp_processor_id();
if (idle_cpu(cpu) && !in_interrupt())
tick_nohz_stop_idle(cpu);
#endif
__irq_enter();
#ifdef CONFIG_NO_HZ
if (idle_cpu(smp_processor_id()))
if (idle_cpu(cpu))
tick_nohz_update_jiffies();
#endif
}

View File

@@ -8,6 +8,7 @@
*/
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
static DEFINE_PER_CPU(unsigned long, print_timestamp);
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
static int did_panic;
int softlockup_thresh = 10;
static int __read_mostly did_panic;
unsigned long __read_mostly softlockup_thresh = 60;
static int
softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
*/
static unsigned long get_timestamp(int this_cpu)
{
return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */
return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}
void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
now = get_timestamp(this_cpu);
/* Wake up the high-prio watchdog task every second: */
if (now > (touch_timestamp + 1))
wake_up_process(per_cpu(watchdog_task, this_cpu));
/* Warn about unreasonable 10+ seconds delays: */
/* Warn about unreasonable delays: */
if (now <= (touch_timestamp + softlockup_thresh))
return;
@@ -121,12 +118,94 @@ void softlockup_tick(void)
spin_unlock(&print_lock);
}
/*
* Have a reasonable limit on the number of tasks checked:
*/
unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
unsigned long __read_mostly sysctl_hung_task_warnings = 10;
/*
* Only do the hung-tasks check on one CPU:
*/
static int check_cpu __read_mostly = -1;
static void check_hung_task(struct task_struct *t, unsigned long now)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
if (t->flags & PF_FROZEN)
return;
if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
t->last_switch_count = switch_count;
t->last_switch_timestamp = now;
return;
}
if ((long)(now - t->last_switch_timestamp) <
sysctl_hung_task_timeout_secs)
return;
if (sysctl_hung_task_warnings < 0)
return;
sysctl_hung_task_warnings--;
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
printk(KERN_ERR "INFO: task %s:%d blocked for more than "
"%ld seconds.\n", t->comm, t->pid,
sysctl_hung_task_timeout_secs);
printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
__debug_show_held_locks(t);
t->last_switch_timestamp = now;
touch_nmi_watchdog();
}
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
static void check_hung_uninterruptible_tasks(int this_cpu)
{
int max_count = sysctl_hung_task_check_count;
unsigned long now = get_timestamp(this_cpu);
struct task_struct *g, *t;
/*
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
if ((tainted & TAINT_DIE) || did_panic)
return;
read_lock(&tasklist_lock);
do_each_thread(g, t) {
if (!--max_count)
break;
if (t->state & TASK_UNINTERRUPTIBLE)
check_hung_task(t, now);
} while_each_thread(g, t);
read_unlock(&tasklist_lock);
}
/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
int this_cpu = (long)__bind_cpu;
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 10 seconds then the
* If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
schedule();
msleep_interruptible(10000);
if (this_cpu != check_cpu)
continue;
if (sysctl_hung_task_timeout_secs)
check_hung_uninterruptible_tasks(this_cpu);
}
return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
check_cpu = any_online_cpu(cpu_online_map);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
if (hotcpu == check_cpu) {
cpumask_t temp_cpu_online_map = cpu_online_map;
cpu_clear(hotcpu, temp_cpu_online_map);
check_cpu = any_online_cpu(temp_cpu_online_map);
}
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
p = per_cpu(watchdog_task, hotcpu);

View File

@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
* even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
* not re-enabled during lock-acquire (which the preempt-spin-ops do):
*/
#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
defined(CONFIG_DEBUG_LOCK_ALLOC)
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
void __lockfunc _read_lock(rwlock_t *lock)
{

View File

@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
int ret;
/* No CPUs can come up or down during this. */
lock_cpu_hotplug();
get_online_cpus();
p = __stop_machine_run(fn, data, cpu);
if (!IS_ERR(p))
ret = kthread_stop(p);
else
ret = PTR_ERR(p);
unlock_cpu_hotplug();
put_online_cpus();
return ret;
}

View File

@@ -53,6 +53,7 @@
#ifdef CONFIG_X86
#include <asm/nmi.h>
#include <asm/stacktrace.h>
#include <asm/io.h>
#endif
static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -81,6 +82,7 @@ extern int compat_log;
extern int maps_protect;
extern int sysctl_stat_interval;
extern int audit_argv_kb;
extern int latencytop_enabled;
/* Constants used for minimum and maximum */
#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -156,8 +158,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
#endif
static struct ctl_table root_table[];
static struct ctl_table_header root_table_header =
{ root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
static struct ctl_table_root sysctl_table_root;
static struct ctl_table_header root_table_header = {
.ctl_table = root_table,
.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
.root = &sysctl_table_root,
};
static struct ctl_table_root sysctl_table_root = {
.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
.header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
};
static struct ctl_table kern_table[];
static struct ctl_table vm_table[];
@@ -191,14 +201,6 @@ static struct ctl_table root_table[] = {
.mode = 0555,
.child = vm_table,
},
#ifdef CONFIG_NET
{
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
.child = net_table,
},
#endif
{
.ctl_name = CTL_FS,
.procname = "fs",
@@ -306,9 +308,43 @@ static struct ctl_table kern_table[] = {
.procname = "sched_nr_migrate",
.data = &sysctl_sched_nr_migrate,
.maxlen = sizeof(unsigned int),
.mode = 644,
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_period_ms",
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_ratio",
.data = &sysctl_sched_rt_ratio,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_min_bal_int_shares",
.data = &sysctl_sched_min_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_max_bal_int_shares",
.data = &sysctl_sched_max_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#endif
{
.ctl_name = CTL_UNNUMBERED,
@@ -382,6 +418,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec_taint,
},
#endif
#ifdef CONFIG_LATENCYTOP
{
.procname = "latencytop",
.data = &latencytop_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#ifdef CONFIG_SECURITY_CAPABILITIES
{
.procname = "cap-bound",
@@ -683,6 +728,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "io_delay_type",
.data = &io_delay_type,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#if defined(CONFIG_MMU)
{
@@ -728,13 +781,40 @@ static struct ctl_table kern_table[] = {
.ctl_name = CTL_UNNUMBERED,
.procname = "softlockup_thresh",
.data = &softlockup_thresh,
.maxlen = sizeof(int),
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &one,
.extra2 = &sixty,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
.data = &sysctl_hung_task_check_count,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_timeout_secs",
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_warnings",
.data = &sysctl_hung_task_warnings,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
},
#endif
#ifdef CONFIG_COMPAT
{
@@ -1300,12 +1380,27 @@ void sysctl_head_finish(struct ctl_table_header *head)
spin_unlock(&sysctl_lock);
}
struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
static struct list_head *
lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
{
struct list_head *header_list;
header_list = &root->header_list;
if (root->lookup)
header_list = root->lookup(root, namespaces);
return header_list;
}
struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
struct ctl_table_header *prev)
{
struct ctl_table_root *root;
struct list_head *header_list;
struct ctl_table_header *head;
struct list_head *tmp;
spin_lock(&sysctl_lock);
if (prev) {
head = prev;
tmp = &prev->ctl_entry;
unuse_table(prev);
goto next;
@@ -1319,14 +1414,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
spin_unlock(&sysctl_lock);
return head;
next:
root = head->root;
tmp = tmp->next;
if (tmp == &root_table_header.ctl_entry)
break;
header_list = lookup_header_list(root, namespaces);
if (tmp != header_list)
continue;
do {
root = list_entry(root->root_list.next,
struct ctl_table_root, root_list);
if (root == &sysctl_table_root)
goto out;
header_list = lookup_header_list(root, namespaces);
} while (list_empty(header_list));
tmp = header_list->next;
}
out:
spin_unlock(&sysctl_lock);
return NULL;
}
struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
{
return __sysctl_head_next(current->nsproxy, prev);
}
void register_sysctl_root(struct ctl_table_root *root)
{
spin_lock(&sysctl_lock);
list_add_tail(&root->root_list, &sysctl_table_root.root_list);
spin_unlock(&sysctl_lock);
}
#ifdef CONFIG_SYSCTL_SYSCALL
int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen)
@@ -1483,18 +1602,21 @@ static __init int sysctl_init(void)
{
int err;
sysctl_set_parent(NULL, root_table);
err = sysctl_check_table(root_table);
err = sysctl_check_table(current->nsproxy, root_table);
return 0;
}
core_initcall(sysctl_init);
/**
* register_sysctl_table - register a sysctl hierarchy
* __register_sysctl_paths - register a sysctl hierarchy
* @root: List of sysctl headers to register on
* @namespaces: Data to compute which lists of sysctl entries are visible
* @path: The path to the directory the sysctl table is in.
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. An entry with a ctl_name of 0 terminates the table.
* array. A completely 0 filled entry terminates the table.
*
* The members of the &struct ctl_table structure are used as follows:
*
@@ -1557,25 +1679,99 @@ core_initcall(sysctl_init);
* This routine returns %NULL on a failure to register, and a pointer
* to the table header on success.
*/
struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
struct ctl_table_header *__register_sysctl_paths(
struct ctl_table_root *root,
struct nsproxy *namespaces,
const struct ctl_path *path, struct ctl_table *table)
{
struct ctl_table_header *tmp;
tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
if (!tmp)
struct list_head *header_list;
struct ctl_table_header *header;
struct ctl_table *new, **prevp;
unsigned int n, npath;
/* Count the path components */
for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
;
/*
* For each path component, allocate a 2-element ctl_table array.
* The first array element will be filled with the sysctl entry
* for this, the second will be the sentinel (ctl_name == 0).
*
* We allocate everything in one go so that we don't have to
* worry about freeing additional memory in unregister_sysctl_table.
*/
header = kzalloc(sizeof(struct ctl_table_header) +
(2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
if (!header)
return NULL;
tmp->ctl_table = table;
INIT_LIST_HEAD(&tmp->ctl_entry);
tmp->used = 0;
tmp->unregistering = NULL;
sysctl_set_parent(NULL, table);
if (sysctl_check_table(tmp->ctl_table)) {
kfree(tmp);
new = (struct ctl_table *) (header + 1);
/* Now connect the dots */
prevp = &header->ctl_table;
for (n = 0; n < npath; ++n, ++path) {
/* Copy the procname */
new->procname = path->procname;
new->ctl_name = path->ctl_name;
new->mode = 0555;
*prevp = new;
prevp = &new->child;
new += 2;
}
*prevp = table;
header->ctl_table_arg = table;
INIT_LIST_HEAD(&header->ctl_entry);
header->used = 0;
header->unregistering = NULL;
header->root = root;
sysctl_set_parent(NULL, header->ctl_table);
if (sysctl_check_table(namespaces, header->ctl_table)) {
kfree(header);
return NULL;
}
spin_lock(&sysctl_lock);
list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
header_list = lookup_header_list(root, namespaces);
list_add_tail(&header->ctl_entry, header_list);
spin_unlock(&sysctl_lock);
return tmp;
return header;
}
/**
* register_sysctl_table_path - register a sysctl table hierarchy
* @path: The path to the directory the sysctl table is in.
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_paths for more details.
*/
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table)
{
return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
path, table);
}
/**
* register_sysctl_table - register a sysctl table hierarchy
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See register_sysctl_paths for more details.
*/
struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
{
static const struct ctl_path null_path[] = { {} };
return register_sysctl_paths(null_path, table);
}
/**
@@ -1604,6 +1800,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
return NULL;
}
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table)
{
return NULL;
}
void unregister_sysctl_table(struct ctl_table_header * table)
{
}
@@ -2662,6 +2864,7 @@ EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
EXPORT_SYMBOL(register_sysctl_table);
EXPORT_SYMBOL(register_sysctl_paths);
EXPORT_SYMBOL(sysctl_intvec);
EXPORT_SYMBOL(sysctl_jiffies);
EXPORT_SYMBOL(sysctl_ms_jiffies);

View File

@@ -1,6 +1,5 @@
#include <linux/stat.h>
#include <linux/sysctl.h>
#include "../arch/s390/appldata/appldata.h"
#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
#include <linux/sunrpc/debug.h>
#include <linux/string.h>
@@ -1343,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table)
}
}
static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
struct ctl_table *table)
{
struct ctl_table_header *head;
struct ctl_table *ref, *test;
@@ -1351,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
depth = sysctl_depth(table);
for (head = sysctl_head_next(NULL); head;
head = sysctl_head_next(head)) {
for (head = __sysctl_head_next(namespaces, NULL); head;
head = __sysctl_head_next(namespaces, head)) {
cur_depth = depth;
ref = head->ctl_table;
repeat:
@@ -1397,13 +1397,14 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
*fail = str;
}
static int sysctl_check_dir(struct ctl_table *table)
static int sysctl_check_dir(struct nsproxy *namespaces,
struct ctl_table *table)
{
struct ctl_table *ref;
int error;
error = 0;
ref = sysctl_check_lookup(table);
ref = sysctl_check_lookup(namespaces, table);
if (ref) {
int match = 0;
if ((!table->procname && !ref->procname) ||
@@ -1428,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table)
return error;
}
static void sysctl_check_leaf(struct ctl_table *table, const char **fail)
static void sysctl_check_leaf(struct nsproxy *namespaces,
struct ctl_table *table, const char **fail)
{
struct ctl_table *ref;
ref = sysctl_check_lookup(table);
ref = sysctl_check_lookup(namespaces, table);
if (ref && (ref != table))
set_fail(fail, table, "Sysctl already exists");
}
@@ -1456,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
}
}
int sysctl_check_table(struct ctl_table *table)
int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
{
int error = 0;
for (; table->ctl_name || table->procname; table++) {
@@ -1486,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table)
set_fail(&fail, table, "Directory with extra1");
if (table->extra2)
set_fail(&fail, table, "Directory with extra2");
if (sysctl_check_dir(table))
if (sysctl_check_dir(namespaces, table))
set_fail(&fail, table, "Inconsistent directory names");
} else {
if ((table->strategy == sysctl_data) ||
@@ -1535,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table)
if (!table->procname && table->proc_handler)
set_fail(&fail, table, "proc_handler without procname");
#endif
sysctl_check_leaf(table, &fail);
sysctl_check_leaf(namespaces, table, &fail);
}
sysctl_check_bin_path(table, &fail);
if (fail) {
@@ -1543,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table)
error = -EINVAL;
}
if (table->child)
error |= sysctl_check_table(table->child);
error |= sysctl_check_table(namespaces, table->child);
}
return error;
}

216
kernel/test_kprobes.c Normal file
View File

@@ -0,0 +1,216 @@
/*
* test_kprobes.c - simple sanity test for *probes
*
* Copyright IBM Corp. 2008
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/random.h>
#define div_factor 3
static u32 rand1, preh_val, posth_val, jph_val;
static int errors, handler_errors, num_tests;
static noinline u32 kprobe_target(u32 value)
{
/*
* gcc ignores noinline on some architectures unless we stuff
* sufficient lard into the function. The get_kprobe() here is
* just for that.
*
* NOTE: We aren't concerned about the correctness of get_kprobe()
* here; hence, this call is neither under !preempt nor with the
* kprobe_mutex held. This is fine(tm)
*/
if (get_kprobe((void *)0xdeadbeef))
printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
return (value / div_factor);
}
static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
preh_val = (rand1 / div_factor);
return 0;
}
static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
printk(KERN_ERR "Kprobe smoke test failed: "
"incorrect value in post_handler\n");
}
posth_val = preh_val + div_factor;
}
static struct kprobe kp = {
.symbol_name = "kprobe_target",
.pre_handler = kp_pre_handler,
.post_handler = kp_post_handler
};
static int test_kprobe(void)
{
int ret;
ret = register_kprobe(&kp);
if (ret < 0) {
printk(KERN_ERR "Kprobe smoke test failed: "
"register_kprobe returned %d\n", ret);
return ret;
}
ret = kprobe_target(rand1);
unregister_kprobe(&kp);
if (preh_val == 0) {
printk(KERN_ERR "Kprobe smoke test failed: "
"kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
printk(KERN_ERR "Kprobe smoke test failed: "
"kprobe post_handler not called\n");
handler_errors++;
}
return 0;
}
static u32 j_kprobe_target(u32 value)
{
if (value != rand1) {
handler_errors++;
printk(KERN_ERR "Kprobe smoke test failed: "
"incorrect value in jprobe handler\n");
}
jph_val = rand1;
jprobe_return();
return 0;
}
static struct jprobe jp = {
.entry = j_kprobe_target,
.kp.symbol_name = "kprobe_target"
};
static int test_jprobe(void)
{
int ret;
ret = register_jprobe(&jp);
if (ret < 0) {
printk(KERN_ERR "Kprobe smoke test failed: "
"register_jprobe returned %d\n", ret);
return ret;
}
ret = kprobe_target(rand1);
unregister_jprobe(&jp);
if (jph_val == 0) {
printk(KERN_ERR "Kprobe smoke test failed: "
"jprobe handler not called\n");
handler_errors++;
}
return 0;
}
#ifdef CONFIG_KRETPROBES
static u32 krph_val;
static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long ret = regs_return_value(regs);
if (ret != (rand1 / div_factor)) {
handler_errors++;
printk(KERN_ERR "Kprobe smoke test failed: "
"incorrect value in kretprobe handler\n");
}
krph_val = (rand1 / div_factor);
return 0;
}
static struct kretprobe rp = {
.handler = return_handler,
.kp.symbol_name = "kprobe_target"
};
static int test_kretprobe(void)
{
int ret;
ret = register_kretprobe(&rp);
if (ret < 0) {
printk(KERN_ERR "Kprobe smoke test failed: "
"register_kretprobe returned %d\n", ret);
return ret;
}
ret = kprobe_target(rand1);
unregister_kretprobe(&rp);
if (krph_val == 0) {
printk(KERN_ERR "Kprobe smoke test failed: "
"kretprobe handler not called\n");
handler_errors++;
}
return 0;
}
#endif /* CONFIG_KRETPROBES */
int init_test_probes(void)
{
int ret;
do {
rand1 = random32();
} while (rand1 <= div_factor);
printk(KERN_INFO "Kprobe smoke test started\n");
num_tests++;
ret = test_kprobe();
if (ret < 0)
errors++;
num_tests++;
ret = test_jprobe();
if (ret < 0)
errors++;
#ifdef CONFIG_KRETPROBES
num_tests++;
ret = test_kretprobe();
if (ret < 0)
errors++;
#endif /* CONFIG_KRETPROBES */
if (errors)
printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
"%d tests failed\n", errors, num_tests);
else if (handler_errors)
printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
"running handlers\n", handler_errors);
else
printk(KERN_INFO "Kprobe smoke test passed successfully\n");
return 0;
}

View File

@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
{
u64 clc = ((u64) latch << evt->shift);
if (unlikely(!evt->mult)) {
evt->mult = 1;
WARN_ON(1);
}
do_div(clc, evt->mult);
if (clc < 1000)
clc = 1000;
@@ -151,6 +156,14 @@ static void clockevents_notify_released(void)
void clockevents_register_device(struct clock_event_device *dev)
{
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
}
spin_lock(&clockevents_lock);

View File

@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
}
if (!list_empty(&watchdog_list)) {
__mod_timer(&watchdog_timer,
watchdog_timer.expires + WATCHDOG_INTERVAL);
/* Cycle through CPUs to check if the CPUs stay synchronized to
* each other. */
int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
if (next_cpu >= NR_CPUS)
next_cpu = first_cpu(cpu_online_map);
watchdog_timer.expires += WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, next_cpu);
}
spin_unlock(&watchdog_lock);
}
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (!started && watchdog) {
watchdog_last = watchdog->read();
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer(&watchdog_timer);
add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
}
} else {
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (watchdog)
del_timer(&watchdog_timer);
watchdog = cs;
init_timer(&watchdog_timer);
init_timer_deferrable(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
/* Reset watchdog cycles */
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
watchdog_last = watchdog->read();
watchdog_timer.expires =
jiffies + WATCHDOG_INTERVAL;
add_timer(&watchdog_timer);
add_timer_on(&watchdog_timer,
first_cpu(cpu_online_map));
}
}
}
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
spin_unlock_irqrestore(&clocksource_lock, flags);
}
/**
* clocksource_unregister - remove a registered clocksource
*/
void clocksource_unregister(struct clocksource *cs)
{
unsigned long flags;
spin_lock_irqsave(&clocksource_lock, flags);
list_del(&cs->list);
if (clocksource_override == cs)
clocksource_override = NULL;
next_clocksource = select_clocksource();
spin_unlock_irqrestore(&clocksource_lock, flags);
}
#ifdef CONFIG_SYSFS
/**
* sysfs_show_current_clocksources - sysfs interface for current clocksource
@@ -441,7 +462,7 @@ static SYSDEV_ATTR(available_clocksource, 0600,
sysfs_show_available_clocksources, NULL);
static struct sysdev_class clocksource_sysclass = {
set_kset_name("clocksource"),
.name = "clocksource",
};
static struct sys_device device_clocksource = {

View File

@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
/*
* Broadcast the event to the cpus, which are set in the mask
*/
int tick_do_broadcast(cpumask_t mask)
static void tick_do_broadcast(cpumask_t mask)
{
int ret = 0, cpu = smp_processor_id();
int cpu = smp_processor_id();
struct tick_device *td;
/*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
cpu_clear(cpu, mask);
td = &per_cpu(tick_cpu_device, cpu);
td->evtdev->event_handler(td->evtdev);
ret = 1;
}
if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
cpu = first_cpu(mask);
td = &per_cpu(tick_cpu_device, cpu);
td->evtdev->broadcast(mask);
ret = 1;
}
return ret;
}
/*

View File

@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
* Broadcasting support
*/
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern int tick_do_broadcast(cpumask_t mask);
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
extern int tick_check_broadcast_device(struct clock_event_device *dev);
extern int tick_is_broadcast_device(struct clock_event_device *dev);

View File

@@ -9,7 +9,7 @@
*
* Started by: Thomas Gleixner and Ingo Molnar
*
* For licencing details see kernel-base/COPYING
* Distribute under GPLv2.
*/
#include <linux/cpu.h>
#include <linux/err.h>
@@ -143,6 +143,44 @@ void tick_nohz_update_jiffies(void)
local_irq_restore(flags);
}
void tick_nohz_stop_idle(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
if (ts->idle_active) {
ktime_t now, delta;
now = ktime_get();
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_lastupdate = now;
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_active = 0;
}
}
static ktime_t tick_nohz_start_idle(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, delta;
now = ktime_get();
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_lastupdate = now;
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
}
ts->idle_entrytime = now;
ts->idle_active = 1;
return now;
}
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
*last_update_time = ktime_to_us(ts->idle_lastupdate);
return ktime_to_us(ts->idle_sleeptime);
}
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
*
@@ -153,14 +191,16 @@ void tick_nohz_update_jiffies(void)
void tick_nohz_stop_sched_tick(void)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now, delta;
ktime_t last_update, expires, now;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
int cpu;
local_irq_save(flags);
cpu = smp_processor_id();
now = tick_nohz_start_idle(cpu);
ts = &per_cpu(tick_cpu_sched, cpu);
/*
@@ -192,19 +232,7 @@ void tick_nohz_stop_sched_tick(void)
}
}
now = ktime_get();
/*
* When called from irq_exit we need to account the idle sleep time
* correctly.
*/
if (ts->tick_stopped) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
}
ts->idle_entrytime = now;
ts->idle_calls++;
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqbegin(&xtime_lock);
@@ -216,6 +244,10 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
rt_jiffies = rt_needs_cpu(cpu);
if (rt_jiffies && rt_jiffies < delta_jiffies)
delta_jiffies = rt_jiffies;
if (rcu_needs_cpu(cpu))
delta_jiffies = 1;
/*
@@ -291,7 +323,7 @@ void tick_nohz_stop_sched_tick(void)
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
goto out;
} else if(!tick_program_event(expires, 0))
} else if (!tick_program_event(expires, 0))
goto out;
/*
* We are past the event already. So we crossed a
@@ -332,23 +364,22 @@ void tick_nohz_restart_sched_tick(void)
int cpu = smp_processor_id();
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
unsigned long ticks;
ktime_t now, delta;
if (!ts->tick_stopped)
return;
/* Update jiffies first */
now = ktime_get();
ktime_t now;
local_irq_disable();
tick_nohz_stop_idle(cpu);
if (!ts->tick_stopped) {
local_irq_enable();
return;
}
/* Update jiffies first */
select_nohz_load_balancer(0);
now = ktime_get();
tick_do_update_jiffies64(now);
cpu_clear(cpu, nohz_cpu_mask);
/* Account the idle time */
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
/*
* We stopped the tick in idle. Update process times would miss the
* time we slept as update_process_times does only a 1 tick
@@ -502,14 +533,13 @@ static inline void tick_nohz_switch_to_nohz(void) { }
*/
#ifdef CONFIG_HIGH_RES_TIMERS
/*
* We rearm the timer until we get disabled by the idle code
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled and timer->base->cpu_base->lock held.
*/
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
struct hrtimer_cpu_base *base = timer->base->cpu_base;
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
int cpu = smp_processor_id();
@@ -547,15 +577,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
touch_softlockup_watchdog();
ts->idle_jiffies++;
}
/*
* update_process_times() might take tasklist_lock, hence
* drop the base lock. sched-tick hrtimers are per-CPU and
* never accessible by userspace APIs, so this is safe to do.
*/
spin_unlock(&base->lock);
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
spin_lock(&base->lock);
}
/* Do not restart, when we are in the idle loop */

View File

@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
}
/**
* __get_realtime_clock_ts - Returns the time of day in a timespec
* getnstimeofday - Returns the time of day in a timespec
* @ts: pointer to the timespec to be set
*
* Returns the time of day in a timespec. Used by
* do_gettimeofday() and get_realtime_clock_ts().
* Returns the time of day in a timespec.
*/
static inline void __get_realtime_clock_ts(struct timespec *ts)
void getnstimeofday(struct timespec *ts)
{
unsigned long seq;
s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
timespec_add_ns(ts, nsecs);
}
/**
* getnstimeofday - Returns the time of day in a timespec
* @ts: pointer to the timespec to be set
*
* Returns the time of day in a timespec.
*/
void getnstimeofday(struct timespec *ts)
{
__get_realtime_clock_ts(ts);
}
EXPORT_SYMBOL(getnstimeofday);
/**
* do_gettimeofday - Returns the time of day in a timeval
* @tv: pointer to the timeval to be set
*
* NOTE: Users should be converted to using get_realtime_clock_ts()
* NOTE: Users should be converted to using getnstimeofday()
*/
void do_gettimeofday(struct timeval *tv)
{
struct timespec now;
__get_realtime_clock_ts(&now);
getnstimeofday(&now);
tv->tv_sec = now.tv_sec;
tv->tv_usec = now.tv_nsec/1000;
}
@@ -198,7 +186,8 @@ static void change_clocksource(void)
clock->error = 0;
clock->xtime_nsec = 0;
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clocksource_calculate_interval(clock,
(unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
tick_clock_notify();
@@ -255,7 +244,8 @@ void __init timekeeping_init(void)
ntp_clear();
clock = clocksource_get_next();
clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clocksource_calculate_interval(clock,
(unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
clock->cycle_last = clocksource_read(clock);
xtime.tv_sec = sec;
@@ -335,9 +325,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
/* sysfs resume/suspend bits for timekeeping */
static struct sysdev_class timekeeping_sysclass = {
.name = "timekeeping",
.resume = timekeeping_resume,
.suspend = timekeeping_suspend,
set_kset_name("timekeeping"),
};
static struct sys_device device_timer = {

View File

@@ -26,7 +26,7 @@
* the pid and cmdline from the owner process if applicable.
*
* Start/stop data collection:
* # echo 1[0] >/proc/timer_stats
* # echo [1|0] >/proc/timer_stats
*
* Display the information collected so far:
* # cat /proc/timer_stats

View File

@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
typedef struct tvec_s {
struct tvec {
struct list_head vec[TVN_SIZE];
} tvec_t;
};
typedef struct tvec_root_s {
struct tvec_root {
struct list_head vec[TVR_SIZE];
} tvec_root_t;
};
struct tvec_t_base_s {
struct tvec_base {
spinlock_t lock;
struct timer_list *running_timer;
unsigned long timer_jiffies;
tvec_root_t tv1;
tvec_t tv2;
tvec_t tv3;
tvec_t tv4;
tvec_t tv5;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
struct tvec tv4;
struct tvec tv5;
} ____cacheline_aligned;
typedef struct tvec_t_base_s tvec_base_t;
tvec_base_t boot_tvec_bases;
struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/*
* Note that all tvec_bases is 2 byte aligned and lower bit of
* Note that all tvec_bases are 2 byte aligned and lower bit of
* base in timer_list is guaranteed to be zero. Use the LSB for
* the new flag to indicate whether the timer is deferrable
*/
#define TBASE_DEFERRABLE_FLAG (0x1)
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
{
return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
}
static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
{
return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
}
static inline void timer_set_deferrable(struct timer_list *timer)
{
timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
TBASE_DEFERRABLE_FLAG));
}
static inline void
timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
{
timer->base = (tvec_base_t *)((unsigned long)(new_base) |
timer->base = (struct tvec_base *)((unsigned long)(new_base) |
tbase_get_deferrable(timer->base));
}
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
EXPORT_SYMBOL_GPL(round_jiffies_relative);
static inline void set_running_timer(tvec_base_t *base,
static inline void set_running_timer(struct tvec_base *base,
struct timer_list *timer)
{
#ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
#endif
}
static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
* possible to set timer->base = NULL and drop the lock: the timer remains
* locked.
*/
static tvec_base_t *lock_timer_base(struct timer_list *timer,
static struct tvec_base *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
__acquires(timer->base->lock)
{
tvec_base_t *base;
struct tvec_base *base;
for (;;) {
tvec_base_t *prelock_base = timer->base;
struct tvec_base *prelock_base = timer->base;
base = tbase_get_base(prelock_base);
if (likely(base != NULL)) {
spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
int __mod_timer(struct timer_list *timer, unsigned long expires)
{
tvec_base_t *base, *new_base;
struct tvec_base *base, *new_base;
unsigned long flags;
int ret = 0;
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
tvec_base_t *base = per_cpu(tvec_bases, cpu);
struct tvec_base *base = per_cpu(tvec_bases, cpu);
unsigned long flags;
timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
*/
int del_timer(struct timer_list *timer)
{
tvec_base_t *base;
struct tvec_base *base;
unsigned long flags;
int ret = 0;
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
*/
int try_to_del_timer_sync(struct timer_list *timer)
{
tvec_base_t *base;
struct tvec_base *base;
unsigned long flags;
int ret = -1;
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
static int cascade(tvec_base_t *base, tvec_t *tv, int index)
static int cascade(struct tvec_base *base, struct tvec *tv, int index)
{
/* cascade all the timers from tv up one level */
struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
* This function cascades all vectors and executes all expired timer
* vectors.
*/
static inline void __run_timers(tvec_base_t *base)
static inline void __run_timers(struct tvec_base *base)
{
struct timer_list *timer;
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base)
int preempt_count = preempt_count();
fn(data);
if (preempt_count != preempt_count()) {
printk(KERN_WARNING "huh, entered %p "
printk(KERN_ERR "huh, entered %p "
"with preempt_count %08x, exited"
" with %08x?\n",
fn, preempt_count,
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
* is used on S/390 to stop all activity when a cpus is idle.
* This functions needs to be called disabled.
*/
static unsigned long __next_timer_interrupt(tvec_base_t *base)
static unsigned long __next_timer_interrupt(struct tvec_base *base)
{
unsigned long timer_jiffies = base->timer_jiffies;
unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
int index, slot, array, found = 0;
struct timer_list *nte;
tvec_t *varray[4];
struct tvec *varray[4];
/* Look for timer events in tv1. */
index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
varray[3] = &base->tv5;
for (array = 0; array < 4; array++) {
tvec_t *varp = varray[array];
struct tvec *varp = varray[array];
index = slot = timer_jiffies & TVN_MASK;
do {
@@ -795,7 +793,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
*/
unsigned long get_next_timer_interrupt(unsigned long now)
{
tvec_base_t *base = __get_cpu_var(tvec_bases);
struct tvec_base *base = __get_cpu_var(tvec_bases);
unsigned long expires;
spin_lock(&base->lock);
@@ -894,9 +892,9 @@ static inline void calc_load(unsigned long ticks)
*/
static void run_timer_softirq(struct softirq_action *h)
{
tvec_base_t *base = __get_cpu_var(tvec_bases);
struct tvec_base *base = __get_cpu_var(tvec_bases);
hrtimer_run_queues();
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
@@ -907,6 +905,7 @@ static void run_timer_softirq(struct softirq_action *h)
*/
void run_local_timers(void)
{
hrtimer_run_queues();
raise_softirq(TIMER_SOFTIRQ);
softlockup_tick();
}
@@ -1222,7 +1221,7 @@ static struct lock_class_key base_lock_keys[NR_CPUS];
static int __cpuinit init_timers_cpu(int cpu)
{
int j;
tvec_base_t *base;
struct tvec_base *base;
static char __cpuinitdata tvec_base_done[NR_CPUS];
if (!tvec_base_done[cpu]) {
@@ -1277,7 +1276,7 @@ static int __cpuinit init_timers_cpu(int cpu)
}
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
{
struct timer_list *timer;
@@ -1291,8 +1290,8 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
static void __cpuinit migrate_timers(int cpu)
{
tvec_base_t *old_base;
tvec_base_t *new_base;
struct tvec_base *old_base;
struct tvec_base *new_base;
int i;
BUG_ON(cpu_online(cpu));

View File

@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { }
#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
static DEFINE_MUTEX(uids_mutex);
static inline void uids_mutex_lock(void)
@@ -128,86 +128,83 @@ static inline void uids_mutex_unlock(void)
mutex_unlock(&uids_mutex);
}
/* return cpu shares held by the user */
static ssize_t cpu_shares_show(struct kset *kset, char *buffer)
/* uid directory attributes */
static ssize_t cpu_shares_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
struct user_struct *up = container_of(kset, struct user_struct, kset);
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
}
/* modify cpu shares held by the user */
static ssize_t cpu_shares_store(struct kset *kset, const char *buffer,
size_t size)
static ssize_t cpu_shares_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t size)
{
struct user_struct *up = container_of(kset, struct user_struct, kset);
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
unsigned long shares;
int rc;
sscanf(buffer, "%lu", &shares);
sscanf(buf, "%lu", &shares);
rc = sched_group_set_shares(up->tg, shares);
return (rc ? rc : size);
}
static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
static struct kobj_attribute cpu_share_attr =
__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
/* default attributes per uid directory */
static struct attribute *uids_attributes[] = {
&cpu_share_attr.attr,
NULL
};
/* the lifetime of user_struct is not managed by the core (now) */
static void uids_release(struct kobject *kobj)
{
sa->attr.name = name;
sa->attr.mode = mode;
sa->show = cpu_shares_show;
sa->store = cpu_shares_store;
return;
}
/* Create "/sys/kernel/uids/<uid>" directory and
* "/sys/kernel/uids/<uid>/cpu_share" file for this user.
*/
static int user_kobject_create(struct user_struct *up)
static struct kobj_type uids_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.default_attrs = uids_attributes,
.release = uids_release,
};
/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
static int uids_user_create(struct user_struct *up)
{
struct kset *kset = &up->kset;
struct kobject *kobj = &kset->kobj;
struct kobject *kobj = &up->kobj;
int error;
memset(kset, 0, sizeof(struct kset));
kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
kobject_set_name(kobj, "%d", up->uid);
kset_init(kset);
user_attr_init(&up->user_attr, "cpu_share", 0644);
error = kobject_add(kobj);
if (error)
memset(kobj, 0, sizeof(struct kobject));
kobj->kset = uids_kset;
error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
if (error) {
kobject_put(kobj);
goto done;
error = sysfs_create_file(kobj, &up->user_attr.attr);
if (error)
kobject_del(kobj);
}
kobject_uevent(kobj, KOBJ_ADD);
done:
return error;
}
/* create these in sysfs filesystem:
/* create these entries in sysfs:
* "/sys/kernel/uids" directory
* "/sys/kernel/uids/0" directory (for root user)
* "/sys/kernel/uids/0/cpu_share" file (for root user)
*/
int __init uids_kobject_init(void)
int __init uids_sysfs_init(void)
{
int error;
uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
if (!uids_kset)
return -ENOMEM;
/* create under /sys/kernel dir */
uids_kobject.parent = &kernel_subsys.kobj;
uids_kobject.kset = &kernel_subsys;
kobject_set_name(&uids_kobject, "uids");
kobject_init(&uids_kobject);
error = kobject_add(&uids_kobject);
if (!error)
error = user_kobject_create(&root_user);
return error;
return uids_user_create(&root_user);
}
/* work function to remove sysfs directory for a user and free up
@@ -216,7 +213,6 @@ int __init uids_kobject_init(void)
static void remove_user_sysfs_dir(struct work_struct *w)
{
struct user_struct *up = container_of(w, struct user_struct, work);
struct kobject *kobj = &up->kset.kobj;
unsigned long flags;
int remove_user = 0;
@@ -238,9 +234,9 @@ static void remove_user_sysfs_dir(struct work_struct *w)
if (!remove_user)
goto done;
sysfs_remove_file(kobj, &up->user_attr.attr);
kobject_uevent(kobj, KOBJ_REMOVE);
kobject_del(kobj);
kobject_uevent(&up->kobj, KOBJ_REMOVE);
kobject_del(&up->kobj);
kobject_put(&up->kobj);
sched_destroy_user(up);
key_put(up->uid_keyring);
@@ -267,7 +263,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
static inline int user_kobject_create(struct user_struct *up) { return 0; }
int uids_sysfs_init(void) { return 0; }
static inline int uids_user_create(struct user_struct *up) { return 0; }
static inline void uids_mutex_lock(void) { }
static inline void uids_mutex_unlock(void) { }
@@ -322,9 +319,9 @@ void free_uid(struct user_struct *up)
struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
{
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;
struct user_struct *up, *new;
/* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
* atomic.
*/
uids_mutex_lock();
@@ -334,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_unlock_irq(&uidhash_lock);
if (!up) {
struct user_struct *new;
new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
if (!new) {
uids_mutex_unlock();
return NULL;
}
if (!new)
goto out_unlock;
new->uid = uid;
atomic_set(&new->__count, 1);
@@ -356,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
#endif
new->locked_shm = 0;
if (alloc_uid_keyring(new, current) < 0) {
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
if (alloc_uid_keyring(new, current) < 0)
goto out_free_user;
if (sched_create_user(new) < 0) {
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
if (sched_create_user(new) < 0)
goto out_put_keys;
if (user_kobject_create(new)) {
sched_destroy_user(new);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
if (uids_user_create(new))
goto out_destoy_sched;
/*
* Before adding this, check whether we raced
@@ -405,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
uids_mutex_unlock();
return up;
out_destoy_sched:
sched_destroy_user(new);
out_put_keys:
key_put(new->uid_keyring);
key_put(new->session_keyring);
out_free_user:
kmem_cache_free(uid_cachep, new);
out_unlock:
uids_mutex_unlock();
return NULL;
}
void switch_uid(struct user_struct *new_user)

View File

@@ -67,9 +67,8 @@ struct workqueue_struct {
#endif
};
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
threads to each one as cpus come/go. */
static DEFINE_MUTEX(workqueue_mutex);
/* Serializes the accesses to the list of workqueues. */
static DEFINE_SPINLOCK(workqueue_lock);
static LIST_HEAD(workqueues);
static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
* Returns zero on success.
* Returns -ve errno on failure.
*
* Appears to be racy against CPU hotplug.
*
* schedule_on_each_cpu() is very slow.
*/
int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
if (!works)
return -ENOMEM;
preempt_disable(); /* CPU hotplug */
get_online_cpus();
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
}
preempt_enable();
flush_workqueue(keventd_wq);
put_online_cpus();
free_percpu(works);
return 0;
}
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, singlethread_cpu);
start_workqueue_thread(cwq, -1);
} else {
mutex_lock(&workqueue_mutex);
get_online_cpus();
spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
spin_unlock(&workqueue_lock);
for_each_possible_cpu(cpu) {
cwq = init_cpu_workqueue(wq, cpu);
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, cpu);
start_workqueue_thread(cwq, cpu);
}
mutex_unlock(&workqueue_mutex);
put_online_cpus();
}
if (err) {
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
/*
* Our caller is either destroy_workqueue() or CPU_DEAD,
* workqueue_mutex protects cwq->thread
* get_online_cpus() protects cwq->thread.
*/
if (cwq->thread == NULL)
return;
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct cpu_workqueue_struct *cwq;
int cpu;
mutex_lock(&workqueue_mutex);
get_online_cpus();
spin_lock(&workqueue_lock);
list_del(&wq->list);
mutex_unlock(&workqueue_mutex);
spin_unlock(&workqueue_lock);
put_online_cpus();
for_each_cpu_mask(cpu, *cpu_map) {
cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
action &= ~CPU_TASKS_FROZEN;
switch (action) {
case CPU_LOCK_ACQUIRE:
mutex_lock(&workqueue_mutex);
return NOTIFY_OK;
case CPU_LOCK_RELEASE:
mutex_unlock(&workqueue_mutex);
return NOTIFY_OK;
case CPU_UP_PREPARE:
cpu_set(cpu, cpu_populated_map);
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
if (!create_workqueue_thread(cwq, cpu))
break;
printk(KERN_ERR "workqueue for %i failed\n", cpu);
printk(KERN_ERR "workqueue [%s] for %i failed\n",
wq->name, cpu);
return NOTIFY_BAD;
case CPU_ONLINE: