Merge commit 'v2.6.29-rc1' into core/urgent

This commit is contained in:
Ingo Molnar
2009-01-11 03:41:39 +01:00
1595 changed files with 116589 additions and 30482 deletions

View File

@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
async.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files

335
kernel/async.c Normal file
View File

@@ -0,0 +1,335 @@
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
/*
Goals and Theory of Operation
The primary goal of this feature is to reduce the kernel boot time,
by doing various independent hardware delays and discovery operations
decoupled and not strictly serialized.
More specifically, the asynchronous function call concept allows
certain operations (primarily during system boot) to happen
asynchronously, out of order, while these operations still
have their externally visible parts happen sequentially and in-order.
(not unlike how out-of-order CPUs retire their instructions in order)
Key to the asynchronous function call implementation is the concept of
a "sequence cookie" (which, although it has an abstracted type, can be
thought of as a monotonically incrementing number).
The async core will assign each scheduled event such a sequence cookie and
pass this to the called functions.
The asynchronously called function should before doing a globally visible
operation, such as registering device numbers, call the
async_synchronize_cookie() function and pass in its own cookie. The
async_synchronize_cookie() function will make sure that all asynchronous
operations that were scheduled prior to the operation corresponding with the
cookie have completed.
Subsystem/driver initialization code that scheduled asynchronous probe
functions, but which shares global resources with other drivers/subsystems
that do not use the asynchronous call feature, need to do a full
synchronization with the async_synchronize_full() function, before returning
from their init function. This is to maintain strict ordering between the
asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/kthread.h>
#include <asm/atomic.h>
static async_cookie_t next_cookie = 1;
#define MAX_THREADS 256
#define MAX_WORK 32768
static LIST_HEAD(async_pending);
static LIST_HEAD(async_running);
static DEFINE_SPINLOCK(async_lock);
static int async_enabled = 0;
struct async_entry {
struct list_head list;
async_cookie_t cookie;
async_func_ptr *func;
void *data;
struct list_head *running;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static DECLARE_WAIT_QUEUE_HEAD(async_new);
static atomic_t entry_count;
static atomic_t thread_count;
extern int initcall_debug;
/*
* MUST be called with the lock held!
*/
static async_cookie_t __lowest_in_progress(struct list_head *running)
{
struct async_entry *entry;
if (!list_empty(&async_pending)) {
entry = list_first_entry(&async_pending,
struct async_entry, list);
return entry->cookie;
} else if (!list_empty(running)) {
entry = list_first_entry(running,
struct async_entry, list);
return entry->cookie;
} else {
/* nothing in progress... next_cookie is "infinity" */
return next_cookie;
}
}
/*
* pick the first pending entry and run it
*/
static void run_one_entry(void)
{
unsigned long flags;
struct async_entry *entry;
ktime_t calltime, delta, rettime;
/* 1) pick one task from the pending queue */
spin_lock_irqsave(&async_lock, flags);
if (list_empty(&async_pending))
goto out;
entry = list_first_entry(&async_pending, struct async_entry, list);
/* 2) move it to the running queue */
list_del(&entry->list);
list_add_tail(&entry->list, &async_running);
spin_unlock_irqrestore(&async_lock, flags);
/* 3) run it (and print duration)*/
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
entry->func, ktime_to_ns(delta) >> 10);
}
/* 4) remove it from the running queue */
spin_lock_irqsave(&async_lock, flags);
list_del(&entry->list);
/* 5) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* 6) wake up any waiters. */
wake_up(&async_done);
return;
out:
spin_unlock_irqrestore(&async_lock, flags);
}
static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
/*
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
ptr(data, newcookie);
return newcookie;
}
entry->func = ptr;
entry->data = data;
entry->running = running;
spin_lock_irqsave(&async_lock, flags);
newcookie = entry->cookie = next_cookie++;
list_add_tail(&entry->list, &async_pending);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
wake_up(&async_new);
return newcookie;
}
async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
{
return __async_schedule(ptr, data, &async_pending);
}
EXPORT_SYMBOL_GPL(async_schedule);
async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
{
return __async_schedule(ptr, data, running);
}
EXPORT_SYMBOL_GPL(async_schedule_special);
void async_synchronize_full(void)
{
do {
async_synchronize_cookie(next_cookie);
} while (!list_empty(&async_running) || !list_empty(&async_pending));
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
void async_synchronize_full_special(struct list_head *list)
{
async_synchronize_cookie_special(next_cookie, list);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_special);
void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
{
ktime_t starttime, delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, __lowest_in_progress(running) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
printk("async_continuing @ %i after %lli usec\n",
task_pid_nr(current), ktime_to_ns(delta) >> 10);
}
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
void async_synchronize_cookie(async_cookie_t cookie)
{
async_synchronize_cookie_special(cookie, &async_running);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
static int async_thread(void *unused)
{
DECLARE_WAITQUEUE(wq, current);
add_wait_queue(&async_new, &wq);
while (!kthread_should_stop()) {
int ret = HZ;
set_current_state(TASK_INTERRUPTIBLE);
/*
* check the list head without lock.. false positives
* are dealt with inside run_one_entry() while holding
* the lock.
*/
rmb();
if (!list_empty(&async_pending))
run_one_entry();
else
ret = schedule_timeout(HZ);
if (ret == 0) {
/*
* we timed out, this means we as thread are redundant.
* we sign off and die, but we to avoid any races there
* is a last-straw check to see if work snuck in.
*/
atomic_dec(&thread_count);
wmb(); /* manager must see our departure first */
if (list_empty(&async_pending))
break;
/*
* woops work came in between us timing out and us
* signing off; we need to stay alive and keep working.
*/
atomic_inc(&thread_count);
}
}
remove_wait_queue(&async_new, &wq);
return 0;
}
static int async_manager_thread(void *unused)
{
DECLARE_WAITQUEUE(wq, current);
add_wait_queue(&async_new, &wq);
while (!kthread_should_stop()) {
int tc, ec;
set_current_state(TASK_INTERRUPTIBLE);
tc = atomic_read(&thread_count);
rmb();
ec = atomic_read(&entry_count);
while (tc < ec && tc < MAX_THREADS) {
kthread_run(async_thread, NULL, "async/%i", tc);
atomic_inc(&thread_count);
tc++;
}
schedule();
}
remove_wait_queue(&async_new, &wq);
return 0;
}
static int __init async_init(void)
{
if (async_enabled)
kthread_run(async_manager_thread, NULL, "async/mgr");
return 0;
}
static int __init setup_async(char *str)
{
async_enabled = 1;
return 1;
}
__setup("fastboot", setup_async);
core_initcall(async_init);

View File

@@ -84,7 +84,7 @@ struct cgroupfs_root {
/* Tracks how many cgroups are currently defined in hierarchy.*/
int number_of_cgroups;
/* A list running through the mounted hierarchies */
/* A list running through the active hierarchies */
struct list_head root_list;
/* Hierarchy-specific flags */
@@ -148,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
#define for_each_subsys(_root, _ss) \
list_for_each_entry(_ss, &_root->subsys_list, sibling)
/* for_each_root() allows you to iterate across the active hierarchies */
#define for_each_root(_root) \
/* for_each_active_root() allows you to iterate across the active hierarchies */
#define for_each_active_root(_root) \
list_for_each_entry(_root, &roots, root_list)
/* the list of cgroups eligible for automatic release. Protected by
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
rcu_read_lock();
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup *cgrp = cg->subsys[i]->cgroup;
struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
@@ -384,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
* @cg: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_cg_links,
struct css_set *cg, struct cgroup *cgrp)
{
struct cg_cgroup_link *link;
BUG_ON(list_empty(tmp_cg_links));
link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
cgrp_link_list);
link->cg = cg;
list_move(&link->cgrp_link_list, &cgrp->css_sets);
list_add(&link->cg_link_list, &cg->cg_links);
}
/*
* find_css_set() takes an existing cgroup group and a
* cgroup object, and returns a css_set object that's
@@ -399,7 +418,6 @@ static struct css_set *find_css_set(
int i;
struct list_head tmp_cg_links;
struct cg_cgroup_link *link;
struct hlist_head *hhead;
@@ -444,26 +462,11 @@ static struct css_set *find_css_set(
* only do it for the first subsystem in each
* hierarchy
*/
if (ss->root->subsys_list.next == &ss->sibling) {
BUG_ON(list_empty(&tmp_cg_links));
link = list_entry(tmp_cg_links.next,
struct cg_cgroup_link,
cgrp_link_list);
list_del(&link->cgrp_link_list);
list_add(&link->cgrp_link_list, &cgrp->css_sets);
link->cg = res;
list_add(&link->cg_link_list, &res->cg_links);
}
}
if (list_empty(&rootnode.subsys_list)) {
link = list_entry(tmp_cg_links.next,
struct cg_cgroup_link,
cgrp_link_list);
list_del(&link->cgrp_link_list);
list_add(&link->cgrp_link_list, &dummytop->css_sets);
link->cg = res;
list_add(&link->cg_link_list, &res->cg_links);
if (ss->root->subsys_list.next == &ss->sibling)
link_css_set(&tmp_cg_links, res, cgrp);
}
if (list_empty(&rootnode.subsys_list))
link_css_set(&tmp_cg_links, res, dummytop);
BUG_ON(!list_empty(&tmp_cg_links));
@@ -586,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
if (ss->pre_destroy)
ss->pre_destroy(ss, cgrp);
return;
}
static void free_cgroup_rcu(struct rcu_head *obj)
{
struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
kfree(cgrp);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -610,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
/*
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss) {
if (cgrp->subsys[ss->subsys_id])
ss->destroy(ss, cgrp);
}
for_each_subsys(cgrp->root, ss)
ss->destroy(ss, cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
/* Drop the active superblock reference that we took when we
* created the cgroup */
/*
* Drop the active superblock reference that we took when we
* created the cgroup
*/
deactivate_super(cgrp->root->sb);
kfree(cgrp);
call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
}
iput(inode);
}
@@ -712,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
mutex_lock(&ss->hierarchy_mutex);
cgrp->subsys[i] = dummytop->subsys[i];
cgrp->subsys[i]->cgroup = cgrp;
list_add(&ss->sibling, &root->subsys_list);
rcu_assign_pointer(ss->root, root);
list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
if (ss->bind)
ss->bind(ss, cgrp);
mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & removed_bits) {
/* We're removing this subsystem */
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
mutex_lock(&ss->hierarchy_mutex);
if (ss->bind)
ss->bind(ss, dummytop);
dummytop->subsys[i]->cgroup = dummytop;
cgrp->subsys[i] = NULL;
rcu_assign_pointer(subsys[i]->root, &rootnode);
list_del(&ss->sibling);
subsys[i]->root = &rootnode;
list_move(&ss->sibling, &rootnode.subsys_list);
mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & final_bits) {
/* Subsystem state should already exist */
BUG_ON(!cgrp->subsys[i]);
@@ -990,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
root = NULL;
} else {
/* New superblock */
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup *root_cgrp = &root->top_cgroup;
struct inode *inode;
int i;
@@ -1031,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
list_add(&root->root_list, &roots);
root_count++;
sb->s_root->d_fsdata = &root->top_cgroup;
sb->s_root->d_fsdata = root_cgrp;
root->top_cgroup.dentry = sb->s_root;
/* Link the top cgroup in this hierarchy into all
@@ -1042,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
struct hlist_node *node;
struct css_set *cg;
hlist_for_each_entry(cg, node, hhead, hlist) {
struct cg_cgroup_link *link;
BUG_ON(list_empty(&tmp_cg_links));
link = list_entry(tmp_cg_links.next,
struct cg_cgroup_link,
cgrp_link_list);
list_del(&link->cgrp_link_list);
link->cg = cg;
list_add(&link->cgrp_link_list,
&root->top_cgroup.css_sets);
list_add(&link->cg_link_list, &cg->cg_links);
}
hlist_for_each_entry(cg, node, hhead, hlist)
link_css_set(&tmp_cg_links, cg, root_cgrp);
}
write_unlock(&css_set_lock);
free_cg_links(&tmp_cg_links);
BUG_ON(!list_empty(&cgrp->sibling));
BUG_ON(!list_empty(&cgrp->children));
BUG_ON(!list_empty(&root_cgrp->sibling));
BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
cgroup_populate_dir(cgrp);
cgroup_populate_dir(root_cgrp);
mutex_unlock(&inode->i_mutex);
mutex_unlock(&cgroup_mutex);
}
@@ -1113,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
}
write_unlock(&css_set_lock);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
root_count--;
}
list_del(&root->root_list);
root_count--;
mutex_unlock(&cgroup_mutex);
kfree(root);
@@ -1145,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
* Called with cgroup_mutex held. Writes path of cgroup into buf.
* Returns 0 on success, -errno on error.
* Called with cgroup_mutex held or else with an RCU-protected cgroup
* reference. Writes path of cgroup into buf. Returns 0 on success,
* -errno on error.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
char *start;
struct dentry *dentry = rcu_dereference(cgrp->dentry);
if (cgrp == dummytop) {
if (!dentry || cgrp == dummytop) {
/*
* Inactive subsystems have no dentry for their root
* cgroup
@@ -1165,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
*--start = '\0';
for (;;) {
int len = cgrp->dentry->d_name.len;
int len = dentry->d_name.len;
if ((start -= len) < buf)
return -ENAMETOOLONG;
memcpy(start, cgrp->dentry->d_name.name, len);
cgrp = cgrp->parent;
if (!cgrp)
break;
dentry = rcu_dereference(cgrp->dentry);
if (!cgrp->parent)
continue;
if (--start < buf)
@@ -1216,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
int retval = 0;
struct cgroup_subsys *ss;
struct cgroup *oldcgrp;
struct css_set *cg = tsk->cgroups;
struct css_set *cg;
struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
int subsys_id;
@@ -1236,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
task_lock(tsk);
cg = tsk->cgroups;
get_css_set(cg);
task_unlock(tsk);
/*
* Locate or allocate a new css_set for this task,
* based on its final set of cgroups
*/
newcg = find_css_set(cg, cgrp);
put_css_set(cg);
if (!newcg)
return -ENOMEM;
@@ -1445,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
if (!cft || cgroup_is_removed(cgrp))
if (cgroup_is_removed(cgrp))
return -ENODEV;
if (cft->write)
return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1490,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
if (!cft || cgroup_is_removed(cgrp))
if (cgroup_is_removed(cgrp))
return -ENODEV;
if (cft->read)
@@ -1554,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
err = generic_file_open(inode, file);
if (err)
return err;
cft = __d_cft(file->f_dentry);
if (!cft)
return -ENODEV;
if (cft->read_map || cft->read_seq_string) {
struct cgroup_seqfile_state *state =
kzalloc(sizeof(*state), GFP_USER);
@@ -1671,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
if (!error) {
dentry->d_fsdata = cgrp;
inc_nlink(parent->d_inode);
cgrp->dentry = dentry;
rcu_assign_pointer(cgrp->dentry, dentry);
dget(dentry);
}
dput(dentry);
@@ -1812,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
{
struct task_struct *res;
struct list_head *l = it->task;
struct cg_cgroup_link *link;
/* If the iterator cg is NULL, we have no tasks */
if (!it->cg_link)
@@ -1819,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
res = list_entry(l, struct task_struct, cg_list);
/* Advance iterator to find next entry */
l = l->next;
if (l == &res->cgroups->tasks) {
link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
if (l == &link->cg->tasks) {
/* We reached the end of this task list - move on to
* the next cg_cgroup_link */
cgroup_advance_iter(cgrp, it);
@@ -2013,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
*/
static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
{
int n = 0;
int n = 0, pid;
struct cgroup_iter it;
struct task_struct *tsk;
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
if (unlikely(n == npids))
break;
pidarray[n++] = task_pid_vnr(tsk);
pid = task_pid_vnr(tsk);
if (pid > 0)
pidarray[n++] = pid;
}
cgroup_iter_end(cgrp, &it);
return n;
@@ -2052,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
ret = 0;
cgrp = dentry->d_fsdata;
rcu_read_lock();
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2077,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
cgroup_iter_end(cgrp, &it);
rcu_read_unlock();
err:
return ret;
}
@@ -2324,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup *cgrp)
{
css->cgroup = cgrp;
atomic_set(&css->refcnt, 0);
atomic_set(&css->refcnt, 1);
css->flags = 0;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
@@ -2332,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
cgrp->subsys[ss->subsys_id] = css;
}
static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
{
/* We need to take each hierarchy_mutex in a consistent order */
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->root == root)
mutex_lock_nested(&ss->hierarchy_mutex, i);
}
}
static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
{
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->root == root)
mutex_unlock(&ss->hierarchy_mutex);
}
}
/*
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
@@ -2380,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
init_cgroup_css(css, ss, cgrp);
}
cgroup_lock_hierarchy(root);
list_add(&cgrp->sibling, &cgrp->parent->children);
cgroup_unlock_hierarchy(root);
root->number_of_cgroups++;
err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2431,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
{
/* Check the reference count on each subsystem. Since we
* already established that there are no tasks in the
* cgroup, if the css refcount is also 0, then there should
* cgroup, if the css refcount is also 1, then there should
* be no outstanding references, so the subsystem is safe to
* destroy. We scan across all subsystems rather than using
* the per-hierarchy linked list of mounted subsystems since
@@ -2452,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
* release agent to be called anyway. */
if (css && atomic_read(&css->refcnt))
if (css && (atomic_read(&css->refcnt) > 1))
return 1;
}
return 0;
}
/*
* Atomically mark all (or else none) of the cgroup's CSS objects as
* CSS_REMOVED. Return true on success, or false if the cgroup has
* busy subsystems. Call with cgroup_mutex held
*/
static int cgroup_clear_css_refs(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
unsigned long flags;
bool failed = false;
local_irq_save(flags);
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
int refcnt;
do {
/* We can only remove a CSS with a refcnt==1 */
refcnt = atomic_read(&css->refcnt);
if (refcnt > 1) {
failed = true;
goto done;
}
BUG_ON(!refcnt);
/*
* Drop the refcnt to 0 while we check other
* subsystems. This will cause any racing
* css_tryget() to spin until we set the
* CSS_REMOVED bits or abort
*/
} while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
}
done:
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
if (failed) {
/*
* Restore old refcnt if we previously managed
* to clear it from 1 to 0
*/
if (!atomic_read(&css->refcnt))
atomic_set(&css->refcnt, 1);
} else {
/* Commit the fact that the CSS is removed */
set_bit(CSS_REMOVED, &css->flags);
}
}
local_irq_restore(flags);
return !failed;
}
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
struct super_block *sb;
struct cgroupfs_root *root;
/* the vfs holds both inode->i_mutex already */
@@ -2487,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
root = cgrp->root;
sb = root->sb;
if (atomic_read(&cgrp->count)
|| !list_empty(&cgrp->children)
|| cgroup_has_css_refs(cgrp)) {
|| !cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
@@ -2502,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
if (!list_empty(&cgrp->release_list))
list_del(&cgrp->release_list);
spin_unlock(&release_list_lock);
/* delete my sibling from parent->children */
cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
list_del(&cgrp->sibling);
cgroup_unlock_hierarchy(cgrp->root);
spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
spin_unlock(&d->d_lock);
@@ -2525,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
css = ss->create(ss, dummytop);
/* We don't handle early failures gracefully */
@@ -2544,6 +2629,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
mutex_init(&ss->hierarchy_mutex);
ss->active = 1;
}
@@ -2562,7 +2648,6 @@ int __init cgroup_init_early(void)
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&rootnode);
list_add(&rootnode.root_list, &roots);
root_count = 1;
init_task.cgroups = &init_css_set;
@@ -2669,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
mutex_lock(&cgroup_mutex);
for_each_root(root) {
for_each_active_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
int subsys_id;
int count = 0;
/* Skip this hierarchy if it has no active subsystems */
if (!root->actual_subsys_bits)
continue;
seq_printf(m, "%lu:", root->subsys_bits);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2800,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
task_lock(child);
if (list_empty(&child->cg_list))
list_add(&child->cg_list, &child->cgroups->tasks);
task_unlock(child);
write_unlock(&css_set_lock);
}
}
@@ -2907,6 +2991,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_unlock(&cgroup_mutex);
return 0;
}
task_lock(tsk);
cg = tsk->cgroups;
parent = task_cgroup(tsk, subsys->subsys_id);
@@ -2919,6 +3004,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
/* Keep the cgroup alive */
get_css_set(cg);
task_unlock(tsk);
mutex_unlock(&cgroup_mutex);
/* Now do the VFS work to create a cgroup */
@@ -2937,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
}
/* Create the cgroup directory, which also creates the cgroup */
ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
ret = vfs_mkdir(inode, dentry, 0755);
child = __d_cgrp(dentry);
dput(dentry);
if (ret) {
@@ -2947,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
goto out_release;
}
if (!child) {
printk(KERN_INFO
"Couldn't find new cgroup %s\n", nodename);
ret = -ENOMEM;
goto out_release;
}
/* The cgroup now exists. Retake cgroup_mutex and check
* that we're still in the same state that we thought we
* were. */
@@ -3049,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
if ((atomic_dec_return(&css->refcnt) == 1) &&
notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}

View File

@@ -84,7 +84,7 @@ struct cpuset {
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
struct cpuset *parent; /* my parent */
@@ -195,8 +195,6 @@ static int cpuset_mems_generation;
static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
.cpus_allowed = CPU_MASK_ALL,
.mems_allowed = NODE_MASK_ALL,
};
/*
@@ -278,7 +276,7 @@ static struct file_system_type cpuset_fs_type = {
};
/*
* Return in *pmask the portion of a cpusets's cpus_allowed that
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
* until we find one that does have some online cpus. If we get
* all the way to the top and still haven't found any online cpus,
@@ -291,15 +289,16 @@ static struct file_system_type cpuset_fs_type = {
* Call with callback_mutex held.
*/
static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask)
{
while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
cs = cs->parent;
if (cs)
cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
else
*pmask = cpu_online_map;
BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
cpumask_copy(pmask, cpu_online_mask);
BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
}
/*
@@ -375,14 +374,9 @@ void cpuset_update_task_memory_state(void)
struct task_struct *tsk = current;
struct cpuset *cs;
if (task_cs(tsk) == &top_cpuset) {
/* Don't need rcu for top_cpuset. It's never freed. */
my_cpusets_mem_gen = top_cpuset.mems_generation;
} else {
rcu_read_lock();
my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
rcu_read_unlock();
}
rcu_read_lock();
my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
rcu_read_unlock();
if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
mutex_lock(&callback_mutex);
@@ -414,12 +408,43 @@ void cpuset_update_task_memory_state(void)
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
return cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
}
/**
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
{
struct cpuset *trial;
trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
if (!trial)
return NULL;
if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
kfree(trial);
return NULL;
}
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
return trial;
}
/**
* free_trial_cpuset - free the trial cpuset
* @trial: the trial cpuset to be freed
*/
static void free_trial_cpuset(struct cpuset *trial)
{
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
}
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@@ -469,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
c = cgroup_cs(cont);
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
return -EINVAL;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
@@ -479,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
if (cgroup_task_count(cur->css.cgroup)) {
if (cpus_empty(trial->cpus_allowed) ||
if (cpumask_empty(trial->cpus_allowed) ||
nodes_empty(trial->mems_allowed)) {
return -ENOSPC;
}
@@ -494,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
*/
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
}
static void
@@ -519,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
cp = list_first_entry(&q, struct cpuset, stack_list);
list_del(q.next);
if (cpus_empty(cp->cpus_allowed))
if (cpumask_empty(cp->cpus_allowed))
continue;
if (is_sched_load_balance(cp))
@@ -586,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* element of the partition (one sched domain) to be passed to
* partition_sched_domains().
*/
static int generate_sched_domains(cpumask_t **domains,
/* FIXME: see the FIXME in partition_sched_domains() */
static int generate_sched_domains(struct cpumask **domains,
struct sched_domain_attr **attributes)
{
LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -594,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_t *doms; /* resulting partition; i.e. sched domains */
struct cpumask *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] cpumask_t slot */
int nslot; /* next empty doms[] struct cpumask slot */
doms = NULL;
dattr = NULL;
@@ -605,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
doms = kmalloc(cpumask_size(), GFP_KERNEL);
if (!doms)
goto done;
@@ -614,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
*doms = top_cpuset.cpus_allowed;
cpumask_copy(doms, top_cpuset.cpus_allowed);
ndoms = 1;
goto done;
@@ -633,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains,
cp = list_first_entry(&q, struct cpuset, stack_list);
list_del(q.next);
if (cpus_empty(cp->cpus_allowed))
if (cpumask_empty(cp->cpus_allowed))
continue;
/*
@@ -684,7 +710,7 @@ restart:
* Now we know how many domains to create.
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
*/
doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
if (!doms)
goto done;
@@ -696,7 +722,7 @@ restart:
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
cpumask_t *dp;
struct cpumask *dp;
int apn = a->pn;
if (apn < 0) {
@@ -719,14 +745,14 @@ restart:
continue;
}
cpus_clear(*dp);
cpumask_clear(dp);
if (dattr)
*(dattr + nslot) = SD_ATTR_INIT;
for (j = i; j < csn; j++) {
struct cpuset *b = csa[j];
if (apn == b->pn) {
cpus_or(*dp, *dp, b->cpus_allowed);
cpumask_or(dp, dp, b->cpus_allowed);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -766,7 +792,7 @@ done:
static void do_rebuild_sched_domains(struct work_struct *unused)
{
struct sched_domain_attr *attr;
cpumask_t *doms;
struct cpumask *doms;
int ndoms;
get_online_cpus();
@@ -835,7 +861,7 @@ void rebuild_sched_domains(void)
static int cpuset_test_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
return !cpus_equal(tsk->cpus_allowed,
return !cpumask_equal(&tsk->cpus_allowed,
(cgroup_cs(scan->cg))->cpus_allowed);
}
@@ -853,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
static void cpuset_change_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
}
/**
@@ -885,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
* @cs: the cpuset to consider
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, const char *buf)
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
struct ptr_heap heap;
struct cpuset trialcs;
int retval;
int is_load_balanced;
@@ -896,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
if (cs == &top_cpuset)
return -EACCES;
trialcs = *cs;
/*
* An empty cpus_allowed is ok only if the cpuset has no tasks.
* Since cpulist_parse() fails on an empty mask, we special case
@@ -905,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
* with tasks have cpus.
*/
if (!*buf) {
cpus_clear(trialcs.cpus_allowed);
cpumask_clear(trialcs->cpus_allowed);
} else {
retval = cpulist_parse(buf, &trialcs.cpus_allowed);
retval = cpulist_parse(buf, trialcs->cpus_allowed);
if (retval < 0)
return retval;
if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
return -EINVAL;
}
retval = validate_change(cs, &trialcs);
retval = validate_change(cs, trialcs);
if (retval < 0)
return retval;
/* Nothing to do if the cpus didn't change */
if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
if (retval)
return retval;
is_load_balanced = is_sched_load_balance(&trialcs);
is_load_balanced = is_sched_load_balance(trialcs);
mutex_lock(&callback_mutex);
cs->cpus_allowed = trialcs.cpus_allowed;
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
/*
@@ -1017,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
fudge = 10; /* spare mmarray[] slots */
fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
retval = -ENOMEM;
/*
@@ -1104,9 +1128,9 @@ done:
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
*/
static int update_nodemask(struct cpuset *cs, const char *buf)
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
struct cpuset trialcs;
nodemask_t oldmem;
int retval;
@@ -1117,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
if (cs == &top_cpuset)
return -EACCES;
trialcs = *cs;
/*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
* Since nodelist_parse() fails on an empty mask, we special case
@@ -1126,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
* with tasks have memory.
*/
if (!*buf) {
nodes_clear(trialcs.mems_allowed);
nodes_clear(trialcs->mems_allowed);
} else {
retval = nodelist_parse(buf, trialcs.mems_allowed);
retval = nodelist_parse(buf, trialcs->mems_allowed);
if (retval < 0)
goto done;
if (!nodes_subset(trialcs.mems_allowed,
if (!nodes_subset(trialcs->mems_allowed,
node_states[N_HIGH_MEMORY]))
return -EINVAL;
}
oldmem = cs->mems_allowed;
if (nodes_equal(oldmem, trialcs.mems_allowed)) {
if (nodes_equal(oldmem, trialcs->mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
retval = validate_change(cs, &trialcs);
retval = validate_change(cs, trialcs);
if (retval < 0)
goto done;
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs.mems_allowed;
cs->mems_allowed = trialcs->mems_allowed;
cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
@@ -1167,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
async_rebuild_sched_domains();
}
@@ -1186,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset trialcs;
struct cpuset *trialcs;
int err;
int balance_flag_changed;
trialcs = *cs;
if (turning_on)
set_bit(bit, &trialcs.flags);
else
clear_bit(bit, &trialcs.flags);
trialcs = alloc_trial_cpuset(cs);
if (!trialcs)
return -ENOMEM;
err = validate_change(cs, &trialcs);
if (turning_on)
set_bit(bit, &trialcs->flags);
else
clear_bit(bit, &trialcs->flags);
err = validate_change(cs, trialcs);
if (err < 0)
return err;
goto out;
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(&trialcs));
is_sched_load_balance(trialcs));
mutex_lock(&callback_mutex);
cs->flags = trialcs.flags;
cs->flags = trialcs->flags;
mutex_unlock(&callback_mutex);
if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
async_rebuild_sched_domains();
return 0;
out:
free_trial_cpuset(trialcs);
return err;
}
/*
@@ -1311,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
/* Protected by cgroup_lock */
static cpumask_var_t cpus_attach;
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct task_struct *tsk)
{
struct cpuset *cs = cgroup_cs(cont);
int ret = 0;
if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
if (tsk->flags & PF_THREAD_BOUND) {
cpumask_t mask;
if (tsk->flags & PF_THREAD_BOUND) {
mutex_lock(&callback_mutex);
mask = cs->cpus_allowed;
if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
ret = -EINVAL;
mutex_unlock(&callback_mutex);
if (!cpus_equal(tsk->cpus_allowed, mask))
return -EINVAL;
}
return security_task_setscheduler(tsk, 0, NULL);
return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
}
static void cpuset_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct cgroup *oldcont,
struct task_struct *tsk)
{
cpumask_t cpus;
nodemask_t from, to;
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
int err;
mutex_lock(&callback_mutex);
guarantee_online_cpus(cs, &cpus);
err = set_cpus_allowed_ptr(tsk, &cpus);
mutex_unlock(&callback_mutex);
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
} else {
mutex_lock(&callback_mutex);
guarantee_online_cpus(cs, cpus_attach);
mutex_unlock(&callback_mutex);
}
err = set_cpus_allowed_ptr(tsk, cpus_attach);
if (err)
return;
@@ -1359,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
cpuset_migrate_mm(mm, &from, &to);
mmput(mm);
}
}
/* The various types of files and directories in a cpuset file system */
@@ -1454,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
const char *buf)
{
int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *trialcs;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs)
return -ENOMEM;
switch (cft->private) {
case FILE_CPULIST:
retval = update_cpumask(cgroup_cs(cgrp), buf);
retval = update_cpumask(cs, trialcs, buf);
break;
case FILE_MEMLIST:
retval = update_nodemask(cgroup_cs(cgrp), buf);
retval = update_nodemask(cs, trialcs, buf);
break;
default:
retval = -EINVAL;
break;
}
free_trial_cpuset(trialcs);
cgroup_unlock();
return retval;
}
@@ -1487,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
cpumask_t mask;
int ret;
mutex_lock(&callback_mutex);
mask = cs->cpus_allowed;
ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
mutex_unlock(&callback_mutex);
return cpulist_scnprintf(page, PAGE_SIZE, &mask);
return ret;
}
static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -1729,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
parent_cs = cgroup_cs(parent);
cs->mems_allowed = parent_cs->mems_allowed;
cs->cpus_allowed = parent_cs->cpus_allowed;
cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
return;
}
@@ -1755,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create(
cs = kmalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
kfree(cs);
return ERR_PTR(-ENOMEM);
}
cpuset_update_task_memory_state();
cs->flags = 0;
@@ -1763,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create(
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpus_clear(cs->cpus_allowed);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
@@ -1790,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
number_of_cpusets--;
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
@@ -1813,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = {
int __init cpuset_init_early(void)
{
alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
top_cpuset.mems_generation = cpuset_mems_generation++;
return 0;
}
@@ -1828,7 +1875,7 @@ int __init cpuset_init(void)
{
int err = 0;
cpus_setall(top_cpuset.cpus_allowed);
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
fmeter_init(&top_cpuset.fmeter);
@@ -1840,6 +1887,9 @@ int __init cpuset_init(void)
if (err < 0)
return err;
if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
BUG();
number_of_cpusets = 1;
return 0;
}
@@ -1914,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* has online cpus, so can't be empty).
*/
parent = cs->parent;
while (cpus_empty(parent->cpus_allowed) ||
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
parent = parent->parent;
@@ -1955,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
}
/* Continue past cpusets with all cpus, mems online */
if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
@@ -1963,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
cpu_online_mask);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
mutex_unlock(&callback_mutex);
/* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp->cpus_allowed) ||
if (cpumask_empty(cp->cpus_allowed) ||
nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
else {
@@ -1995,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
unsigned long phase, void *unused_cpu)
{
struct sched_domain_attr *attr;
cpumask_t *doms;
struct cpumask *doms;
int ndoms;
switch (phase) {
@@ -2010,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
}
cgroup_lock();
top_cpuset.cpus_allowed = cpu_online_map;
cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
scan_for_empty_cpusets(&top_cpuset);
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
@@ -2055,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
void __init cpuset_init_smp(void)
{
top_cpuset.cpus_allowed = cpu_online_map;
cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2065,15 +2116,15 @@ void __init cpuset_init_smp(void)
/**
* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
* @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
*
* Description: Returns the cpumask_t cpus_allowed of the cpuset
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_map, even if this means going outside the
* tasks cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
mutex_lock(&callback_mutex);
cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2084,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
* cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
* Must be called with callback_mutex held.
**/
void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
task_lock(tsk);
guarantee_online_cpus(task_cs(tsk), pmask);

View File

@@ -372,7 +372,8 @@ int commit_creds(struct cred *new)
old->fsuid != new->fsuid ||
old->fsgid != new->fsgid ||
!cap_issubset(new->cap_permitted, old->cap_permitted)) {
set_dumpable(task->mm, suid_dumpable);
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
smp_wmb();
}
@@ -506,6 +507,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
else
old = get_cred(&init_cred);
*new = *old;
get_uid(new->user);
get_group_info(new->group_info);
@@ -529,6 +531,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
error:
put_cred(new);
put_cred(old);
return NULL;
}
EXPORT_SYMBOL(prepare_kernel_cred);

View File

@@ -1126,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (pid != &init_struct_pid) {
retval = -ENOMEM;
pid = alloc_pid(task_active_pid_ns(p));
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
if (clone_flags & CLONE_NEWPID) {
retval = pid_ns_prepare_proc(task_active_pid_ns(p));
retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
if (retval < 0)
goto bad_fork_free_pid;
}
@@ -1481,12 +1481,10 @@ void __init proc_caches_init(void)
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
vm_area_cachep = kmem_cache_create("vm_area_struct",
sizeof(struct vm_area_struct), 0,
SLAB_PANIC, NULL);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
mmap_init();
}
/*

View File

@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/async.h>
#include "internals.h"
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void)
unsigned int status;
int i;
/*
* quiesce the kernel, or at least the asynchronous portion
*/
async_synchronize_full();
mutex_lock(&probing_active);
/*
* something may have generated an irq long ago and we want to

View File

@@ -50,6 +50,7 @@
#include <asm/sections.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/async.h>
#if 0
#define DEBUGP printk
@@ -816,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
mod->exit();
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
async_synchronize_full();
mutex_lock(&module_mutex);
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));

View File

@@ -13,7 +13,6 @@
struct ns_cgroup {
struct cgroup_subsys_state css;
spinlock_t lock;
};
struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
if (!ns_cgroup)
return ERR_PTR(-ENOMEM);
spin_lock_init(&ns_cgroup->lock);
return &ns_cgroup->css;
}

View File

@@ -474,6 +474,12 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
}
EXPORT_SYMBOL(task_session_nr_ns);
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);
/*
* Used by proc to find the first pid that is greater than or equal to nr.
*

View File

@@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode)
{
int error;
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
error = platform_begin(platform_mode);
if (error)
return error;
error = platform_begin(platform_mode);
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
if (error)
goto Close;

View File

@@ -25,6 +25,7 @@
#include <linux/syscalls.h>
#include <linux/console.h>
#include <linux/highmem.h>
#include <linux/list.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
return ret;
}
static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
{
free_list_of_pages(ca->chain, clear_page_nosave);
memset(ca, 0, sizeof(struct chain_allocator));
}
/**
* Data types related to memory bitmaps.
*
@@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
struct bm_block {
struct bm_block *next; /* next element of the list */
struct list_head hook; /* hook into a list of bitmap blocks */
unsigned long start_pfn; /* pfn represented by the first bit */
unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
unsigned long *data; /* bitmap representing pages */
@@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb)
return bb->end_pfn - bb->start_pfn;
}
struct zone_bitmap {
struct zone_bitmap *next; /* next element of the list */
unsigned long start_pfn; /* minimal pfn in this zone */
unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
struct bm_block *bm_blocks; /* list of bitmap blocks */
struct bm_block *cur_block; /* recently used bitmap block */
};
/* strcut bm_position is used for browsing memory bitmaps */
struct bm_position {
struct zone_bitmap *zone_bm;
struct bm_block *block;
int bit;
};
struct memory_bitmap {
struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */
struct list_head blocks; /* list of bitmap blocks */
struct linked_page *p_list; /* list of pages used to store zone
* bitmap objects and bitmap block
* objects
@@ -273,11 +259,7 @@ struct memory_bitmap {
static void memory_bm_position_reset(struct memory_bitmap *bm)
{
struct zone_bitmap *zone_bm;
zone_bm = bm->zone_bm_list;
bm->cur.zone_bm = zone_bm;
bm->cur.block = zone_bm->bm_blocks;
bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
bm->cur.bit = 0;
}
@@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
/**
* create_bm_block_list - create a list of block bitmap objects
* @nr_blocks - number of blocks to allocate
* @list - list to put the allocated blocks into
* @ca - chain allocator to be used for allocating memory
*/
static inline struct bm_block *
create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
static int create_bm_block_list(unsigned long pages,
struct list_head *list,
struct chain_allocator *ca)
{
struct bm_block *bblist = NULL;
unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
while (nr_blocks-- > 0) {
struct bm_block *bb;
bb = chain_alloc(ca, sizeof(struct bm_block));
if (!bb)
return NULL;
bb->next = bblist;
bblist = bb;
return -ENOMEM;
list_add(&bb->hook, list);
}
return 0;
}
struct mem_extent {
struct list_head hook;
unsigned long start;
unsigned long end;
};
/**
* free_mem_extents - free a list of memory extents
* @list - list of extents to empty
*/
static void free_mem_extents(struct list_head *list)
{
struct mem_extent *ext, *aux;
list_for_each_entry_safe(ext, aux, list, hook) {
list_del(&ext->hook);
kfree(ext);
}
return bblist;
}
/**
* create_zone_bm_list - create a list of zone bitmap objects
* create_mem_extents - create a list of memory extents representing
* contiguous ranges of PFNs
* @list - list to put the extents into
* @gfp_mask - mask to use for memory allocations
*/
static inline struct zone_bitmap *
create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
{
struct zone_bitmap *zbmlist = NULL;
struct zone *zone;
while (nr_zones-- > 0) {
struct zone_bitmap *zbm;
INIT_LIST_HEAD(list);
zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
if (!zbm)
return NULL;
for_each_zone(zone) {
unsigned long zone_start, zone_end;
struct mem_extent *ext, *cur, *aux;
zbm->next = zbmlist;
zbmlist = zbm;
if (!populated_zone(zone))
continue;
zone_start = zone->zone_start_pfn;
zone_end = zone->zone_start_pfn + zone->spanned_pages;
list_for_each_entry(ext, list, hook)
if (zone_start <= ext->end)
break;
if (&ext->hook == list || zone_end < ext->start) {
/* New extent is necessary */
struct mem_extent *new_ext;
new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
if (!new_ext) {
free_mem_extents(list);
return -ENOMEM;
}
new_ext->start = zone_start;
new_ext->end = zone_end;
list_add_tail(&new_ext->hook, &ext->hook);
continue;
}
/* Merge this zone's range of PFNs with the existing one */
if (zone_start < ext->start)
ext->start = zone_start;
if (zone_end > ext->end)
ext->end = zone_end;
/* More merging may be possible */
cur = ext;
list_for_each_entry_safe_continue(cur, aux, list, hook) {
if (zone_end < cur->start)
break;
if (zone_end < cur->end)
ext->end = cur->end;
list_del(&cur->hook);
kfree(cur);
}
}
return zbmlist;
return 0;
}
/**
* memory_bm_create - allocate memory for a memory bitmap
*/
static int
memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
{
struct chain_allocator ca;
struct zone *zone;
struct zone_bitmap *zone_bm;
struct bm_block *bb;
unsigned int nr;
struct list_head mem_extents;
struct mem_extent *ext;
int error;
chain_init(&ca, gfp_mask, safe_needed);
INIT_LIST_HEAD(&bm->blocks);
/* Compute the number of zones */
nr = 0;
for_each_zone(zone)
if (populated_zone(zone))
nr++;
error = create_mem_extents(&mem_extents, gfp_mask);
if (error)
return error;
/* Allocate the list of zones bitmap objects */
zone_bm = create_zone_bm_list(nr, &ca);
bm->zone_bm_list = zone_bm;
if (!zone_bm) {
chain_free(&ca, PG_UNSAFE_CLEAR);
return -ENOMEM;
}
list_for_each_entry(ext, &mem_extents, hook) {
struct bm_block *bb;
unsigned long pfn = ext->start;
unsigned long pages = ext->end - ext->start;
/* Initialize the zone bitmap objects */
for_each_zone(zone) {
unsigned long pfn;
bb = list_entry(bm->blocks.prev, struct bm_block, hook);
if (!populated_zone(zone))
continue;
error = create_bm_block_list(pages, bm->blocks.prev, &ca);
if (error)
goto Error;
zone_bm->start_pfn = zone->zone_start_pfn;
zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
/* Allocate the list of bitmap block objects */
nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
bb = create_bm_block_list(nr, &ca);
zone_bm->bm_blocks = bb;
zone_bm->cur_block = bb;
if (!bb)
goto Free;
nr = zone->spanned_pages;
pfn = zone->zone_start_pfn;
/* Initialize the bitmap block objects */
while (bb) {
unsigned long *ptr;
ptr = get_image_page(gfp_mask, safe_needed);
bb->data = ptr;
if (!ptr)
goto Free;
list_for_each_entry_continue(bb, &bm->blocks, hook) {
bb->data = get_image_page(gfp_mask, safe_needed);
if (!bb->data) {
error = -ENOMEM;
goto Error;
}
bb->start_pfn = pfn;
if (nr >= BM_BITS_PER_BLOCK) {
if (pages >= BM_BITS_PER_BLOCK) {
pfn += BM_BITS_PER_BLOCK;
nr -= BM_BITS_PER_BLOCK;
pages -= BM_BITS_PER_BLOCK;
} else {
/* This is executed only once in the loop */
pfn += nr;
pfn += pages;
}
bb->end_pfn = pfn;
bb = bb->next;
}
zone_bm = zone_bm->next;
}
bm->p_list = ca.chain;
memory_bm_position_reset(bm);
return 0;
Exit:
free_mem_extents(&mem_extents);
return error;
Free:
Error:
bm->p_list = ca.chain;
memory_bm_free(bm, PG_UNSAFE_CLEAR);
return -ENOMEM;
goto Exit;
}
/**
* memory_bm_free - free memory occupied by the memory bitmap @bm
*/
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
struct zone_bitmap *zone_bm;
struct bm_block *bb;
/* Free the list of bit blocks for each zone_bitmap object */
zone_bm = bm->zone_bm_list;
while (zone_bm) {
struct bm_block *bb;
list_for_each_entry(bb, &bm->blocks, hook)
if (bb->data)
free_image_page(bb->data, clear_nosave_free);
bb = zone_bm->bm_blocks;
while (bb) {
if (bb->data)
free_image_page(bb->data, clear_nosave_free);
bb = bb->next;
}
zone_bm = zone_bm->next;
}
free_list_of_pages(bm->p_list, clear_nosave_free);
bm->zone_bm_list = NULL;
INIT_LIST_HEAD(&bm->blocks);
}
/**
@@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
* to given pfn. The cur_zone_bm member of @bm and the cur_block member
* of @bm->cur_zone_bm are updated.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
void **addr, unsigned int *bit_nr)
{
struct zone_bitmap *zone_bm;
struct bm_block *bb;
/* Check if the pfn is from the current zone */
zone_bm = bm->cur.zone_bm;
if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
zone_bm = bm->zone_bm_list;
/* We don't assume that the zones are sorted by pfns */
while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
zone_bm = zone_bm->next;
if (!zone_bm)
return -EFAULT;
}
bm->cur.zone_bm = zone_bm;
}
/* Check if the pfn corresponds to the current bitmap block */
bb = zone_bm->cur_block;
/*
* Check if the pfn corresponds to the current bitmap block and find
* the block where it fits if this is not the case.
*/
bb = bm->cur.block;
if (pfn < bb->start_pfn)
bb = zone_bm->bm_blocks;
list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
if (pfn >= bb->start_pfn)
break;
while (pfn >= bb->end_pfn) {
bb = bb->next;
if (pfn >= bb->end_pfn)
list_for_each_entry_continue(bb, &bm->blocks, hook)
if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
break;
BUG_ON(!bb);
}
zone_bm->cur_block = bb;
if (&bb->hook == &bm->blocks)
return -EFAULT;
/* The block has been found */
bm->cur.block = bb;
pfn -= bb->start_pfn;
bm->cur.bit = pfn + 1;
*bit_nr = pfn;
*addr = bb->data;
return 0;
@@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
return test_bit(bit, addr);
}
static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
/**
* memory_bm_next_pfn - find the pfn that corresponds to the next set bit
* in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
@@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
struct zone_bitmap *zone_bm;
struct bm_block *bb;
int bit;
bb = bm->cur.block;
do {
bb = bm->cur.block;
do {
bit = bm->cur.bit;
bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
if (bit < bm_block_bits(bb))
goto Return_pfn;
bit = bm->cur.bit;
bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
if (bit < bm_block_bits(bb))
goto Return_pfn;
bb = list_entry(bb->hook.next, struct bm_block, hook);
bm->cur.block = bb;
bm->cur.bit = 0;
} while (&bb->hook != &bm->blocks);
bb = bb->next;
bm->cur.block = bb;
bm->cur.bit = 0;
} while (bb);
zone_bm = bm->cur.zone_bm->next;
if (zone_bm) {
bm->cur.zone_bm = zone_bm;
bm->cur.block = zone_bm->bm_blocks;
bm->cur.bit = 0;
}
} while (zone_bm);
memory_bm_position_reset(bm);
return BM_END_OF_MAP;
@@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void)
* We should save the page if it isn't Nosave or NosaveFree, or Reserved,
* and it isn't a part of a free chunk of pages.
*/
static struct page *saveable_highmem_page(unsigned long pfn)
static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
return NULL;
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
@@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void)
mark_free_pages(zone);
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (saveable_highmem_page(pfn))
if (saveable_highmem_page(zone, pfn))
n++;
}
return n;
}
#else
static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
{
return NULL;
}
#endif /* CONFIG_HIGHMEM */
/**
@@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
* of pages statically defined as 'unsaveable', and it isn't a part of
* a free chunk of pages.
*/
static struct page *saveable_page(unsigned long pfn)
static struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn)
return NULL;
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -903,7 +918,7 @@ unsigned int count_data_pages(void)
mark_free_pages(zone);
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if(saveable_page(pfn))
if (saveable_page(zone, pfn))
n++;
}
return n;
@@ -944,7 +959,7 @@ static inline struct page *
page_is_saveable(struct zone *zone, unsigned long pfn)
{
return is_highmem(zone) ?
saveable_highmem_page(pfn) : saveable_page(pfn);
saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
}
static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
@@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
* data modified by kmap_atomic()
*/
safe_copy_page(buffer, s_page);
dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
dst = kmap_atomic(d_page, KM_USER0);
memcpy(dst, buffer, PAGE_SIZE);
kunmap_atomic(dst, KM_USER0);
} else {
@@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
}
}
#else
#define page_is_saveable(zone, pfn) saveable_page(pfn)
#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
@@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info)
* unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
* the corresponding bit in the memory bitmap @bm
*/
static inline void
unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
{
int j;
@@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
memory_bm_set_bit(bm, buf[j]);
if (memory_bm_pfn_present(bm, buf[j]))
memory_bm_set_bit(bm, buf[j]);
else
return -EFAULT;
}
return 0;
}
/* List of "safe" pages that may be used to store data loaded from the suspend
@@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
if (!pbe) {
swsusp_free();
return NULL;
return ERR_PTR(-ENOMEM);
}
pbe->orig_page = page;
if (safe_highmem_pages > 0) {
@@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
static inline void *
get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
{
return NULL;
return ERR_PTR(-EINVAL);
}
static inline void copy_last_highmem_page(void) {}
@@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
struct pbe *pbe;
struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
struct page *page;
unsigned long pfn = memory_bm_next_pfn(bm);
if (pfn == BM_END_OF_MAP)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
if (PageHighMem(page))
return get_highmem_page_buffer(page, ca);
@@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
pbe = chain_alloc(ca, sizeof(struct pbe));
if (!pbe) {
swsusp_free();
return NULL;
return ERR_PTR(-ENOMEM);
}
pbe->orig_address = page_address(page);
pbe->address = safe_pages_list;
@@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
return error;
} else if (handle->prev <= nr_meta_pages) {
unpack_orig_pfns(buffer, &copy_bm);
error = unpack_orig_pfns(buffer, &copy_bm);
if (error)
return error;
if (handle->prev == nr_meta_pages) {
error = prepare_image(&orig_bm, &copy_bm);
if (error)
@@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
restore_pblist = NULL;
handle->buffer = get_buffer(&orig_bm, &ca);
handle->sync_read = 0;
if (!handle->buffer)
return -ENOMEM;
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
}
} else {
copy_last_highmem_page();
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
if (handle->buffer != buffer)
handle->sync_read = 0;
}

View File

@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
return 0;
}
/*
* Platforms, like ACPI, may want us to save some memory used by them during
* hibernation and to restore the contents of this memory during the subsequent
* resume. The code below implements a mechanism allowing us to do that.
*/
struct nvs_page {
unsigned long phys_start;
unsigned int size;
void *kaddr;
void *data;
struct list_head node;
};
static LIST_HEAD(nvs_list);
/**
* hibernate_nvs_register - register platform NVS memory region to save
* @start - physical address of the region
* @size - size of the region
*
* The NVS region need not be page-aligned (both ends) and we arrange
* things so that the data from page-aligned addresses in this region will
* be copied into separate RAM pages.
*/
int hibernate_nvs_register(unsigned long start, unsigned long size)
{
struct nvs_page *entry, *next;
while (size > 0) {
unsigned int nr_bytes;
entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
if (!entry)
goto Error;
list_add_tail(&entry->node, &nvs_list);
entry->phys_start = start;
nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
entry->size = (size < nr_bytes) ? size : nr_bytes;
start += entry->size;
size -= entry->size;
}
return 0;
Error:
list_for_each_entry_safe(entry, next, &nvs_list, node) {
list_del(&entry->node);
kfree(entry);
}
return -ENOMEM;
}
/**
* hibernate_nvs_free - free data pages allocated for saving NVS regions
*/
void hibernate_nvs_free(void)
{
struct nvs_page *entry;
list_for_each_entry(entry, &nvs_list, node)
if (entry->data) {
free_page((unsigned long)entry->data);
entry->data = NULL;
if (entry->kaddr) {
iounmap(entry->kaddr);
entry->kaddr = NULL;
}
}
}
/**
* hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
*/
int hibernate_nvs_alloc(void)
{
struct nvs_page *entry;
list_for_each_entry(entry, &nvs_list, node) {
entry->data = (void *)__get_free_page(GFP_KERNEL);
if (!entry->data) {
hibernate_nvs_free();
return -ENOMEM;
}
}
return 0;
}
/**
* hibernate_nvs_save - save NVS memory regions
*/
void hibernate_nvs_save(void)
{
struct nvs_page *entry;
printk(KERN_INFO "PM: Saving platform NVS memory\n");
list_for_each_entry(entry, &nvs_list, node)
if (entry->data) {
entry->kaddr = ioremap(entry->phys_start, entry->size);
memcpy(entry->data, entry->kaddr, entry->size);
}
}
/**
* hibernate_nvs_restore - restore NVS memory regions
*
* This function is going to be called with interrupts disabled, so it
* cannot iounmap the virtual addresses used to access the NVS region.
*/
void hibernate_nvs_restore(void)
{
struct nvs_page *entry;
printk(KERN_INFO "PM: Restoring platform NVS memory\n");
list_for_each_entry(entry, &nvs_list, node)
if (entry->data)
memcpy(entry->kaddr, entry->data, entry->size);
}

View File

@@ -15,10 +15,11 @@
#include <linux/uaccess.h>
#include <linux/mm.h>
void res_counter_init(struct res_counter *counter)
void res_counter_init(struct res_counter *counter, struct res_counter *parent)
{
spin_lock_init(&counter->lock);
counter->limit = (unsigned long long)LLONG_MAX;
counter->parent = parent;
}
int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
return 0;
}
int res_counter_charge(struct res_counter *counter, unsigned long val)
int res_counter_charge(struct res_counter *counter, unsigned long val,
struct res_counter **limit_fail_at)
{
int ret;
unsigned long flags;
struct res_counter *c, *u;
spin_lock_irqsave(&counter->lock, flags);
ret = res_counter_charge_locked(counter, val);
spin_unlock_irqrestore(&counter->lock, flags);
*limit_fail_at = NULL;
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
ret = res_counter_charge_locked(c, val);
spin_unlock(&c->lock);
if (ret < 0) {
*limit_fail_at = c;
goto undo;
}
}
ret = 0;
goto done;
undo:
for (u = counter; u != c; u = u->parent) {
spin_lock(&u->lock);
res_counter_uncharge_locked(u, val);
spin_unlock(&u->lock);
}
done:
local_irq_restore(flags);
return ret;
}
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
void res_counter_uncharge(struct res_counter *counter, unsigned long val)
{
unsigned long flags;
struct res_counter *c;
spin_lock_irqsave(&counter->lock, flags);
res_counter_uncharge_locked(counter, val);
spin_unlock_irqrestore(&counter->lock, flags);
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
res_counter_uncharge_locked(c, val);
spin_unlock(&c->lock);
}
local_irq_restore(flags);
}

View File

@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res)
*/
struct resource * __request_region(struct resource *parent,
resource_size_t start, resource_size_t n,
const char *name)
const char *name, int flags)
{
struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent,
res->start = start;
res->end = start + n - 1;
res->flags = IORESOURCE_BUSY;
res->flags |= flags;
write_lock(&resource_lock);
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start,
{
struct resource * res;
res = __request_region(parent, start, n, "check-region");
res = __request_region(parent, start, n, "check-region", 0);
if (!res)
return -EBUSY;
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev,
dr->start = start;
dr->n = n;
res = __request_region(parent, start, n, name);
res = __request_region(parent, start, n, name, 0);
if (res)
devres_add(dev, dr);
else
@@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
return err;
}
#ifdef CONFIG_STRICT_DEVMEM
static int strict_iomem_checks = 1;
#else
static int strict_iomem_checks;
#endif
/*
* check if an address is reserved in the iomem resource tree
* returns 1 if reserved, 0 if not reserved.
*/
int iomem_is_exclusive(u64 addr)
{
struct resource *p = &iomem_resource;
int err = 0;
loff_t l;
int size = PAGE_SIZE;
if (!strict_iomem_checks)
return 0;
addr = addr & PAGE_MASK;
read_lock(&resource_lock);
for (p = p->child; p ; p = r_next(NULL, p, &l)) {
/*
* We can probably skip the resources without
* IORESOURCE_IO attribute?
*/
if (p->start >= addr + size)
break;
if (p->end < addr)
continue;
if (p->flags & IORESOURCE_BUSY &&
p->flags & IORESOURCE_EXCLUSIVE) {
err = 1;
break;
}
}
read_unlock(&resource_lock);
return err;
}
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
strict_iomem_checks = 0;
if (strstr(str, "strict"))
strict_iomem_checks = 1;
return 1;
}
__setup("iomem=", strict_iomem);

View File

@@ -3728,8 +3728,13 @@ redo:
}
double_unlock_balance(this_rq, busiest);
/*
* Should not call ttwu while holding a rq->lock
*/
spin_unlock(&this_rq->lock);
if (active_balance)
wake_up_process(busiest->migration_thread);
spin_lock(&this_rq->lock);
} else
sd->nr_balance_failed = 0;

View File

@@ -1617,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
}
}
#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
/*
* Share the fairness runtime between parent and child, thus the
* total amount of pressure for CPU stays equal - new tasks

View File

@@ -82,6 +82,9 @@ extern int percpu_pagelist_fraction;
extern int compat_log;
extern int latencytop_enabled;
extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
extern int sysctl_nr_trim_pages;
#endif
#ifdef CONFIG_RCU_TORTURE_TEST
extern int rcutorture_runnable;
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -1102,6 +1105,17 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec
},
#else
{
.ctl_name = CTL_UNNUMBERED,
.procname = "nr_trim_pages",
.data = &sysctl_nr_trim_pages,
.maxlen = sizeof(sysctl_nr_trim_pages),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
#endif
{
.ctl_name = VM_LAPTOP_MODE,

View File

@@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event)
*/
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
return rb_event_length(event);
unsigned length = rb_event_length(event);
if (event->type != RINGBUF_TYPE_DATA)
return length;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
length -= sizeof(event->array[0]);
return length;
}
EXPORT_SYMBOL_GPL(ring_buffer_event_length);