From 8bccd85ffbaf8ff1448d1235fa6594e207695531 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 29 Oct 2005 18:16:59 -0700 Subject: [PATCH] [PATCH] Implement sys_* do_* layering in the memory policy layer. - Do a separation between do_xxx and sys_xxx functions. sys_xxx functions take variable sized bitmaps from user space as arguments. do_xxx functions take fixed sized nodemask_t as arguments and may be used from inside the kernel. Doing so simplifies the initialization code. There is no fs = kernel_ds assumption anymore. - Split up get_nodes into get_nodes (which gets the node list) and contextualize_policy which restricts the nodes to those accessible to the task and updates cpusets. - Add comments explaining limitations of bind policy Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 276 +++++++++++++++++++++++++++++-------------------- 1 file changed, 162 insertions(+), 114 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 902d4c9eccdc..123925f50f86 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2,6 +2,7 @@ * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should @@ -17,13 +18,19 @@ * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. + * * bind Only allocate memory on a specific set of nodes, * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * * preferred Try a specific node first before normal fallback. * As a special case node -1 here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) } return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } - -/* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, - unsigned long maxnode, int mode) -{ - unsigned long k; - unsigned long nlongs; - unsigned long endmask; - - --maxnode; - nodes_clear(*nodes); - if (maxnode == 0 || !nmask) - return 0; - - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } - - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - /* AK: shouldn't this error out instead? */ - cpuset_restrict_to_mems_allowed(nodes_addr(*nodes)); - return mpol_check_policy(mode, nodes); -} - /* Generate a custom zonelist for the BIND policy. */ static struct zonelist *bind_zonelist(nodemask_t *nodes) { @@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, return err; } -/* Change policy for a memory range */ -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +static int contextualize_policy(int mode, nodemask_t *nodes) +{ + if (!nodes) + return 0; + + /* Update current mems_allowed */ + cpuset_update_current_mems_allowed(); + /* Ignore nodes not set in current->mems_allowed */ + cpuset_restrict_to_mems_allowed(nodes->bits); + return mpol_check_policy(mode, nodes); +} + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; - nodemask_t nodes; int err; if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) @@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, return -EINVAL; if (end == start) return 0; - - err = get_nodes(&nodes, nmask, maxnode, mode); - if (err) - return err; - - new = mpol_new(mode, &nodes); + if (contextualize_policy(mode, nmask)) + return -EINVAL; + new = mpol_new(mode, nmask); if (IS_ERR(new)) return PTR_ERR(new); @@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, mode,nodes_addr(nodes)[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, &nodes, flags); + vma = check_range(mm, start, end, nmask, flags); err = PTR_ERR(vma); if (!IS_ERR(vma)) err = mbind_range(vma, start, end, new); @@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +long do_set_mempolicy(int mode, nodemask_t *nodes) { - int err; struct mempolicy *new; - nodemask_t nodes; - if (mode < 0 || mode > MPOL_MAX) + if (contextualize_policy(mode, nodes)) return -EINVAL; - err = get_nodes(&nodes, nmask, maxnode, mode); - if (err) - return err; - new = mpol_new(mode, &nodes); + new = mpol_new(mode, nodes); if (IS_ERR(new)) return PTR_ERR(new); mpol_free(current->mempolicy); @@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + *nodes); break; case MPOL_DEFAULT: break; @@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) return err; } -/* Copy a kernel node mask to user space */ -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, - nodemask_t *nodes) -{ - unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); - - if (copy > nbytes) { - if (copy > PAGE_SIZE) - return -EINVAL; - if (clear_user((char __user *)mask + nbytes, copy - nbytes)) - return -EFAULT; - copy = nbytes; - } - return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; -} - /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { - int err, pval; + int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; - if (nmask != NULL && maxnode < MAX_NUMNODES) - return -EINVAL; if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, err = lookup_node(mm, addr); if (err < 0) goto out; - pval = err; + *policy = err; } else if (pol == current->mempolicy && pol->policy == MPOL_INTERLEAVE) { - pval = current->il_next; + *policy = current->il_next; } else { err = -EINVAL; goto out; } } else - pval = pol->policy; + *policy = pol->policy; if (vma) { up_read(¤t->mm->mmap_sem); vma = NULL; } - if (policy && put_user(pval, policy)) - return -EFAULT; - err = 0; - if (nmask) { - nodemask_t nodes; - get_zonemask(pol, &nodes); - err = copy_nodes_to_user(nmask, maxnode, &nodes); - } + if (nmask) + get_zonemask(pol, nmask); out: if (vma) @@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, return err; } +/* + * User space interface with variable sized bitmaps for nodelists. + */ + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes_addr(*nodes)[nlongs-1] &= endmask; + return 0; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long __user *nmask, unsigned long maxnode, + unsigned flags) +{ + nodemask_t nodes; + int err; + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_mbind(start, len, mode, &nodes, flags); +} + +/* Set the process memory policy */ +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, + unsigned long maxnode) +{ + int err; + nodemask_t nodes; + + if (mode < 0 || mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_set_mempolicy(mode, &nodes); +} + +/* Retrieve NUMA policy */ +asmlinkage long sys_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags) +{ + int err, pval; + nodemask_t nodes; + + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + + return err; +} + #ifdef CONFIG_COMPAT asmlinkage long compat_sys_get_mempolicy(int __user *policy, @@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); + pol = vma->vm_ops->get_policy(vma, addr); else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; @@ -1147,14 +1197,12 @@ void __init numa_policy_init(void) /* Set interleaving policy for system init. This way not all the data structures allocated at system boot end up in node zero. */ - if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), - MAX_NUMNODES) < 0) + if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) printk("numa_policy_init: interleaving failed\n"); } -/* Reset policy of current process to default. - * Assumes fs == KERNEL_DS */ +/* Reset policy of current process to default */ void numa_default_policy(void) { - sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); + do_set_mempolicy(MPOL_DEFAULT, NULL); }