forked from Minki/linux
eb48c07146
Each page mapped in a process's address space must be correctly accounted for in _mapcount. Normally the rules for this are straightforward but hugetlbfs page table sharing is different. The page table pages at the PMD level are reference counted while the mapcount remains the same. If this accounting is wrong, it causes bugs like this one reported by Larry Woodman: kernel BUG at mm/filemap.c:135! invalid opcode: 0000 [#1] SMP CPU 22 Modules linked in: bridge stp llc sunrpc binfmt_misc dcdbas microcode pcspkr acpi_pad acpi] Pid: 18001, comm: mpitest Tainted: G W 3.3.0+ #4 Dell Inc. PowerEdge R620/07NDJ2 RIP: 0010:[<ffffffff8112cfed>] [<ffffffff8112cfed>] __delete_from_page_cache+0x15d/0x170 Process mpitest (pid: 18001, threadinfo ffff880428972000, task ffff880428b5cc20) Call Trace: delete_from_page_cache+0x40/0x80 truncate_hugepages+0x115/0x1f0 hugetlbfs_evict_inode+0x18/0x30 evict+0x9f/0x1b0 iput_final+0xe3/0x1e0 iput+0x3e/0x50 d_kill+0xf8/0x110 dput+0xe2/0x1b0 __fput+0x162/0x240 During fork(), copy_hugetlb_page_range() detects if huge_pte_alloc() shared page tables with the check dst_pte == src_pte. The logic is if the PMD page is the same, they must be shared. This assumes that the sharing is between the parent and child. However, if the sharing is with a different process entirely then this check fails as in this diagram: parent | ------------>pmd src_pte----------> data page ^ other--------->pmd--------------------| ^ child-----------| dst_pte For this situation to occur, it must be possible for Parent and Other to have faulted and failed to share page tables with each other. This is possible due to the following style of race. PROC A PROC B copy_hugetlb_page_range copy_hugetlb_page_range src_pte == huge_pte_offset src_pte == huge_pte_offset !src_pte so no sharing !src_pte so no sharing (time passes) hugetlb_fault hugetlb_fault huge_pte_alloc huge_pte_alloc huge_pmd_share huge_pmd_share LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) LOCK(i_mmap_mutex) find nothing, no sharing UNLOCK(i_mmap_mutex) pmd_alloc pmd_alloc LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) LOCK(instantiation_mutex) fault UNLOCK(instantiation_mutex) These two processes are not poing to the same data page but are not sharing page tables because the opportunity was missed. When either process later forks, the src_pte == dst pte is potentially insufficient. As the check falls through, the wrong PTE information is copied in (harmless but wrong) and the mapcount is bumped for a page mapped by a shared page table leading to the BUG_ON. This patch addresses the issue by moving pmd_alloc into huge_pmd_share which guarantees that the shared pud is populated in the same critical section as pmd. This also means that huge_pte_offset test in huge_pmd_share is serialized correctly now which in turn means that the success of the sharing will be higher as the racing tasks see the pud and pmd populated together. Race identified and changelog written mostly by Mel Gorman. {akpm@linux-foundation.org: attempt to make the huge_pmd_share() comment comprehensible, clean up coding style] Reported-by: Larry Woodman <lwoodman@redhat.com> Tested-by: Larry Woodman <lwoodman@redhat.com> Reviewed-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Michal Hocko <mhocko@suse.cz> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Ken Chen <kenchen@google.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Hillf Danton <dhillf@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
455 lines
11 KiB
C
455 lines
11 KiB
C
/*
|
|
* IA-32 Huge TLB Page Support for Kernel.
|
|
*
|
|
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
|
|
*/
|
|
|
|
#include <linux/init.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/err.h>
|
|
#include <linux/sysctl.h>
|
|
#include <asm/mman.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/pgalloc.h>
|
|
|
|
static unsigned long page_table_shareable(struct vm_area_struct *svma,
|
|
struct vm_area_struct *vma,
|
|
unsigned long addr, pgoff_t idx)
|
|
{
|
|
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
|
|
svma->vm_start;
|
|
unsigned long sbase = saddr & PUD_MASK;
|
|
unsigned long s_end = sbase + PUD_SIZE;
|
|
|
|
/* Allow segments to share if only one is marked locked */
|
|
unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
|
|
unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
|
|
|
|
/*
|
|
* match the virtual addresses, permission and the alignment of the
|
|
* page table page.
|
|
*/
|
|
if (pmd_index(addr) != pmd_index(saddr) ||
|
|
vm_flags != svm_flags ||
|
|
sbase < svma->vm_start || svma->vm_end < s_end)
|
|
return 0;
|
|
|
|
return saddr;
|
|
}
|
|
|
|
static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
|
|
{
|
|
unsigned long base = addr & PUD_MASK;
|
|
unsigned long end = base + PUD_SIZE;
|
|
|
|
/*
|
|
* check on proper vm_flags and page table alignment
|
|
*/
|
|
if (vma->vm_flags & VM_MAYSHARE &&
|
|
vma->vm_start <= base && end <= vma->vm_end)
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
|
|
* and returns the corresponding pte. While this is not necessary for the
|
|
* !shared pmd case because we can allocate the pmd later as well, it makes the
|
|
* code much cleaner. pmd allocation is essential for the shared case because
|
|
* pud has to be populated inside the same i_mmap_mutex section - otherwise
|
|
* racing tasks could either miss the sharing (see huge_pte_offset) or select a
|
|
* bad pmd for sharing.
|
|
*/
|
|
static pte_t *
|
|
huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
|
{
|
|
struct vm_area_struct *vma = find_vma(mm, addr);
|
|
struct address_space *mapping = vma->vm_file->f_mapping;
|
|
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
|
|
vma->vm_pgoff;
|
|
struct prio_tree_iter iter;
|
|
struct vm_area_struct *svma;
|
|
unsigned long saddr;
|
|
pte_t *spte = NULL;
|
|
pte_t *pte;
|
|
|
|
if (!vma_shareable(vma, addr))
|
|
return (pte_t *)pmd_alloc(mm, pud, addr);
|
|
|
|
mutex_lock(&mapping->i_mmap_mutex);
|
|
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
|
|
if (svma == vma)
|
|
continue;
|
|
|
|
saddr = page_table_shareable(svma, vma, addr, idx);
|
|
if (saddr) {
|
|
spte = huge_pte_offset(svma->vm_mm, saddr);
|
|
if (spte) {
|
|
get_page(virt_to_page(spte));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!spte)
|
|
goto out;
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
if (pud_none(*pud))
|
|
pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
|
|
else
|
|
put_page(virt_to_page(spte));
|
|
spin_unlock(&mm->page_table_lock);
|
|
out:
|
|
pte = (pte_t *)pmd_alloc(mm, pud, addr);
|
|
mutex_unlock(&mapping->i_mmap_mutex);
|
|
return pte;
|
|
}
|
|
|
|
/*
|
|
* unmap huge page backed by shared pte.
|
|
*
|
|
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
|
|
* indicated by page_count > 1, unmap is achieved by clearing pud and
|
|
* decrementing the ref count. If count == 1, the pte page is not shared.
|
|
*
|
|
* called with vma->vm_mm->page_table_lock held.
|
|
*
|
|
* returns: 1 successfully unmapped a shared pte page
|
|
* 0 the underlying pte page is not shared, or it is the last user
|
|
*/
|
|
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
|
|
{
|
|
pgd_t *pgd = pgd_offset(mm, *addr);
|
|
pud_t *pud = pud_offset(pgd, *addr);
|
|
|
|
BUG_ON(page_count(virt_to_page(ptep)) == 0);
|
|
if (page_count(virt_to_page(ptep)) == 1)
|
|
return 0;
|
|
|
|
pud_clear(pud);
|
|
put_page(virt_to_page(ptep));
|
|
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
|
|
return 1;
|
|
}
|
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm,
|
|
unsigned long addr, unsigned long sz)
|
|
{
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pte_t *pte = NULL;
|
|
|
|
pgd = pgd_offset(mm, addr);
|
|
pud = pud_alloc(mm, pgd, addr);
|
|
if (pud) {
|
|
if (sz == PUD_SIZE) {
|
|
pte = (pte_t *)pud;
|
|
} else {
|
|
BUG_ON(sz != PMD_SIZE);
|
|
if (pud_none(*pud))
|
|
pte = huge_pmd_share(mm, addr, pud);
|
|
else
|
|
pte = (pte_t *)pmd_alloc(mm, pud, addr);
|
|
}
|
|
}
|
|
BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
|
|
|
|
return pte;
|
|
}
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd = NULL;
|
|
|
|
pgd = pgd_offset(mm, addr);
|
|
if (pgd_present(*pgd)) {
|
|
pud = pud_offset(pgd, addr);
|
|
if (pud_present(*pud)) {
|
|
if (pud_large(*pud))
|
|
return (pte_t *)pud;
|
|
pmd = pmd_offset(pud, addr);
|
|
}
|
|
}
|
|
return (pte_t *) pmd;
|
|
}
|
|
|
|
#if 0 /* This is just for testing */
|
|
struct page *
|
|
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
|
{
|
|
unsigned long start = address;
|
|
int length = 1;
|
|
int nr;
|
|
struct page *page;
|
|
struct vm_area_struct *vma;
|
|
|
|
vma = find_vma(mm, addr);
|
|
if (!vma || !is_vm_hugetlb_page(vma))
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
pte = huge_pte_offset(mm, address);
|
|
|
|
/* hugetlb should be locked, and hence, prefaulted */
|
|
WARN_ON(!pte || pte_none(*pte));
|
|
|
|
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
|
|
|
|
WARN_ON(!PageHead(page));
|
|
|
|
return page;
|
|
}
|
|
|
|
int pmd_huge(pmd_t pmd)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
int pud_huge(pud_t pud)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
struct page *
|
|
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
pmd_t *pmd, int write)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
#else
|
|
|
|
struct page *
|
|
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
|
{
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
|
|
int pmd_huge(pmd_t pmd)
|
|
{
|
|
return !!(pmd_val(pmd) & _PAGE_PSE);
|
|
}
|
|
|
|
int pud_huge(pud_t pud)
|
|
{
|
|
return !!(pud_val(pud) & _PAGE_PSE);
|
|
}
|
|
|
|
struct page *
|
|
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
pmd_t *pmd, int write)
|
|
{
|
|
struct page *page;
|
|
|
|
page = pte_page(*(pte_t *)pmd);
|
|
if (page)
|
|
page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
|
|
return page;
|
|
}
|
|
|
|
struct page *
|
|
follow_huge_pud(struct mm_struct *mm, unsigned long address,
|
|
pud_t *pud, int write)
|
|
{
|
|
struct page *page;
|
|
|
|
page = pte_page(*(pte_t *)pud);
|
|
if (page)
|
|
page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
|
|
return page;
|
|
}
|
|
|
|
#endif
|
|
|
|
/* x86_64 also uses this file */
|
|
|
|
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
|
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
|
|
unsigned long addr, unsigned long len,
|
|
unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct hstate *h = hstate_file(file);
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
unsigned long start_addr;
|
|
|
|
if (len > mm->cached_hole_size) {
|
|
start_addr = mm->free_area_cache;
|
|
} else {
|
|
start_addr = TASK_UNMAPPED_BASE;
|
|
mm->cached_hole_size = 0;
|
|
}
|
|
|
|
full_search:
|
|
addr = ALIGN(start_addr, huge_page_size(h));
|
|
|
|
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
|
|
/* At this point: (!vma || addr < vma->vm_end). */
|
|
if (TASK_SIZE - len < addr) {
|
|
/*
|
|
* Start a new search - just in case we missed
|
|
* some holes.
|
|
*/
|
|
if (start_addr != TASK_UNMAPPED_BASE) {
|
|
start_addr = TASK_UNMAPPED_BASE;
|
|
mm->cached_hole_size = 0;
|
|
goto full_search;
|
|
}
|
|
return -ENOMEM;
|
|
}
|
|
if (!vma || addr + len <= vma->vm_start) {
|
|
mm->free_area_cache = addr + len;
|
|
return addr;
|
|
}
|
|
if (addr + mm->cached_hole_size < vma->vm_start)
|
|
mm->cached_hole_size = vma->vm_start - addr;
|
|
addr = ALIGN(vma->vm_end, huge_page_size(h));
|
|
}
|
|
}
|
|
|
|
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
|
|
unsigned long addr0, unsigned long len,
|
|
unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct hstate *h = hstate_file(file);
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
unsigned long base = mm->mmap_base;
|
|
unsigned long addr = addr0;
|
|
unsigned long largest_hole = mm->cached_hole_size;
|
|
unsigned long start_addr;
|
|
|
|
/* don't allow allocations above current base */
|
|
if (mm->free_area_cache > base)
|
|
mm->free_area_cache = base;
|
|
|
|
if (len <= largest_hole) {
|
|
largest_hole = 0;
|
|
mm->free_area_cache = base;
|
|
}
|
|
try_again:
|
|
start_addr = mm->free_area_cache;
|
|
|
|
/* make sure it can fit in the remaining address space */
|
|
if (mm->free_area_cache < len)
|
|
goto fail;
|
|
|
|
/* either no address requested or can't fit in requested address hole */
|
|
addr = (mm->free_area_cache - len) & huge_page_mask(h);
|
|
do {
|
|
/*
|
|
* Lookup failure means no vma is above this address,
|
|
* i.e. return with success:
|
|
*/
|
|
vma = find_vma(mm, addr);
|
|
if (!vma)
|
|
return addr;
|
|
|
|
if (addr + len <= vma->vm_start) {
|
|
/* remember the address as a hint for next time */
|
|
mm->cached_hole_size = largest_hole;
|
|
return (mm->free_area_cache = addr);
|
|
} else if (mm->free_area_cache == vma->vm_end) {
|
|
/* pull free_area_cache down to the first hole */
|
|
mm->free_area_cache = vma->vm_start;
|
|
mm->cached_hole_size = largest_hole;
|
|
}
|
|
|
|
/* remember the largest hole we saw so far */
|
|
if (addr + largest_hole < vma->vm_start)
|
|
largest_hole = vma->vm_start - addr;
|
|
|
|
/* try just below the current vma->vm_start */
|
|
addr = (vma->vm_start - len) & huge_page_mask(h);
|
|
} while (len <= vma->vm_start);
|
|
|
|
fail:
|
|
/*
|
|
* if hint left us with no space for the requested
|
|
* mapping then try again:
|
|
*/
|
|
if (start_addr != base) {
|
|
mm->free_area_cache = base;
|
|
largest_hole = 0;
|
|
goto try_again;
|
|
}
|
|
/*
|
|
* A failed mmap() very likely causes application failure,
|
|
* so fall back to the bottom-up function here. This scenario
|
|
* can happen with large stack limits and large mmap()
|
|
* allocations.
|
|
*/
|
|
mm->free_area_cache = TASK_UNMAPPED_BASE;
|
|
mm->cached_hole_size = ~0UL;
|
|
addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
|
|
len, pgoff, flags);
|
|
|
|
/*
|
|
* Restore the topdown base:
|
|
*/
|
|
mm->free_area_cache = base;
|
|
mm->cached_hole_size = ~0UL;
|
|
|
|
return addr;
|
|
}
|
|
|
|
unsigned long
|
|
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct hstate *h = hstate_file(file);
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
|
|
if (len & ~huge_page_mask(h))
|
|
return -EINVAL;
|
|
if (len > TASK_SIZE)
|
|
return -ENOMEM;
|
|
|
|
if (flags & MAP_FIXED) {
|
|
if (prepare_hugepage_range(file, addr, len))
|
|
return -EINVAL;
|
|
return addr;
|
|
}
|
|
|
|
if (addr) {
|
|
addr = ALIGN(addr, huge_page_size(h));
|
|
vma = find_vma(mm, addr);
|
|
if (TASK_SIZE - len >= addr &&
|
|
(!vma || addr + len <= vma->vm_start))
|
|
return addr;
|
|
}
|
|
if (mm->get_unmapped_area == arch_get_unmapped_area)
|
|
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
|
|
pgoff, flags);
|
|
else
|
|
return hugetlb_get_unmapped_area_topdown(file, addr, len,
|
|
pgoff, flags);
|
|
}
|
|
|
|
#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
|
|
|
|
#ifdef CONFIG_X86_64
|
|
static __init int setup_hugepagesz(char *opt)
|
|
{
|
|
unsigned long ps = memparse(opt, &opt);
|
|
if (ps == PMD_SIZE) {
|
|
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
|
|
} else if (ps == PUD_SIZE && cpu_has_gbpages) {
|
|
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
|
|
} else {
|
|
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
|
|
ps >> 20);
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
__setup("hugepagesz=", setup_hugepagesz);
|
|
#endif
|