mirror of
https://github.com/torvalds/linux.git
synced 2024-12-02 09:01:34 +00:00
1363c3cd86
Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander <wwc@rentec.com> Credit-to: "Richard Purdie" <rpurdie@rpsys.net> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> (partly) Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
295 lines
6.7 KiB
C
295 lines
6.7 KiB
C
/*
|
|
* IA-32 Huge TLB Page Support for Kernel.
|
|
*
|
|
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
|
|
*/
|
|
|
|
#include <linux/config.h>
|
|
#include <linux/init.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/smp_lock.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/err.h>
|
|
#include <linux/sysctl.h>
|
|
#include <asm/mman.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/tlbflush.h>
|
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd = NULL;
|
|
|
|
pgd = pgd_offset(mm, addr);
|
|
pud = pud_alloc(mm, pgd, addr);
|
|
pmd = pmd_alloc(mm, pud, addr);
|
|
return (pte_t *) pmd;
|
|
}
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd = NULL;
|
|
|
|
pgd = pgd_offset(mm, addr);
|
|
pud = pud_offset(pgd, addr);
|
|
pmd = pmd_offset(pud, addr);
|
|
return (pte_t *) pmd;
|
|
}
|
|
|
|
/*
|
|
* This function checks for proper alignment of input addr and len parameters.
|
|
*/
|
|
int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
|
|
{
|
|
if (len & ~HPAGE_MASK)
|
|
return -EINVAL;
|
|
if (addr & ~HPAGE_MASK)
|
|
return -EINVAL;
|
|
return 0;
|
|
}
|
|
|
|
#if 0 /* This is just for testing */
|
|
struct page *
|
|
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
|
{
|
|
unsigned long start = address;
|
|
int length = 1;
|
|
int nr;
|
|
struct page *page;
|
|
struct vm_area_struct *vma;
|
|
|
|
vma = find_vma(mm, addr);
|
|
if (!vma || !is_vm_hugetlb_page(vma))
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
pte = huge_pte_offset(mm, address);
|
|
|
|
/* hugetlb should be locked, and hence, prefaulted */
|
|
WARN_ON(!pte || pte_none(*pte));
|
|
|
|
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
|
|
|
|
WARN_ON(!PageCompound(page));
|
|
|
|
return page;
|
|
}
|
|
|
|
int pmd_huge(pmd_t pmd)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
struct page *
|
|
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
pmd_t *pmd, int write)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
#else
|
|
|
|
struct page *
|
|
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
|
{
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
|
|
int pmd_huge(pmd_t pmd)
|
|
{
|
|
return !!(pmd_val(pmd) & _PAGE_PSE);
|
|
}
|
|
|
|
struct page *
|
|
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
pmd_t *pmd, int write)
|
|
{
|
|
struct page *page;
|
|
|
|
page = pte_page(*(pte_t *)pmd);
|
|
if (page)
|
|
page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
|
|
return page;
|
|
}
|
|
#endif
|
|
|
|
void hugetlb_clean_stale_pgtable(pte_t *pte)
|
|
{
|
|
pmd_t *pmd = (pmd_t *) pte;
|
|
struct page *page;
|
|
|
|
page = pmd_page(*pmd);
|
|
pmd_clear(pmd);
|
|
dec_page_state(nr_page_table_pages);
|
|
page_cache_release(page);
|
|
}
|
|
|
|
/* x86_64 also uses this file */
|
|
|
|
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
|
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
|
|
unsigned long addr, unsigned long len,
|
|
unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
unsigned long start_addr;
|
|
|
|
if (len > mm->cached_hole_size) {
|
|
start_addr = mm->free_area_cache;
|
|
} else {
|
|
start_addr = TASK_UNMAPPED_BASE;
|
|
mm->cached_hole_size = 0;
|
|
}
|
|
|
|
full_search:
|
|
addr = ALIGN(start_addr, HPAGE_SIZE);
|
|
|
|
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
|
|
/* At this point: (!vma || addr < vma->vm_end). */
|
|
if (TASK_SIZE - len < addr) {
|
|
/*
|
|
* Start a new search - just in case we missed
|
|
* some holes.
|
|
*/
|
|
if (start_addr != TASK_UNMAPPED_BASE) {
|
|
start_addr = TASK_UNMAPPED_BASE;
|
|
mm->cached_hole_size = 0;
|
|
goto full_search;
|
|
}
|
|
return -ENOMEM;
|
|
}
|
|
if (!vma || addr + len <= vma->vm_start) {
|
|
mm->free_area_cache = addr + len;
|
|
return addr;
|
|
}
|
|
if (addr + mm->cached_hole_size < vma->vm_start)
|
|
mm->cached_hole_size = vma->vm_start - addr;
|
|
addr = ALIGN(vma->vm_end, HPAGE_SIZE);
|
|
}
|
|
}
|
|
|
|
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
|
|
unsigned long addr0, unsigned long len,
|
|
unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma, *prev_vma;
|
|
unsigned long base = mm->mmap_base, addr = addr0;
|
|
unsigned long largest_hole = mm->cached_hole_size;
|
|
int first_time = 1;
|
|
|
|
/* don't allow allocations above current base */
|
|
if (mm->free_area_cache > base)
|
|
mm->free_area_cache = base;
|
|
|
|
if (len <= largest_hole) {
|
|
largest_hole = 0;
|
|
mm->free_area_cache = base;
|
|
}
|
|
try_again:
|
|
/* make sure it can fit in the remaining address space */
|
|
if (mm->free_area_cache < len)
|
|
goto fail;
|
|
|
|
/* either no address requested or cant fit in requested address hole */
|
|
addr = (mm->free_area_cache - len) & HPAGE_MASK;
|
|
do {
|
|
/*
|
|
* Lookup failure means no vma is above this address,
|
|
* i.e. return with success:
|
|
*/
|
|
if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
|
|
return addr;
|
|
|
|
/*
|
|
* new region fits between prev_vma->vm_end and
|
|
* vma->vm_start, use it:
|
|
*/
|
|
if (addr + len <= vma->vm_start &&
|
|
(!prev_vma || (addr >= prev_vma->vm_end))) {
|
|
/* remember the address as a hint for next time */
|
|
mm->cached_hole_size = largest_hole;
|
|
return (mm->free_area_cache = addr);
|
|
} else {
|
|
/* pull free_area_cache down to the first hole */
|
|
if (mm->free_area_cache == vma->vm_end) {
|
|
mm->free_area_cache = vma->vm_start;
|
|
mm->cached_hole_size = largest_hole;
|
|
}
|
|
}
|
|
|
|
/* remember the largest hole we saw so far */
|
|
if (addr + largest_hole < vma->vm_start)
|
|
largest_hole = vma->vm_start - addr;
|
|
|
|
/* try just below the current vma->vm_start */
|
|
addr = (vma->vm_start - len) & HPAGE_MASK;
|
|
} while (len <= vma->vm_start);
|
|
|
|
fail:
|
|
/*
|
|
* if hint left us with no space for the requested
|
|
* mapping then try again:
|
|
*/
|
|
if (first_time) {
|
|
mm->free_area_cache = base;
|
|
largest_hole = 0;
|
|
first_time = 0;
|
|
goto try_again;
|
|
}
|
|
/*
|
|
* A failed mmap() very likely causes application failure,
|
|
* so fall back to the bottom-up function here. This scenario
|
|
* can happen with large stack limits and large mmap()
|
|
* allocations.
|
|
*/
|
|
mm->free_area_cache = TASK_UNMAPPED_BASE;
|
|
mm->cached_hole_size = ~0UL;
|
|
addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
|
|
len, pgoff, flags);
|
|
|
|
/*
|
|
* Restore the topdown base:
|
|
*/
|
|
mm->free_area_cache = base;
|
|
mm->cached_hole_size = ~0UL;
|
|
|
|
return addr;
|
|
}
|
|
|
|
unsigned long
|
|
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
|
|
if (len & ~HPAGE_MASK)
|
|
return -EINVAL;
|
|
if (len > TASK_SIZE)
|
|
return -ENOMEM;
|
|
|
|
if (addr) {
|
|
addr = ALIGN(addr, HPAGE_SIZE);
|
|
vma = find_vma(mm, addr);
|
|
if (TASK_SIZE - len >= addr &&
|
|
(!vma || addr + len <= vma->vm_start))
|
|
return addr;
|
|
}
|
|
if (mm->get_unmapped_area == arch_get_unmapped_area)
|
|
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
|
|
pgoff, flags);
|
|
else
|
|
return hugetlb_get_unmapped_area_topdown(file, addr, len,
|
|
pgoff, flags);
|
|
}
|
|
|
|
#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
|
|
|