mirror of
https://github.com/torvalds/linux.git
synced 2024-11-01 17:51:43 +00:00
ec8acf20af
swap_lock is heavily contended when I test swap to 3 fast SSD (even slightly slower than swap to 2 such SSD). The main contention comes from swap_info_get(). This patch tries to fix the gap with adding a new per-partition lock. Global data like nr_swapfiles, total_swap_pages, least_priority and swap_list are still protected by swap_lock. nr_swap_pages is an atomic now, it can be changed without swap_lock. In theory, it's possible get_swap_page() finds no swap pages but actually there are free swap pages. But sounds not a big problem. Accessing partition specific data (like scan_swap_map and so on) is only protected by swap_info_struct.lock. Changing swap_info_struct.flags need hold swap_lock and swap_info_struct.lock, because scan_scan_map() will check it. read the flags is ok with either the locks hold. If both swap_lock and swap_info_struct.lock must be hold, we always hold the former first to avoid deadlock. swap_entry_free() can change swap_list. To delete that code, we add a new highest_priority_index. Whenever get_swap_page() is called, we check it. If it's valid, we use it. It's a pity get_swap_page() still holds swap_lock(). But in practice, swap_lock() isn't heavily contended in my test with this patch (or I can say there are other much more heavier bottlenecks like TLB flush). And BTW, looks get_swap_page() doesn't really need the lock. We never free swap_info[] and we check SWAP_WRITEOK flag. The only risk without the lock is we could swapout to some low priority swap, but we can quickly recover after several rounds of swap, so sounds not a big deal to me. But I'd prefer to fix this if it's a real problem. "swap: make each swap partition have one address_space" improved the swapout speed from 1.7G/s to 2G/s. This patch further improves the speed to 2.3G/s, so around 15% improvement. It's a multi-process test, so TLB flush isn't the biggest bottleneck before the patches. [arnd@arndb.de: fix it for nommu] [hughd@google.com: add missing unlock] [minchan@kernel.org: get rid of lockdep whinge on sys_swapon] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Seth Jennings <sjenning@linux.vnet.ibm.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
427 lines
11 KiB
C
427 lines
11 KiB
C
/*
|
|
* linux/arch/sparc/mm/init.c
|
|
*
|
|
* Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
|
|
* Copyright (C) 1995 Eddie C. Dost (ecd@skynet.be)
|
|
* Copyright (C) 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
|
|
* Copyright (C) 2000 Anton Blanchard (anton@samba.org)
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/signal.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/string.h>
|
|
#include <linux/types.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/initrd.h>
|
|
#include <linux/init.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/bootmem.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/poison.h>
|
|
#include <linux/gfp.h>
|
|
|
|
#include <asm/sections.h>
|
|
#include <asm/page.h>
|
|
#include <asm/pgtable.h>
|
|
#include <asm/vaddrs.h>
|
|
#include <asm/pgalloc.h> /* bug in asm-generic/tlb.h: check_pgt_cache */
|
|
#include <asm/tlb.h>
|
|
#include <asm/prom.h>
|
|
#include <asm/leon.h>
|
|
|
|
unsigned long *sparc_valid_addr_bitmap;
|
|
EXPORT_SYMBOL(sparc_valid_addr_bitmap);
|
|
|
|
unsigned long phys_base;
|
|
EXPORT_SYMBOL(phys_base);
|
|
|
|
unsigned long pfn_base;
|
|
EXPORT_SYMBOL(pfn_base);
|
|
|
|
struct sparc_phys_banks sp_banks[SPARC_PHYS_BANKS+1];
|
|
|
|
/* Initial ramdisk setup */
|
|
extern unsigned int sparc_ramdisk_image;
|
|
extern unsigned int sparc_ramdisk_size;
|
|
|
|
unsigned long highstart_pfn, highend_pfn;
|
|
|
|
void show_mem(unsigned int filter)
|
|
{
|
|
printk("Mem-info:\n");
|
|
show_free_areas(filter);
|
|
printk("Free swap: %6ldkB\n",
|
|
get_nr_swap_pages() << (PAGE_SHIFT-10));
|
|
printk("%ld pages of RAM\n", totalram_pages);
|
|
printk("%ld free pages\n", nr_free_pages());
|
|
}
|
|
|
|
|
|
extern unsigned long cmdline_memory_size;
|
|
unsigned long last_valid_pfn;
|
|
|
|
unsigned long calc_highpages(void)
|
|
{
|
|
int i;
|
|
int nr = 0;
|
|
|
|
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
|
|
unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
|
|
unsigned long end_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
|
|
|
|
if (end_pfn <= max_low_pfn)
|
|
continue;
|
|
|
|
if (start_pfn < max_low_pfn)
|
|
start_pfn = max_low_pfn;
|
|
|
|
nr += end_pfn - start_pfn;
|
|
}
|
|
|
|
return nr;
|
|
}
|
|
|
|
static unsigned long calc_max_low_pfn(void)
|
|
{
|
|
int i;
|
|
unsigned long tmp = pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT);
|
|
unsigned long curr_pfn, last_pfn;
|
|
|
|
last_pfn = (sp_banks[0].base_addr + sp_banks[0].num_bytes) >> PAGE_SHIFT;
|
|
for (i = 1; sp_banks[i].num_bytes != 0; i++) {
|
|
curr_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
|
|
|
|
if (curr_pfn >= tmp) {
|
|
if (last_pfn < tmp)
|
|
tmp = last_pfn;
|
|
break;
|
|
}
|
|
|
|
last_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
|
|
}
|
|
|
|
return tmp;
|
|
}
|
|
|
|
unsigned long __init bootmem_init(unsigned long *pages_avail)
|
|
{
|
|
unsigned long bootmap_size, start_pfn;
|
|
unsigned long end_of_phys_memory = 0UL;
|
|
unsigned long bootmap_pfn, bytes_avail, size;
|
|
int i;
|
|
|
|
bytes_avail = 0UL;
|
|
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
|
|
end_of_phys_memory = sp_banks[i].base_addr +
|
|
sp_banks[i].num_bytes;
|
|
bytes_avail += sp_banks[i].num_bytes;
|
|
if (cmdline_memory_size) {
|
|
if (bytes_avail > cmdline_memory_size) {
|
|
unsigned long slack = bytes_avail - cmdline_memory_size;
|
|
|
|
bytes_avail -= slack;
|
|
end_of_phys_memory -= slack;
|
|
|
|
sp_banks[i].num_bytes -= slack;
|
|
if (sp_banks[i].num_bytes == 0) {
|
|
sp_banks[i].base_addr = 0xdeadbeef;
|
|
} else {
|
|
sp_banks[i+1].num_bytes = 0;
|
|
sp_banks[i+1].base_addr = 0xdeadbeef;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Start with page aligned address of last symbol in kernel
|
|
* image.
|
|
*/
|
|
start_pfn = (unsigned long)__pa(PAGE_ALIGN((unsigned long) &_end));
|
|
|
|
/* Now shift down to get the real physical page frame number. */
|
|
start_pfn >>= PAGE_SHIFT;
|
|
|
|
bootmap_pfn = start_pfn;
|
|
|
|
max_pfn = end_of_phys_memory >> PAGE_SHIFT;
|
|
|
|
max_low_pfn = max_pfn;
|
|
highstart_pfn = highend_pfn = max_pfn;
|
|
|
|
if (max_low_pfn > pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT)) {
|
|
highstart_pfn = pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT);
|
|
max_low_pfn = calc_max_low_pfn();
|
|
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
|
|
calc_highpages() >> (20 - PAGE_SHIFT));
|
|
}
|
|
|
|
#ifdef CONFIG_BLK_DEV_INITRD
|
|
/* Now have to check initial ramdisk, so that bootmap does not overwrite it */
|
|
if (sparc_ramdisk_image) {
|
|
if (sparc_ramdisk_image >= (unsigned long)&_end - 2 * PAGE_SIZE)
|
|
sparc_ramdisk_image -= KERNBASE;
|
|
initrd_start = sparc_ramdisk_image + phys_base;
|
|
initrd_end = initrd_start + sparc_ramdisk_size;
|
|
if (initrd_end > end_of_phys_memory) {
|
|
printk(KERN_CRIT "initrd extends beyond end of memory "
|
|
"(0x%016lx > 0x%016lx)\ndisabling initrd\n",
|
|
initrd_end, end_of_phys_memory);
|
|
initrd_start = 0;
|
|
}
|
|
if (initrd_start) {
|
|
if (initrd_start >= (start_pfn << PAGE_SHIFT) &&
|
|
initrd_start < (start_pfn << PAGE_SHIFT) + 2 * PAGE_SIZE)
|
|
bootmap_pfn = PAGE_ALIGN (initrd_end) >> PAGE_SHIFT;
|
|
}
|
|
}
|
|
#endif
|
|
/* Initialize the boot-time allocator. */
|
|
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, pfn_base,
|
|
max_low_pfn);
|
|
|
|
/* Now register the available physical memory with the
|
|
* allocator.
|
|
*/
|
|
*pages_avail = 0;
|
|
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
|
|
unsigned long curr_pfn, last_pfn;
|
|
|
|
curr_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
|
|
if (curr_pfn >= max_low_pfn)
|
|
break;
|
|
|
|
last_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
|
|
if (last_pfn > max_low_pfn)
|
|
last_pfn = max_low_pfn;
|
|
|
|
/*
|
|
* .. finally, did all the rounding and playing
|
|
* around just make the area go away?
|
|
*/
|
|
if (last_pfn <= curr_pfn)
|
|
continue;
|
|
|
|
size = (last_pfn - curr_pfn) << PAGE_SHIFT;
|
|
*pages_avail += last_pfn - curr_pfn;
|
|
|
|
free_bootmem(sp_banks[i].base_addr, size);
|
|
}
|
|
|
|
#ifdef CONFIG_BLK_DEV_INITRD
|
|
if (initrd_start) {
|
|
/* Reserve the initrd image area. */
|
|
size = initrd_end - initrd_start;
|
|
reserve_bootmem(initrd_start, size, BOOTMEM_DEFAULT);
|
|
*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
|
|
|
|
initrd_start = (initrd_start - phys_base) + PAGE_OFFSET;
|
|
initrd_end = (initrd_end - phys_base) + PAGE_OFFSET;
|
|
}
|
|
#endif
|
|
/* Reserve the kernel text/data/bss. */
|
|
size = (start_pfn << PAGE_SHIFT) - phys_base;
|
|
reserve_bootmem(phys_base, size, BOOTMEM_DEFAULT);
|
|
*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
|
|
|
|
/* Reserve the bootmem map. We do not account for it
|
|
* in pages_avail because we will release that memory
|
|
* in free_all_bootmem.
|
|
*/
|
|
size = bootmap_size;
|
|
reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size, BOOTMEM_DEFAULT);
|
|
*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
|
|
|
|
return max_pfn;
|
|
}
|
|
|
|
/*
|
|
* paging_init() sets up the page tables: We call the MMU specific
|
|
* init routine based upon the Sun model type on the Sparc.
|
|
*
|
|
*/
|
|
extern void srmmu_paging_init(void);
|
|
extern void device_scan(void);
|
|
|
|
void __init paging_init(void)
|
|
{
|
|
srmmu_paging_init();
|
|
prom_build_devicetree();
|
|
of_fill_in_cpu_data();
|
|
device_scan();
|
|
}
|
|
|
|
static void __init taint_real_pages(void)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; sp_banks[i].num_bytes; i++) {
|
|
unsigned long start, end;
|
|
|
|
start = sp_banks[i].base_addr;
|
|
end = start + sp_banks[i].num_bytes;
|
|
|
|
while (start < end) {
|
|
set_bit(start >> 20, sparc_valid_addr_bitmap);
|
|
start += PAGE_SIZE;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
|
|
{
|
|
unsigned long tmp;
|
|
|
|
#ifdef CONFIG_DEBUG_HIGHMEM
|
|
printk("mapping high region %08lx - %08lx\n", start_pfn, end_pfn);
|
|
#endif
|
|
|
|
for (tmp = start_pfn; tmp < end_pfn; tmp++) {
|
|
struct page *page = pfn_to_page(tmp);
|
|
|
|
ClearPageReserved(page);
|
|
init_page_count(page);
|
|
__free_page(page);
|
|
totalhigh_pages++;
|
|
}
|
|
}
|
|
|
|
void __init mem_init(void)
|
|
{
|
|
int codepages = 0;
|
|
int datapages = 0;
|
|
int initpages = 0;
|
|
int reservedpages = 0;
|
|
int i;
|
|
|
|
if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
|
|
prom_printf("BUG: fixmap and pkmap areas overlap\n");
|
|
prom_printf("pkbase: 0x%lx pkend: 0x%lx fixstart 0x%lx\n",
|
|
PKMAP_BASE,
|
|
(unsigned long)PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
|
|
FIXADDR_START);
|
|
prom_printf("Please mail sparclinux@vger.kernel.org.\n");
|
|
prom_halt();
|
|
}
|
|
|
|
|
|
/* Saves us work later. */
|
|
memset((void *)&empty_zero_page, 0, PAGE_SIZE);
|
|
|
|
i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5);
|
|
i += 1;
|
|
sparc_valid_addr_bitmap = (unsigned long *)
|
|
__alloc_bootmem(i << 2, SMP_CACHE_BYTES, 0UL);
|
|
|
|
if (sparc_valid_addr_bitmap == NULL) {
|
|
prom_printf("mem_init: Cannot alloc valid_addr_bitmap.\n");
|
|
prom_halt();
|
|
}
|
|
memset(sparc_valid_addr_bitmap, 0, i << 2);
|
|
|
|
taint_real_pages();
|
|
|
|
max_mapnr = last_valid_pfn - pfn_base;
|
|
high_memory = __va(max_low_pfn << PAGE_SHIFT);
|
|
|
|
totalram_pages = free_all_bootmem();
|
|
|
|
for (i = 0; sp_banks[i].num_bytes != 0; i++) {
|
|
unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
|
|
unsigned long end_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
|
|
|
|
num_physpages += sp_banks[i].num_bytes >> PAGE_SHIFT;
|
|
|
|
if (end_pfn <= highstart_pfn)
|
|
continue;
|
|
|
|
if (start_pfn < highstart_pfn)
|
|
start_pfn = highstart_pfn;
|
|
|
|
map_high_region(start_pfn, end_pfn);
|
|
}
|
|
|
|
totalram_pages += totalhigh_pages;
|
|
|
|
codepages = (((unsigned long) &_etext) - ((unsigned long)&_start));
|
|
codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
|
|
datapages = (((unsigned long) &_edata) - ((unsigned long)&_etext));
|
|
datapages = PAGE_ALIGN(datapages) >> PAGE_SHIFT;
|
|
initpages = (((unsigned long) &__init_end) - ((unsigned long) &__init_begin));
|
|
initpages = PAGE_ALIGN(initpages) >> PAGE_SHIFT;
|
|
|
|
/* Ignore memory holes for the purpose of counting reserved pages */
|
|
for (i=0; i < max_low_pfn; i++)
|
|
if (test_bit(i >> (20 - PAGE_SHIFT), sparc_valid_addr_bitmap)
|
|
&& PageReserved(pfn_to_page(i)))
|
|
reservedpages++;
|
|
|
|
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
|
|
nr_free_pages() << (PAGE_SHIFT-10),
|
|
num_physpages << (PAGE_SHIFT - 10),
|
|
codepages << (PAGE_SHIFT-10),
|
|
reservedpages << (PAGE_SHIFT - 10),
|
|
datapages << (PAGE_SHIFT-10),
|
|
initpages << (PAGE_SHIFT-10),
|
|
totalhigh_pages << (PAGE_SHIFT-10));
|
|
}
|
|
|
|
void free_initmem (void)
|
|
{
|
|
unsigned long addr;
|
|
unsigned long freed;
|
|
|
|
addr = (unsigned long)(&__init_begin);
|
|
freed = (unsigned long)(&__init_end) - addr;
|
|
for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
|
|
struct page *p;
|
|
|
|
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
|
|
p = virt_to_page(addr);
|
|
|
|
ClearPageReserved(p);
|
|
init_page_count(p);
|
|
__free_page(p);
|
|
totalram_pages++;
|
|
num_physpages++;
|
|
}
|
|
printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n",
|
|
freed >> 10);
|
|
}
|
|
|
|
#ifdef CONFIG_BLK_DEV_INITRD
|
|
void free_initrd_mem(unsigned long start, unsigned long end)
|
|
{
|
|
if (start < end)
|
|
printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
|
|
(end - start) >> 10);
|
|
for (; start < end; start += PAGE_SIZE) {
|
|
struct page *p;
|
|
|
|
memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE);
|
|
p = virt_to_page(start);
|
|
|
|
ClearPageReserved(p);
|
|
init_page_count(p);
|
|
__free_page(p);
|
|
totalram_pages++;
|
|
num_physpages++;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void sparc_flush_page_to_ram(struct page *page)
|
|
{
|
|
unsigned long vaddr = (unsigned long)page_address(page);
|
|
|
|
if (vaddr)
|
|
__flush_page_to_ram(vaddr);
|
|
}
|
|
EXPORT_SYMBOL(sparc_flush_page_to_ram);
|