From fa62aafea9e415cd1efd8c4054106112fe809f19 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:38 -0800 Subject: [PATCH 01/82] x86, mm: Add global page_size_mask and probe one time only Now we pass around use_gbpages and use_pse for calculating page table size, Later we will need to call init_memory_mapping for every ram range one by one, that mean those calculation will be done several times. Those information are the same for all ram range and could be stored in page_size_mask and could be probed it one time only. Move that probing code out of init_memory_mapping into separated function probe_page_size_mask(), and call it before all init_memory_mapping. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-2-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup.c | 1 + arch/x86/mm/init.c | 55 ++++++++++++++++------------------ 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f76..98ac76dc4eae 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -602,6 +602,7 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; +void probe_page_size_mask(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca45696f30fb..01fb5f9baf90 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -913,6 +913,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); init_gbpages(); + probe_page_size_mask(); /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Fri, 16 Nov 2012 19:38:39 -0800 Subject: [PATCH 02/82] x86, mm: Split out split_mem_range from init_memory_mapping So make init_memory_mapping smaller and readable. -v2: use 0 instead of nr_range as input parameter found by Yasuaki Ishimatsu. Suggested-by: Ingo Molnar Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-3-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Cc: Yasuaki Ishimatsu Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index aa5b0da03f6d..6368b86b84e2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -146,25 +146,13 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } -/* - * Setup the direct mapping of the physical memory at PAGE_OFFSET. - * This runs before bootmem is initialized and gets pages directly from - * the physical memory. To access them they are temporarily mapped. - */ -unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) { unsigned long start_pfn, end_pfn; - unsigned long ret = 0; unsigned long pos; - struct map_range mr[NR_RANGE_MR]; - int nr_range, i; - - printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", - start, end - 1); - - memset(mr, 0, sizeof(mr)); - nr_range = 0; + int i; /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; @@ -258,6 +246,27 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1< Date: Fri, 16 Nov 2012 19:38:40 -0800 Subject: [PATCH 03/82] x86, mm: Move down find_early_table_space() It will need to call split_mem_range(). Move it down after that to avoid extra declaration. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-4-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 117 +++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 58 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6368b86b84e2..701abbc24735 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -36,64 +36,6 @@ struct map_range { }; static int page_size_mask; -/* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. - */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) -{ - int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; - phys_addr_t base; - - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; - - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } - - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - } - - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - good_end = max_pfn_mapped << PAGE_SHIFT; - - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); - - pgt_buf_start = base >> PAGE_SHIFT; - pgt_buf_end = pgt_buf_start; - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); -} void probe_page_size_mask(void) { @@ -249,6 +191,65 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) +{ + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0, good_end; + phys_addr_t base; + + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; + + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; + + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } + + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + good_end = max_pfn_mapped << PAGE_SHIFT; + + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); + if (!base) + panic("Cannot find space for the kernel page tables"); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", + mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_top << PAGE_SHIFT) - 1); +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from From 22ddfcaa0dbae992332381d41b8a1fbc72269a13 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:41 -0800 Subject: [PATCH 04/82] x86, mm: Move init_memory_mapping calling out of setup.c Now init_memory_mapping is called two times, later will be called for every ram ranges. Could put all related init_mem calling together and out of setup.c. Actually, it reverts commit 1bbbbe7 x86: Exclude E820_RESERVED regions and memory holes above 4 GB from direct mapping. will address that later with complete solution include handling hole under 4g. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-5-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 1 - arch/x86/include/asm/pgtable.h | 2 +- arch/x86/kernel/setup.c | 27 +-------------------------- arch/x86/mm/init.c | 19 ++++++++++++++++++- 4 files changed, 20 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index adcc0ae73d09..4f13998be59a 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,7 +12,6 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); - extern unsigned long __initdata pgt_buf_start; extern unsigned long __meminitdata pgt_buf_end; extern unsigned long __meminitdata pgt_buf_top; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 98ac76dc4eae..dd1a88832d25 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -602,7 +602,7 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ extern int direct_gbpages; -void probe_page_size_mask(void); +void init_mem_mapping(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 01fb5f9baf90..23b079fb93fc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -913,34 +913,9 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); init_gbpages(); - probe_page_size_mask(); - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - int i; - unsigned long start, end; - unsigned long start_pfn, end_pfn; - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, - NULL) { - - end = PFN_PHYS(end_pfn); - if (end <= (1UL<<32)) - continue; - - start = PFN_PHYS(start_pfn); - max_pfn_mapped = init_memory_mapping( - max((1UL<<32), start), end); - } - - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif memblock.current_limit = get_max_mapped(); dma_contiguous_reserve(0); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 701abbc24735..9e17f9e18a21 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -37,7 +37,7 @@ struct map_range { static int page_size_mask; -void probe_page_size_mask(void) +static void __init probe_page_size_mask(void) { #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* @@ -315,6 +315,23 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, return ret >> PAGE_SHIFT; } +void __init init_mem_mapping(void) +{ + probe_page_size_mask(); + + /* max_pfn_mapped is updated here */ + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn< Date: Fri, 16 Nov 2012 19:38:42 -0800 Subject: [PATCH 05/82] x86, mm: Revert back good_end setting for 64bit After | commit 8548c84da2f47e71bbbe300f55edb768492575f7 | Author: Takashi Iwai | Date: Sun Oct 23 23:19:12 2011 +0200 | | x86: Fix S4 regression | | Commit 4b239f458 ("x86-64, mm: Put early page table high") causes a S4 | regression since 2.6.39, namely the machine reboots occasionally at S4 | resume. It doesn't happen always, overall rate is about 1/20. But, | like other bugs, once when this happens, it continues to happen. | | This patch fixes the problem by essentially reverting the memory | assignment in the older way. Have some page table around 512M again, that will prevent kdump to find 512M under 768M. We need revert that reverting, so we could put page table high again for 64bit. Takashi agreed that S4 regression could be something else. https://lkml.org/lkml/2012/6/15/182 Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-6-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9e17f9e18a21..dbef4ffe8d31 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -234,8 +234,8 @@ static void __init find_early_table_space(struct map_range *mr, int nr_range) #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif good_end = max_pfn_mapped << PAGE_SHIFT; +#endif base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (!base) From 84f1ae30bb68d8da98bca7ff2c2b825b2ac8c9a5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:43 -0800 Subject: [PATCH 06/82] x86, mm: Change find_early_table_space() paramters call split_mem_range inside the function. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-7-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dbef4ffe8d31..51f919febf64 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -196,12 +196,18 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB * pages. Then find enough contiguous space for those page tables. */ -static void __init find_early_table_space(struct map_range *mr, int nr_range) +static void __init find_early_table_space(unsigned long start, unsigned long end) { int i; unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long start = 0, good_end; + unsigned long good_end; phys_addr_t base; + struct map_range mr[NR_RANGE_MR]; + int nr_range; + + memset(mr, 0, sizeof(mr)); + nr_range = 0; + nr_range = split_mem_range(mr, nr_range, start, end); for (i = 0; i < nr_range; i++) { unsigned long range, extra; @@ -276,7 +282,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(mr, nr_range); + find_early_table_space(start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, From c14fa0b63b5b4234667c03fdc3314c0881caa514 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:44 -0800 Subject: [PATCH 07/82] x86, mm: Find early page table buffer together We should not do that in every calling of init_memory_mapping. At the same time need to move down early_memtest, and could remove after_bootmem checking. -v2: fix one early_memtest with 32bit by passing max_pfn_mapped instead. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-8-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 66 ++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 51f919febf64..1ce0d033fafc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -274,16 +274,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, memset(mr, 0, sizeof(mr)); nr_range = split_mem_range(mr, 0, start, end); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ - if (!after_bootmem) - find_early_table_space(start, end); - for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); @@ -296,6 +286,36 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + return ret >> PAGE_SHIFT; +} + +void __init init_mem_mapping(void) +{ + probe_page_size_mask(); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ +#ifdef CONFIG_X86_64 + find_early_table_space(0, max_pfn< max_low_pfn) { + max_pfn_mapped = init_memory_mapping(1UL<<32, + max_pfn< pgt_buf_start) + if (pgt_buf_end > pgt_buf_start) x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), PFN_PHYS(pgt_buf_end)); - if (!after_bootmem) - early_memtest(start, end); + /* stop the wrong using */ + pgt_buf_top = 0; - return ret >> PAGE_SHIFT; -} - -void __init init_mem_mapping(void) -{ - probe_page_size_mask(); - - /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn< Date: Fri, 16 Nov 2012 19:38:45 -0800 Subject: [PATCH 08/82] x86, mm: Separate out calculate_table_space_size() It should take physical address range that will need to be mapped. find_early_table_space should take range that pgt buff should be in. Separating page table size calculating and finding early page table to reduce confusing. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-9-git-send-email-yinghai@kernel.org Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1ce0d033fafc..7b961d0b1389 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -196,12 +196,10 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB * pages. Then find enough contiguous space for those page tables. */ -static void __init find_early_table_space(unsigned long start, unsigned long end) +static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) { int i; unsigned long puds = 0, pmds = 0, ptes = 0, tables; - unsigned long good_end; - phys_addr_t base; struct map_range mr[NR_RANGE_MR]; int nr_range; @@ -240,9 +238,17 @@ static void __init find_early_table_space(unsigned long start, unsigned long end #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); - good_end = max_pfn_mapped << PAGE_SHIFT; #endif + return tables; +} + +static void __init find_early_table_space(unsigned long start, + unsigned long good_end, + unsigned long tables) +{ + phys_addr_t base; + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (!base) panic("Cannot find space for the kernel page tables"); @@ -250,10 +256,6 @@ static void __init find_early_table_space(unsigned long start, unsigned long end pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); } /* @@ -291,6 +293,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, void __init init_mem_mapping(void) { + unsigned long tables, good_end, end; + probe_page_size_mask(); /* @@ -301,10 +305,18 @@ void __init init_mem_mapping(void) * nodes are discovered. */ #ifdef CONFIG_X86_64 - find_early_table_space(0, max_pfn< pgt_buf_start) + if (pgt_buf_end > pgt_buf_start) { + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", + end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_end << PAGE_SHIFT) - 1); x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), PFN_PHYS(pgt_buf_end)); + } /* stop the wrong using */ pgt_buf_top = 0; From dd7dfad7fb297b1746bcdbebbdc970d723a635bd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:46 -0800 Subject: [PATCH 09/82] x86, mm: Set memblock initial limit to 1M memblock_x86_fill() could double memory array. If we set memblock.current_limit to 512M, so memory array could be around 512M. So kdump will not get big range (like 512M) under 1024M. Try to put it down under 1M, it would use about 4k or so, and that is limited. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-10-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 23b079fb93fc..4bd89218cbc3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -890,7 +890,7 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock.current_limit = get_max_mapped(); + memblock.current_limit = ISA_END_ADDRESS; memblock_x86_fill(); /* From 4eea6aa581abfeb2695ebe9f9d4672597e1bdd4b Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Fri, 16 Nov 2012 19:38:47 -0800 Subject: [PATCH 10/82] x86, mm: if kernel .text .data .bss are not marked as E820_RAM, complain and fix There could be cases where user supplied memmap=exactmap memory mappings do not mark the region where the kernel .text .data and .bss reside as E820_RAM, as reported here: https://lkml.org/lkml/2012/8/14/86 Handle it by complaining, and adding the range back into the e820. Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/1353123563-3103-11-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4bd89218cbc3..d85cbd96525d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -832,6 +832,20 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); + /* + * Complain if .text .data and .bss are not marked as E820_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have incorrectly supplied it via memmap=exactmap. If + * we really are running on top non-RAM, we will crash later anyways. + */ + if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) { + pr_warn(".text .data .bss are not marked as E820_RAM!\n"); + + e820_add_region(code_resource.start, + __pa(__brk_limit) - code_resource.start + 1, + E820_RAM); + } + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { From dda56e134059b840631fdfd034784056b627c2a6 Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Fri, 16 Nov 2012 19:38:48 -0800 Subject: [PATCH 11/82] x86, mm: Fixup code testing if a pfn is direct mapped Update code that previously assumed pfns [ 0 - max_low_pfn_mapped ) and [ 4GB - max_pfn_mapped ) were always direct mapped, to now look up pfn_mapped ranges instead. -v2: change applying sequence to keep git bisecting working. so add dummy pfn_range_is_mapped(). - Yinghai Lu Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/1353123563-3103-12-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 8 ++++++++ arch/x86/kernel/cpu/amd.c | 8 +++----- arch/x86/platform/efi/efi.c | 7 +++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index e21fdd10479f..45aae6e5f649 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,6 +51,14 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } +static inline bool pfn_range_is_mapped(unsigned long start_pfn, + unsigned long end_pfn) +{ + return end_pfn <= max_low_pfn_mapped || + (end_pfn > (1UL << (32 - PAGE_SHIFT)) && + end_pfn <= max_pfn_mapped); +} + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d12..9619ba6528ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -676,12 +676,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ad4439145f85..36e53f0a9ce3 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, end_pfn; + u64 end, systab, start_pfn, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; + start_pfn = PFN_DOWN(md->phys_addr); end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) { + if (pfn_range_is_mapped(start_pfn, end_pfn)) { va = __va(md->phys_addr); if (!(md->attribute & EFI_MEMORY_WB)) From 8eb5779f6b9c7e390c92f451edaafc039e06e743 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:49 -0800 Subject: [PATCH 12/82] x86, mm: use pfn_range_is_mapped() with CPA We are going to map ram only, so under max_low_pfn_mapped, between 4g and max_pfn_mapped does not mean mapped at all. Use pfn_range_is_mapped() directly. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-13-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a718e0d23503..44acfcd6c16f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -551,16 +551,10 @@ static int split_large_page(pte_t *kpte, unsigned long address) for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); - if (address >= (unsigned long)__va(0) && - address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), + PFN_DOWN(__pa(address)) + 1)) split_page_count(level); -#ifdef CONFIG_X86_64 - if (address >= (unsigned long)__va(1UL<<32) && - address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) - split_page_count(level); -#endif - /* * Install the new, split up pagetable. * @@ -729,13 +723,9 @@ static int cpa_process_alias(struct cpa_data *cpa) unsigned long vaddr; int ret; - if (cpa->pfn >= max_pfn_mapped) + if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; -#ifdef CONFIG_X86_64 - if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) - return 0; -#endif /* * No need to redo, when the primary call touched the direct * mapping already: From 5101730cb0613b91d40b9bb7be6bb023d2f6aa24 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:50 -0800 Subject: [PATCH 13/82] x86, mm: use pfn_range_is_mapped() with gart We are going to map ram only, so under max_low_pfn_mapped, between 4g and max_pfn_mapped does not mean mapped at all. Use pfn_range_is_mapped() directly. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-14-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/amd_gart_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index e66311200cbd..b574b295a2f9 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -768,10 +768,9 @@ int __init gart_iommu_init(void) aper_base = info.aper_base; end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) init_memory_mapping(start_pfn< Date: Fri, 16 Nov 2012 19:38:51 -0800 Subject: [PATCH 14/82] x86, mm: use pfn_range_is_mapped() with reserve_initrd We are going to map ram only, so under max_low_pfn_mapped, between 4g and max_pfn_mapped does not mean mapped at all. Use pfn_range_is_mapped() to find out if range is mapped for initrd. That could happen bootloader put initrd in range but user could use memmap to carve some of range out. Also during copying need to use early_memmap to map original initrd for accessing. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-15-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 52 ++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d85cbd96525d..bd52f9da17cc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -317,20 +317,19 @@ static void __init relocate_initrd(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 area_size = PAGE_ALIGN(ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; - /* We need to move the initrd down into lowmem */ - ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, - PAGE_SIZE); + /* We need to move the initrd down into directly mapped mem */ + ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_low_pfn_mapped), + area_size, PAGE_SIZE); if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the lowmem currently occupied by + /* Note: this includes all the mem currently occupied by the initrd, we rely on that fact to keep the data intact. */ memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; @@ -340,17 +339,7 @@ static void __init relocate_initrd(void) q = (char *)initrd_start; - /* Copy any lowmem portion of the initrd */ - if (ramdisk_image < end_of_lowmem) { - clen = end_of_lowmem - ramdisk_image; - p = (char *)__va(ramdisk_image); - memcpy(q, p, clen); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } - - /* Copy the highmem portion of the initrd */ + /* Copy the initrd */ while (ramdisk_size) { slop = ramdisk_image & ~PAGE_MASK; clen = ramdisk_size; @@ -364,7 +353,7 @@ static void __init relocate_initrd(void) ramdisk_image += clen; ramdisk_size -= clen; } - /* high pages is not converted by early_res_to_bootmem */ + ramdisk_image = boot_params.hdr.ramdisk_image; ramdisk_size = boot_params.hdr.ramdisk_size; printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" @@ -373,13 +362,27 @@ static void __init relocate_initrd(void) ramdisk_here, ramdisk_here + ramdisk_size - 1); } +static u64 __init get_mem_size(unsigned long limit_pfn) +{ + int i; + u64 mapped_pages = 0; + unsigned long start_pfn, end_pfn; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min_t(unsigned long, start_pfn, limit_pfn); + end_pfn = min_t(unsigned long, end_pfn, limit_pfn); + mapped_pages += end_pfn - start_pfn; + } + + return mapped_pages << PAGE_SHIFT; +} static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -387,18 +390,19 @@ static void __init reserve_initrd(void) initrd_start = 0; - if (ramdisk_size >= (end_of_lowmem>>1)) { + mapped_size = get_mem_size(max_low_pfn_mapped); + if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, end_of_lowmem>>1); - } + ramdisk_size, mapped_size>>1); printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); - - if (ramdisk_end <= end_of_lowmem) { - /* All in lowmem, easy case */ + if (ramdisk_end <= (max_low_pfn_mapped< Date: Fri, 16 Nov 2012 19:38:52 -0800 Subject: [PATCH 15/82] x86, mm: Only direct map addresses that are marked as E820_RAM Currently direct mappings are created for [ 0 to max_low_pfn< Link: http://lkml.kernel.org/r/1353123563-3103-16-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 8 +- arch/x86/kernel/setup.c | 8 +- arch/x86/mm/init.c | 120 +++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 6 +- 4 files changed, 117 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 45aae6e5f649..54c97879195e 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,13 +51,7 @@ static inline phys_addr_t get_max_mapped(void) return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; } -static inline bool pfn_range_is_mapped(unsigned long start_pfn, - unsigned long end_pfn) -{ - return end_pfn <= max_low_pfn_mapped || - (end_pfn > (1UL << (32 - PAGE_SHIFT)) && - end_pfn <= max_pfn_mapped); -} +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bd52f9da17cc..68dffeceb193 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -116,9 +116,11 @@ #include /* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7b961d0b1389..bb44e9f2cc49 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -243,6 +243,38 @@ static unsigned long __init calculate_table_space_size(unsigned long start, unsi return tables; } +static unsigned long __init calculate_all_table_space_size(void) +{ + unsigned long start_pfn, end_pfn; + unsigned long tables; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + tables = calculate_table_space_size(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = start_pfn << PAGE_SHIFT; + u64 end = end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + tables += calculate_table_space_size(start, end); + } + + return tables; +} + static void __init find_early_table_space(unsigned long start, unsigned long good_end, unsigned long tables) @@ -258,6 +290,34 @@ static void __init find_early_table_space(unsigned long start, pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); } +static struct range pfn_mapped[E820_X_MAX]; +static int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -288,9 +348,55 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); + return ret >> PAGE_SHIFT; } +/* + * Iterate through E820 memory map and create direct mappings for only E820_RAM + * regions. We cannot simply create direct mappings for all pfns from + * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in + * high addresses that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result in using + * smaller size (i.e. 4K instead of 2M or 1G) page tables. + */ +static void __init init_all_memory_mapping(void) +{ + unsigned long start_pfn, end_pfn; + int i; + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = (u64)start_pfn << PAGE_SHIFT; + u64 end = (u64)end_pfn << PAGE_SHIFT; + + if (end <= ISA_END_ADDRESS) + continue; + + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; +#ifdef CONFIG_X86_32 + /* on 32 bit, we only map up to max_low_pfn */ + if ((start >> PAGE_SHIFT) >= max_low_pfn) + continue; + + if ((end >> PAGE_SHIFT) > max_low_pfn) + end = max_low_pfn << PAGE_SHIFT; +#endif + init_memory_mapping(start, end); + } + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif +} + void __init init_mem_mapping(void) { unsigned long tables, good_end, end; @@ -311,23 +417,15 @@ void __init init_mem_mapping(void) end = max_low_pfn << PAGE_SHIFT; good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_table_space_size(0, end); + tables = calculate_all_table_space_size(); find_early_table_space(0, good_end, tables); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<node_zones + ZONE_NORMAL; - unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - last_mapped_pfn = init_memory_mapping(start, start + size); - if (last_mapped_pfn > max_pfn_mapped) - max_pfn_mapped = last_mapped_pfn; + init_memory_mapping(start, start + size); ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); From 74f27655dda84604d8bab47872020dcce5c88731 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:53 -0800 Subject: [PATCH 16/82] x86, mm: relocate initrd under all mem for 64bit instead of under 4g. For 64bit, we can use any mapped mem instead of low mem. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-17-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 68dffeceb193..94f922a73c54 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -324,7 +324,7 @@ static void __init relocate_initrd(void) char *p, *q; /* We need to move the initrd down into directly mapped mem */ - ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_low_pfn_mapped), + ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), area_size, PAGE_SIZE); if (!ramdisk_here) @@ -392,7 +392,7 @@ static void __init reserve_initrd(void) initrd_start = 0; - mapped_size = get_mem_size(max_low_pfn_mapped); + mapped_size = get_mem_size(max_pfn_mapped); if (ramdisk_size >= (mapped_size>>1)) panic("initrd too large to handle, " "disabling initrd (%lld needed, %lld available)\n", @@ -401,8 +401,7 @@ static void __init reserve_initrd(void) printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); - if (ramdisk_end <= (max_low_pfn_mapped< Date: Fri, 16 Nov 2012 19:38:54 -0800 Subject: [PATCH 17/82] x86, mm: Align start address to correct big page size We are going to use buffer in BRK to map small range just under memory top, and use those new mapped ram to map ram range under it. The ram range that will be mapped at first could be only page aligned, but ranges around it are ram too, we could use bigger page to map it to avoid small page size. We will adjust page_size_mask in following patch: x86, mm: Use big page size for small memory range to use big page size for small ram range. Before that patch, this patch will make sure start address to be aligned down according to bigger page size, otherwise entry in page page will not have correct value. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-18-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 1 + arch/x86/mm/init_64.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 11a58001b4ce..27f7fc69cf8a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -310,6 +310,7 @@ repeat: __pgprot(PTE_IDENT_ATTR | _PAGE_PSE); + pfn &= PMD_MASK >> PAGE_SHIFT; addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32c7e3847cf6..869372a5d3cf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -464,7 +464,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pmd, - pfn_pte(address >> PAGE_SHIFT, + pfn_pte((address & PMD_MASK) >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; @@ -541,7 +541,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pages++; spin_lock(&init_mm.page_table_lock); set_pte((pte_t *)pud, - pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); last_map_addr = next; continue; From aeebe84cc96cde4181807bc67c300c550d0ef123 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:55 -0800 Subject: [PATCH 18/82] x86, mm: Use big page size for small memory range We could map small range in the middle of big range at first, so should use big page size at first to avoid using small page size to break down page table. Only can set big page bit when that range has ram area around it. -v2: fix 32bit boundary checking. We can not count ram above max_low_pfn for 32 bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-19-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bb44e9f2cc49..da591ebc8d12 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -88,6 +88,40 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } +/* + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. + */ +static void __init_refok adjust_range_page_size_mask(struct map_range *mr, + int nr_range) +{ + int i; + + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<> PAGE_SHIFT) > max_low_pfn) + continue; +#endif + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1< Date: Fri, 16 Nov 2012 19:38:56 -0800 Subject: [PATCH 19/82] x86, mm: Don't clear page table if range is ram After we add code use buffer in BRK to pre-map buf for page table in following patch: x86, mm: setup page table in top-down it should be safe to remove early_memmap for page table accessing. Instead we get panic with that. It turns out that we clear the initial page table wrongly for next range that is separated by holes. And it only happens when we are trying to map ram range one by one. We need to check if the range is ram before clearing page table. We change the loop structure to remove the extra little loop and use one loop only, and in that loop will caculate next at first, and check if [addr,next) is covered by E820_RAM. -v2: E820_RESERVED_KERN is treated as E820_RAM. EFI one change some E820_RAM to that, so next kernel by kexec will know that range is used already. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-20-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 869372a5d3cf..fa28e3e29741 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -363,20 +363,20 @@ static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) { - unsigned pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i; pte_t *pte = pte_page + pte_index(addr); - for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { - + for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) { + next = (addr & PAGE_MASK) + PAGE_SIZE; if (addr >= end) { - if (!after_bootmem) { - for(; i < PTRS_PER_PTE; i++, pte++) - set_pte(pte, __pte(0)); - } - break; + if (!after_bootmem && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; } /* @@ -419,15 +419,14 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pte_t *pte; pgprot_t new_prot = prot; - if (address >= end) { - if (!after_bootmem) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); - } - break; - } - next = (address & PMD_MASK) + PMD_SIZE; + if (address >= end) { + if (!after_bootmem && + !e820_any_mapped(address & PMD_MASK, next, E820_RAM) && + !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; + } if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { @@ -497,13 +496,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; - if (addr >= end) - break; - next = (addr & PUD_MASK) + PUD_SIZE; - - if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { - set_pud(pud, __pud(0)); + if (addr >= end) { + if (!after_bootmem && + !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) && + !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN)) + set_pud(pud, __pud(0)); continue; } From f763ad1d3870abb811ec7520b4c1adc56471a3a4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:57 -0800 Subject: [PATCH 20/82] x86, mm: Break down init_all_memory_mapping Will replace that with top-down page table initialization. New API need to take range: init_range_memory_mapping() Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-21-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index da591ebc8d12..c688ea3887f2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -398,40 +398,30 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * Depending on the alignment of E820 ranges, this may possibly result in using * smaller size (i.e. 4K instead of 2M or 1G) page tables. */ -static void __init init_all_memory_mapping(void) +static void __init init_range_memory_mapping(unsigned long range_start, + unsigned long range_end) { unsigned long start_pfn, end_pfn; int i; - /* the ISA range is always mapped regardless of memory holes */ - init_memory_mapping(0, ISA_END_ADDRESS); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { u64 start = (u64)start_pfn << PAGE_SHIFT; u64 end = (u64)end_pfn << PAGE_SHIFT; - if (end <= ISA_END_ADDRESS) + if (end <= range_start) continue; - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; -#ifdef CONFIG_X86_32 - /* on 32 bit, we only map up to max_low_pfn */ - if ((start >> PAGE_SHIFT) >= max_low_pfn) + if (start < range_start) + start = range_start; + + if (start >= range_end) continue; - if ((end >> PAGE_SHIFT) > max_low_pfn) - end = max_low_pfn << PAGE_SHIFT; -#endif + if (end > range_end) + end = range_end; + init_memory_mapping(start, end); } - -#ifdef CONFIG_X86_64 - if (max_pfn > max_low_pfn) { - /* can we preseve max_low_pfn ?*/ - max_low_pfn = max_pfn; - } -#endif } void __init init_mem_mapping(void) @@ -461,8 +451,15 @@ void __init init_mem_mapping(void) (pgt_buf_top << PAGE_SHIFT) - 1); max_pfn_mapped = 0; /* will get exact value next */ - init_all_memory_mapping(); - + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + init_range_memory_mapping(ISA_END_ADDRESS, end); +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#endif /* * Reserve the kernel pagetable pages we used (pgt_buf_start - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) From 8d57470d8f859635deffe3919d7d4867b488b85a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:58 -0800 Subject: [PATCH 21/82] x86, mm: setup page table in top-down Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first. Then use mapped pages to map more ranges below, and keep looping until all pages get mapped. alloc_low_page will use page from BRK at first, after that buffer is used up, will use memblock to find and reserve pages for page table usage. Introduce min_pfn_mapped to make sure find new pages from mapped ranges, that will be updated when lower pages get mapped. Also add step_size to make sure that don't try to map too big range with limited mapped pages initially, and increase the step_size when we have more mapped pages on hand. We don't need to call pagetable_reserve anymore, reserve work is done in alloc_low_page() directly. At last we can get rid of calculation and find early pgt related code. -v2: update to after fix_xen change, also use MACRO for initial pgt_buf size and add comments with it. -v3: skip big reserved range in memblock.reserved near end. -v4: don't need fix_xen change now. -v5: add changelog about moving about reserving pagetable to alloc_low_page. Suggested-by: "H. Peter Anvin" Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-22-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 + arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup.c | 3 + arch/x86/mm/init.c | 210 +++++++++--------------------- arch/x86/mm/init_32.c | 17 ++- arch/x86/mm/init_64.c | 17 ++- 6 files changed, 94 insertions(+), 155 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195e..9f6f3e66e84d 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dd1a88832d25..6991a3e1bf81 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd) extern int direct_gbpages; void init_mem_mapping(void); +void early_alloc_pgt_buf(void); /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 94f922a73c54..f7634092931b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,6 +124,7 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); @@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p) reserve_ibft_region(); + early_alloc_pgt_buf(); + /* * Need to conclude brk, before memblock_x86_fill() * it could use memblock_find_in_range, could overlap with diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c688ea3887f2..2393d0099e7f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ +#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; + + base = __pa(extend_brk(tables, PAGE_SIZE)); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + int after_bootmem; int direct_gbpages @@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, return nr_range; } -/* - * First calculate space needed for kernel direct mapping page tables to cover - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB - * pages. Then find enough contiguous space for those page tables. - */ -static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) -{ - int i; - unsigned long puds = 0, pmds = 0, ptes = 0, tables; - struct map_range mr[NR_RANGE_MR]; - int nr_range; - - memset(mr, 0, sizeof(mr)); - nr_range = 0; - nr_range = split_mem_range(mr, nr_range, start, end); - - for (i = 0; i < nr_range; i++) { - unsigned long range, extra; - - range = mr[i].end - mr[i].start; - puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - - if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { - extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); - pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else { - pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; - } - - if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { - extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else { - ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - } - - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - return tables; -} - -static unsigned long __init calculate_all_table_space_size(void) -{ - unsigned long start_pfn, end_pfn; - unsigned long tables; - int i; - - /* the ISA range is always mapped regardless of memory holes */ - tables = calculate_table_space_size(0, ISA_END_ADDRESS); - - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - u64 start = start_pfn << PAGE_SHIFT; - u64 end = end_pfn << PAGE_SHIFT; - - if (end <= ISA_END_ADDRESS) - continue; - - if (start < ISA_END_ADDRESS) - start = ISA_END_ADDRESS; -#ifdef CONFIG_X86_32 - /* on 32 bit, we only map up to max_low_pfn */ - if ((start >> PAGE_SHIFT) >= max_low_pfn) - continue; - - if ((end >> PAGE_SHIFT) > max_low_pfn) - end = max_low_pfn << PAGE_SHIFT; -#endif - tables += calculate_table_space_size(start, end); - } - - return tables; -} - -static void __init find_early_table_space(unsigned long start, - unsigned long good_end, - unsigned long tables) -{ - phys_addr_t base; - - base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (!base) - panic("Cannot find space for the kernel page tables"); - - pgt_buf_start = base >> PAGE_SHIFT; - pgt_buf_end = pgt_buf_start; - pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); -} - static struct range pfn_mapped[E820_X_MAX]; static int nr_pfn_mapped; @@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } /* - * Iterate through E820 memory map and create direct mappings for only E820_RAM - * regions. We cannot simply create direct mappings for all pfns from - * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in - * high addresses that cannot be marked as UC by fixed/variable range MTRRs. - * Depending on the alignment of E820 ranges, this may possibly result in using - * smaller size (i.e. 4K instead of 2M or 1G) page tables. + * would have hole in the middle or ends, and only ram parts will be mapped. */ -static void __init init_range_memory_mapping(unsigned long range_start, +static unsigned long __init init_range_memory_mapping( + unsigned long range_start, unsigned long range_end) { unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { @@ -421,71 +334,70 @@ static void __init init_range_memory_mapping(unsigned long range_start, end = range_end; init_memory_mapping(start, end); + + mapped_ram_size += end - start; } + + return mapped_ram_size; } +/* (PUD_SHIFT-PMD_SHIFT)/2 */ +#define STEP_SIZE_SHIFT 5 void __init init_mem_mapping(void) { - unsigned long tables, good_end, end; + unsigned long end, real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + unsigned long new_mapped_ram_size; probe_page_size_mask(); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; - good_end = end; #else end = max_low_pfn << PAGE_SHIFT; - good_end = max_pfn_mapped << PAGE_SHIFT; #endif - tables = calculate_all_table_space_size(); - find_early_table_space(0, good_end, tables); - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_top << PAGE_SHIFT) - 1); - max_pfn_mapped = 0; /* will get exact value next */ /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); - init_range_memory_mapping(ISA_END_ADDRESS, end); + + /* xen has big range in reserved near end of ram, skip it at first */ + addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, + PAGE_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + while (last_start > ISA_END_ADDRESS) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < ISA_END_ADDRESS) + start = ISA_END_ADDRESS; + } else + start = ISA_END_ADDRESS; + new_mapped_ram_size = init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + /* only increase step_size after big range get mapped */ + if (new_mapped_ram_size > mapped_ram_size) + step_size <<= STEP_SIZE_SHIFT; + mapped_ram_size += new_mapped_ram_size; + } + + if (real_end < end) + init_range_memory_mapping(real_end, end); + #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } #endif - /* - * Reserve the kernel pagetable pages we used (pgt_buf_start - - * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) - * so that they can be reused for other purposes. - * - * On native it just means calling memblock_reserve, on Xen it also - * means marking RW the pagetable pages that we allocated before - * but that haven't been used. - * - * In fact on xen we mark RO the whole range pgt_buf_start - - * pgt_buf_top, because we have to make sure that when - * init_memory_mapping reaches the pagetable pages area, it maps - * RO all the pagetable pages, including the ones that are beyond - * pgt_buf_end at that time. - */ - if (pgt_buf_end > pgt_buf_start) { - printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", - end - 1, pgt_buf_start << PAGE_SHIFT, - (pgt_buf_end << PAGE_SHIFT) - 1); - x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), - PFN_PHYS(pgt_buf_end)); - } - - /* stop the wrong using */ - pgt_buf_top = 0; - early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27f7fc69cf8a..7bb11064a9e1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false; static __init void *alloc_low_page(void) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = __va(pfn * PAGE_SIZE); clear_page(adr); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index fa28e3e29741..eefaea637b3d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -316,7 +316,7 @@ void __init cleanup_highmap(void) static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = pgt_buf_end++; + unsigned long pfn; void *adr; if (after_bootmem) { @@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= pgt_buf_top) - panic("alloc_low_page: ran out of memory"); + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); clear_page(adr); From 973dc4f3fad5890bc7b694148ad4c825b9af6dc1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:38:59 -0800 Subject: [PATCH 22/82] x86, mm: Remove early_memremap workaround for page table accessing on 64bit We try to put page table high to make room for kdump, and at that time those ranges are not mapped yet, and have to use ioremap to access it. Now after patch that pre-map page table top down. x86, mm: setup page table in top-down We do not need that workaround anymore. Just use __va to return directly mapping address. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-23-git-send-email-yinghai@kernel.org Acked-by: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index eefaea637b3d..5ee924237689 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -340,36 +340,12 @@ static __ref void *alloc_low_page(unsigned long *phys) } else pfn = pgt_buf_end++; - adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); + adr = __va(pfn * PAGE_SIZE); clear_page(adr); *phys = pfn * PAGE_SIZE; return adr; } -static __ref void *map_low_page(void *virt) -{ - void *adr; - unsigned long phys, left; - - if (after_bootmem) - return virt; - - phys = __pa(virt); - left = phys & (PAGE_SIZE - 1); - adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); - adr = (void *)(((unsigned long)adr) | left); - - return adr; -} - -static __ref void unmap_low_page(void *adr) -{ - if (after_bootmem) - return; - - early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) @@ -442,10 +418,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); - pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + pte = (pte_t *)pmd_page_vaddr(*pmd); last_map_addr = phys_pte_init(pte, address, end, prot); - unmap_low_page(pte); spin_unlock(&init_mm.page_table_lock); continue; } @@ -483,7 +458,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pte = alloc_low_page(&pte_phys); last_map_addr = phys_pte_init(pte, address, end, new_prot); - unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); @@ -518,10 +492,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (pud_val(*pud)) { if (!pud_large(*pud)) { - pmd = map_low_page(pmd_offset(pud, 0)); + pmd = pmd_offset(pud, 0); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); __flush_tlb_all(); continue; } @@ -560,7 +533,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, pmd = alloc_low_page(&pmd_phys); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); - unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); pud_populate(&init_mm, pud, __va(pmd_phys)); @@ -596,17 +568,15 @@ kernel_physical_mapping_init(unsigned long start, next = end; if (pgd_val(*pgd)) { - pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + pud = (pud_t *)pgd_page_vaddr(*pgd); last_map_addr = phys_pud_init(pud, __pa(start), __pa(end), page_size_mask); - unmap_low_page(pud); continue; } pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); - unmap_low_page(pud); spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); From 868bf4d6b94c980d3ad87f892a5e528b8ee2c320 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:00 -0800 Subject: [PATCH 23/82] x86, mm: Remove parameter in alloc_low_page for 64bit Now all page table buf are pre-mapped, and could use virtual address directly. So don't need to remember physical address anymore. Remove that phys pointer in alloc_low_page(), and that will allow us to merge alloc_low_page between 64bit and 32bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-24-git-send-email-yinghai@kernel.org Acked-by: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_64.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5ee924237689..19608203fa5d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -314,14 +314,13 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(unsigned long *phys) +static __ref void *alloc_low_page(void) { unsigned long pfn; void *adr; if (after_bootmem) { adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - *phys = __pa(adr); return adr; } @@ -342,7 +341,6 @@ static __ref void *alloc_low_page(unsigned long *phys) adr = __va(pfn * PAGE_SIZE); clear_page(adr); - *phys = pfn * PAGE_SIZE; return adr; } @@ -401,7 +399,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address = next) { - unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; pgprot_t new_prot = prot; @@ -456,11 +453,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, continue; } - pte = alloc_low_page(&pte_phys); + pte = alloc_low_page(); last_map_addr = phys_pte_init(pte, address, end, new_prot); spin_lock(&init_mm.page_table_lock); - pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + pmd_populate_kernel(&init_mm, pmd, pte); spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); @@ -476,7 +473,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, int i = pud_index(addr); for (; i < PTRS_PER_PUD; i++, addr = next) { - unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; pgprot_t prot = PAGE_KERNEL; @@ -530,12 +526,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, continue; } - pmd = alloc_low_page(&pmd_phys); + pmd = alloc_low_page(); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, prot); spin_lock(&init_mm.page_table_lock); - pud_populate(&init_mm, pud, __va(pmd_phys)); + pud_populate(&init_mm, pud, pmd); spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); @@ -560,7 +556,6 @@ kernel_physical_mapping_init(unsigned long start, for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); - unsigned long pud_phys; pud_t *pud; next = (start + PGDIR_SIZE) & PGDIR_MASK; @@ -574,12 +569,12 @@ kernel_physical_mapping_init(unsigned long start, continue; } - pud = alloc_low_page(&pud_phys); + pud = alloc_low_page(); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, __va(pud_phys)); + pgd_populate(&init_mm, pgd, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } From 5c51bdbe4c74dce7996d0bbfa39974775cc3f13c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:01 -0800 Subject: [PATCH 24/82] x86, mm: Merge alloc_low_page between 64bit and 32bit They are almost same except 64 bit need to handle after_bootmem case. Add mm_internal.h to make that alloc_low_page() only to be accessible from arch/x86/mm/init*.c Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-25-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 26 ++------------------------ arch/x86/mm/init_64.c | 32 ++------------------------------ arch/x86/mm/mm_internal.h | 6 ++++++ 4 files changed, 44 insertions(+), 54 deletions(-) create mode 100644 arch/x86/mm/mm_internal.h diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2393d0099e7f..848189229b2d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -17,10 +17,44 @@ #include #include /* for MAX_DMA_PFN */ +#include "mm_internal.h" + unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +__ref void *alloc_low_page(void) +{ + unsigned long pfn; + void *adr; + +#ifdef CONFIG_X86_64 + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + + return adr; + } +#endif + + if ((pgt_buf_end + 1) >= pgt_buf_top) { + unsigned long ret; + if (min_pfn_mapped >= max_pfn_mapped) + panic("alloc_low_page: ran out of memory"); + ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE, PAGE_SIZE); + if (!ret) + panic("alloc_low_page: can not alloc memory"); + memblock_reserve(ret, PAGE_SIZE); + pfn = ret >> PAGE_SHIFT; + } else + pfn = pgt_buf_end++; + + adr = __va(pfn * PAGE_SIZE); + clear_page(adr); + return adr; +} + /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ #define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7bb11064a9e1..a7f2df1cdcfd 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -53,36 +53,14 @@ #include #include +#include "mm_internal.h" + unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); bool __read_mostly __vmalloc_start_set = false; -static __init void *alloc_low_page(void) -{ - unsigned long pfn; - void *adr; - - if ((pgt_buf_end + 1) >= pgt_buf_top) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); - if (!ret) - panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); - pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - /* * Creates a middle page table and puts a pointer to it in the * given global directory entry. This only returns the gd entry diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 19608203fa5d..1d53defc99c9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,8 @@ #include #include +#include "mm_internal.h" + static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -314,36 +316,6 @@ void __init cleanup_highmap(void) } } -static __ref void *alloc_low_page(void) -{ - unsigned long pfn; - void *adr; - - if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); - - return adr; - } - - if ((pgt_buf_end + 1) >= pgt_buf_top) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_page: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); - if (!ret) - panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); - pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; - - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; -} - static unsigned long __meminit phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, pgprot_t prot) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 000000000000..b3f993a2555e --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,6 @@ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_page(void); + +#endif /* __X86_MM_INTERNAL_H */ From 9985b4c6fa7d660f685918a58282275e9e35d8e0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:02 -0800 Subject: [PATCH 25/82] x86, mm: Move min_pfn_mapped back to mm/init.c Also change it to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-26-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/mm/init.c | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 9f6f3e66e84d..54c97879195e 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -45,7 +45,6 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; -extern unsigned long min_pfn_mapped; static inline phys_addr_t get_max_mapped(void) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7634092931b..20151941cce8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -124,7 +124,6 @@ */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; -unsigned long min_pfn_mapped; #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 848189229b2d..6392bf9a3947 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -23,6 +23,8 @@ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; +static unsigned long min_pfn_mapped; + __ref void *alloc_low_page(void) { unsigned long pfn; From 6f80b68e9e515547edbacb0c37491730bf766db5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:03 -0800 Subject: [PATCH 26/82] x86, mm, Xen: Remove mapping_pagetable_reserve() Page table area are pre-mapped now after x86, mm: setup page table in top-down x86, mm: Remove early_memremap workaround for page table accessing on 64bit mapping_pagetable_reserve is not used anymore, so remove it. Also remove operation in mask_rw_pte(), as modified allow_low_page always return pages that are already mapped, moreover xen_alloc_pte_init, xen_alloc_pmd_init, etc, will mark the page RO before hooking it into the pagetable automatically. -v2: add changelog about mask_rw_pte() from Stefano. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-27-git-send-email-yinghai@kernel.org Cc: Stefano Stabellini Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pgtable_types.h | 1 - arch/x86/include/asm/x86_init.h | 12 ------------ arch/x86/kernel/x86_init.c | 4 ---- arch/x86/mm/init.c | 4 ---- arch/x86/xen/mmu.c | 28 ---------------------------- 5 files changed, 49 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505d..79738f20aaf5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -301,7 +301,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 57693498519c..3b2ce8fc995a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,17 +68,6 @@ struct x86_init_oem { void (*banner)(void); }; -/** - * struct x86_init_mapping - platform specific initial kernel pagetable setup - * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage - * - * For more details on the purpose of this hook, look in - * init_memory_mapping and the commit that added it. - */ -struct x86_init_mapping { - void (*pagetable_reserve)(u64 start, u64 end); -}; - /** * struct x86_init_paging - platform specific paging functions * @pagetable_init: platform specific paging initialization call to setup @@ -136,7 +125,6 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; - struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7a3d075a814a..50cf83ecd32e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, - .mapping = { - .pagetable_reserve = native_pagetable_reserve, - }, - .paging = { .pagetable_init = native_pagetable_init, }, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6392bf9a3947..21173fcdb4a1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -112,10 +112,6 @@ static void __init probe_page_size_mask(void) __supported_pte_mask |= _PAGE_GLOBAL; } } -void __init native_pagetable_reserve(u64 start, u64 end) -{ - memblock_reserve(start, end - start); -} #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dcf5f2dd91ec..bbb883f58bc4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); -static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) -{ - /* reserve the range used */ - native_pagetable_reserve(start, end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, - PFN_PHYS(pgt_buf_top)); - while (end < PFN_PHYS(pgt_buf_top)) { - make_lowmem_page_readwrite(__va(end)); - end += PAGE_SIZE; - } -} - #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - - /* - * If the new pfn is within the range of the newly allocated - * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot as a freshly allocated page, make sure - * it is RO. - */ - if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_top)) || - (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) - pte = pte_wrprotect(pte); - return pte; } #endif /* CONFIG_X86_64 */ @@ -2197,7 +2170,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { - x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_init = xen_pagetable_init; pv_mmu_ops = xen_mmu_ops; From 22c8ca2ac256bb681be791858b35502b5d37e73b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:04 -0800 Subject: [PATCH 27/82] x86, mm: Add alloc_low_pages(num) 32bit kmap mapping needs pages to be used for low to high. At this point those pages are still from pgt_buf_* from BRK, so it is ok now. But we want to move early_ioremap_page_table_range_init() out of init_memory_mapping() and only call it one time later, that will make page_table_range_init/page_table_kmap_check/alloc_low_page to use memblock to get page. memblock allocation for pages are from high to low. So will get panic from page_table_kmap_check() that has BUG_ON to do ordering checking. This patch add alloc_low_pages to make it possible to allocate serveral pages at first, and hand out pages one by one from low to high. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-28-git-send-email-yinghai@kernel.org Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 33 +++++++++++++++++++++------------ arch/x86/mm/mm_internal.h | 6 +++++- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 21173fcdb4a1..02cea14c6d0c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,36 +25,45 @@ unsigned long __meminitdata pgt_buf_top; static unsigned long min_pfn_mapped; -__ref void *alloc_low_page(void) +__ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; - void *adr; + int i; #ifdef CONFIG_X86_64 if (after_bootmem) { - adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + unsigned int order; - return adr; + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | + __GFP_ZERO, order); } #endif - if ((pgt_buf_end + 1) >= pgt_buf_top) { + if ((pgt_buf_end + num) >= pgt_buf_top) { unsigned long ret; if (min_pfn_mapped >= max_pfn_mapped) panic("alloc_low_page: ran out of memory"); ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE, PAGE_SIZE); + PAGE_SIZE * num , PAGE_SIZE); if (!ret) panic("alloc_low_page: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE); + memblock_reserve(ret, PAGE_SIZE * num); pfn = ret >> PAGE_SHIFT; - } else - pfn = pgt_buf_end++; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + } - adr = __va(pfn * PAGE_SIZE); - clear_page(adr); - return adr; + for (i = 0; i < num; i++) { + void *adr; + + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); + } + + return __va(pfn << PAGE_SHIFT); } /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index b3f993a2555e..7e3b88ee078a 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -1,6 +1,10 @@ #ifndef __X86_MM_INTERNAL_H #define __X86_MM_INTERNAL_H -void *alloc_low_page(void); +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} #endif /* __X86_MM_INTERNAL_H */ From ddd3509df8f8d4f1cf4784f559d702ce00dc8846 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 16 Nov 2012 19:39:05 -0800 Subject: [PATCH 28/82] x86, mm: Add pointer about Xen mmu requirement for alloc_low_pages Add link for more information 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve -v2: updated to commets from hpa to include commit name. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-29-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 02cea14c6d0c..cb4f8ba70ecc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,6 +25,15 @@ unsigned long __meminitdata pgt_buf_top; static unsigned long min_pfn_mapped; +/* + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. + */ __ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; From 719272c45b821d38608fc333700bde1a89c56c59 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:06 -0800 Subject: [PATCH 29/82] x86, mm: only call early_ioremap_page_table_range_init() once On 32bit, before patcheset that only set page table for ram, we only call that one time. Now, we are calling that during every init_memory_mapping if we have holes under max_low_pfn. We should only call it one time after all ranges under max_low_page get mapped just like we did before. Also that could avoid the risk to run out of pgt_buf in BRK. Need to update page_table_range_init() to count the pages for kmap page table at first, and use new added alloc_low_pages() to get pages in sequence. That will conform to the requirement that pages need to be in low to high order. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-30-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 13 +++++------- arch/x86/mm/init_32.c | 47 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cb4f8ba70ecc..bed4888c6f4f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -343,14 +343,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); -#ifdef CONFIG_X86_32 - early_ioremap_page_table_range_init(); - - load_cr3(swapper_pg_dir); -#endif - - __flush_tlb_all(); - add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); return ret >> PAGE_SHIFT; @@ -447,7 +439,12 @@ void __init init_mem_mapping(void) /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } +#else + early_ioremap_page_table_range_init(); + load_cr3(swapper_pg_dir); + __flush_tlb_all(); #endif + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a7f2df1cdcfd..0ae1ba8bc1b9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -135,8 +135,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr) return one_page_table_init(pmd) + pte_idx; } +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, - unsigned long vaddr, pte_t *lastpte) + unsigned long vaddr, pte_t *lastpte, + void **adr) { #ifdef CONFIG_HIGHMEM /* @@ -150,16 +181,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin - && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start - || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { pte_t *newpte; int i; BUG_ON(after_bootmem); - newpte = alloc_low_page(); + newpte = *adr; for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); @@ -193,6 +223,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); vaddr = start; pgd_idx = pgd_index(vaddr); @@ -205,7 +240,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { pte = page_table_kmap_check(one_page_table_init(pmd), - pmd, vaddr, pte); + pmd, vaddr, pte, &adr); vaddr += PMD_SIZE; } From cf47065961b48727b4e47bc3e2e67f4996878437 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:07 -0800 Subject: [PATCH 30/82] x86, mm: Move back pgt_buf_* to mm/init.c Also change them to static. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-31-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 4 ---- arch/x86/mm/init.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 4f13998be59a..626ea8d31b6d 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -12,8 +12,4 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); -extern unsigned long __initdata pgt_buf_start; -extern unsigned long __meminitdata pgt_buf_end; -extern unsigned long __meminitdata pgt_buf_top; - #endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bed4888c6f4f..3cadf1013cea 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -19,9 +19,9 @@ #include "mm_internal.h" -unsigned long __initdata pgt_buf_start; -unsigned long __meminitdata pgt_buf_end; -unsigned long __meminitdata pgt_buf_top; +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; From 148b20989e0b83cb301e1fcd9e987c7abde05333 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:08 -0800 Subject: [PATCH 31/82] x86, mm: Move init_gbpages() out of setup.c Put it in mm/init.c, and call it from probe_page_mask(). init_mem_mapping is calling probe_page_mask at first. So calling sequence is not changed. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-32-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 15 +-------------- arch/x86/mm/init.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 20151941cce8..85b62f1c8071 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -282,18 +282,7 @@ void * __init extend_brk(size_t size, size_t align) return ret; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} +#ifdef CONFIG_X86_32 static void __init cleanup_highmap(void) { } @@ -933,8 +922,6 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); - init_gbpages(); - init_mem_mapping(); memblock.current_limit = get_max_mapped(); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3cadf1013cea..8168bf8fcda7 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -98,6 +98,16 @@ int direct_gbpages #endif ; +static void __init init_gbpages(void) +{ +#ifdef CONFIG_X86_64 + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +#endif +} + struct map_range { unsigned long start; unsigned long end; @@ -108,6 +118,8 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { + init_gbpages(); + #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. From f836e35a98ab3b2f0d4c8730610e4a4a7f533505 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:09 -0800 Subject: [PATCH 32/82] x86, mm: change low/hignmem_pfn_init to static on 32bit Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-33-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0ae1ba8bc1b9..322ee56ea1fe 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -575,7 +575,7 @@ early_param("highmem", parse_highmem); * artificially via the highmem=x boot parameter then create * it: */ -void __init lowmem_pfn_init(void) +static void __init lowmem_pfn_init(void) { /* max_low_pfn is 0, we already have early_res support */ max_low_pfn = max_pfn; @@ -611,7 +611,7 @@ void __init lowmem_pfn_init(void) * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ -void __init highmem_pfn_init(void) +static void __init highmem_pfn_init(void) { max_low_pfn = MAXMEM_PFN; From c8dcdb9ce463ad4a660099a74a850f4f6fc81c41 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:10 -0800 Subject: [PATCH 33/82] x86, mm: Move function declaration into mm_internal.h They are only for mm/init*.c. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-34-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/init.h | 16 +++------------- arch/x86/mm/mm_internal.h | 7 +++++++ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 626ea8d31b6d..bac770b7cbc4 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,15 +1,5 @@ -#ifndef _ASM_X86_INIT_32_H -#define _ASM_X86_INIT_32_H +#ifndef _ASM_X86_INIT_H +#define _ASM_X86_INIT_H -#ifdef CONFIG_X86_32 -extern void __init early_ioremap_page_table_range_init(void); -#endif -extern void __init zone_sizes_init(void); - -extern unsigned long __init -kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask); - -#endif /* _ASM_X86_INIT_32_H */ +#endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 7e3b88ee078a..dc79ac121774 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -7,4 +7,11 @@ static inline void *alloc_low_page(void) return alloc_low_pages(1); } +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + #endif /* __X86_MM_INTERNAL_H */ From 11ed9e927d573d78beda6e6a166612666ae97064 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:11 -0800 Subject: [PATCH 34/82] x86, mm: Add check before clear pte above max_low_pfn on 32bit During test patch that adjust page_size_mask to map small range ram with big page size, found page table is setup wrongly for 32bit. And native_pagetable_init wrong clear pte for pmd with large page support. 1. add more comments about why we are expecting pte. 2. add BUG checking, so next time we could find problem earlier when we mess up page table setup again. 3. max_low_pfn is not included boundary for low memory mapping. We should check from max_low_pfn instead of +1. 4. add print out when some pte really get cleared, or we should use WARN() to find out why above max_low_pfn get mapped? so we could fix it. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-35-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init_32.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 322ee56ea1fe..19ef9f018012 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -480,9 +480,14 @@ void __init native_pagetable_init(void) /* * Remove any mappings which extend past the end of physical - * memory from the boot time page table: + * memory from the boot time page table. + * In virtual address space, we should have at least two pages + * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END + * definition. And max_low_pfn is set to VMALLOC_END physical + * address. If initial memory mapping is doing right job, we + * should have pte used near max_low_pfn or one pmd is not present. */ - for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) { va = PAGE_OFFSET + (pfn<> PAGE_SHIFT); From 5a0d3aeeeffbd1534a510fc10c4ab7c99c45afce Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:12 -0800 Subject: [PATCH 35/82] x86, mm: use round_up/down in split_mem_range() to replace own inline version for those roundup and rounddown. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-36-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8168bf8fcda7..0e625e606e5d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -218,13 +218,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * slowdowns. */ if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + end_pfn = PMD_SIZE >> PAGE_SHIFT; else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; @@ -234,15 +232,13 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; #ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; #else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); + end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; + if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT)) + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; #endif if (start_pfn < end_pfn) { @@ -253,9 +249,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #ifdef CONFIG_X86_64 /* big page (1G) range */ - start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; + end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & @@ -264,9 +259,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, } /* tail is not big page (1G) alignment */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:13 -0800 Subject: [PATCH 36/82] x86, mm: use PFN_DOWN in split_mem_range() to replace own inline version for shifting. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-37-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0e625e606e5d..1cca052b2cbd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -208,8 +208,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, int i; /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; + start_pfn = PFN_DOWN(start); + pos = PFN_PHYS(start_pfn); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -218,59 +218,59 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * slowdowns. */ if (pos == 0) - end_pfn = PMD_SIZE >> PAGE_SHIFT; + end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; + if (end_pfn > PFN_DOWN(end)) + end_pfn = PFN_DOWN(end); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; + pos = PFN_PHYS(end_pfn); } /* big page (2M) range */ - start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT; - if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT)) - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<> PAGE_SHIFT; - end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + end_pfn = PFN_DOWN(round_down(end, PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<> PAGE_SHIFT; - end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT; + start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; + start_pfn = PFN_DOWN(pos); + end_pfn = PFN_DOWN(end); nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* try to merge same page size and continuous */ From 1829ae9ad7380bf17333ab9ad1610631d9cb8664 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:14 -0800 Subject: [PATCH 37/82] x86, mm: use pfn instead of pos in split_mem_range could save some bit shifting operations. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-38-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1cca052b2cbd..4bf1c5374928 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -204,12 +204,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long end) { unsigned long start_pfn, end_pfn; - unsigned long pos; + unsigned long pfn; int i; /* head if not big page alignment ? */ - start_pfn = PFN_DOWN(start); - pos = PFN_PHYS(start_pfn); + pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory @@ -217,26 +216,26 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, * and overlapping MTRRs into large pages can cause * slowdowns. */ - if (pos == 0) + if (pfn == 0) end_pfn = PFN_DOWN(PMD_SIZE); else - end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif if (end_pfn > PFN_DOWN(end)) end_pfn = PFN_DOWN(end); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = PFN_PHYS(end_pfn); + pfn = end_pfn; } /* big page (2M) range */ - start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE)); + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #else /* CONFIG_X86_64 */ - end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE)); + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); #endif @@ -244,32 +243,32 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:15 -0800 Subject: [PATCH 38/82] x86, mm: use limit_pfn for end pfn instead of shifting end to get that. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-39-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4bf1c5374928..f410dc6f843e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -203,10 +203,12 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long start, unsigned long end) { - unsigned long start_pfn, end_pfn; + unsigned long start_pfn, end_pfn, limit_pfn; unsigned long pfn; int i; + limit_pfn = PFN_DOWN(end); + /* head if not big page alignment ? */ pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 @@ -223,8 +225,8 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif - if (end_pfn > PFN_DOWN(end)) - end_pfn = PFN_DOWN(end); + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pfn = end_pfn; @@ -233,11 +235,11 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, /* big page (2M) range */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); - if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE))) - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { @@ -249,7 +251,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, #ifdef CONFIG_X86_64 /* big page (1G) range */ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); - end_pfn = PFN_DOWN(round_down(end, PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & @@ -259,7 +261,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, /* tail is not big page (1G) alignment */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); - end_pfn = PFN_DOWN(round_down(end, PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1< Date: Fri, 16 Nov 2012 19:39:16 -0800 Subject: [PATCH 39/82] x86, mm: Unifying after_bootmem for 32bit and 64bit after_bootmem has different meaning in 32bit and 64bit. 32bit: after bootmem is ready 64bit: after bootmem is distroyed Let's merget them make 32bit the same as 64bit. for 32bit, it is mixing alloc_bootmem_pages, and alloc_low_page under after_bootmem is set or not set. alloc_bootmem is just wrapper for memblock for x86. Now we have alloc_low_page() with memblock too. We can drop bootmem path now, and only alloc_low_page only. At the same time, we make alloc_low_page could handle real after_bootmem for 32bit, because alloc_bootmem_pages could fallback to use slab too. At last move after_bootmem set position for 32bit the same as 64bit. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-40-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 2 -- arch/x86/mm/init_32.c | 21 ++++----------------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f410dc6f843e..2a27e5ac1e3c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -39,7 +39,6 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long pfn; int i; -#ifdef CONFIG_X86_64 if (after_bootmem) { unsigned int order; @@ -47,7 +46,6 @@ __ref void *alloc_low_pages(unsigned int num) return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO, order); } -#endif if ((pgt_buf_end + num) >= pgt_buf_top) { unsigned long ret; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 19ef9f018012..f4fc4a28393a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -73,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); - else - pmd_table = (pmd_t *)alloc_low_page(); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); @@ -98,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { - pte_t *page_table = NULL; - - if (after_bootmem) { -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) - page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); -#endif - if (!page_table) - page_table = - (pte_t *)alloc_bootmem_pages(PAGE_SIZE); - } else - page_table = (pte_t *)alloc_low_page(); + pte_t *page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -708,8 +695,6 @@ void __init setup_bootmem_allocator(void) printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped< Date: Fri, 16 Nov 2012 19:39:17 -0800 Subject: [PATCH 40/82] x86, mm: Move after_bootmem to mm_internel.h it is only used in arch/x86/mm/init*.c Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-41-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/mm_internal.h | 2 ++ include/linux/mm.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index dc79ac121774..6b563a118891 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -14,4 +14,6 @@ unsigned long kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask); void zone_sizes_init(void); +extern int after_bootmem; + #endif /* __X86_MM_INTERNAL_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index bcaab4e6fe91..64d5271a3d36 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1355,7 +1355,6 @@ extern void __init mmap_init(void); extern void show_mem(unsigned int flags); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -extern int after_bootmem; extern __printf(3, 4) void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); From b8fd39c036ab982aa087b7ee671f86e2574d31f2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:18 -0800 Subject: [PATCH 41/82] x86, mm: Use clamp_t() in init_range_memory_mapping save some lines, and make code more readable. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-42-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 2a27e5ac1e3c..6f85de8a1f28 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -357,31 +357,20 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * would have hole in the middle or ends, and only ram parts will be mapped. */ static unsigned long __init init_range_memory_mapping( - unsigned long range_start, - unsigned long range_end) + unsigned long r_start, + unsigned long r_end) { unsigned long start_pfn, end_pfn; unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - u64 start = (u64)start_pfn << PAGE_SHIFT; - u64 end = (u64)end_pfn << PAGE_SHIFT; - - if (end <= range_start) + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) continue; - if (start < range_start) - start = range_start; - - if (start >= range_end) - continue; - - if (end > range_end) - end = range_end; - init_memory_mapping(start, end); - mapped_ram_size += end - start; } From 94b43c3d86dddf95064fc83e9087448b35f985ff Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:19 -0800 Subject: [PATCH 42/82] x86, mm: kill numa_free_all_bootmem() Now NO_BOOTMEM version free_all_bootmem_node() does not really do free_bootmem at all, and it only call register_page_bootmem_info_node instead. That is confusing, try to kill that free_all_bootmem_node(). Before that, this patch will remove numa_free_all_bootmem(). That function could be replaced with register_page_bootmem_info() and free_all_bootmem(); Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-43-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numa_64.h | 2 -- arch/x86/mm/init_64.c | 15 +++++++++++---- arch/x86/mm/numa_64.c | 13 ------------- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 0c05f7ae46e8..fe4d2d410ad2 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -1,6 +1,4 @@ #ifndef _ASM_X86_NUMA_64_H #define _ASM_X86_NUMA_64_H -extern unsigned long numa_free_all_bootmem(void); - #endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1d53defc99c9..41785305f645 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -629,6 +629,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static struct kcore_list kcore_vsyscall; +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} + void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; @@ -641,11 +651,8 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_NUMA - totalram_pages = numa_free_all_bootmem(); -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif absent_pages = absent_pages_in_range(0, max_pfn); reservedpages = max_pfn - totalram_pages - absent_pages; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 92e27119ee1a..9405ffc91502 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -10,16 +10,3 @@ void __init initmem_init(void) { x86_numa_init(); } - -unsigned long __init numa_free_all_bootmem(void) -{ - unsigned long pages = 0; - int i; - - for_each_online_node(i) - pages += free_all_bootmem_node(NODE_DATA(i)); - - pages += free_low_memory_core_early(MAX_NUMNODES); - - return pages; -} From c074eaac2ab264c94520efff7e896b771de885ae Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:20 -0800 Subject: [PATCH 43/82] x86, mm: kill numa_64.h Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-44-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numa.h | 2 -- arch/x86/include/asm/numa_64.h | 4 ---- arch/x86/kernel/acpi/boot.c | 1 - arch/x86/kernel/cpu/amd.c | 1 - arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/setup.c | 3 --- 6 files changed, 12 deletions(-) delete mode 100644 arch/x86/include/asm/numa_64.h diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 49119fcea2dc..52560a2038e1 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu) #ifdef CONFIG_X86_32 # include -#else -# include #endif #ifdef CONFIG_NUMA diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h deleted file mode 100644 index fe4d2d410ad2..000000000000 --- a/arch/x86/include/asm/numa_64.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _ASM_X86_NUMA_64_H -#define _ASM_X86_NUMA_64_H - -#endif /* _ASM_X86_NUMA_64_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589ac..4b23aa18518d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include -# include #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9619ba6528ca..913f94f9e8d9 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -12,7 +12,6 @@ #include #ifdef CONFIG_X86_64 -# include # include # include #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531a..3b547cc4bd03 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -17,7 +17,6 @@ #ifdef CONFIG_X86_64 #include -#include #endif #include "cpu.h" diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 85b62f1c8071..6d29d1fcf068 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,9 +108,6 @@ #include #include #include -#ifdef CONFIG_X86_64 -#include -#endif #include #include #include From 961f8fa04b1c3dfe35c2d3ee7ccba28f9b0e0e09 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:21 -0800 Subject: [PATCH 44/82] sparc, mm: Remove calling of free_all_bootmem_node() Now NO_BOOTMEM version free_all_bootmem_node() does not really do free_bootmem at all, and it only call register_page_bootmem_info_node instead. That is confusing, try to kill that free_all_bootmem_node(). Before that, this patch will remove calling of free_all_bootmem_node() We add register_page_bootmem_info() to call register_page_bootmem_info_node directly. Also could use free_all_bootmem() for numa case, and it is just the same as free_low_memory_core_early(). Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-45-git-send-email-yinghai@kernel.org Cc: "David S. Miller" Cc: Andrew Morton Cc: sparclinux@vger.kernel.org Acked-by: "David S. Miller" Signed-off-by: H. Peter Anvin --- arch/sparc/mm/init_64.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 9e28a118e6a4..b24bac238e34 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2021,6 +2021,16 @@ static void __init patch_tlb_miss_handler_bitmap(void) flushi(&valid_addr_bitmap_insn[0]); } +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + int i; + + for_each_online_node(i) + if (NODE_DATA(i)->node_spanned_pages) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} void __init mem_init(void) { unsigned long codepages, datapages, initpages; @@ -2038,20 +2048,8 @@ void __init mem_init(void) high_memory = __va(last_valid_pfn << PAGE_SHIFT); -#ifdef CONFIG_NEED_MULTIPLE_NODES - { - int i; - for_each_online_node(i) { - if (NODE_DATA(i)->node_spanned_pages != 0) { - totalram_pages += - free_all_bootmem_node(NODE_DATA(i)); - } - } - totalram_pages += free_low_memory_core_early(MAX_NUMNODES); - } -#else + register_page_bootmem_info(); totalram_pages = free_all_bootmem(); -#endif /* We subtract one to account for the mem_map_zero page * allocated below. From 600cc5b7f6371706679490d7ee108015ae57ac2f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:22 -0800 Subject: [PATCH 45/82] mm: Kill NO_BOOTMEM version free_all_bootmem_node() Now NO_BOOTMEM version free_all_bootmem_node() does not really do free_bootmem at all, and it only call register_page_bootmem_info_node for online nodes instead. That is confusing. We can kill that free_all_bootmem_node(), after we kill two callings in x86 and sparc. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-46-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- mm/nobootmem.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b31411..ecc2f13d557d 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,20 +137,6 @@ unsigned long __init free_low_memory_core_early(int nodeid) return count; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) -{ - register_page_bootmem_info_node(pgdat); - - /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -} - /** * free_all_bootmem - release free pages to the buddy allocator * From 9710f581bb4c35589ac046b0cfc0deb7f369fc85 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 16 Nov 2012 19:39:23 -0800 Subject: [PATCH 46/82] x86, mm: Let "memmap=" take more entries one time Current "memmap=" only can take one entry every time. when we have more entries, we have to use memmap= for each of them. For pxe booting, we have command line length limitation, those extra "memmap=" would waste too much space. This patch make memmap= could take several entries one time, and those entries will be split with ',' Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1353123563-3103-47-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index df06ade26bef..d32abeabbda5 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -835,7 +835,7 @@ static int __init parse_memopt(char *p) } early_param("mem", parse_memopt); -static int __init parse_memmap_opt(char *p) +static int __init parse_memmap_one(char *p) { char *oldp; u64 start_at, mem_size; @@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p) return *p == '\0' ? 0 : -EINVAL; } +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) From 65315d4889d403ea025081d8ca85ddf7b9c10f39 Mon Sep 17 00:00:00 2001 From: Cong Ding Date: Mon, 14 Jan 2013 17:13:35 +0000 Subject: [PATCH 47/82] x86/boot: Fix minor fd leakage in tools/relocs.c The opened file should be closed. Signed-off-by: Cong Ding Cc: Kusanagi Kouichi Cc: Jarkko Sakkinen Cc: Jiri Kosina Cc: Matt Fleming Link: http://lkml.kernel.org/r/1358183628-27784-1-git-send-email-dinggnu@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/tools/relocs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 5a1847d61930..79d67bd507fa 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -814,12 +814,14 @@ int main(int argc, char **argv) read_relocs(fp); if (show_absolute_syms) { print_absolute_symbols(); - return 0; + goto out; } if (show_absolute_relocs) { print_absolute_relocs(); - return 0; + goto out; } emit_relocs(as_text, use_real_mode); +out: + fclose(fp); return 0; } From 09c205afde70c15f20ca76ba0a57409dad175fd0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 27 Jan 2013 10:43:28 -0800 Subject: [PATCH 48/82] x86, boot: Define the 2.12 bzImage boot protocol Define the 2.12 bzImage boot protocol: add xloadflags and additional fields to allow the command line, initramfs and struct boot_params to live above the 4 GiB mark. The xloadflags now communicates if this is a 64-bit kernel with the legacy 64-bit entry point and which of the EFI handover entry points are supported. Avoid adding new read flags to loadflags because of claimed bootloaders testing the whole byte for == 1 to determine bzImageness at least until the issue can be researched further. This is based on patches by Yinghai Lu and David Woodhouse. Originally-by: Yinghai Lu Originally-by: David Woodhouse Acked-by: Yinghai Lu Acked-by: David Woodhouse Acked-by: Matt Fleming Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1359058816-7615-26-git-send-email-yinghai@kernel.org Cc: Rob Landley Cc: Gokul Caushik Cc: Josh Triplett Cc: Joe Millenbach --- Documentation/x86/boot.txt | 27 +++++++++++- Documentation/x86/zero-page.txt | 4 ++ arch/x86/boot/header.S | 39 ++++++++++++----- arch/x86/boot/setup.ld | 2 +- arch/x86/include/uapi/asm/bootparam.h | 63 +++++++++++++++++++-------- 5 files changed, 106 insertions(+), 29 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 406d82d5d2bb..3edb4c2887a1 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -57,6 +57,10 @@ Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment Protocol 2.11: (Kernel 3.6) Added a field for offset of EFI handover protocol entry point. +Protocol 2.12: (Kernel 3.9) Added the xloadflags field and extension fields + to struct boot_params for for loading bzImage and ramdisk + above 4G in 64bit. + **** MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -182,7 +186,7 @@ Offset Proto Name Meaning 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 0235/1 2.10+ min_alignment Minimum alignment, as a power of two -0236/2 N/A pad3 Unused +0236/2 2.12+ xloadflags Boot protocol option flags 0238/4 2.06+ cmdline_size Maximum size of the kernel command line 023C/4 2.07+ hardware_subarch Hardware subarchitecture 0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data @@ -582,6 +586,27 @@ Protocol: 2.10+ misaligned kernel. Therefore, a loader should typically try each power-of-two alignment from kernel_alignment down to this alignment. +Field name: xloadflags +Type: read +Offset/size: 0x236/2 +Protocol: 2.12+ + + This field is a bitmask. + + Bit 0 (read): XLF_KERNEL_64 + - If 1, this kernel has the legacy 64-bit entry point at 0x200. + + Bit 1 (read): XLF_CAN_BE_LOADED_ABOVE_4G + - If 1, kernel/boot_params/cmdline/ramdisk can be above 4G. + + Bit 2 (read): XLF_EFI_HANDOVER_32 + - If 1, the kernel supports the 32-bit EFI handoff entry point + given at handover_offset. + + Bit 3 (read): XLF_EFI_HANDOVER_64 + - If 1, the kernel supports the 64-bit EFI handoff entry point + given at handover_offset + 0x200. + Field name: cmdline_size Type: read Offset/size: 0x238/4 diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index cf5437deda81..199f453cb4de 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -19,6 +19,9 @@ Offset Proto Name Meaning 090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!! 0A0/010 ALL sys_desc_table System description table (struct sys_desc_table) 0B0/010 ALL olpc_ofw_header OLPC's OpenFirmware CIF and friends +0C0/004 ALL ext_ramdisk_image ramdisk_image high 32bits +0C4/004 ALL ext_ramdisk_size ramdisk_size high 32bits +0C8/004 ALL ext_cmd_line_ptr cmd_line_ptr high 32bits 140/080 ALL edid_info Video mode setup (struct edid_info) 1C0/020 ALL efi_info EFI 32 information (struct efi_info) 1E0/004 ALL alk_mem_k Alternative mem check, in KB @@ -27,6 +30,7 @@ Offset Proto Name Meaning 1E9/001 ALL eddbuf_entries Number of entries in eddbuf (below) 1EA/001 ALL edd_mbr_sig_buf_entries Number of entries in edd_mbr_sig_buffer (below) +1EF/001 ALL sentinel Used to detect broken bootloaders 290/040 ALL edd_mbr_sig_buffer EDD MBR signatures 2D0/A00 ALL e820_map E820 memory map table (array of struct e820entry) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8c132a625b94..944ce595f767 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -21,6 +21,7 @@ #include #include #include +#include #include "boot.h" #include "voffset.h" #include "zoffset.h" @@ -255,6 +256,9 @@ section_table: # header, from the old boot sector. .section ".header", "a" + .globl sentinel +sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */ + .globl hdr hdr: setup_sects: .byte 0 /* Filled in by build.c */ @@ -279,7 +283,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020b # header version number (>= 0x0105) + .word 0x020c # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -297,13 +301,7 @@ type_of_loader: .byte 0 # 0 means ancient bootloader, newer # flags, unused bits must be zero (RFU) bit within loadflags loadflags: -LOADED_HIGH = 1 # If set, the kernel is loaded high -CAN_USE_HEAP = 0x80 # If set, the loader also has set - # heap_end_ptr to tell how much - # space behind setup.S can be used for - # heap purposes. - # Only the loader knows what is free - .byte LOADED_HIGH + .byte LOADED_HIGH # The kernel is to be loaded high setup_move_size: .word 0x8000 # size to move, when setup is not # loaded at 0x90000. We will move setup @@ -369,7 +367,23 @@ relocatable_kernel: .byte 1 relocatable_kernel: .byte 0 #endif min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment -pad3: .word 0 + +xloadflags: +#ifdef CONFIG_X86_64 +# define XLF0 XLF_KERNEL_64 /* 64-bit kernel */ +#else +# define XLF0 0 +#endif +#ifdef CONFIG_EFI_STUB +# ifdef CONFIG_X86_64 +# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */ +# else +# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */ +# endif +#else +# define XLF23 0 +#endif + .word XLF0 | XLF23 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol @@ -397,8 +411,13 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr #define INIT_SIZE VO_INIT_SIZE #endif init_size: .long INIT_SIZE # kernel initialization size -handover_offset: .long 0x30 # offset to the handover +handover_offset: +#ifdef CONFIG_EFI_STUB + .long 0x30 # offset to the handover # protocol entry point +#else + .long 0 +#endif # End of setup header ##################################################### diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 03c0683636b6..96a6c7563538 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -13,7 +13,7 @@ SECTIONS .bstext : { *(.bstext) } .bsdata : { *(.bsdata) } - . = 497; + . = 495; .header : { *(.header) } .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 92862cd90201..c15ddaf90710 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -1,6 +1,31 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H +/* setup_data types */ +#define SETUP_NONE 0 +#define SETUP_E820_EXT 1 +#define SETUP_DTB 2 +#define SETUP_PCI 3 + +/* ram_size flags */ +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +/* loadflags */ +#define LOADED_HIGH (1<<0) +#define QUIET_FLAG (1<<5) +#define KEEP_SEGMENTS (1<<6) +#define CAN_USE_HEAP (1<<7) + +/* xloadflags */ +#define XLF_KERNEL_64 (1<<0) +#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) +#define XLF_EFI_HANDOVER_32 (1<<2) +#define XLF_EFI_HANDOVER_64 (1<<3) + +#ifndef __ASSEMBLY__ + #include #include #include @@ -9,12 +34,6 @@ #include #include