feee6b2989
We currently try to shrink a single zone when removing memory. We use the zone of the first page of the memory we are removing. If that memmap was never initialized (e.g., memory was never onlined), we will read garbage and can trigger kernel BUGs (due to a stale pointer): BUG: unable to handle page fault for address: 000000000000353d #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 7 Comm: kworker/u8:0 Not tainted 5.3.0-rc5-next-20190820+ #317 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.4 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:clear_zone_contiguous+0x5/0x10 Code: 48 89 c6 48 89 c3 e8 2a fe ff ff 48 85 c0 75 cf 5b 5d c3 c6 85 fd 05 00 00 01 5b 5d c3 0f 1f 840 RSP: 0018:ffffad2400043c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000200000000 RCX: 0000000000000000 RDX: 0000000000200000 RSI: 0000000000140000 RDI: 0000000000002f40 RBP: 0000000140000000 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000140000 R13: 0000000000140000 R14: 0000000000002f40 R15: ffff9e3e7aff3680 FS: 0000000000000000(0000) GS:ffff9e3e7bb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000353d CR3: 0000000058610000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __remove_pages+0x4b/0x640 arch_remove_memory+0x63/0x8d try_remove_memory+0xdb/0x130 __remove_memory+0xa/0x11 acpi_memory_device_remove+0x70/0x100 acpi_bus_trim+0x55/0x90 acpi_device_hotplug+0x227/0x3a0 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x221/0x550 worker_thread+0x50/0x3b0 kthread+0x105/0x140 ret_from_fork+0x3a/0x50 Modules linked in: CR2: 000000000000353d Instead, shrink the zones when offlining memory or when onlining failed. Introduce and use remove_pfn_range_from_zone(() for that. We now properly shrink the zones, even if we have DIMMs whereby - Some memory blocks fall into no zone (never onlined) - Some memory blocks fall into multiple zones (offlined+re-onlined) - Multiple memory blocks that fall into different zones Drop the zone parameter (with a potential dubious value) from __remove_pages() and __remove_section(). Link: http://lkml.kernel.org/r/20191006085646.5768-6-david@redhat.com Fixes:f1dd2cd13c
("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible afterd0dc12e86b
] Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Cc: Michal Hocko <mhocko@suse.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Logan Gunthorpe <logang@deltatee.com> Cc: <stable@vger.kernel.org> [5.0+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
441 lines
10 KiB
C
441 lines
10 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* linux/arch/sh/mm/init.c
|
|
*
|
|
* Copyright (C) 1999 Niibe Yutaka
|
|
* Copyright (C) 2002 - 2011 Paul Mundt
|
|
*
|
|
* Based on linux/arch/i386/mm/init.c:
|
|
* Copyright (C) 1995 Linus Torvalds
|
|
*/
|
|
#include <linux/mm.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/init.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/io.h>
|
|
#include <linux/dma-mapping.h>
|
|
#include <linux/export.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/mmzone.h>
|
|
#include <asm/kexec.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/setup.h>
|
|
#include <asm/cache.h>
|
|
#include <linux/sizes.h>
|
|
|
|
pgd_t swapper_pg_dir[PTRS_PER_PGD];
|
|
|
|
void __init generic_mem_init(void)
|
|
{
|
|
memblock_add(__MEMORY_START, __MEMORY_SIZE);
|
|
}
|
|
|
|
void __init __weak plat_mem_setup(void)
|
|
{
|
|
/* Nothing to see here, move along. */
|
|
}
|
|
|
|
#ifdef CONFIG_MMU
|
|
static pte_t *__get_pte_phys(unsigned long addr)
|
|
{
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
|
|
pgd = pgd_offset_k(addr);
|
|
if (pgd_none(*pgd)) {
|
|
pgd_ERROR(*pgd);
|
|
return NULL;
|
|
}
|
|
|
|
pud = pud_alloc(NULL, pgd, addr);
|
|
if (unlikely(!pud)) {
|
|
pud_ERROR(*pud);
|
|
return NULL;
|
|
}
|
|
|
|
pmd = pmd_alloc(NULL, pud, addr);
|
|
if (unlikely(!pmd)) {
|
|
pmd_ERROR(*pmd);
|
|
return NULL;
|
|
}
|
|
|
|
return pte_offset_kernel(pmd, addr);
|
|
}
|
|
|
|
static void set_pte_phys(unsigned long addr, unsigned long phys, pgprot_t prot)
|
|
{
|
|
pte_t *pte;
|
|
|
|
pte = __get_pte_phys(addr);
|
|
if (!pte_none(*pte)) {
|
|
pte_ERROR(*pte);
|
|
return;
|
|
}
|
|
|
|
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, prot));
|
|
local_flush_tlb_one(get_asid(), addr);
|
|
|
|
if (pgprot_val(prot) & _PAGE_WIRED)
|
|
tlb_wire_entry(NULL, addr, *pte);
|
|
}
|
|
|
|
static void clear_pte_phys(unsigned long addr, pgprot_t prot)
|
|
{
|
|
pte_t *pte;
|
|
|
|
pte = __get_pte_phys(addr);
|
|
|
|
if (pgprot_val(prot) & _PAGE_WIRED)
|
|
tlb_unwire_entry();
|
|
|
|
set_pte(pte, pfn_pte(0, __pgprot(0)));
|
|
local_flush_tlb_one(get_asid(), addr);
|
|
}
|
|
|
|
void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
|
|
{
|
|
unsigned long address = __fix_to_virt(idx);
|
|
|
|
if (idx >= __end_of_fixed_addresses) {
|
|
BUG();
|
|
return;
|
|
}
|
|
|
|
set_pte_phys(address, phys, prot);
|
|
}
|
|
|
|
void __clear_fixmap(enum fixed_addresses idx, pgprot_t prot)
|
|
{
|
|
unsigned long address = __fix_to_virt(idx);
|
|
|
|
if (idx >= __end_of_fixed_addresses) {
|
|
BUG();
|
|
return;
|
|
}
|
|
|
|
clear_pte_phys(address, prot);
|
|
}
|
|
|
|
static pmd_t * __init one_md_table_init(pud_t *pud)
|
|
{
|
|
if (pud_none(*pud)) {
|
|
pmd_t *pmd;
|
|
|
|
pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
|
if (!pmd)
|
|
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
|
__func__, PAGE_SIZE, PAGE_SIZE);
|
|
pud_populate(&init_mm, pud, pmd);
|
|
BUG_ON(pmd != pmd_offset(pud, 0));
|
|
}
|
|
|
|
return pmd_offset(pud, 0);
|
|
}
|
|
|
|
static pte_t * __init one_page_table_init(pmd_t *pmd)
|
|
{
|
|
if (pmd_none(*pmd)) {
|
|
pte_t *pte;
|
|
|
|
pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
|
if (!pte)
|
|
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
|
|
__func__, PAGE_SIZE, PAGE_SIZE);
|
|
pmd_populate_kernel(&init_mm, pmd, pte);
|
|
BUG_ON(pte != pte_offset_kernel(pmd, 0));
|
|
}
|
|
|
|
return pte_offset_kernel(pmd, 0);
|
|
}
|
|
|
|
static pte_t * __init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
|
|
unsigned long vaddr, pte_t *lastpte)
|
|
{
|
|
return pte;
|
|
}
|
|
|
|
void __init page_table_range_init(unsigned long start, unsigned long end,
|
|
pgd_t *pgd_base)
|
|
{
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
pte_t *pte = NULL;
|
|
int i, j, k;
|
|
unsigned long vaddr;
|
|
|
|
vaddr = start;
|
|
i = __pgd_offset(vaddr);
|
|
j = __pud_offset(vaddr);
|
|
k = __pmd_offset(vaddr);
|
|
pgd = pgd_base + i;
|
|
|
|
for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) {
|
|
pud = (pud_t *)pgd;
|
|
for ( ; (j < PTRS_PER_PUD) && (vaddr != end); pud++, j++) {
|
|
pmd = one_md_table_init(pud);
|
|
#ifndef __PAGETABLE_PMD_FOLDED
|
|
pmd += k;
|
|
#endif
|
|
for (; (k < PTRS_PER_PMD) && (vaddr != end); pmd++, k++) {
|
|
pte = page_table_kmap_check(one_page_table_init(pmd),
|
|
pmd, vaddr, pte);
|
|
vaddr += PMD_SIZE;
|
|
}
|
|
k = 0;
|
|
}
|
|
j = 0;
|
|
}
|
|
}
|
|
#endif /* CONFIG_MMU */
|
|
|
|
void __init allocate_pgdat(unsigned int nid)
|
|
{
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
|
|
|
|
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
|
NODE_DATA(nid) = memblock_alloc_try_nid(
|
|
sizeof(struct pglist_data),
|
|
SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
|
|
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
|
|
if (!NODE_DATA(nid))
|
|
panic("Can't allocate pgdat for node %d\n", nid);
|
|
#endif
|
|
|
|
NODE_DATA(nid)->node_start_pfn = start_pfn;
|
|
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
|
|
}
|
|
|
|
static void __init do_init_bootmem(void)
|
|
{
|
|
struct memblock_region *reg;
|
|
|
|
/* Add active regions with valid PFNs. */
|
|
for_each_memblock(memory, reg) {
|
|
unsigned long start_pfn, end_pfn;
|
|
start_pfn = memblock_region_memory_base_pfn(reg);
|
|
end_pfn = memblock_region_memory_end_pfn(reg);
|
|
__add_active_range(0, start_pfn, end_pfn);
|
|
}
|
|
|
|
/* All of system RAM sits in node 0 for the non-NUMA case */
|
|
allocate_pgdat(0);
|
|
node_set_online(0);
|
|
|
|
plat_mem_setup();
|
|
|
|
for_each_memblock(memory, reg) {
|
|
int nid = memblock_get_region_node(reg);
|
|
|
|
memory_present(nid, memblock_region_memory_base_pfn(reg),
|
|
memblock_region_memory_end_pfn(reg));
|
|
}
|
|
sparse_init();
|
|
}
|
|
|
|
static void __init early_reserve_mem(void)
|
|
{
|
|
unsigned long start_pfn;
|
|
u32 zero_base = (u32)__MEMORY_START + (u32)PHYSICAL_OFFSET;
|
|
u32 start = zero_base + (u32)CONFIG_ZERO_PAGE_OFFSET;
|
|
|
|
/*
|
|
* Partially used pages are not usable - thus
|
|
* we are rounding upwards:
|
|
*/
|
|
start_pfn = PFN_UP(__pa(_end));
|
|
|
|
/*
|
|
* Reserve the kernel text and Reserve the bootmem bitmap. We do
|
|
* this in two steps (first step was init_bootmem()), because
|
|
* this catches the (definitely buggy) case of us accidentally
|
|
* initializing the bootmem allocator with an invalid RAM area.
|
|
*/
|
|
memblock_reserve(start, (PFN_PHYS(start_pfn) + PAGE_SIZE - 1) - start);
|
|
|
|
/*
|
|
* Reserve physical pages below CONFIG_ZERO_PAGE_OFFSET.
|
|
*/
|
|
if (CONFIG_ZERO_PAGE_OFFSET != 0)
|
|
memblock_reserve(zero_base, CONFIG_ZERO_PAGE_OFFSET);
|
|
|
|
/*
|
|
* Handle additional early reservations
|
|
*/
|
|
check_for_initrd();
|
|
reserve_crashkernel();
|
|
}
|
|
|
|
void __init paging_init(void)
|
|
{
|
|
unsigned long max_zone_pfns[MAX_NR_ZONES];
|
|
unsigned long vaddr, end;
|
|
|
|
sh_mv.mv_mem_init();
|
|
|
|
early_reserve_mem();
|
|
|
|
/*
|
|
* Once the early reservations are out of the way, give the
|
|
* platforms a chance to kick out some memory.
|
|
*/
|
|
if (sh_mv.mv_mem_reserve)
|
|
sh_mv.mv_mem_reserve();
|
|
|
|
memblock_enforce_memory_limit(memory_limit);
|
|
memblock_allow_resize();
|
|
|
|
memblock_dump_all();
|
|
|
|
/*
|
|
* Determine low and high memory ranges:
|
|
*/
|
|
max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
|
|
min_low_pfn = __MEMORY_START >> PAGE_SHIFT;
|
|
|
|
nodes_clear(node_online_map);
|
|
|
|
memory_start = (unsigned long)__va(__MEMORY_START);
|
|
memory_end = memory_start + (memory_limit ?: memblock_phys_mem_size());
|
|
|
|
uncached_init();
|
|
pmb_init();
|
|
do_init_bootmem();
|
|
ioremap_fixed_init();
|
|
|
|
/* We don't need to map the kernel through the TLB, as
|
|
* it is permanatly mapped using P1. So clear the
|
|
* entire pgd. */
|
|
memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));
|
|
|
|
/* Set an initial value for the MMU.TTB so we don't have to
|
|
* check for a null value. */
|
|
set_TTB(swapper_pg_dir);
|
|
|
|
/*
|
|
* Populate the relevant portions of swapper_pg_dir so that
|
|
* we can use the fixmap entries without calling kmalloc.
|
|
* pte's will be filled in by __set_fixmap().
|
|
*/
|
|
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
|
|
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
|
|
page_table_range_init(vaddr, end, swapper_pg_dir);
|
|
|
|
kmap_coherent_init();
|
|
|
|
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
|
|
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
|
|
free_area_init_nodes(max_zone_pfns);
|
|
}
|
|
|
|
unsigned int mem_init_done = 0;
|
|
|
|
void __init mem_init(void)
|
|
{
|
|
pg_data_t *pgdat;
|
|
|
|
high_memory = NULL;
|
|
for_each_online_pgdat(pgdat)
|
|
high_memory = max_t(void *, high_memory,
|
|
__va(pgdat_end_pfn(pgdat) << PAGE_SHIFT));
|
|
|
|
memblock_free_all();
|
|
|
|
/* Set this up early, so we can take care of the zero page */
|
|
cpu_cache_init();
|
|
|
|
/* clear the zero-page */
|
|
memset(empty_zero_page, 0, PAGE_SIZE);
|
|
__flush_wback_region(empty_zero_page, PAGE_SIZE);
|
|
|
|
vsyscall_init();
|
|
|
|
mem_init_print_info(NULL);
|
|
pr_info("virtual kernel memory layout:\n"
|
|
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
|
#ifdef CONFIG_HIGHMEM
|
|
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
|
#endif
|
|
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
|
|
" lowmem : 0x%08lx - 0x%08lx (%4ld MB) (cached)\n"
|
|
#ifdef CONFIG_UNCACHED_MAPPING
|
|
" : 0x%08lx - 0x%08lx (%4ld MB) (uncached)\n"
|
|
#endif
|
|
" .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
|
" .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
|
|
" .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
|
|
FIXADDR_START, FIXADDR_TOP,
|
|
(FIXADDR_TOP - FIXADDR_START) >> 10,
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
|
|
(LAST_PKMAP*PAGE_SIZE) >> 10,
|
|
#endif
|
|
|
|
(unsigned long)VMALLOC_START, VMALLOC_END,
|
|
(VMALLOC_END - VMALLOC_START) >> 20,
|
|
|
|
(unsigned long)memory_start, (unsigned long)high_memory,
|
|
((unsigned long)high_memory - (unsigned long)memory_start) >> 20,
|
|
|
|
#ifdef CONFIG_UNCACHED_MAPPING
|
|
uncached_start, uncached_end, uncached_size >> 20,
|
|
#endif
|
|
|
|
(unsigned long)&__init_begin, (unsigned long)&__init_end,
|
|
((unsigned long)&__init_end -
|
|
(unsigned long)&__init_begin) >> 10,
|
|
|
|
(unsigned long)&_etext, (unsigned long)&_edata,
|
|
((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
|
|
|
|
(unsigned long)&_text, (unsigned long)&_etext,
|
|
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
|
|
|
|
mem_init_done = 1;
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
int arch_add_memory(int nid, u64 start, u64 size,
|
|
struct mhp_restrictions *restrictions)
|
|
{
|
|
unsigned long start_pfn = PFN_DOWN(start);
|
|
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
int ret;
|
|
|
|
/* We only have ZONE_NORMAL, so this is easy.. */
|
|
ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
|
|
if (unlikely(ret))
|
|
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
int memory_add_physaddr_to_nid(u64 addr)
|
|
{
|
|
/* Node 0 for now.. */
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
|
#endif
|
|
|
|
void arch_remove_memory(int nid, u64 start, u64 size,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
unsigned long start_pfn = PFN_DOWN(start);
|
|
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
|
|
__remove_pages(start_pfn, nr_pages, altmap);
|
|
}
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|