forked from Minki/linux
192f0f8e9d
Notable changes: - Removal of the NPU DMA code, used by the out-of-tree Nvidia driver, as well as some other functions only used by drivers that haven't (yet?) made it upstream. - A fix for a bug in our handling of hardware watchpoints (eg. perf record -e mem: ...) which could lead to register corruption and kernel crashes. - Enable HAVE_ARCH_HUGE_VMAP, which allows us to use large pages for vmalloc when using the Radix MMU. - A large but incremental rewrite of our exception handling code to use gas macros rather than multiple levels of nested CPP macros. And the usual small fixes, cleanups and improvements. Thanks to: Alastair D'Silva, Alexey Kardashevskiy, Andreas Schwab, Aneesh Kumar K.V, Anju T Sudhakar, Anton Blanchard, Arnd Bergmann, Athira Rajeev, Cédric Le Goater, Christian Lamparter, Christophe Leroy, Christophe Lombard, Christoph Hellwig, Daniel Axtens, Denis Efremov, Enrico Weigelt, Frederic Barrat, Gautham R. Shenoy, Geert Uytterhoeven, Geliang Tang, Gen Zhang, Greg Kroah-Hartman, Greg Kurz, Gustavo Romero, Krzysztof Kozlowski, Madhavan Srinivasan, Masahiro Yamada, Mathieu Malaterre, Michael Neuling, Nathan Lynch, Naveen N. Rao, Nicholas Piggin, Nishad Kamdar, Oliver O'Halloran, Qian Cai, Ravi Bangoria, Sachin Sant, Sam Bobroff, Satheesh Rajendran, Segher Boessenkool, Shaokun Zhang, Shawn Anastasio, Stewart Smith, Suraj Jitindar Singh, Thiago Jung Bauermann, YueHaibing. -----BEGIN PGP SIGNATURE----- iQIcBAABAgAGBQJdKVoLAAoJEFHr6jzI4aWA0kIP/A6shIbbE7H5W2hFrqt/PPPK 3+VrvPKbOFF+W6hcE/RgSZmEnUo0svdNjHUd/eMfFS1vb/uRt2QDdrsHUNNwURQL M2mcLXFwYpnjSjb/XMgDbHpAQxjeGfTdYLonUIejN7Rk8KQUeLyKQ3SBn6kfMc46 DnUUcPcjuRGaETUmVuZZ4e40ZWbJp8PKDrSJOuUrTPXMaK5ciNbZk5mCWXGbYl6G BMQAyv4ld/417rNTjBEP/T2foMJtioAt4W6mtlgdkOTdIEZnFU67nNxDBthNSu2c 95+I+/sML4KOp1R4yhqLSLIDDbc3bg3c99hLGij0d948z3bkSZ8bwnPaUuy70C4v U8rvl/+N6C6H3DgSsPE/Gnkd8DnudqWY8nULc+8p3fXljGwww6/Qgt+6yCUn8BdW WgixkSjKgjDmzTw8trIUNEqORrTVle7cM2hIyIK2Q5T4kWzNQxrLZ/x/3wgoYjUa 1KwIzaRo5JKZ9D3pJnJ5U+knE2/90rJIyfcp0W6ygyJsWKi2GNmq1eN3sKOw0IxH Tg86RENIA/rEMErNOfP45sLteMuTR7of7peCG3yumIOZqsDVYAzerpvtSgip2cvK aG+9HcYlBFOOOF9Dabi8GXsTBLXLfwiyjjLSpA9eXPwW8KObgiNfTZa7ujjTPvis 4mk9oukFTFUpfhsMmI3T =3dBZ -----END PGP SIGNATURE----- Merge tag 'powerpc-5.3-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux Pull powerpc updates from Michael Ellerman: "Notable changes: - Removal of the NPU DMA code, used by the out-of-tree Nvidia driver, as well as some other functions only used by drivers that haven't (yet?) made it upstream. - A fix for a bug in our handling of hardware watchpoints (eg. perf record -e mem: ...) which could lead to register corruption and kernel crashes. - Enable HAVE_ARCH_HUGE_VMAP, which allows us to use large pages for vmalloc when using the Radix MMU. - A large but incremental rewrite of our exception handling code to use gas macros rather than multiple levels of nested CPP macros. And the usual small fixes, cleanups and improvements. Thanks to: Alastair D'Silva, Alexey Kardashevskiy, Andreas Schwab, Aneesh Kumar K.V, Anju T Sudhakar, Anton Blanchard, Arnd Bergmann, Athira Rajeev, Cédric Le Goater, Christian Lamparter, Christophe Leroy, Christophe Lombard, Christoph Hellwig, Daniel Axtens, Denis Efremov, Enrico Weigelt, Frederic Barrat, Gautham R. Shenoy, Geert Uytterhoeven, Geliang Tang, Gen Zhang, Greg Kroah-Hartman, Greg Kurz, Gustavo Romero, Krzysztof Kozlowski, Madhavan Srinivasan, Masahiro Yamada, Mathieu Malaterre, Michael Neuling, Nathan Lynch, Naveen N. Rao, Nicholas Piggin, Nishad Kamdar, Oliver O'Halloran, Qian Cai, Ravi Bangoria, Sachin Sant, Sam Bobroff, Satheesh Rajendran, Segher Boessenkool, Shaokun Zhang, Shawn Anastasio, Stewart Smith, Suraj Jitindar Singh, Thiago Jung Bauermann, YueHaibing" * tag 'powerpc-5.3-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (163 commits) powerpc/powernv/idle: Fix restore of SPRN_LDBAR for POWER9 stop state. powerpc/eeh: Handle hugepages in ioremap space ocxl: Update for AFU descriptor template version 1.1 powerpc/boot: pass CONFIG options in a simpler and more robust way powerpc/boot: add {get, put}_unaligned_be32 to xz_config.h powerpc/irq: Don't WARN continuously in arch_local_irq_restore() powerpc/module64: Use symbolic instructions names. powerpc/module32: Use symbolic instructions names. powerpc: Move PPC_HA() PPC_HI() and PPC_LO() to ppc-opcode.h powerpc/module64: Fix comment in R_PPC64_ENTRY handling powerpc/boot: Add lzo support for uImage powerpc/boot: Add lzma support for uImage powerpc/boot: don't force gzipped uImage powerpc/8xx: Add microcode patch to move SMC parameter RAM. powerpc/8xx: Use IO accessors in microcode programming. powerpc/8xx: replace #ifdefs by IS_ENABLED() in microcode.c powerpc/8xx: refactor programming of microcode CPM params. powerpc/8xx: refactor printing of microcode patch name. powerpc/8xx: Refactor microcode write powerpc/8xx: refactor writing of CPM microcode arrays ...
680 lines
16 KiB
C
680 lines
16 KiB
C
/*
|
|
* PPC Huge TLB Page Support for Kernel.
|
|
*
|
|
* Copyright (C) 2003 David Gibson, IBM Corporation.
|
|
* Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
|
|
*
|
|
* Based on the IA-32 version:
|
|
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/io.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/export.h>
|
|
#include <linux/of_fdt.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/moduleparam.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/swapops.h>
|
|
#include <linux/kmemleak.h>
|
|
#include <asm/pgtable.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/setup.h>
|
|
#include <asm/hugetlb.h>
|
|
#include <asm/pte-walk.h>
|
|
|
|
bool hugetlb_disabled = false;
|
|
|
|
#define hugepd_none(hpd) (hpd_val(hpd) == 0)
|
|
|
|
#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *)))
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
|
|
{
|
|
/*
|
|
* Only called for hugetlbfs pages, hence can ignore THP and the
|
|
* irq disabled walk.
|
|
*/
|
|
return __find_linux_pte(mm->pgd, addr, NULL, NULL);
|
|
}
|
|
|
|
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
|
|
unsigned long address, unsigned int pdshift,
|
|
unsigned int pshift, spinlock_t *ptl)
|
|
{
|
|
struct kmem_cache *cachep;
|
|
pte_t *new;
|
|
int i;
|
|
int num_hugepd;
|
|
|
|
if (pshift >= pdshift) {
|
|
cachep = PGT_CACHE(PTE_T_ORDER);
|
|
num_hugepd = 1 << (pshift - pdshift);
|
|
} else if (IS_ENABLED(CONFIG_PPC_8xx)) {
|
|
cachep = PGT_CACHE(PTE_INDEX_SIZE);
|
|
num_hugepd = 1;
|
|
} else {
|
|
cachep = PGT_CACHE(pdshift - pshift);
|
|
num_hugepd = 1;
|
|
}
|
|
|
|
if (!cachep) {
|
|
WARN_ONCE(1, "No page table cache created for hugetlb tables");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
|
|
|
|
BUG_ON(pshift > HUGEPD_SHIFT_MASK);
|
|
BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
|
|
|
|
if (!new)
|
|
return -ENOMEM;
|
|
|
|
/*
|
|
* Make sure other cpus find the hugepd set only after a
|
|
* properly initialized page table is visible to them.
|
|
* For more details look for comment in __pte_alloc().
|
|
*/
|
|
smp_wmb();
|
|
|
|
spin_lock(ptl);
|
|
/*
|
|
* We have multiple higher-level entries that point to the same
|
|
* actual pte location. Fill in each as we go and backtrack on error.
|
|
* We need all of these so the DTLB pgtable walk code can find the
|
|
* right higher-level entry without knowing if it's a hugepage or not.
|
|
*/
|
|
for (i = 0; i < num_hugepd; i++, hpdp++) {
|
|
if (unlikely(!hugepd_none(*hpdp)))
|
|
break;
|
|
hugepd_populate(hpdp, new, pshift);
|
|
}
|
|
/* If we bailed from the for loop early, an error occurred, clean up */
|
|
if (i < num_hugepd) {
|
|
for (i = i - 1 ; i >= 0; i--, hpdp--)
|
|
*hpdp = __hugepd(0);
|
|
kmem_cache_free(cachep, new);
|
|
} else {
|
|
kmemleak_ignore(new);
|
|
}
|
|
spin_unlock(ptl);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* At this point we do the placement change only for BOOK3S 64. This would
|
|
* possibly work on other subarchs.
|
|
*/
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
|
|
{
|
|
pgd_t *pg;
|
|
pud_t *pu;
|
|
pmd_t *pm;
|
|
hugepd_t *hpdp = NULL;
|
|
unsigned pshift = __ffs(sz);
|
|
unsigned pdshift = PGDIR_SHIFT;
|
|
spinlock_t *ptl;
|
|
|
|
addr &= ~(sz-1);
|
|
pg = pgd_offset(mm, addr);
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
if (pshift == PGDIR_SHIFT)
|
|
/* 16GB huge page */
|
|
return (pte_t *) pg;
|
|
else if (pshift > PUD_SHIFT) {
|
|
/*
|
|
* We need to use hugepd table
|
|
*/
|
|
ptl = &mm->page_table_lock;
|
|
hpdp = (hugepd_t *)pg;
|
|
} else {
|
|
pdshift = PUD_SHIFT;
|
|
pu = pud_alloc(mm, pg, addr);
|
|
if (!pu)
|
|
return NULL;
|
|
if (pshift == PUD_SHIFT)
|
|
return (pte_t *)pu;
|
|
else if (pshift > PMD_SHIFT) {
|
|
ptl = pud_lockptr(mm, pu);
|
|
hpdp = (hugepd_t *)pu;
|
|
} else {
|
|
pdshift = PMD_SHIFT;
|
|
pm = pmd_alloc(mm, pu, addr);
|
|
if (!pm)
|
|
return NULL;
|
|
if (pshift == PMD_SHIFT)
|
|
/* 16MB hugepage */
|
|
return (pte_t *)pm;
|
|
else {
|
|
ptl = pmd_lockptr(mm, pm);
|
|
hpdp = (hugepd_t *)pm;
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
if (pshift >= PGDIR_SHIFT) {
|
|
ptl = &mm->page_table_lock;
|
|
hpdp = (hugepd_t *)pg;
|
|
} else {
|
|
pdshift = PUD_SHIFT;
|
|
pu = pud_alloc(mm, pg, addr);
|
|
if (!pu)
|
|
return NULL;
|
|
if (pshift >= PUD_SHIFT) {
|
|
ptl = pud_lockptr(mm, pu);
|
|
hpdp = (hugepd_t *)pu;
|
|
} else {
|
|
pdshift = PMD_SHIFT;
|
|
pm = pmd_alloc(mm, pu, addr);
|
|
if (!pm)
|
|
return NULL;
|
|
ptl = pmd_lockptr(mm, pm);
|
|
hpdp = (hugepd_t *)pm;
|
|
}
|
|
}
|
|
#endif
|
|
if (!hpdp)
|
|
return NULL;
|
|
|
|
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
|
|
|
|
if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
|
|
pdshift, pshift, ptl))
|
|
return NULL;
|
|
|
|
return hugepte_offset(*hpdp, addr, pdshift);
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
/*
|
|
* Tracks gpages after the device tree is scanned and before the
|
|
* huge_boot_pages list is ready on pseries.
|
|
*/
|
|
#define MAX_NUMBER_GPAGES 1024
|
|
__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
|
|
__initdata static unsigned nr_gpages;
|
|
|
|
/*
|
|
* Build list of addresses of gigantic pages. This function is used in early
|
|
* boot before the buddy allocator is setup.
|
|
*/
|
|
void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
|
|
{
|
|
if (!addr)
|
|
return;
|
|
while (number_of_pages > 0) {
|
|
gpage_freearray[nr_gpages] = addr;
|
|
nr_gpages++;
|
|
number_of_pages--;
|
|
addr += page_size;
|
|
}
|
|
}
|
|
|
|
int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
|
|
{
|
|
struct huge_bootmem_page *m;
|
|
if (nr_gpages == 0)
|
|
return 0;
|
|
m = phys_to_virt(gpage_freearray[--nr_gpages]);
|
|
gpage_freearray[nr_gpages] = 0;
|
|
list_add(&m->list, &huge_boot_pages);
|
|
m->hstate = hstate;
|
|
return 1;
|
|
}
|
|
#endif
|
|
|
|
|
|
int __init alloc_bootmem_huge_page(struct hstate *h)
|
|
{
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
|
|
return pseries_alloc_bootmem_huge_page(h);
|
|
#endif
|
|
return __alloc_bootmem_huge_page(h);
|
|
}
|
|
|
|
#ifndef CONFIG_PPC_BOOK3S_64
|
|
#define HUGEPD_FREELIST_SIZE \
|
|
((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
|
|
|
|
struct hugepd_freelist {
|
|
struct rcu_head rcu;
|
|
unsigned int index;
|
|
void *ptes[0];
|
|
};
|
|
|
|
static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
|
|
|
|
static void hugepd_free_rcu_callback(struct rcu_head *head)
|
|
{
|
|
struct hugepd_freelist *batch =
|
|
container_of(head, struct hugepd_freelist, rcu);
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < batch->index; i++)
|
|
kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
|
|
|
|
free_page((unsigned long)batch);
|
|
}
|
|
|
|
static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
|
|
{
|
|
struct hugepd_freelist **batchp;
|
|
|
|
batchp = &get_cpu_var(hugepd_freelist_cur);
|
|
|
|
if (atomic_read(&tlb->mm->mm_users) < 2 ||
|
|
mm_is_thread_local(tlb->mm)) {
|
|
kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
|
|
put_cpu_var(hugepd_freelist_cur);
|
|
return;
|
|
}
|
|
|
|
if (*batchp == NULL) {
|
|
*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
|
|
(*batchp)->index = 0;
|
|
}
|
|
|
|
(*batchp)->ptes[(*batchp)->index++] = hugepte;
|
|
if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
|
|
call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
|
|
*batchp = NULL;
|
|
}
|
|
put_cpu_var(hugepd_freelist_cur);
|
|
}
|
|
#else
|
|
static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
|
|
#endif
|
|
|
|
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
|
|
unsigned long start, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pte_t *hugepte = hugepd_page(*hpdp);
|
|
int i;
|
|
|
|
unsigned long pdmask = ~((1UL << pdshift) - 1);
|
|
unsigned int num_hugepd = 1;
|
|
unsigned int shift = hugepd_shift(*hpdp);
|
|
|
|
/* Note: On fsl the hpdp may be the first of several */
|
|
if (shift > pdshift)
|
|
num_hugepd = 1 << (shift - pdshift);
|
|
|
|
start &= pdmask;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= pdmask;
|
|
if (! ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
for (i = 0; i < num_hugepd; i++, hpdp++)
|
|
*hpdp = __hugepd(0);
|
|
|
|
if (shift >= pdshift)
|
|
hugepd_free(tlb, hugepte);
|
|
else if (IS_ENABLED(CONFIG_PPC_8xx))
|
|
pgtable_free_tlb(tlb, hugepte,
|
|
get_hugepd_cache_index(PTE_INDEX_SIZE));
|
|
else
|
|
pgtable_free_tlb(tlb, hugepte,
|
|
get_hugepd_cache_index(pdshift - shift));
|
|
}
|
|
|
|
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
unsigned long start;
|
|
|
|
start = addr;
|
|
do {
|
|
unsigned long more;
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
next = pmd_addr_end(addr, end);
|
|
if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
|
|
/*
|
|
* if it is not hugepd pointer, we should already find
|
|
* it cleared.
|
|
*/
|
|
WARN_ON(!pmd_none_or_clear_bad(pmd));
|
|
continue;
|
|
}
|
|
/*
|
|
* Increment next by the size of the huge mapping since
|
|
* there may be more than one entry at this level for a
|
|
* single hugepage, but all of them point to
|
|
* the same kmem cache that holds the hugepte.
|
|
*/
|
|
more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
|
|
if (more > next)
|
|
next = more;
|
|
|
|
free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
|
|
addr, next, floor, ceiling);
|
|
} while (addr = next, addr != end);
|
|
|
|
start &= PUD_MASK;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= PUD_MASK;
|
|
if (!ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
pmd = pmd_offset(pud, start);
|
|
pud_clear(pud);
|
|
pmd_free_tlb(tlb, pmd, start);
|
|
mm_dec_nr_pmds(tlb->mm);
|
|
}
|
|
|
|
static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
unsigned long start;
|
|
|
|
start = addr;
|
|
do {
|
|
pud = pud_offset(pgd, addr);
|
|
next = pud_addr_end(addr, end);
|
|
if (!is_hugepd(__hugepd(pud_val(*pud)))) {
|
|
if (pud_none_or_clear_bad(pud))
|
|
continue;
|
|
hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
|
|
ceiling);
|
|
} else {
|
|
unsigned long more;
|
|
/*
|
|
* Increment next by the size of the huge mapping since
|
|
* there may be more than one entry at this level for a
|
|
* single hugepage, but all of them point to
|
|
* the same kmem cache that holds the hugepte.
|
|
*/
|
|
more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
|
|
if (more > next)
|
|
next = more;
|
|
|
|
free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
|
|
addr, next, floor, ceiling);
|
|
}
|
|
} while (addr = next, addr != end);
|
|
|
|
start &= PGDIR_MASK;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= PGDIR_MASK;
|
|
if (!ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
pud = pud_offset(pgd, start);
|
|
pgd_clear(pgd);
|
|
pud_free_tlb(tlb, pud, start);
|
|
mm_dec_nr_puds(tlb->mm);
|
|
}
|
|
|
|
/*
|
|
* This function frees user-level page tables of a process.
|
|
*/
|
|
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pgd_t *pgd;
|
|
unsigned long next;
|
|
|
|
/*
|
|
* Because there are a number of different possible pagetable
|
|
* layouts for hugepage ranges, we limit knowledge of how
|
|
* things should be laid out to the allocation path
|
|
* (huge_pte_alloc(), above). Everything else works out the
|
|
* structure as it goes from information in the hugepd
|
|
* pointers. That means that we can't here use the
|
|
* optimization used in the normal page free_pgd_range(), of
|
|
* checking whether we're actually covering a large enough
|
|
* range to have to do anything at the top level of the walk
|
|
* instead of at the bottom.
|
|
*
|
|
* To make sense of this, you should probably go read the big
|
|
* block comment at the top of the normal free_pgd_range(),
|
|
* too.
|
|
*/
|
|
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
pgd = pgd_offset(tlb->mm, addr);
|
|
if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
continue;
|
|
hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
|
|
} else {
|
|
unsigned long more;
|
|
/*
|
|
* Increment next by the size of the huge mapping since
|
|
* there may be more than one entry at the pgd level
|
|
* for a single hugepage, but all of them point to the
|
|
* same kmem cache that holds the hugepte.
|
|
*/
|
|
more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
|
|
if (more > next)
|
|
next = more;
|
|
|
|
free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
|
|
addr, next, floor, ceiling);
|
|
}
|
|
} while (addr = next, addr != end);
|
|
}
|
|
|
|
struct page *follow_huge_pd(struct vm_area_struct *vma,
|
|
unsigned long address, hugepd_t hpd,
|
|
int flags, int pdshift)
|
|
{
|
|
pte_t *ptep;
|
|
spinlock_t *ptl;
|
|
struct page *page = NULL;
|
|
unsigned long mask;
|
|
int shift = hugepd_shift(hpd);
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
retry:
|
|
/*
|
|
* hugepage directory entries are protected by mm->page_table_lock
|
|
* Use this instead of huge_pte_lockptr
|
|
*/
|
|
ptl = &mm->page_table_lock;
|
|
spin_lock(ptl);
|
|
|
|
ptep = hugepte_offset(hpd, address, pdshift);
|
|
if (pte_present(*ptep)) {
|
|
mask = (1UL << shift) - 1;
|
|
page = pte_page(*ptep);
|
|
page += ((address & mask) >> PAGE_SHIFT);
|
|
if (flags & FOLL_GET)
|
|
get_page(page);
|
|
} else {
|
|
if (is_hugetlb_entry_migration(*ptep)) {
|
|
spin_unlock(ptl);
|
|
__migration_entry_wait(mm, ptep, ptl);
|
|
goto retry;
|
|
}
|
|
}
|
|
spin_unlock(ptl);
|
|
return page;
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_MM_SLICES
|
|
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long pgoff,
|
|
unsigned long flags)
|
|
{
|
|
struct hstate *hstate = hstate_file(file);
|
|
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
|
|
|
|
#ifdef CONFIG_PPC_RADIX_MMU
|
|
if (radix_enabled())
|
|
return radix__hugetlb_get_unmapped_area(file, addr, len,
|
|
pgoff, flags);
|
|
#endif
|
|
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
|
|
}
|
|
#endif
|
|
|
|
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
|
|
{
|
|
/* With radix we don't use slice, so derive it from vma*/
|
|
if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) {
|
|
unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
|
|
|
|
return 1UL << mmu_psize_to_shift(psize);
|
|
}
|
|
return vma_kernel_pagesize(vma);
|
|
}
|
|
|
|
static int __init add_huge_page_size(unsigned long long size)
|
|
{
|
|
int shift = __ffs(size);
|
|
int mmu_psize;
|
|
|
|
/* Check that it is a page size supported by the hardware and
|
|
* that it fits within pagetable and slice limits. */
|
|
if (size <= PAGE_SIZE || !is_power_of_2(size))
|
|
return -EINVAL;
|
|
|
|
mmu_psize = check_and_get_huge_psize(shift);
|
|
if (mmu_psize < 0)
|
|
return -EINVAL;
|
|
|
|
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
|
|
|
|
/* Return if huge page size has already been setup */
|
|
if (size_to_hstate(size))
|
|
return 0;
|
|
|
|
hugetlb_add_hstate(shift - PAGE_SHIFT);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __init hugepage_setup_sz(char *str)
|
|
{
|
|
unsigned long long size;
|
|
|
|
size = memparse(str, &str);
|
|
|
|
if (add_huge_page_size(size) != 0) {
|
|
hugetlb_bad_size();
|
|
pr_err("Invalid huge page size specified(%llu)\n", size);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
__setup("hugepagesz=", hugepage_setup_sz);
|
|
|
|
static int __init hugetlbpage_init(void)
|
|
{
|
|
bool configured = false;
|
|
int psize;
|
|
|
|
if (hugetlb_disabled) {
|
|
pr_info("HugeTLB support is disabled!\n");
|
|
return 0;
|
|
}
|
|
|
|
if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
|
|
!mmu_has_feature(MMU_FTR_16M_PAGE))
|
|
return -ENODEV;
|
|
|
|
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
|
|
unsigned shift;
|
|
unsigned pdshift;
|
|
|
|
if (!mmu_psize_defs[psize].shift)
|
|
continue;
|
|
|
|
shift = mmu_psize_to_shift(psize);
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
if (shift > PGDIR_SHIFT)
|
|
continue;
|
|
else if (shift > PUD_SHIFT)
|
|
pdshift = PGDIR_SHIFT;
|
|
else if (shift > PMD_SHIFT)
|
|
pdshift = PUD_SHIFT;
|
|
else
|
|
pdshift = PMD_SHIFT;
|
|
#else
|
|
if (shift < PUD_SHIFT)
|
|
pdshift = PMD_SHIFT;
|
|
else if (shift < PGDIR_SHIFT)
|
|
pdshift = PUD_SHIFT;
|
|
else
|
|
pdshift = PGDIR_SHIFT;
|
|
#endif
|
|
|
|
if (add_huge_page_size(1ULL << shift) < 0)
|
|
continue;
|
|
/*
|
|
* if we have pdshift and shift value same, we don't
|
|
* use pgt cache for hugepd.
|
|
*/
|
|
if (pdshift > shift && IS_ENABLED(CONFIG_PPC_8xx))
|
|
pgtable_cache_add(PTE_INDEX_SIZE);
|
|
else if (pdshift > shift)
|
|
pgtable_cache_add(pdshift - shift);
|
|
else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx))
|
|
pgtable_cache_add(PTE_T_ORDER);
|
|
|
|
configured = true;
|
|
}
|
|
|
|
if (configured) {
|
|
if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
|
|
hugetlbpage_init_default();
|
|
} else
|
|
pr_info("Failed to initialize. Disabling HugeTLB");
|
|
|
|
return 0;
|
|
}
|
|
|
|
arch_initcall(hugetlbpage_init);
|
|
|
|
void flush_dcache_icache_hugepage(struct page *page)
|
|
{
|
|
int i;
|
|
void *start;
|
|
|
|
BUG_ON(!PageCompound(page));
|
|
|
|
for (i = 0; i < (1UL << compound_order(page)); i++) {
|
|
if (!PageHighMem(page)) {
|
|
__flush_dcache_icache(page_address(page+i));
|
|
} else {
|
|
start = kmap_atomic(page+i);
|
|
__flush_dcache_icache(start);
|
|
kunmap_atomic(start);
|
|
}
|
|
}
|
|
}
|