mirror of
https://github.com/torvalds/linux.git
synced 2024-12-27 05:11:48 +00:00
588ab3f9af
- Initial page table creation reworked to avoid breaking large block mappings (huge pages) into smaller ones. The ARM architecture requires break-before-make in such cases to avoid TLB conflicts but that's not always possible on live page tables - Kernel virtual memory layout: the kernel image is no longer linked to the bottom of the linear mapping (PAGE_OFFSET) but at the bottom of the vmalloc space, allowing the kernel to be loaded (nearly) anywhere in physical RAM - Kernel ASLR: position independent kernel Image and modules being randomly mapped in the vmalloc space with the randomness is provided by UEFI (efi_get_random_bytes() patches merged via the arm64 tree, acked by Matt Fleming) - Implement relative exception tables for arm64, required by KASLR (initial code for ARCH_HAS_RELATIVE_EXTABLE added to lib/extable.c but actual x86 conversion to deferred to 4.7 because of the merge dependencies) - Support for the User Access Override feature of ARMv8.2: this allows uaccess functions (get_user etc.) to be implemented using LDTR/STTR instructions. Such instructions, when run by the kernel, perform unprivileged accesses adding an extra level of protection. The set_fs() macro is used to "upgrade" such instruction to privileged accesses via the UAO bit - Half-precision floating point support (part of ARMv8.2) - Optimisations for CPUs with or without a hardware prefetcher (using run-time code patching) - copy_page performance improvement to deal with 128 bytes at a time - Sanity checks on the CPU capabilities (via CPUID) to prevent incompatible secondary CPUs from being brought up (e.g. weird big.LITTLE configurations) - valid_user_regs() reworked for better sanity check of the sigcontext information (restored pstate information) - ACPI parking protocol implementation - CONFIG_DEBUG_RODATA enabled by default - VDSO code marked as read-only - DEBUG_PAGEALLOC support - ARCH_HAS_UBSAN_SANITIZE_ALL enabled - Erratum workaround Cavium ThunderX SoC - set_pte_at() fix for PROT_NONE mappings - Code clean-ups -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABAgAGBQJW6u95AAoJEGvWsS0AyF7xMyoP/3x2O6bgreSQ84BdO4JChN4+ RQ9OVdX8u2ItO9sgaCY2AA6KoiBuEjGmPl/XRuK0I7DpODTtRjEXQHuNNhz8AelC hn4AEVqamY6Z5BzHFIjs8G9ydEbq+OXcKWEdwSsBhP/cMvI7ss3dps1f5iNPT5Vv 50E/kUz+aWYy7pKlB18VDV7TUOA3SuYuGknWV8+bOY5uPb8hNT3Y3fHOg/EuNNN3 DIuYH1V7XQkXtF+oNVIGxzzJCXULBE7egMcWAm1ydSOHK0JwkZAiL7OhI7ceVD0x YlDxBnqmi4cgzfBzTxITAhn3OParwN6udQprdF1WGtFF6fuY2eRDSH/L/iZoE4DY OulL951OsBtF8YC3+RKLk908/0bA2Uw8ftjCOFJTYbSnZBj1gWK41VkCYMEXiHQk EaN8+2Iw206iYIoyvdjGCLw7Y0oakDoVD9vmv12SOaHeQljTkjoN8oIlfjjKTeP7 3AXj5v9BDMDVh40nkVayysRNvqe48Kwt9Wn0rhVTLxwdJEiFG/OIU6HLuTkretdN dcCNFSQrRieSFHpBK9G0vKIpIss1ZwLm8gjocVXH7VK4Mo/TNQe4p2/wAF29mq4r xu1UiXmtU3uWxiqZnt72LOYFCarQ0sFA5+pMEvF5W+NrVB0wGpXhcwm+pGsIi4IM LepccTgykiUBqW5TRzPz =/oS+ -----END PGP SIGNATURE----- Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux Pull arm64 updates from Catalin Marinas: "Here are the main arm64 updates for 4.6. There are some relatively intrusive changes to support KASLR, the reworking of the kernel virtual memory layout and initial page table creation. Summary: - Initial page table creation reworked to avoid breaking large block mappings (huge pages) into smaller ones. The ARM architecture requires break-before-make in such cases to avoid TLB conflicts but that's not always possible on live page tables - Kernel virtual memory layout: the kernel image is no longer linked to the bottom of the linear mapping (PAGE_OFFSET) but at the bottom of the vmalloc space, allowing the kernel to be loaded (nearly) anywhere in physical RAM - Kernel ASLR: position independent kernel Image and modules being randomly mapped in the vmalloc space with the randomness is provided by UEFI (efi_get_random_bytes() patches merged via the arm64 tree, acked by Matt Fleming) - Implement relative exception tables for arm64, required by KASLR (initial code for ARCH_HAS_RELATIVE_EXTABLE added to lib/extable.c but actual x86 conversion to deferred to 4.7 because of the merge dependencies) - Support for the User Access Override feature of ARMv8.2: this allows uaccess functions (get_user etc.) to be implemented using LDTR/STTR instructions. Such instructions, when run by the kernel, perform unprivileged accesses adding an extra level of protection. The set_fs() macro is used to "upgrade" such instruction to privileged accesses via the UAO bit - Half-precision floating point support (part of ARMv8.2) - Optimisations for CPUs with or without a hardware prefetcher (using run-time code patching) - copy_page performance improvement to deal with 128 bytes at a time - Sanity checks on the CPU capabilities (via CPUID) to prevent incompatible secondary CPUs from being brought up (e.g. weird big.LITTLE configurations) - valid_user_regs() reworked for better sanity check of the sigcontext information (restored pstate information) - ACPI parking protocol implementation - CONFIG_DEBUG_RODATA enabled by default - VDSO code marked as read-only - DEBUG_PAGEALLOC support - ARCH_HAS_UBSAN_SANITIZE_ALL enabled - Erratum workaround Cavium ThunderX SoC - set_pte_at() fix for PROT_NONE mappings - Code clean-ups" * tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (99 commits) arm64: kasan: Fix zero shadow mapping overriding kernel image shadow arm64: kasan: Use actual memory node when populating the kernel image shadow arm64: Update PTE_RDONLY in set_pte_at() for PROT_NONE permission arm64: Fix misspellings in comments. arm64: efi: add missing frame pointer assignment arm64: make mrs_s prefixing implicit in read_cpuid arm64: enable CONFIG_DEBUG_RODATA by default arm64: Rework valid_user_regs arm64: mm: check at build time that PAGE_OFFSET divides the VA space evenly arm64: KVM: Move kvm_call_hyp back to its original localtion arm64: mm: treat memstart_addr as a signed quantity arm64: mm: list kernel sections in order arm64: lse: deal with clobbered IP registers after branch via PLT arm64: mm: dump: Use VA_START directly instead of private LOWEST_ADDR arm64: kconfig: add submenu for 8.2 architectural features arm64: kernel: acpi: fix ioremap in ACPI parking protocol cpu_postboot arm64: Add support for Half precision floating point arm64: Remove fixmap include fragility arm64: Add workaround for Cavium erratum 27456 arm64: mm: Mark .rodata as RO ...
325 lines
9.2 KiB
C
325 lines
9.2 KiB
C
/*
|
|
* Copyright (C) 2012,2013 - ARM Ltd
|
|
* Author: Marc Zyngier <marc.zyngier@arm.com>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef __ARM64_KVM_MMU_H__
|
|
#define __ARM64_KVM_MMU_H__
|
|
|
|
#include <asm/page.h>
|
|
#include <asm/memory.h>
|
|
#include <asm/cpufeature.h>
|
|
|
|
/*
|
|
* As ARMv8.0 only has the TTBR0_EL2 register, we cannot express
|
|
* "negative" addresses. This makes it impossible to directly share
|
|
* mappings with the kernel.
|
|
*
|
|
* Instead, give the HYP mode its own VA region at a fixed offset from
|
|
* the kernel by just masking the top bits (which are all ones for a
|
|
* kernel address).
|
|
*
|
|
* ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these
|
|
* macros (the entire kernel runs at EL2).
|
|
*/
|
|
#define HYP_PAGE_OFFSET_SHIFT VA_BITS
|
|
#define HYP_PAGE_OFFSET_MASK ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
|
|
#define HYP_PAGE_OFFSET (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
|
|
|
|
/*
|
|
* Our virtual mapping for the idmap-ed MMU-enable code. Must be
|
|
* shared across all the page-tables. Conveniently, we use the last
|
|
* possible page, where no kernel mapping will ever exist.
|
|
*/
|
|
#define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
|
|
|
|
/*
|
|
* KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
|
|
* levels in addition to the PGD and potentially the PUD which are
|
|
* pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
|
|
* tables use one level of tables less than the kernel.
|
|
*/
|
|
#ifdef CONFIG_ARM64_64K_PAGES
|
|
#define KVM_MMU_CACHE_MIN_PAGES 1
|
|
#else
|
|
#define KVM_MMU_CACHE_MIN_PAGES 2
|
|
#endif
|
|
|
|
#ifdef __ASSEMBLY__
|
|
|
|
#include <asm/alternative.h>
|
|
#include <asm/cpufeature.h>
|
|
|
|
/*
|
|
* Convert a kernel VA into a HYP VA.
|
|
* reg: VA to be converted.
|
|
*/
|
|
.macro kern_hyp_va reg
|
|
alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
|
|
and \reg, \reg, #HYP_PAGE_OFFSET_MASK
|
|
alternative_else
|
|
nop
|
|
alternative_endif
|
|
.endm
|
|
|
|
#else
|
|
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/cachetype.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/pgtable.h>
|
|
|
|
#define KERN_TO_HYP(kva) ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET)
|
|
|
|
/*
|
|
* We currently only support a 40bit IPA.
|
|
*/
|
|
#define KVM_PHYS_SHIFT (40)
|
|
#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT)
|
|
#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL)
|
|
|
|
int create_hyp_mappings(void *from, void *to);
|
|
int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
|
|
void free_boot_hyp_pgd(void);
|
|
void free_hyp_pgds(void);
|
|
|
|
void stage2_unmap_vm(struct kvm *kvm);
|
|
int kvm_alloc_stage2_pgd(struct kvm *kvm);
|
|
void kvm_free_stage2_pgd(struct kvm *kvm);
|
|
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
|
phys_addr_t pa, unsigned long size, bool writable);
|
|
|
|
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
|
|
|
|
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
|
|
|
|
phys_addr_t kvm_mmu_get_httbr(void);
|
|
phys_addr_t kvm_mmu_get_boot_httbr(void);
|
|
phys_addr_t kvm_get_idmap_vector(void);
|
|
int kvm_mmu_init(void);
|
|
void kvm_clear_hyp_idmap(void);
|
|
|
|
#define kvm_set_pte(ptep, pte) set_pte(ptep, pte)
|
|
#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd)
|
|
|
|
static inline void kvm_clean_pgd(pgd_t *pgd) {}
|
|
static inline void kvm_clean_pmd(pmd_t *pmd) {}
|
|
static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
|
|
static inline void kvm_clean_pte(pte_t *pte) {}
|
|
static inline void kvm_clean_pte_entry(pte_t *pte) {}
|
|
|
|
static inline void kvm_set_s2pte_writable(pte_t *pte)
|
|
{
|
|
pte_val(*pte) |= PTE_S2_RDWR;
|
|
}
|
|
|
|
static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
|
|
{
|
|
pmd_val(*pmd) |= PMD_S2_RDWR;
|
|
}
|
|
|
|
static inline void kvm_set_s2pte_readonly(pte_t *pte)
|
|
{
|
|
pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY;
|
|
}
|
|
|
|
static inline bool kvm_s2pte_readonly(pte_t *pte)
|
|
{
|
|
return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY;
|
|
}
|
|
|
|
static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
|
|
{
|
|
pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY;
|
|
}
|
|
|
|
static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
|
|
{
|
|
return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY;
|
|
}
|
|
|
|
|
|
#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end)
|
|
#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end)
|
|
#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end)
|
|
|
|
/*
|
|
* In the case where PGDIR_SHIFT is larger than KVM_PHYS_SHIFT, we can address
|
|
* the entire IPA input range with a single pgd entry, and we would only need
|
|
* one pgd entry. Note that in this case, the pgd is actually not used by
|
|
* the MMU for Stage-2 translations, but is merely a fake pgd used as a data
|
|
* structure for the kernel pgtable macros to work.
|
|
*/
|
|
#if PGDIR_SHIFT > KVM_PHYS_SHIFT
|
|
#define PTRS_PER_S2_PGD_SHIFT 0
|
|
#else
|
|
#define PTRS_PER_S2_PGD_SHIFT (KVM_PHYS_SHIFT - PGDIR_SHIFT)
|
|
#endif
|
|
#define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT)
|
|
|
|
#define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
|
|
|
|
/*
|
|
* If we are concatenating first level stage-2 page tables, we would have less
|
|
* than or equal to 16 pointers in the fake PGD, because that's what the
|
|
* architecture allows. In this case, (4 - CONFIG_PGTABLE_LEVELS)
|
|
* represents the first level for the host, and we add 1 to go to the next
|
|
* level (which uses contatenation) for the stage-2 tables.
|
|
*/
|
|
#if PTRS_PER_S2_PGD <= 16
|
|
#define KVM_PREALLOC_LEVEL (4 - CONFIG_PGTABLE_LEVELS + 1)
|
|
#else
|
|
#define KVM_PREALLOC_LEVEL (0)
|
|
#endif
|
|
|
|
static inline void *kvm_get_hwpgd(struct kvm *kvm)
|
|
{
|
|
pgd_t *pgd = kvm->arch.pgd;
|
|
pud_t *pud;
|
|
|
|
if (KVM_PREALLOC_LEVEL == 0)
|
|
return pgd;
|
|
|
|
pud = pud_offset(pgd, 0);
|
|
if (KVM_PREALLOC_LEVEL == 1)
|
|
return pud;
|
|
|
|
BUG_ON(KVM_PREALLOC_LEVEL != 2);
|
|
return pmd_offset(pud, 0);
|
|
}
|
|
|
|
static inline unsigned int kvm_get_hwpgd_size(void)
|
|
{
|
|
if (KVM_PREALLOC_LEVEL > 0)
|
|
return PTRS_PER_S2_PGD * PAGE_SIZE;
|
|
return PTRS_PER_S2_PGD * sizeof(pgd_t);
|
|
}
|
|
|
|
static inline bool kvm_page_empty(void *ptr)
|
|
{
|
|
struct page *ptr_page = virt_to_page(ptr);
|
|
return page_count(ptr_page) == 1;
|
|
}
|
|
|
|
#define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
|
|
|
|
#ifdef __PAGETABLE_PMD_FOLDED
|
|
#define kvm_pmd_table_empty(kvm, pmdp) (0)
|
|
#else
|
|
#define kvm_pmd_table_empty(kvm, pmdp) \
|
|
(kvm_page_empty(pmdp) && (!(kvm) || KVM_PREALLOC_LEVEL < 2))
|
|
#endif
|
|
|
|
#ifdef __PAGETABLE_PUD_FOLDED
|
|
#define kvm_pud_table_empty(kvm, pudp) (0)
|
|
#else
|
|
#define kvm_pud_table_empty(kvm, pudp) \
|
|
(kvm_page_empty(pudp) && (!(kvm) || KVM_PREALLOC_LEVEL < 1))
|
|
#endif
|
|
|
|
|
|
struct kvm;
|
|
|
|
#define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l))
|
|
|
|
static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
|
|
{
|
|
return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
|
|
}
|
|
|
|
static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
|
|
kvm_pfn_t pfn,
|
|
unsigned long size,
|
|
bool ipa_uncached)
|
|
{
|
|
void *va = page_address(pfn_to_page(pfn));
|
|
|
|
if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
|
|
kvm_flush_dcache_to_poc(va, size);
|
|
|
|
if (!icache_is_aliasing()) { /* PIPT */
|
|
flush_icache_range((unsigned long)va,
|
|
(unsigned long)va + size);
|
|
} else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */
|
|
/* any kind of VIPT cache */
|
|
__flush_icache_all();
|
|
}
|
|
}
|
|
|
|
static inline void __kvm_flush_dcache_pte(pte_t pte)
|
|
{
|
|
struct page *page = pte_page(pte);
|
|
kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
|
|
}
|
|
|
|
static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
|
|
{
|
|
struct page *page = pmd_page(pmd);
|
|
kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
|
|
}
|
|
|
|
static inline void __kvm_flush_dcache_pud(pud_t pud)
|
|
{
|
|
struct page *page = pud_page(pud);
|
|
kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
|
|
}
|
|
|
|
#define kvm_virt_to_phys(x) __virt_to_phys((unsigned long)(x))
|
|
|
|
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
|
|
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
|
|
|
|
static inline bool __kvm_cpu_uses_extended_idmap(void)
|
|
{
|
|
return __cpu_uses_extended_idmap();
|
|
}
|
|
|
|
static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
|
|
pgd_t *hyp_pgd,
|
|
pgd_t *merged_hyp_pgd,
|
|
unsigned long hyp_idmap_start)
|
|
{
|
|
int idmap_idx;
|
|
|
|
/*
|
|
* Use the first entry to access the HYP mappings. It is
|
|
* guaranteed to be free, otherwise we wouldn't use an
|
|
* extended idmap.
|
|
*/
|
|
VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
|
|
merged_hyp_pgd[0] = __pgd(__pa(hyp_pgd) | PMD_TYPE_TABLE);
|
|
|
|
/*
|
|
* Create another extended level entry that points to the boot HYP map,
|
|
* which contains an ID mapping of the HYP init code. We essentially
|
|
* merge the boot and runtime HYP maps by doing so, but they don't
|
|
* overlap anyway, so this is fine.
|
|
*/
|
|
idmap_idx = hyp_idmap_start >> VA_BITS;
|
|
VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
|
|
merged_hyp_pgd[idmap_idx] = __pgd(__pa(boot_hyp_pgd) | PMD_TYPE_TABLE);
|
|
}
|
|
|
|
static inline unsigned int kvm_get_vmid_bits(void)
|
|
{
|
|
int reg = read_system_reg(SYS_ID_AA64MMFR1_EL1);
|
|
|
|
return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
|
|
}
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
#endif /* __ARM64_KVM_MMU_H__ */
|