From b3f0907c71e006e12fde74ea9a745b6096b6f90f Mon Sep 17 00:00:00 2001 From: Brijesh Singh <brijesh.singh@amd.com> Date: Fri, 14 Sep 2018 08:45:58 -0500 Subject: [PATCH 01/16] x86/mm: Add .bss..decrypted section to hold shared variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvmclock defines few static variables which are shared with the hypervisor during the kvmclock initialization. When SEV is active, memory is encrypted with a guest-specific key, and if the guest OS wants to share the memory region with the hypervisor then it must clear the C-bit before sharing it. Currently, we use kernel_physical_mapping_init() to split large pages before clearing the C-bit on shared pages. But it fails when called from the kvmclock initialization (mainly because the memblock allocator is not ready that early during boot). Add a __bss_decrypted section attribute which can be used when defining such shared variable. The so-defined variables will be placed in the .bss..decrypted section. This section will be mapped with C=0 early during boot. The .bss..decrypted section has a big chunk of memory that may be unused when memory encryption is not active, free it when memory encryption is not active. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Borislav Petkov <bp@suse.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Sean Christopherson <sean.j.christopherson@intel.com> Cc: Radim Krčmář<rkrcmar@redhat.com> Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/1536932759-12905-2-git-send-email-brijesh.singh@amd.com --- arch/x86/include/asm/mem_encrypt.h | 7 +++++++ arch/x86/kernel/head64.c | 16 ++++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 19 +++++++++++++++++++ arch/x86/mm/init.c | 4 ++++ arch/x86/mm/mem_encrypt.c | 24 ++++++++++++++++++++++++ 5 files changed, 70 insertions(+) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c0643831706e..616f8e637bc3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); +#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +#define __bss_decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* @@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa(x) (__pa(x) | sme_me_mask) #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) +extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..c16af27eb23f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -112,6 +112,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -234,6 +235,21 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Encrypt the kernel and related (if SME is active) */ sme_encrypt_kernel(bp); + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a419f86..5dd3317d761f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,6 +65,23 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + #else #define X86_ALIGN_RODATA_BEGIN @@ -74,6 +91,7 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED #endif @@ -355,6 +373,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) + BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a8fc26c1115..faca978ebf9d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } +void __weak mem_encrypt_free_decrypted_mem(void) { } + void __ref free_initmem(void) { e820__reallocate_tables(); + mem_encrypt_free_decrypted_mem(); + free_kernel_image_pages(&__init_begin, &__init_end); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b2de398d1fd3..006f373f54ab 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -348,6 +348,30 @@ bool sev_active(void) EXPORT_SYMBOL(sev_active); /* Architecture __weak replacement functions */ +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (mem_encrypt_active()) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} + void __init mem_encrypt_init(void) { if (!sme_me_mask) From 6a1cac56f41f9ea94e440dfcc1cac44b41a1b194 Mon Sep 17 00:00:00 2001 From: Brijesh Singh <brijesh.singh@amd.com> Date: Fri, 14 Sep 2018 08:45:59 -0500 Subject: [PATCH 02/16] x86/kvm: Use __bss_decrypted attribute in shared variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent removal of the memblock dependency from kvmclock caused a SEV guest regression because the wall_clock and hv_clock_boot variables are no longer mapped decrypted when SEV is active. Use the __bss_decrypted attribute to put the static wall_clock and hv_clock_boot in the .bss..decrypted section so that they are mapped decrypted during boot. In the preparatory stage of CPU hotplug, the per-cpu pvclock data pointer assigns either an element of the static array or dynamically allocated memory for the pvclock data pointer. The static array are now mapped decrypted but the dynamically allocated memory is not mapped decrypted. However, when SEV is active this memory range must be mapped decrypted. Add a function which is called after the page allocator is up, and allocate memory for the pvclock data pointers for the all possible cpus. Map this memory range as decrypted when SEV is active. Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency") Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Borislav Petkov <bp@suse.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Sean Christopherson <sean.j.christopherson@intel.com> Cc: "Radim Krčmář" <rkrcmar@redhat.com> Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/1536932759-12905-3-git-send-email-brijesh.singh@amd.com --- arch/x86/kernel/kvmclock.c | 52 +++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1e6764648af3..013fe3d21dbb 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include <linux/sched/clock.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/set_memory.h> #include <asm/hypervisor.h> #include <asm/mem_encrypt.h> @@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); -static struct pvclock_wall_clock wall_clock; + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +static struct pvclock_vsyscall_time_info *hvclock_mem; static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { @@ -236,6 +238,45 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (sev_active()) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + static int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 @@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void) kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif + + kvmclock_init_mem(); + return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); @@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* Use the static page for the first CPUs, allocate otherwise */ if (cpu < HVC_BOOT_ARRAY_SIZE) p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; else - p = kzalloc(sizeof(*p), GFP_KERNEL); + return -ENOMEM; per_cpu(hv_clock_per_cpu, cpu) = p; return p ? 0 : -ENOMEM; From 753694a8df318f204a0ac1303de136def16f2e9c Mon Sep 17 00:00:00 2001 From: Xiaochen Shen <xiaochen.shen@intel.com> Date: Sat, 15 Sep 2018 14:58:19 -0700 Subject: [PATCH 03/16] x86/intel_rdt: Fix data type in parsing callbacks Each resource is associated with a parsing callback to parse the data provided from user space when writing schemata file. The 'data' parameter in the callbacks is defined as a void pointer which is error prone due to lack of type check. parse_bw() processes the 'data' parameter as a string while its caller actually passes the parameter as a pointer to struct rdt_cbm_parse_data. Thus, parse_bw() takes wrong data and causes failure of parsing MBA throttle value. To fix the issue, the 'data' parameter in all parsing callbacks is defined and handled as a pointer to struct rdt_parse_data (renamed from struct rdt_cbm_parse_data). Fixes: 7604df6e16ae ("x86/intel_rdt: Support flexible data to parsing callbacks") Fixes: 9ab9aa15c309 ("x86/intel_rdt: Ensure requested schemata respects mode") Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-2-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 16 ++++++++++++---- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 21 ++++++++------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 4e588f36228f..78266c798280 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e) e <= QOS_L3_MBM_LOCAL_EVENT_ID); } +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /** * struct rdt_resource - attributes of an RDT resource * @rid: The index of the resource @@ -423,16 +428,19 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (void *data, struct rdt_resource *r, - struct rdt_domain *d); + int (*parse_ctrlval)(struct rdt_parse_data *data, + struct rdt_resource *r, + struct rdt_domain *d); struct list_head evt_list; int num_rmid; unsigned int mon_scale; unsigned long fflags; }; -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index af358ca05160..edd5761f7336 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - unsigned long data; - char *buf = _buf; + unsigned long bw_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if (!bw_validate(buf, &data, r)) + if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; - d->new_ctrl = data; + d->new_ctrl = bw_val; d->have_new_ctrl = true; return 0; @@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } -struct rdt_cbm_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - struct rdt_cbm_parse_data *data = _data; struct rdtgroup *rdtgrp = data->rdtgrp; u32 cbm_val; @@ -195,7 +190,7 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) static int parse_line(char *line, struct rdt_resource *r, struct rdtgroup *rdtgrp) { - struct rdt_cbm_parse_data data; + struct rdt_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; From f968dc119a159a95628a20de2a2dcc913d0a82d7 Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:20 -0700 Subject: [PATCH 04/16] x86/intel_rdt: Fix size reporting of MBA resource Chen Yu reported a divide-by-zero error when accessing the 'size' resctrl file when a MBA resource is enabled. divide error: 0000 [#1] SMP PTI CPU: 93 PID: 1929 Comm: cat Not tainted 4.19.0-rc2-debug-rdt+ #25 RIP: 0010:rdtgroup_cbm_to_size+0x7e/0xa0 Call Trace: rdtgroup_size_show+0x11a/0x1d0 seq_read+0xd8/0x3b0 Quoting Chen Yu's report: This is because for MB resource, the r->cache.cbm_len is zero, thus calculating size in rdtgroup_cbm_to_size() will trigger the exception. Fix this issue in the 'size' file by getting correct memory bandwidth value which is in MBps when MBA software controller is enabled or in percentage when MBA software controller is disabled. Fixes: d9b48c86eb38 ("x86/intel_rdt: Display resource groups' allocations in bytes") Reported-by: Chen Yu <yu.c.chen@intel.com> Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Chen Yu <yu.c.chen@intel.com> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Link: https://lkml.kernel.org/r/20180904174614.26682-1-yu.c.chen@intel.com Link: https://lkml.kernel.org/r/1537048707-76280-3-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b799c00bef09..32e8bbdf2400 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1155,8 +1155,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; - bool sep = false; - u32 cbm; + bool sep; + u32 ctrl; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -1174,6 +1174,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } for_each_alloc_enabled_rdt_resource(r) { + sep = false; seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(d, &r->domains, list) { if (sep) @@ -1181,8 +1182,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - cbm = d->ctrl_val[rdtgrp->closid]; - size = rdtgroup_cbm_to_size(r, d, cbm); + ctrl = (!is_mba_sc(r) ? + d->ctrl_val[rdtgrp->closid] : + d->mbps_val[rdtgrp->closid]); + if (r->rid == RDT_RESOURCE_MBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); } seq_printf(s, "%d=%u", d->id, size); sep = true; From c793da8e4c62d2c002a79c47f44efead450cbcae Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:21 -0700 Subject: [PATCH 05/16] x86/intel_rdt: Global closid helper to support future fixes The number of CLOSIDs supported by a system is the minimum number of CLOSIDs supported by any of its resources. Care should be taken when iterating over the CLOSIDs of a resource since it may be that the number of CLOSIDs supported on the system is less than the number of CLOSIDs supported by the resource. Introduce a helper function that can be used to query the number of CLOSIDs that is supported by all resources, irrespective of how many CLOSIDs are supported by a particular resource. Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-4-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 1 + arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 78266c798280..285eb3ec4200 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -544,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); +int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 32e8bbdf2400..b372923eb209 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...) * limited as the number of resources grows. */ static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} static void closid_init(void) { @@ -111,6 +117,7 @@ static void closid_init(void) /* CLOSID 0 is always reserved for the default group */ closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) From 47d53b184aee983ab9492503da11b0a81b19145b Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:22 -0700 Subject: [PATCH 06/16] x86/intel_rdt: Fix invalid mode warning when multiple resources are managed When multiple resources are managed by RDT, the number of CLOSIDs used is the minimum of the CLOSIDs supported by each resource. In the function rdt_bit_usage_show(), the annotated bitmask is created to depict how the CAT supporting caches are being used. During this annotated bitmask creation, each resource group is queried for its mode that is used as a label in the annotated bitmask. The maximum number of resource groups is currently assumed to be the number of CLOSIDs supported by the resource for which the information is being displayed. This is incorrect since the number of active CLOSIDs is the minimum across all resources. If information for a cache instance with more CLOSIDs than another is being generated we thus encounter a warning like: invalid mode for closid 8 WARNING: CPU: 88 PID: 1791 at [SNIP]/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c :827 rdt_bit_usage_show+0x221/0x2b0 Fix this by ensuring that only the number of supported CLOSIDs are considered. Fixes: e651901187ab8 ("x86/intel_rdt: Introduce "bit_usage" to display cache allocations details") Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-5-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b372923eb209..ea91750ba27f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -809,7 +809,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->id); - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (!closid_allocated(i)) continue; mode = rdtgroup_mode_by_closid(i); From 70479c012b67b89e219c40eddc5dc338b7c447a3 Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:23 -0700 Subject: [PATCH 07/16] x86/intel_rdt: Fix unchecked MSR access When a new resource group is created, it is initialized with sane defaults that currently assume the resource being initialized is a CAT resource. This code path is also followed by a MBA resource that is not allocated the same as a CAT resource and as a result we encounter the following unchecked MSR access error: unchecked MSR access error: WRMSR to 0xd51 (tried to write 0x0000 000000000064) at rIP: 0xffffffffae059994 (native_write_msr+0x4/0x20) Call Trace: mba_wrmsr+0x41/0x80 update_domains+0x125/0x130 rdtgroup_mkdir+0x270/0x500 Fix the above by ensuring the initial allocation is only attempted on a CAT resource. Fixes: 95f0b77ef ("x86/intel_rdt: Initialize new resource group with sane defaults") Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-6-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index ea91750ba27f..74821bc457c0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2349,6 +2349,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; list_for_each_entry(d, &r->domains, list) { d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; @@ -2386,6 +2392,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("failed to initialize allocations\n"); From 32d736abed4febff4b6bf85d5d240ee24d254322 Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:24 -0700 Subject: [PATCH 08/16] x86/intel_rdt: Do not allow pseudo-locking of MBA resource A system supporting pseudo-locking may have MBA as well as CAT resources of which only the CAT resources could support cache pseudo-locking. When the schemata to be pseudo-locked is provided it should be checked that that schemata does not attempt to pseudo-lock a MBA resource. Fixes: e0bdfe8e3 ("x86/intel_rdt: Support creation/removal of pseudo-locked region") Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-7-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index edd5761f7336..0f53049719cd 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -195,6 +195,12 @@ static int parse_line(char *line, struct rdt_resource *r, struct rdt_domain *d; unsigned long dom_id; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + r->rid == RDT_RESOURCE_MBA) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; From f0df4e1acf3d721958dcafb2c9c0bdf25189068d Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:25 -0700 Subject: [PATCH 09/16] x86/intel_rdt: Fix incorrect loop end condition A loop is used to check if a CAT resource's CBM of one CLOSID overlaps with the CBM of another CLOSID of the same resource. The loop is run over all CLOSIDs supported by the resource. The problem with running the loop over all CLOSIDs supported by the resource is that its number of supported CLOSIDs may be more than the number of supported CLOSIDs on the system, which is the minimum number of CLOSIDs supported across all resources. Fix the loop to only consider the number of system supported CLOSIDs, not all that are supported by the resource. Fixes: 49f7b4efa ("x86/intel_rdt: Enable setting of exclusive mode") Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-8-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 74821bc457c0..afd93d45e21b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -996,7 +996,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, /* Check for overlap with other resource groups */ ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { ctrl_b = (unsigned long *)ctrl; mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && From 939b90b20bc87e199b6b53942764b987289b87ce Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:26 -0700 Subject: [PATCH 10/16] x86/intel_rdt: Fix exclusive mode handling of MBA resource It is possible for a resource group to consist out of MBA as well as CAT/CDP resources. The "exclusive" resource mode only applies to the CAT/CDP resources since MBA allocations cannot be specified to overlap or not. When a user requests a resource group to become "exclusive" then it can only be successful if there are CAT/CDP resources in the group and none of their CBMs associated with the group's CLOSID overlaps with any other resource group. Fix the "exclusive" mode setting by failing if there isn't any CAT/CDP resource in the group and ensuring that the CBM checking is only done on CAT/CDP resources. Fixes: 49f7b4efa ("x86/intel_rdt: Enable setting of exclusive mode") Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-9-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index afd93d45e21b..f3231f78d69b 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1031,16 +1031,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; struct rdt_resource *r; + bool has_cache = false; struct rdt_domain *d; for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) + continue; + has_cache = true; list_for_each_entry(d, &r->domains, list) { if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], - rdtgrp->closid, false)) + rdtgrp->closid, false)) { + rdt_last_cmd_puts("schemata overlaps\n"); return false; + } } } + if (!has_cache) { + rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n"); + return false; + } + return true; } @@ -1092,7 +1103,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, rdtgrp->mode = RDT_MODE_SHAREABLE; } else if (!strcmp(buf, "exclusive")) { if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - rdt_last_cmd_printf("schemata overlaps\n"); ret = -EINVAL; goto out; } From ffb2315fd22c2568747402eecdc581a245a2f5ba Mon Sep 17 00:00:00 2001 From: Reinette Chatre <reinette.chatre@intel.com> Date: Sat, 15 Sep 2018 14:58:27 -0700 Subject: [PATCH 11/16] x86/intel_rdt: Fix incorrect loop end condition In order to determine a sane default cache allocation for a new CAT/CDP resource group, all resource groups are checked to determine which cache portions are available to share. At this time all possible CLOSIDs that can be supported by the resource is checked. This is problematic if the resource supports more CLOSIDs than another CAT/CDP resource. In this case, the number of CLOSIDs that could be allocated are fewer than the number of CLOSIDs that can be supported by the resource. Limit the check of closids to that what is supported by the system based on the minimum across all resources. Fixes: 95f0b77ef ("x86/intel_rdt: Initialize new resource group with sane defaults") Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Cc: "Xiaochen Shen" <xiaochen.shen@intel.com> Cc: "Chen Yu" <yu.c.chen@intel.com> Link: https://lkml.kernel.org/r/1537048707-76280-10-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index f3231f78d69b..1b8e86a5d5e1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -2370,7 +2370,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) From 571d0563c8881595f4ab027aef9ed1c55e3e7b7c Mon Sep 17 00:00:00 2001 From: Dan Carpenter <dan.carpenter@oracle.com> Date: Wed, 19 Sep 2018 13:35:53 +0300 Subject: [PATCH 12/16] x86/paravirt: Fix some warning messages The first argument to WARN_ONCE() is a condition. Fixes: 5800dc5c19f3 ("x86/paravirt: Fix spectre-v2 mitigations for paravirt guests") Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Juergen Gross <jgross@suse.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Alok Kataria <akataria@vmware.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: virtualization@lists.linux-foundation.org Cc: kernel-janitors@vger.kernel.org Link: https://lkml.kernel.org/r/20180919103553.GD9238@mwanda --- arch/x86/kernel/paravirt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index afdb303285f8..8dc69d82567e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -91,7 +91,7 @@ unsigned paravirt_patch_call(void *insnbuf, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -111,7 +111,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } From 336b08088d4da009108d4418e241ca95c9152237 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Wed, 19 Sep 2018 13:48:41 +0200 Subject: [PATCH 13/16] MAINTAINERS: Add Borislav to the x86 maintainers Borislav is effectivly maintaining parts of X86 already, make it official. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Ingo Molnar <mingo@kernel.org> Acked-by: Borislav Petkov <bp@alien8.de> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 4ece30f15777..091e66b60cd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15913,6 +15913,7 @@ F: net/x25/ X86 ARCHITECTURE (32-BIT AND 64-BIT) M: Thomas Gleixner <tglx@linutronix.de> M: Ingo Molnar <mingo@redhat.com> +M: Borislav Petkov <bp@alien8.de> R: "H. Peter Anvin" <hpa@zytor.com> M: x86@kernel.org L: linux-kernel@vger.kernel.org From a8b3bb338e4ee4cc84a2b9a6fdf27049b84baa59 Mon Sep 17 00:00:00 2001 From: Fenghua Yu <fenghua.yu@intel.com> Date: Thu, 20 Sep 2018 12:37:08 -0700 Subject: [PATCH 14/16] x86/intel_rdt: Add Reinette as co-maintainer for RDT Reinette Chatre is doing great job on enabling pseudo-locking and other features in RDT. Add her as co-maintainer for RDT. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Ingo Molnar <mingo@kernel.org> Acked-by: Reinette Chatre <reinette.chatre@intel.com> Cc: "H Peter Anvin" <hpa@zytor.com> Cc: "Tony Luck" <tony.luck@intel.com> Link: https://lkml.kernel.org/r/1537472228-221799-1-git-send-email-fenghua.yu@intel.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 091e66b60cd2..140ea6ee3ac8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12260,6 +12260,7 @@ F: Documentation/networking/rds.txt RDT - RESOURCE ALLOCATION M: Fenghua Yu <fenghua.yu@intel.com> +M: Reinette Chatre <reinette.chatre@intel.com> L: linux-kernel@vger.kernel.org S: Supported F: arch/x86/kernel/cpu/intel_rdt* From 9068a427ee0beb1365a0925e8c33f338f09f5e97 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Wed, 19 Sep 2018 14:33:14 +0200 Subject: [PATCH 15/16] MAINTAINERS: Add X86 MM entry Dave, Andy and Peter are de facto overseing the mm parts of X86. Add an explicit maintainers entry. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Andy Lutomirski <luto@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Acked-by: Ingo Molnar <mingo@kernel.org> --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 140ea6ee3ac8..80a311252b04 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15943,6 +15943,15 @@ M: Borislav Petkov <bp@alien8.de> S: Maintained F: arch/x86/kernel/cpu/microcode/* +X86 MM +M: Dave Hansen <dave.hansen@linux.intel.com> +M: Andy Lutomirski <luto@kernel.org> +M: Peter Zijlstra <peterz@infradead.org> +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm +S: Maintained +F: arch/x86/mm/ + X86 PLATFORM DRIVERS M: Darren Hart <dvhart@infradead.org> M: Andy Shevchenko <andy@infradead.org> From 05ab1d8a4b36ee912b7087c6da127439ed0a903e Mon Sep 17 00:00:00 2001 From: Feng Tang <feng.tang@intel.com> Date: Thu, 20 Sep 2018 10:58:28 +0800 Subject: [PATCH 16/16] x86/mm: Expand static page table for fixmap space We met a kernel panic when enabling earlycon, which is due to the fixmap address of earlycon is not statically setup. Currently the static fixmap setup in head_64.S only covers 2M virtual address space, while it actually could be in 4M space with different kernel configurations, e.g. when VSYSCALL emulation is disabled. So increase the static space to 4M for now by defining FIXMAP_PMD_NUM to 2, and add a build time check to ensure that the fixmap is covered by the initial static page tables. Fixes: 1ad83c858c7d ("x86_64,vsyscall: Make vsyscall emulation configurable") Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Feng Tang <feng.tang@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: kernel test robot <rong.a.chen@intel.com> Reviewed-by: Juergen Gross <jgross@suse.com> (Xen parts) Cc: H Peter Anvin <hpa@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andy Lutomirsky <luto@kernel.org> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180920025828.23699-1-feng.tang@intel.com --- arch/x86/include/asm/fixmap.h | 10 ++++++++++ arch/x86/include/asm/pgtable_64.h | 3 ++- arch/x86/kernel/head64.c | 4 +++- arch/x86/kernel/head_64.S | 16 ++++++++++++---- arch/x86/mm/pgtable.c | 9 +++++++++ arch/x86/xen/mmu_pv.c | 8 ++++++-- 6 files changed, 42 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index e203169931c7..6390bd8c141b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,6 +14,16 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +/* + * Exposed to assembly code for setting up initial page tables. Cannot be + * calculated in assembly code (fixmap entries are an enum), but is sanity + * checked in the actual fixmap C code to make sure that the fixmap is + * covered fully. + */ +#define FIXMAP_PMD_NUM 2 +/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ +#define FIXMAP_PMD_TOP 507 + #ifndef __ASSEMBLY__ #include <linux/kernel.h> #include <asm/acpi.h> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index ce2b59047cb8..9c85b54bf03c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -14,6 +14,7 @@ #include <asm/processor.h> #include <linux/bitops.h> #include <linux/threads.h> +#include <asm/fixmap.h> extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; @@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; -extern pte_t level1_fixmap_pgt[512]; +extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c16af27eb23f..ddee1f0870c4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -35,6 +35,7 @@ #include <asm/bootparam_utils.h> #include <asm/microcode.h> #include <asm/kasan.h> +#include <asm/fixmap.h> /* * Manage page tables very early on. @@ -166,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[511] += load_delta; pmd = fixup_pointer(level2_fixmap_pgt, physaddr); - pmd[506] += load_delta; + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; /* * Set up the identity mapping for the switchover. These diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 15ebc2fc166e..a3618cf04cf6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,6 +24,7 @@ #include "../entry/calling.h" #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/fixmap.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt) KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 NEXT_PAGE(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) .fill 512,8,0 + .endr #undef PMDS diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ae394552fb94..089e78c4effd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -637,6 +637,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + if (idx >= __end_of_fixed_addresses) { BUG(); return; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2fe5c9b1816b..dd461c0167ef 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1907,7 +1907,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); - /* L3_k[511][506] -> level1_fixmap_pgt */ + /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */ convert_pfn_mfn(level2_fixmap_pgt); /* We get [511][511] and have Xen's version of level2_kernel_pgt */ @@ -1952,7 +1952,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + for (i = 0; i < FIXMAP_PMD_NUM; i++) { + set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE, + PAGE_KERNEL_RO); + } /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,