From 0444ad93ea2449963132d68753020a6a24d69895 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:57:56 -0400 Subject: [PATCH 01/14] x86, iommu: Add IOMMU_INIT macros, .iommu_table section, and iommu_table_entry structure This patch set adds a mechanism to "modularize" the IOMMUs we have on X86. Currently the count of IOMMUs is up to six and they have a complex relationship that requires careful execution order. 'pci_iommu_alloc' does that today, but most folks are unhappy with how it does it. This patch set addresses this and also paves a mechanism to jettison unused IOMMUs during run-time. For details that sparked this, please refer to: http://lkml.org/lkml/2010/8/2/282 The first solution that comes to mind is to convert wholesale the IOMMU detection routines to be called during initcall time frame. Unfortunately that misses the dependency relationship that some of the IOMMUs have (for example: for AMD-Vi IOMMU to work, GART detection MUST run first, and before all of that SWIOTLB MUST run). The second solution would be to introduce a registration call wherein the IOMMU would provide its detection/init routines and as well on what MUST run before it. That would work, except that the 'pci_iommu_alloc' which would run through this list, is called during mem_init. This means we don't have any memory allocator, and it is so early that we haven't yet started running through the initcall_t list. This solution borrows concepts from the 2nd idea and from how MODULE_INIT works. A macro is provided that each IOMMU uses to define it's detect function and early_init (before the memory allocate is active), and as well what other IOMMU MUST run before us. Since most IOMMUs depend on having SWIOTLB run first ("pci_swiotlb_detect") a convenience macro to depends on that is also provided. This macro is similar in design to MODULE_PARAM macro wherein we setup a .iommu_table section in which we populate it with the values that match a struct iommu_table_entry. During bootup we will sort through the array so that the IOMMUs that MUST run before us are first elements in the array. And then we just iterate through them calling the detection routine and if appropiate, the init routines. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-2-git-send-email-konrad.wilk@oracle.com> CC: H. Peter Anvin CC: Fujita Tomonori CC: Thomas Gleixner CC: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iommu_table.h | 95 ++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 7 +++ 2 files changed, 102 insertions(+) create mode 100644 arch/x86/include/asm/iommu_table.h diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h new file mode 100644 index 000000000000..435176f96a56 --- /dev/null +++ b/arch/x86/include/asm/iommu_table.h @@ -0,0 +1,95 @@ + +#ifndef _ASM_X86_IOMMU_TABLE_H +#define _ASM_X86_IOMMU_TABLE_H + +#include + +/* + * History lesson: + * The execution chain of IOMMUs in 2.6.36 looks as so: + * + * [xen-swiotlb] + * | + * +----[swiotlb *]--+ + * / | \ + * / | \ + * [GART] [Calgary] [Intel VT-d] + * / + * / + * [AMD-Vi] + * + * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip + * over the rest of IOMMUs and unconditionally initialize the SWIOTLB. + * Also it would surreptitiously initialize set the swiotlb=1 if there were + * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb + * flag would be turned off by all IOMMUs except the Calgary one. + * + * The IOMMU_INIT* macros allow a similar tree (or more complex if desired) + * to be built by defining who we depend on. + * + * And all that needs to be done is to use one of the macros in the IOMMU + * and the pci-dma.c will take care of the rest. + */ + +struct iommu_table_entry { + initcall_t detect; + initcall_t depend; + void (*early_init)(void); /* No memory allocate available. */ + void (*late_init)(void); /* Yes, can allocate memory. */ +#define IOMMU_FINISH_IF_DETECTED (1<<0) +#define IOMMU_DETECTED (1<<1) + int flags; +}; +/* + * Macro fills out an entry in the .iommu_table that is equivalent + * to the fields that 'struct iommu_table_entry' has. The entries + * that are put in the .iommu_table section are not put in any order + * hence during boot-time we will have to resort them based on + * dependency. */ + + +#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\ + static const struct iommu_table_entry const \ + __iommu_entry_##_detect __used \ + __attribute__ ((unused, __section__(".iommu_table"), \ + aligned((sizeof(void *))))) \ + = {_detect, _depend, _early_init, _late_init, \ + _finish ? IOMMU_FINISH_IF_DETECTED : 0} +/* + * The simplest IOMMU definition. Provide the detection routine + * and it will be run after the SWIOTLB and the other IOMMUs + * that utilize this macro. If the IOMMU is detected (ie, the + * detect routine returns a positive value), the other IOMMUs + * are also checked. You can use IOMMU_INIT_FINISH if you prefer + * to stop detecting the other IOMMUs after yours has been detected. + */ +#define IOMMU_INIT_POST(_detect) \ + __IOMMU_INIT(_detect, pci_swiotlb_detect, 0, 0, 0) + +#define IOMMU_INIT_POST_FINISH(detect) \ + __IOMMU_INIT(_detect, pci_swiotlb_detect, 0, 0, 1) + +/* + * A more sophisticated version of IOMMU_INIT. This variant requires: + * a). A detection routine function. + * b). The name of the detection routine we depend on to get called + * before us. + * c). The init routine which gets called if the detection routine + * returns a positive value from the pci_iommu_alloc. This means + * no presence of a memory allocator. + * d). Similar to the 'init', except that this gets called from pci_iommu_init + * where we do have a memory allocator. + * + * The _CONT vs the _EXIT differs in that the _CONT variant will + * continue detecting other IOMMUs in the call list after the + * the detection routine returns a positive number. The _EXIT will + * stop the execution chain. Both will still call the 'init' and + * 'late_init' functions if they are set. + */ +#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ + __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) + +#define IOMMU_INIT(_detect, _depend, _init, _late_init) \ + __IOMMU_INIT(_detect, _depend, _init, _late_init, 0) + +#endif /* _ASM_X86_IOMMU_TABLE_H */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d0bb52296fa3..b92e040466c1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -260,6 +260,13 @@ SECTIONS *(.altinstr_replacement) } + .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { + __iommu_table = .; + *(.iommu_table) + . = ALIGN(8); + __iommu_table_end = .; + } + /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame From 480125ba49ba62be93beea37770f266846e077ab Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:57:57 -0400 Subject: [PATCH 02/14] x86, iommu: Make all IOMMU's detection routines return a value. We return 1 if the IOMMU has been detected. Zero or an error number if we failed to find it. This is in preperation of using the IOMMU_INIT so that we can detect whether an IOMMU is present. I have not tested this for regression on Calgary, nor on AMD Vi chipsets as I don't have that hardware. CC: Muli Ben-Yehuda CC: "Jon D. Mason" CC: "Darrick J. Wong" CC: Jesse Barnes CC: David Woodhouse CC: Chris Wright CC: Yinghai Lu CC: Joerg Roedel CC: H. Peter Anvin CC: Fujita Tomonori Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-3-git-send-email-konrad.wilk@oracle.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/amd_iommu.h | 4 ++-- arch/x86/include/asm/calgary.h | 4 ++-- arch/x86/include/asm/gart.h | 5 +++-- arch/x86/kernel/amd_iommu_init.c | 8 +++++--- arch/x86/kernel/aperture_64.c | 11 +++++++---- arch/x86/kernel/pci-calgary_64.c | 15 ++++++++------- drivers/pci/dmar.c | 4 +++- include/linux/dmar.h | 6 +++--- 8 files changed, 33 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 5af2982133b5..2798142cdb49 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -24,11 +24,11 @@ #ifdef CONFIG_AMD_IOMMU -extern void amd_iommu_detect(void); +extern int amd_iommu_detect(void); #else -static inline void amd_iommu_detect(void) { } +static inline int amd_iommu_detect(void) { return -ENODEV; } #endif diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index 0918654305af..0d467b338835 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h @@ -62,9 +62,9 @@ struct cal_chipset_ops { extern int use_calgary; #ifdef CONFIG_CALGARY_IOMMU -extern void detect_calgary(void); +extern int detect_calgary(void); #else -static inline void detect_calgary(void) { return; } +static inline int detect_calgary(void) { return -ENODEV; } #endif #endif /* _ASM_X86_CALGARY_H */ diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 4ac5b0f33fc1..d7d1d4c438a4 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -37,7 +37,7 @@ extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); extern int gart_iommu_init(void); extern void __init gart_parse_options(char *); -extern void gart_iommu_hole_init(void); +extern int gart_iommu_hole_init(void); #else #define gart_iommu_aperture 0 @@ -50,8 +50,9 @@ static inline void early_gart_iommu_check(void) static inline void gart_parse_options(char *options) { } -static inline void gart_iommu_hole_init(void) +static inline int gart_iommu_hole_init(void) { + return -ENODEV; } #endif diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3cc63e2b8dd4..0b9e2dc4fc9a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1382,13 +1382,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) return 0; } -void __init amd_iommu_detect(void) +int __init amd_iommu_detect(void) { if (no_iommu || (iommu_detected && !gart_iommu_aperture)) - return; + return -ENODEV; if (amd_iommu_disabled) - return; + return -ENODEV; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; @@ -1397,7 +1397,9 @@ void __init amd_iommu_detect(void) /* Make sure ACS will be enabled */ pci_request_acs(); + return 1; } + return -ENODEV; } /**************************************************************************** diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e17..afa0dab3302f 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -371,7 +371,7 @@ void __init early_gart_iommu_check(void) static int __initdata printed_gart_size_msg; -void __init gart_iommu_hole_init(void) +int __init gart_iommu_hole_init(void) { u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; @@ -381,7 +381,7 @@ void __init gart_iommu_hole_init(void) if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) - return; + return -ENODEV; printk(KERN_INFO "Checking aperture...\n"); @@ -463,8 +463,9 @@ out: unsigned long n = (32 * 1024 * 1024) << last_aper_order; insert_aperture_resource((u32)last_aper_base, n); + return 1; } - return; + return 0; } if (!fallback_aper_force) { @@ -500,7 +501,7 @@ out: panic("Not enough memory for aperture"); } } else { - return; + return 0; } /* Fix up the north bridges */ @@ -524,4 +525,6 @@ out: } set_up_gart_resume(aper_order, aper_alloc); + + return 1; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 078d4ec1a9d9..28c6b389fee6 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1364,7 +1364,7 @@ static int __init calgary_iommu_init(void) return 0; } -void __init detect_calgary(void) +int __init detect_calgary(void) { int bus; void *tbl; @@ -1378,13 +1378,13 @@ void __init detect_calgary(void) * another HW IOMMU already, bail out. */ if (no_iommu || iommu_detected) - return; + return -ENODEV; if (!use_calgary) - return; + return -ENODEV; if (!early_pci_allowed()) - return; + return -ENODEV; printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); @@ -1410,13 +1410,13 @@ void __init detect_calgary(void) if (!rio_table_hdr) { printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " "in EBDA - bailing!\n"); - return; + return -ENODEV; } ret = build_detail_arrays(); if (ret) { printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); - return; + return -ENOMEM; } specified_table_size = determine_tce_table_size((is_kdump_kernel() ? @@ -1464,7 +1464,7 @@ void __init detect_calgary(void) x86_init.iommu.iommu_init = calgary_iommu_init; } - return; + return calgary_found; cleanup: for (--bus; bus >= 0; --bus) { @@ -1473,6 +1473,7 @@ cleanup: if (info->tce_space) free_tce_table(info->tce_space); } + return -ENOMEM; } static int __init calgary_parse_options(char *p) diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 0a19708074c2..5fa64ea5416f 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -687,7 +687,7 @@ failed: return 0; } -void __init detect_intel_iommu(void) +int __init detect_intel_iommu(void) { int ret; @@ -723,6 +723,8 @@ void __init detect_intel_iommu(void) } early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); dmar_tbl = NULL; + + return (ret ? 1 : -ENODEV); } diff --git a/include/linux/dmar.h b/include/linux/dmar.h index d7cecc90ed34..a20602041511 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -57,15 +57,15 @@ extern int dmar_table_init(void); extern int dmar_dev_scope_init(void); /* Intel IOMMU detection */ -extern void detect_intel_iommu(void); +extern int detect_intel_iommu(void); extern int enable_drhd_fault_handling(void); extern int parse_ioapics_under_ir(void); extern int alloc_iommu(struct dmar_drhd_unit *); #else -static inline void detect_intel_iommu(void) +static inline int detect_intel_iommu(void) { - return; + return -ENODEV; } static inline int dmar_table_init(void) From 5bef80a4b826b9cee1c6aec7ecc371ec395260cc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:57:58 -0400 Subject: [PATCH 03/14] x86, iommu: Add proper dependency sort routine (and sanity check). We are using a very simple sort routine which sorts the .iommu_table array in the order of dependencies. Specifically each structure of iommu_table_entry has a field 'depend' which contains the function pointer to the IOMMU that MUST be run before us. We sort the array of structures so that the struct iommu_table_entry with no 'depend' field are first, and then the subsequent ones are the ones for which the 'depend' function has been already invoked (in other words, precede us). Using the kernel's version 'sort', which is a mergeheap is feasible, but would require making the comparison operator scan recursivly the array to satisfy the "heapify" process: setting the levels properly. The end result would much more complex than it should be an it is just much simpler to utilize this simple sort routine. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-4-git-send-email-konrad.wilk@oracle.com> CC: H. Peter Anvin CC: Fujita Tomonori Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iommu_table.h | 6 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/pci-iommu_table.c | 89 ++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 arch/x86/kernel/pci-iommu_table.c diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index 435176f96a56..2124e3ef6f98 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -92,4 +92,10 @@ struct iommu_table_entry { #define IOMMU_INIT(_detect, _depend, _init, _late_init) \ __IOMMU_INIT(_detect, _depend, _init, _late_init, 0) +void sort_iommu_table(struct iommu_table_entry *start, + struct iommu_table_entry *finish); + +void check_iommu_entries(struct iommu_table_entry *start, + struct iommu_table_entry *finish); + #endif /* _ASM_X86_IOMMU_TABLE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0925676266bd..6817546595ef 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -42,6 +42,7 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o +obj-y += pci-iommu_table.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c new file mode 100644 index 000000000000..55d745ec1181 --- /dev/null +++ b/arch/x86/kernel/pci-iommu_table.c @@ -0,0 +1,89 @@ +#include +#include +#include +#include + + +#define DEBUG 1 + +static struct iommu_table_entry * __init +find_dependents_of(struct iommu_table_entry *start, + struct iommu_table_entry *finish, + struct iommu_table_entry *q) +{ + struct iommu_table_entry *p; + + if (!q) + return NULL; + + for (p = start; p < finish; p++) + if (p->detect == q->depend) + return p; + + return NULL; +} + + +void __init sort_iommu_table(struct iommu_table_entry *start, + struct iommu_table_entry *finish) { + + struct iommu_table_entry *p, *q, tmp; + + for (p = start; p < finish; p++) { +again: + q = find_dependents_of(start, finish, p); + /* We are bit sneaky here. We use the memory address to figure + * out if the node we depend on is past our point, if so, swap. + */ + if (q > p) { + tmp = *p; + memmove(p, q, sizeof(*p)); + *q = tmp; + goto again; + } + } + +} + +#ifdef DEBUG +void __init check_iommu_entries(struct iommu_table_entry *start, + struct iommu_table_entry *finish) +{ + struct iommu_table_entry *p, *q, *x; + char sym_p[KSYM_SYMBOL_LEN]; + char sym_q[KSYM_SYMBOL_LEN]; + + /* Simple cyclic dependency checker. */ + for (p = start; p < finish; p++) { + q = find_dependents_of(start, finish, p); + x = find_dependents_of(start, finish, q); + if (p == x) { + sprint_symbol(sym_p, (unsigned long)p->detect); + sprint_symbol(sym_q, (unsigned long)q->detect); + + printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \ + " on %s and vice-versa. BREAKING IT.\n", + sym_p, sym_q); + /* Heavy handed way..*/ + x->depend = 0; + } + } + + for (p = start; p < finish; p++) { + q = find_dependents_of(p, finish, p); + if (q && q > p) { + sprint_symbol(sym_p, (unsigned long)p->detect); + sprint_symbol(sym_q, (unsigned long)q->detect); + + printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\ + "should be called before %s!\n", + sym_p, sym_q); + } + } +} +#else +inline void check_iommu_entries(struct iommu_table_entry *start, + struct iommu_table_entry *finish) +{ +} +#endif From efa631c26d3bb1162b8f95008801db602217f52b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:57:59 -0400 Subject: [PATCH 04/14] x86, swiotlb: Simplify SWIOTLB pci_swiotlb_detect routine. In 'pci_swiotlb_detect' we used to do two different things: a). If user provided 'iommu=soft' or 'swiotlb=force' we would set swiotlb=1 and return 1 (and forcing pci-dma.c to call pci_swiotlb_init() immediately). b). If 4GB or more would be detected and if user did not specify iommu=off, we would set 'swiotlb=1' and return whatever 'a)' figured out. We simplify this by splitting a) and b) in two different routines. CC: Fujita Tomonori Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-5-git-send-email-konrad.wilk@oracle.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iommu_table.h | 4 ++-- arch/x86/include/asm/swiotlb.h | 13 ++++++++++-- arch/x86/kernel/pci-dma.c | 4 +++- arch/x86/kernel/pci-swiotlb.c | 33 ++++++++++++++++++++++++------ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index 2124e3ef6f98..df55a78888e3 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -64,10 +64,10 @@ struct iommu_table_entry { * to stop detecting the other IOMMUs after yours has been detected. */ #define IOMMU_INIT_POST(_detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect, 0, 0, 0) + __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0) #define IOMMU_INIT_POST_FINISH(detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect, 0, 0, 1) + __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1) /* * A more sophisticated version of IOMMU_INIT. This variant requires: diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 8085277e1b8b..977f1761a25d 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -5,17 +5,26 @@ #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern int __init pci_swiotlb_detect(void); +extern int __init pci_swiotlb_detect_override(void); +extern int __init pci_swiotlb_detect_4gb(void); extern void __init pci_swiotlb_init(void); +extern void __init pci_swiotlb_late_init(void); #else #define swiotlb 0 -static inline int pci_swiotlb_detect(void) +static inline int pci_swiotlb_detect_override(void) +{ + return 0; +} +static inline int pci_swiotlb_detect_4gb(void) { return 0; } static inline void pci_swiotlb_init(void) { } +static inline void pci_swiotlb_late_init(void) +{ +} #endif static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 9f07cfcbd3a5..1b3beb5075e6 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -133,9 +133,11 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); - if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) + if (pci_xen_swiotlb_detect() || pci_swiotlb_detect_override()) goto out; + pci_swiotlb_detect_4gb(); + gart_iommu_hole_init(); detect_calgary(); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index a5bc528d4328..c7a72faeb146 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -41,24 +41,33 @@ static struct dma_map_ops swiotlb_dma_ops = { }; /* - * pci_swiotlb_detect - set swiotlb to 1 if necessary + * pci_swiotlb_detect_override - set swiotlb to 1 if necessary * * This returns non-zero if we are forced to use swiotlb (by the boot * option). */ -int __init pci_swiotlb_detect(void) +int __init pci_swiotlb_detect_override(void) { int use_swiotlb = swiotlb | swiotlb_force; + if (swiotlb_force) + swiotlb = 1; + + return use_swiotlb; +} + +/* + * if 4GB or more detected (and iommu=off not set) return 1 + * and set swiotlb to 1. + */ +int __init pci_swiotlb_detect_4gb(void) +{ /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif - if (swiotlb_force) - swiotlb = 1; - - return use_swiotlb; + return swiotlb; } void __init pci_swiotlb_init(void) @@ -68,3 +77,15 @@ void __init pci_swiotlb_init(void) dma_ops = &swiotlb_dma_ops; } } + +void __init pci_swiotlb_late_init(void) +{ + /* An IOMMU turned us off. */ + if (!swiotlb) + swiotlb_free(); + else { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } +} From c116c5457c46edb767df6f4e36d4905e3514ad37 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:58:00 -0400 Subject: [PATCH 05/14] x86, swiotlb: Make SWIOTLB use IOMMU_INIT_* macros. We utilize the IOMMU_INIT macros to create this dependency: [pci_xen_swiotlb_detect] | [pci_swiotlb_detect_override] | [pci_swiotlb_detect_4gb] And set the SWIOTLB IOMMU_INIT to utilize 'pci_swiotlb_init' for .init and 'pci_swiotlb_late_init' for .late_init. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-6-git-send-email-konrad.wilk@oracle.com> CC: Fujita Tomonori Signed-off-by: H. Peter Anvin --- arch/x86/kernel/pci-swiotlb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index c7a72faeb146..8f972cbddef0 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -10,7 +10,8 @@ #include #include #include - +#include +#include int swiotlb __read_mostly; static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -55,6 +56,10 @@ int __init pci_swiotlb_detect_override(void) return use_swiotlb; } +IOMMU_INIT_FINISH(pci_swiotlb_detect_override, + pci_xen_swiotlb_detect, + pci_swiotlb_init, + pci_swiotlb_late_init); /* * if 4GB or more detected (and iommu=off not set) return 1 @@ -69,6 +74,10 @@ int __init pci_swiotlb_detect_4gb(void) #endif return swiotlb; } +IOMMU_INIT(pci_swiotlb_detect_4gb, + pci_swiotlb_detect_override, + pci_swiotlb_init, + pci_swiotlb_late_init); void __init pci_swiotlb_init(void) { From 5cb3a267939a223eb84692d229569d2ef493d7ca Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:58:01 -0400 Subject: [PATCH 06/14] x86, xen-swiotlb: Make Xen-SWIOTLB use IOMMU_INIT_* macros. We utilize the IOMMU_INIT macros to create this dependency: [null] | [pci_xen_swiotlb_detect] | [pci_swiotlb_detect_override] | [pci_swiotlb_detect_4gb] In other words, we set 'pci_xen_swiotlb_detect' to be the first detection to be run during start. CC: Fujita Tomonori Cc: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-7-git-send-email-konrad.wilk@oracle.com> Signed-off-by: H. Peter Anvin --- arch/x86/xen/pci-swiotlb-xen.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index a013ec9d0c54..22471001b74c 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -5,6 +5,7 @@ #include #include +#include int xen_swiotlb __read_mostly; @@ -56,3 +57,7 @@ void __init pci_xen_swiotlb_init(void) dma_ops = &xen_swiotlb_dma_ops; } } +IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, + 0, + pci_xen_swiotlb_init, + 0); From d2aa232f3d0b5a3e22f91b736fe68eddcf0d5ea3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:58:02 -0400 Subject: [PATCH 07/14] x86, calgary: Make Calgary IOMMU use IOMMU_INIT_* macros. We utilize the IOMMU_INIT macros to create this dependency: [pci_xen_swiotlb_detect] | [pci_swiotlb_detect_override] | [pci_swiotlb_detect_4gb] | [detect_calgary] Meaning that 'detect_calgary' is going to be called after 'pci_swiotlb_detect'. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-8-git-send-email-konrad.wilk@oracle.com> CC: Muli Ben-Yehuda CC: "Jon D. Mason" CC: "Darrick J. Wong" CC: Fujita Tomonori Signed-off-by: H. Peter Anvin --- arch/x86/kernel/pci-calgary_64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 28c6b389fee6..f56a117cef68 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -1595,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void) * and before device_initcall. */ rootfs_initcall(calgary_fixup_tce_spaces); + +IOMMU_INIT_POST(detect_calgary); From 22e6daf41ba28ddc06295e42859b266f737b3e99 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:58:03 -0400 Subject: [PATCH 08/14] x86, GART/AMD-VI: Make AMD GART and IOMMU use IOMMU_INIT_* macros. We utilize the IOMMU_INIT macros to create this dependency: [null] | [pci_xen_swiotlb_detect] | [pci_swiotlb_detect_override] | [pci_swiotlb_detect_4gb] | +-------+--------+ / \ [detect_calgary] [gart_iommu_hole_init] | [amd_iommu_detect] Meaning that 'amd_iommu_detect' will be called after 'gart_iommu_hole_init'. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-9-git-send-email-konrad.wilk@oracle.com> CC: Fujita Tomonori CC: Joerg Roedel CC: Thomas Gleixner CC: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/amd_iommu_init.c | 7 ++++++- arch/x86/kernel/pci-gart_64.c | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0b9e2dc4fc9a..26a5e4385210 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -31,7 +31,7 @@ #include #include #include - +#include /* * definitions for the ACPI scanning code */ @@ -1430,3 +1430,8 @@ static int __init parse_amd_iommu_options(char *str) __setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); + +IOMMU_INIT_FINISH(amd_iommu_detect, + gart_iommu_hole_init, + 0, + 0); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa67..de9734b100a4 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -41,6 +41,7 @@ #include #include #include +#include static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -896,3 +897,4 @@ void __init gart_parse_options(char *p) } } } +IOMMU_INIT_POST(gart_iommu_hole_init); From 4db77ff3237a88ea74f691dd776e92b2f86a8f3f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:58:04 -0400 Subject: [PATCH 09/14] x86, VT-d: Make Intel VT-d IOMMU use IOMMU_INIT_* macros. We utilize the IOMMU_INIT macros to create this dependency: [null] | [pci_xen_swiotlb_detect] | [pci_swiotlb_detect_override] | [pci_swiotlb_detect_4gb] | +-------+--------+---------------------+ / \ \ [detect_calgary] [gart_iommu_hole_init] [detect_intel_iommu] | [amd_iommu_detect] Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-10-git-send-email-konrad.wilk@oracle.com> CC: Fujita Tomonori CC: Jesse Barnes CC: David Woodhouse CC: Len Brown CC: Chris Wright CC: Yinghai Lu Signed-off-by: H. Peter Anvin --- drivers/pci/dmar.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 5fa64ea5416f..4ef56a0920a7 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -36,6 +36,7 @@ #include #include #include +#include #define PREFIX "DMAR: " @@ -724,7 +725,7 @@ int __init detect_intel_iommu(void) early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); dmar_tbl = NULL; - return (ret ? 1 : -ENODEV); + return ret ? 1 : -ENODEV; } @@ -1457,3 +1458,4 @@ int __init dmar_ir_support(void) return 0; return dmar->flags & 0x1; } +IOMMU_INIT_POST(detect_intel_iommu); From ee1f284f38c8dfcbc7b656915a039dde016de7d3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:58:05 -0400 Subject: [PATCH 10/14] x86, iommu: Utilize the IOMMU_INIT macros functionality. We remove all of the sub-platform detection/init routines and instead use on the .iommu_table array of structs to call the .early_init if .detect returned a positive value. Also we can stop detecting other IOMMUs if the IOMMU used the _FINISH type macro. During the 'pci_iommu_init' stage, we call .init for the second-stage initialization if it was defined. Currently only SWIOTLB has this defined and it used to de-allocate the SWIOTLB if the other detected IOMMUs have deemed it unnecessary to use SWIOTLB. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-11-git-send-email-konrad.wilk@oracle.com> CC: Fujita Tomonori CC: Thomas Gleixner CC: Ingo Molnar Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/x86/kernel/pci-dma.c | 46 ++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1b3beb5075e6..9ea999a4dcc1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,9 +11,8 @@ #include #include #include -#include #include -#include +#include static int forbid_dac __read_mostly; @@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; +extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; + /* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", @@ -130,28 +131,24 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { + struct iommu_table_entry *p; + /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); - if (pci_xen_swiotlb_detect() || pci_swiotlb_detect_override()) - goto out; + sort_iommu_table(__iommu_table, __iommu_table_end); + check_iommu_entries(__iommu_table, __iommu_table_end); - pci_swiotlb_detect_4gb(); - - gart_iommu_hole_init(); - - detect_calgary(); - - detect_intel_iommu(); - - /* needs to be called after gart_iommu_hole_init */ - amd_iommu_detect(); -out: - pci_xen_swiotlb_init(); - - pci_swiotlb_init(); + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && p->detect && p->detect() > 0) { + p->flags |= IOMMU_DETECTED; + if (p->early_init) + p->early_init(); + if (p->flags & IOMMU_FINISH_IF_DETECTED) + break; + } + } } - void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -294,6 +291,7 @@ EXPORT_SYMBOL(dma_supported); static int __init pci_iommu_init(void) { + struct iommu_table_entry *p; dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); #ifdef CONFIG_PCI @@ -301,12 +299,10 @@ static int __init pci_iommu_init(void) #endif x86_init.iommu.iommu_init(); - if (swiotlb || xen_swiotlb) { - printk(KERN_INFO "PCI-DMA: " - "Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_print_info(); - } else - swiotlb_free(); + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && (p->flags & IOMMU_DETECTED) && p->late_init) + p->late_init(); + } return 0; } From 6f44d0337cc54a46e83b4c8a6195607e78fff71d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 27 Aug 2010 14:19:33 -0400 Subject: [PATCH 11/14] x86, doc: Adding comments about .iommu_table and its neighbors. Updating the linker section with comments about .iommu_table and some other ones that I know of. CC: Sam Ravnborg CC: H. Peter Anvin CC: Fujita Tomonori CC: Thomas Gleixner CC: Ingo Molnar Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282933173-19960-1-git-send-email-konrad.wilk@oracle.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b92e040466c1..3f07c370f761 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -242,6 +242,12 @@ SECTIONS __x86_cpu_dev_end = .; } + /* + * start address and size of operations which during runtime + * can be patched with virtualization friendly instructions or + * baremetal native ones. Think page table operations. + * Details in paravirt_types.h + */ . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; @@ -249,6 +255,11 @@ SECTIONS __parainstructions_end = .; } + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + */ . = ALIGN(8); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { __alt_instructions = .; @@ -256,10 +267,21 @@ SECTIONS __alt_instructions_end = .; } + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + */ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } + /* + * struct iommu_table_entry entries are injected in this section. + * It is an array of IOMMUs which during run time gets sorted depending + * on its dependency order. After rootfs_initcall is complete + * this section can be safely removed. + */ .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { __iommu_table = .; *(.iommu_table) From 7ac41ccf47d82569d26f34beab1dec92cc3b6347 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 30 Aug 2010 14:10:02 -0400 Subject: [PATCH 12/14] x86, iommu: Fix IOMMU_INIT alignment rules This boot crash was observed: DMA-API: preallocated 32768 debug entries DMA-API: debugging enabled by kernel config BUG: unable to handle kernel paging request at 19da8955 IP: [] 0xf4ffffff *pde = 00000000 The crux of the failure was that even if we did not use any of the .iommu_table section, the linker would still insert it in the vmlinux file. This patch fixes that and also fixes the runtime crash where we would try to access the array. Reported-by: Ingo Molnar Signed-off-by: Konrad Rzeszutek Wilk Cc: Joerg Roedel Cc: FUJITA Tomonori LKML-Reference: <1283191802-25086-1-git-send-email-konrad.wilk@oracle.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3f07c370f761..38e2b67807e1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -285,10 +285,9 @@ SECTIONS .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { __iommu_table = .; *(.iommu_table) - . = ALIGN(8); __iommu_table_end = .; } - + . = ALIGN(8); /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame From fc6a2f37d084173de57fe75f73cbe4bb296b9e8a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 8 Oct 2010 14:53:47 -0400 Subject: [PATCH 13/14] ia64, iommu: Add a dummy iommu_table.h file in IA64. We don't need a complex IOMMU dependency list on IA64 so we just define the IOMMU_* macro which is used the DMAR driver, as a dummy. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1286564028-2352-2-git-send-email-konrad.wilk@oracle.com> Reported-by: Tony Luck Tested-by: Tony Luck Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin --- arch/ia64/include/asm/iommu_table.h | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 arch/ia64/include/asm/iommu_table.h diff --git a/arch/ia64/include/asm/iommu_table.h b/arch/ia64/include/asm/iommu_table.h new file mode 100644 index 000000000000..92c8d36ae5ae --- /dev/null +++ b/arch/ia64/include/asm/iommu_table.h @@ -0,0 +1,6 @@ +#ifndef _ASM_IA64_IOMMU_TABLE_H +#define _ASM_IA64_IOMMU_TABLE_H + +#define IOMMU_INIT_POST(_detect) + +#endif /* _ASM_IA64_IOMMU_TABLE_H */ From 6e9636693373d938aa3b13427be3d212f172ac06 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 8 Oct 2010 14:53:48 -0400 Subject: [PATCH 14/14] x86, iommu: Update header comments with appropriate naming The header comments diverged a bit from the implementation. Lets re-sync them. Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1286564028-2352-3-git-send-email-konrad.wilk@oracle.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iommu_table.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index df55a78888e3..f229b13a5f30 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -1,4 +1,3 @@ - #ifndef _ASM_X86_IOMMU_TABLE_H #define _ASM_X86_IOMMU_TABLE_H @@ -60,7 +59,7 @@ struct iommu_table_entry { * and it will be run after the SWIOTLB and the other IOMMUs * that utilize this macro. If the IOMMU is detected (ie, the * detect routine returns a positive value), the other IOMMUs - * are also checked. You can use IOMMU_INIT_FINISH if you prefer + * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer * to stop detecting the other IOMMUs after yours has been detected. */ #define IOMMU_INIT_POST(_detect) \ @@ -80,9 +79,9 @@ struct iommu_table_entry { * d). Similar to the 'init', except that this gets called from pci_iommu_init * where we do have a memory allocator. * - * The _CONT vs the _EXIT differs in that the _CONT variant will + * The standard vs the _FINISH differs in that the _FINISH variant will * continue detecting other IOMMUs in the call list after the - * the detection routine returns a positive number. The _EXIT will + * the detection routine returns a positive number. The _FINISH will * stop the execution chain. Both will still call the 'init' and * 'late_init' functions if they are set. */