s390/boot: move dma sections from decompressor to decompressed kernel

This change simplifies the task of making the decompressor relocatable.

The decompressor's image contains special DMA sections between _sdma and
_edma. This DMA segment is loaded at boot as part of the decompressor and
then simply handed over to the decompressed kernel. The decompressor itself
never uses it in any way. The primary reason for this is the need to keep
the aforementioned DMA segment below 2GB which is required by architecture,
and because the decompressor is always loaded at a fixed low physical
address, it is guaranteed that the DMA region will not cross the 2GB
memory limit. If the DMA region had been placed in the decompressed kernel,
then KASLR would make this guarantee impossible to fulfill or it would
be restricted to the first 2GB of memory address space.

This commit moves all DMA sections between _sdma and _edma from
the decompressor's image to the decompressed kernel's image. The complete
DMA region is placed in the init section of the decompressed kernel and
immediately relocated below 2GB at start-up before it is needed by other
parts of the decompressed kernel. The relocation of the DMA region happens
even if the decompressed kernel is already located below 2GB in order
to keep the first implementation simple. The relocation should not have
any noticeable impact on boot time because the DMA segment is only a couple
of pages.

After relocating the DMA sections, the kernel has to fix all references
which point into it. In order to automate this, place all variables
pointing into the DMA sections in a special .dma.refs section. All such
variables must be defined using the new __dma_ref macro. Only variables
containing addresses within the DMA sections must be placed in the new
.dma.refs section.

Furthermore, move the initialization of control registers from
the decompressor to the decompressed kernel because some control registers
reference tables that must be placed in the DMA data section to
guarantee that their addresses are below 2G. Because the decompressed
kernel relocates the DMA sections at startup, the content of control
registers CR2, CR5 and CR15 must be updated with new addresses after
the relocation. The decompressed kernel initializes all control registers
early at boot and then updates the content of CR2, CR5 and CR15
as soon as the DMA relocation has occurred. This practically reverts
the commit a80313ff91 ("s390/kernel: introduce .dma sections").

Signed-off-by: Alexander Egorenkov <egorenar@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
Alexander Egorenkov 2021-06-15 19:17:36 +02:00 committed by Heiko Carstens
parent 97dd89e901
commit 6bda667037
13 changed files with 183 additions and 99 deletions

View File

@ -36,7 +36,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
obj-y += version.o pgm_check_info.o ctype.o text_dma.o obj-y += version.o pgm_check_info.o ctype.o
obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o

View File

@ -26,10 +26,6 @@ extern int vmalloc_size_set;
extern int kaslr_enabled; extern int kaslr_enabled;
extern char __boot_data_start[], __boot_data_end[]; extern char __boot_data_start[], __boot_data_end[];
extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
extern char _sdma[], _edma[];
extern char _stext_dma[], _etext_dma[];
extern struct exception_table_entry _start_dma_ex_table[];
extern struct exception_table_entry _stop_dma_ex_table[];
extern char _decompressor_syms_start[], _decompressor_syms_end[]; extern char _decompressor_syms_start[], _decompressor_syms_end[];
extern char _stack_start[], _stack_end[]; extern char _stack_start[], _stack_end[];

View File

@ -37,29 +37,6 @@ SECTIONS
*(.data.*) *(.data.*)
_edata = . ; _edata = . ;
} }
/*
* .dma section for code, data, ex_table that need to stay below 2 GB,
* even when the kernel is relocate: above 2 GB.
*/
. = ALIGN(PAGE_SIZE);
_sdma = .;
.dma.text : {
_stext_dma = .;
*(.dma.text)
. = ALIGN(PAGE_SIZE);
_etext_dma = .;
}
. = ALIGN(16);
.dma.ex_table : {
_start_dma_ex_table = .;
KEEP(*(.dma.ex_table))
_stop_dma_ex_table = .;
}
.dma.data : {
*(.dma.data)
}
. = ALIGN(PAGE_SIZE);
_edma = .;
BOOT_DATA BOOT_DATA
BOOT_DATA_PRESERVED BOOT_DATA_PRESERVED

View File

@ -317,7 +317,6 @@ SYM_CODE_START_LOCAL(startup_normal)
xc 0x300(256),0x300 xc 0x300(256),0x300
xc 0xe00(256),0xe00 xc 0xe00(256),0xe00
xc 0xf00(256),0xf00 xc 0xf00(256),0xf00
lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers
stcke __LC_BOOT_CLOCK stcke __LC_BOOT_CLOCK
mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1
spt 6f-.LPG0(%r13) spt 6f-.LPG0(%r13)
@ -336,35 +335,6 @@ SYM_CODE_END(startup_normal)
.quad 0x0000000180000000,startup_pgm_check_handler .quad 0x0000000180000000,startup_pgm_check_handler
.Lio_new_psw: .Lio_new_psw:
.quad 0x0002000180000000,0x1f0 # disabled wait .quad 0x0002000180000000,0x1f0 # disabled wait
.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space
.quad 0 # cr1: primary space segment table
.quad .Lduct # cr2: dispatchable unit control table
.quad 0 # cr3: instruction authorization
.quad 0xffff # cr4: instruction authorization
.quad .Lduct # cr5: primary-aste origin
.quad 0 # cr6: I/O interrupts
.quad 0 # cr7: secondary space segment table
.quad 0x0000000000008000 # cr8: access registers translation
.quad 0 # cr9: tracing off
.quad 0 # cr10: tracing off
.quad 0 # cr11: tracing off
.quad 0 # cr12: tracing off
.quad 0 # cr13: home space segment table
.quad 0xc0000000 # cr14: machine check handling off
.quad .Llinkage_stack # cr15: linkage stack operations
.section .dma.data,"aw",@progbits
.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0
.long 0,0,0,0,0,0,0,0
.Llinkage_stack:
.long 0,0,0x89000000,0,0,0,0x8a000000,0
.align 64
.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0
.align 128
.Lduald:.rept 8
.long 0x80000000,0,0,0 # invalid access-list entries
.endr
.previous
#include "head_kdump.S" #include "head_kdump.S"

View File

@ -29,37 +29,6 @@ u64 __bootdata_preserved(stfle_fac_list[16]);
u64 __bootdata_preserved(alt_stfle_fac_list[16]); u64 __bootdata_preserved(alt_stfle_fac_list[16]);
struct oldmem_data __bootdata_preserved(oldmem_data); struct oldmem_data __bootdata_preserved(oldmem_data);
/*
* Some code and data needs to stay below 2 GB, even when the kernel would be
* relocated above 2 GB, because it has to use 31 bit addresses.
* Such code and data is part of the .dma section, and its location is passed
* over to the decompressed / relocated kernel via the .boot.preserved.data
* section.
*/
unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma);
unsigned long __bootdata_preserved(__edma) = __pa(&_edma);
unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma);
unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma);
struct exception_table_entry *
__bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table;
struct exception_table_entry *
__bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table;
int _diag210_dma(struct diag210 *addr);
int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode);
int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode);
void _diag0c_dma(struct hypfs_diag0c_entry *entry);
void _diag308_reset_dma(void);
struct diag_ops __bootdata_preserved(diag_dma_ops) = {
.diag210 = _diag210_dma,
.diag26c = _diag26c_dma,
.diag14 = _diag14_dma,
.diag0c = _diag0c_dma,
.diag308_reset = _diag308_reset_dma
};
static struct diag210 _diag210_tmp_dma __section(".dma.data");
struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma;
void error(char *x) void error(char *x)
{ {
sclp_early_printk("\n\n"); sclp_early_printk("\n\n");

View File

@ -309,6 +309,7 @@ int diag26c(void *req, void *resp, enum diag26c_sc subcode);
struct hypfs_diag0c_entry; struct hypfs_diag0c_entry;
/* This struct must contain only pointers/references into the text DMA section. */
struct diag_ops { struct diag_ops {
int (*diag210)(struct diag210 *addr); int (*diag210)(struct diag210 *addr);
int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode);
@ -319,4 +320,11 @@ struct diag_ops {
extern struct diag_ops diag_dma_ops; extern struct diag_ops diag_dma_ops;
extern struct diag210 *__diag210_tmp_dma; extern struct diag210 *__diag210_tmp_dma;
int _diag210_dma(struct diag210 *addr);
int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode);
int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode);
void _diag0c_dma(struct hypfs_diag0c_entry *entry);
void _diag308_reset_dma(void);
#endif /* _ASM_S390_DIAG_H */ #endif /* _ASM_S390_DIAG_H */

View File

@ -40,7 +40,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o
obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
obj-y += smp.o obj-y += smp.o text_dma.o
extra-y += head64.o vmlinux.lds extra-y += head64.o vmlinux.lds

View File

@ -14,6 +14,7 @@
#include <asm/diag.h> #include <asm/diag.h>
#include <asm/trace/diag.h> #include <asm/trace/diag.h>
#include <asm/sections.h> #include <asm/sections.h>
#include "entry.h"
struct diag_stat { struct diag_stat {
unsigned int counter[NR_DIAG_STAT]; unsigned int counter[NR_DIAG_STAT];
@ -50,8 +51,16 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
[DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" },
}; };
struct diag_ops __bootdata_preserved(diag_dma_ops); struct diag_ops __dma_ref diag_dma_ops = {
struct diag210 *__bootdata_preserved(__diag210_tmp_dma); .diag210 = _diag210_dma,
.diag26c = _diag26c_dma,
.diag14 = _diag14_dma,
.diag0c = _diag0c_dma,
.diag308_reset = _diag308_reset_dma
};
static struct diag210 _diag210_tmp_dma __section(".dma.data");
struct diag210 __dma_ref *__diag210_tmp_dma = &_diag210_tmp_dma;
static int show_diag_stat(struct seq_file *m, void *v) static int show_diag_stat(struct seq_file *m, void *v)
{ {

View File

@ -64,4 +64,13 @@ void stack_free(unsigned long stack);
extern char kprobes_insn_page[]; extern char kprobes_insn_page[];
extern char _sdma[], _edma[];
extern char _stext_dma[], _etext_dma[];
extern struct exception_table_entry _start_dma_ex_table[];
extern struct exception_table_entry _stop_dma_ex_table[];
#define __dma_data __section(".dma.data")
#define __dma_ref __section(".dma.refs")
extern long _start_dma_refs[], _end_dma_refs[];
#endif /* _ENTRY_H */ #endif /* _ENTRY_H */

View File

@ -21,6 +21,7 @@ ENTRY(startup_continue)
larl %r1,tod_clock_base larl %r1,tod_clock_base
mvc 0(16,%r1),__LC_BOOT_CLOCK mvc 0(16,%r1),__LC_BOOT_CLOCK
larl %r13,.LPG1 # get base larl %r13,.LPG1 # get base
lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers
# #
# Setup stack # Setup stack
# #
@ -41,3 +42,19 @@ ENTRY(startup_continue)
.align 16 .align 16
.LPG1: .LPG1:
.Ldw: .quad 0x0002000180000000,0x0000000000000000 .Ldw: .quad 0x0002000180000000,0x0000000000000000
.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space
.quad 0 # cr1: primary space segment table
.quad 0 # cr2: dispatchable unit control table
.quad 0 # cr3: instruction authorization
.quad 0xffff # cr4: instruction authorization
.quad 0 # cr5: primary-aste origin
.quad 0 # cr6: I/O interrupts
.quad 0 # cr7: secondary space segment table
.quad 0x0000000000008000 # cr8: access registers translation
.quad 0 # cr9: tracing off
.quad 0 # cr10: tracing off
.quad 0 # cr11: tracing off
.quad 0 # cr12: tracing off
.quad 0 # cr13: home space segment table
.quad 0xc0000000 # cr14: machine check handling off
.quad 0 # cr15: linkage stack operations

View File

@ -94,17 +94,64 @@ char elf_platform[ELF_PLATFORM_SIZE];
unsigned long int_hwcap = 0; unsigned long int_hwcap = 0;
/*
* Some code and data needs to stay below 2 GB, even when the kernel would be
* relocated above 2 GB, because it has to use 31 bit addresses.
* Such code and data is part of the .dma section.
*/
unsigned long __dma_ref __sdma = __pa(&_sdma);
unsigned long __dma_ref __edma = __pa(&_edma);
unsigned long __dma_ref __stext_dma = __pa(&_stext_dma);
unsigned long __dma_ref __etext_dma = __pa(&_etext_dma);
struct exception_table_entry __dma_ref *__start_dma_ex_table = _start_dma_ex_table;
struct exception_table_entry __dma_ref *__stop_dma_ex_table = _stop_dma_ex_table;
/*
* Control registers CR2, CR5 and CR15 are initialized with addresses
* of tables that must be placed below 2G which is handled by the DMA
* sections.
* Because the DMA sections are relocated below 2G at startup,
* the content of control registers CR2, CR5 and CR15 must be updated
* with new addresses after the relocation. The initial initialization of
* control registers occurs in head64.S and then gets updated again after DMA
* relocation. We must access the relevant DMA tables indirectly via
* pointers placed in the .dma.refs linker section. Those pointers get
* updated automatically during DMA relocation and always contain a valid
* address within DMA sections.
*/
static __dma_data u32 __ctl_duct_dma[16] __aligned(64);
static __dma_data u64 __ctl_aste_dma[8] __aligned(64) = {
[1] = 0xffffffffffffffff
};
static __dma_data u32 __ctl_duald_dma[32] __aligned(128) = {
0x80000000, 0, 0, 0,
0x80000000, 0, 0, 0,
0x80000000, 0, 0, 0,
0x80000000, 0, 0, 0,
0x80000000, 0, 0, 0,
0x80000000, 0, 0, 0,
0x80000000, 0, 0, 0,
0x80000000, 0, 0, 0
};
static __dma_data u32 __ctl_linkage_stack_dma[8] __aligned(64) = {
0, 0, 0x89000000, 0,
0, 0, 0x8a000000, 0
};
static u64 __dma_ref *__ctl_aste = __ctl_aste_dma;
static u32 __dma_ref *__ctl_duald = __ctl_duald_dma;
static u32 __dma_ref *__ctl_linkage_stack = __ctl_linkage_stack_dma;
static u32 __dma_ref *__ctl_duct = __ctl_duct_dma;
int __bootdata(noexec_disabled); int __bootdata(noexec_disabled);
unsigned long __bootdata(ident_map_size); unsigned long __bootdata(ident_map_size);
struct mem_detect_info __bootdata(mem_detect); struct mem_detect_info __bootdata(mem_detect);
struct initrd_data __bootdata(initrd_data); struct initrd_data __bootdata(initrd_data);
struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table);
struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table);
unsigned long __bootdata_preserved(__stext_dma);
unsigned long __bootdata_preserved(__etext_dma);
unsigned long __bootdata_preserved(__sdma);
unsigned long __bootdata_preserved(__edma);
unsigned long __bootdata_preserved(__kaslr_offset); unsigned long __bootdata_preserved(__kaslr_offset);
unsigned int __bootdata_preserved(zlib_dfltcc_support); unsigned int __bootdata_preserved(zlib_dfltcc_support);
EXPORT_SYMBOL(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support);
@ -753,7 +800,6 @@ static void __init reserve_kernel(void)
memblock_reserve(0, HEAD_END); memblock_reserve(0, HEAD_END);
memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn)
- (unsigned long)_stext); - (unsigned long)_stext);
memblock_reserve(__sdma, __edma - __sdma);
} }
static void __init setup_memory(void) static void __init setup_memory(void)
@ -773,6 +819,53 @@ static void __init setup_memory(void)
memblock_enforce_memory_limit(memblock_end_of_DRAM()); memblock_enforce_memory_limit(memblock_end_of_DRAM());
} }
static void __init relocate_dma_section(void)
{
unsigned long dma_addr, dma_size;
long dma_offset;
long *ptr;
/* Allocate a new DMA capable memory region */
dma_size = __edma - __sdma;
pr_info("Relocating DMA section of size 0x%08lx\n", dma_size);
dma_addr = (unsigned long)memblock_alloc_low(dma_size, PAGE_SIZE);
if (!dma_addr)
panic("Failed to allocate memory for DMA section\n");
dma_offset = dma_addr - __sdma;
/* Move original DMA section to the new one */
memmove((void *)dma_addr, (void *)__sdma, dma_size);
/* Zero out the old DMA section to catch invalid accesses within it */
memset((void *)__sdma, 0, dma_size);
/* Update all DMA region references */
for (ptr = _start_dma_refs; ptr != _end_dma_refs; ptr++)
*ptr += dma_offset;
}
/* This must be called after DMA relocation */
static void __init setup_cr(void)
{
union ctlreg2 cr2;
union ctlreg5 cr5;
union ctlreg15 cr15;
__ctl_duct[1] = (unsigned long)__ctl_aste;
__ctl_duct[2] = (unsigned long)__ctl_aste;
__ctl_duct[4] = (unsigned long)__ctl_duald;
/* Update control registers CR2, CR5 and CR15 */
__ctl_store(cr2.val, 2, 2);
__ctl_store(cr5.val, 5, 5);
__ctl_store(cr15.val, 15, 15);
cr2.ducto = (unsigned long)__ctl_duct >> 6;
cr5.pasteo = (unsigned long)__ctl_duct >> 6;
cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3;
__ctl_load(cr2.val, 2, 2);
__ctl_load(cr5.val, 5, 5);
__ctl_load(cr15.val, 15, 15);
}
/* /*
* Setup hardware capabilities. * Setup hardware capabilities.
*/ */
@ -1061,6 +1154,9 @@ void __init setup_arch(char **cmdline_p)
free_mem_detect_info(); free_mem_detect_info();
relocate_dma_section();
setup_cr();
setup_uv(); setup_uv();
setup_memory_end(); setup_memory_end();
setup_memory(); setup_memory();

View File

@ -71,6 +71,13 @@ SECTIONS
RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE)
BOOT_DATA_PRESERVED BOOT_DATA_PRESERVED
. = ALIGN(8);
.dma.refs : {
_start_dma_refs = .;
*(.dma.refs)
_end_dma_refs = .;
}
_edata = .; /* End of data section */ _edata = .; /* End of data section */
/* will be freed after init */ /* will be freed after init */
@ -136,6 +143,32 @@ SECTIONS
BOOT_DATA BOOT_DATA
/*
* .dma section for code, data, ex_table that need to stay below 2 GB,
* even when the kernel is relocated above 2 GB.
*/
. = ALIGN(PAGE_SIZE);
_sdma = .;
.dma.text : {
_stext_dma = .;
*(.dma.text)
*(.dma.text.*_indirect_*)
. = ALIGN(PAGE_SIZE);
_etext_dma = .;
}
. = ALIGN(16);
.dma.ex_table : {
_start_dma_ex_table = .;
KEEP(*(.dma.ex_table))
_stop_dma_ex_table = .;
}
. = ALIGN(PAGE_SIZE);
.dma.data : {
*(.dma.data)
}
. = ALIGN(PAGE_SIZE);
_edma = .;
/* early.c uses stsi, which requires page aligned data. */ /* early.c uses stsi, which requires page aligned data. */
. = ALIGN(PAGE_SIZE); . = ALIGN(PAGE_SIZE);
INIT_DATA_SECTION(0x100) INIT_DATA_SECTION(0x100)