Merge branch 'kvm-ppc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD

This commit is contained in:
Paolo Bonzini 2016-07-11 18:10:06 +02:00
commit 6d5315b3a6
13 changed files with 407 additions and 9 deletions

View File

@ -0,0 +1,45 @@
/*
* Hypervisor Maintenance Interrupt header file.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.
*
* Copyright 2015 IBM Corporation
* Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
*/
#ifndef __ASM_PPC64_HMI_H__
#define __ASM_PPC64_HMI_H__
#ifdef CONFIG_PPC_BOOK3S_64
#define CORE_TB_RESYNC_REQ_BIT 63
#define MAX_SUBCORE_PER_CORE 4
/*
* sibling_subcore_state structure is used to co-ordinate all threads
* during HMI to avoid TB corruption. This structure is allocated once
* per each core and shared by all threads on that core.
*/
struct sibling_subcore_state {
unsigned long flags;
u8 in_guest[MAX_SUBCORE_PER_CORE];
};
extern void wait_for_subcore_guest_exit(void);
extern void wait_for_tb_resync(void);
#else
static inline void wait_for_subcore_guest_exit(void) { }
static inline void wait_for_tb_resync(void) { }
#endif
#endif /* __ASM_PPC64_HMI_H__ */

View File

@ -25,6 +25,7 @@
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
#include <asm/kvm_book3s_asm.h>
#endif
#include <asm/hmi.h>
register struct paca_struct *local_paca asm("r13");
@ -181,6 +182,11 @@ struct paca_struct {
*/
u16 in_mce;
u8 hmi_event_available; /* HMI event is available */
/*
* Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
* more details
*/
struct sibling_subcore_state *sibling_subcore_state;
#endif
/* Stuff for accurate time accounting */

View File

@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o
obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o
obj64-$(CONFIG_RELOCATABLE) += reloc_64.o
obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o
obj-$(CONFIG_PPC64) += vdso64/

View File

@ -680,6 +680,8 @@ _GLOBAL(__replay_interrupt)
BEGIN_FTR_SECTION
cmpwi r3,0xe80
beq h_doorbell_common
cmpwi r3,0xe60
beq hmi_exception_common
FTR_SECTION_ELSE
cmpwi r3,0xa00
beq doorbell_super_common
@ -1172,7 +1174,7 @@ fwnmi_data_area:
.globl hmi_exception_early
hmi_exception_early:
EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
mr r10,r1 /* Save r1 */
ld r1,PACAEMERGSP(r13) /* Use emergency stack */
subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */

56
arch/powerpc/kernel/hmi.c Normal file
View File

@ -0,0 +1,56 @@
/*
* Hypervisor Maintenance Interrupt (HMI) handling.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.
*
* Copyright 2015 IBM Corporation
* Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
*/
#undef DEBUG
#include <linux/types.h>
#include <linux/compiler.h>
#include <asm/paca.h>
#include <asm/hmi.h>
void wait_for_subcore_guest_exit(void)
{
int i;
/*
* NULL bitmap pointer indicates that KVM module hasn't
* been loaded yet and hence no guests are running.
* If no KVM is in use, no need to co-ordinate among threads
* as all of them will always be in host and no one is going
* to modify TB other than the opal hmi handler.
* Hence, just return from here.
*/
if (!local_paca->sibling_subcore_state)
return;
for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
while (local_paca->sibling_subcore_state->in_guest[i])
cpu_relax();
}
void wait_for_tb_resync(void)
{
if (!local_paca->sibling_subcore_state)
return;
while (test_bit(CORE_TB_RESYNC_REQ_BIT,
&local_paca->sibling_subcore_state->flags))
cpu_relax();
}

View File

@ -270,8 +270,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
ld r2,PACATOC(r13); \
ld r1,PACAR1(r13); \
std r3,ORIG_GPR3(r1); /* Save original r3 */ \
li r0,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \
bl opal_call_realmode; \
li r3,0; /* NULL argument */ \
bl hmi_exception_realmode; \
nop; \
ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \
20: nop;

View File

@ -60,6 +60,7 @@
#include <asm/switch_to.h>
#include <asm/tm.h>
#include <asm/debug.h>
#include <asm/hmi.h>
#include <sysdev/fsl_pci.h>
#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@ -307,9 +308,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
{
__this_cpu_inc(irq_stat.hmi_exceptions);
wait_for_subcore_guest_exit();
if (ppc_md.hmi_exception_early)
ppc_md.hmi_exception_early(regs);
wait_for_tb_resync();
return 0;
}

View File

@ -52,6 +52,7 @@
#include <asm/switch_to.h>
#include <asm/smp.h>
#include <asm/dbell.h>
#include <asm/hmi.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = {
.hcall_implemented = kvmppc_hcall_impl_hv,
};
static int kvm_init_subcore_bitmap(void)
{
int i, j;
int nr_cores = cpu_nr_cores();
struct sibling_subcore_state *sibling_subcore_state;
for (i = 0; i < nr_cores; i++) {
int first_cpu = i * threads_per_core;
int node = cpu_to_node(first_cpu);
/* Ignore if it is already allocated. */
if (paca[first_cpu].sibling_subcore_state)
continue;
sibling_subcore_state =
kmalloc_node(sizeof(struct sibling_subcore_state),
GFP_KERNEL, node);
if (!sibling_subcore_state)
return -ENOMEM;
memset(sibling_subcore_state, 0,
sizeof(struct sibling_subcore_state));
for (j = 0; j < threads_per_core; j++) {
int cpu = first_cpu + j;
paca[cpu].sibling_subcore_state = sibling_subcore_state;
}
}
return 0;
}
static int kvmppc_book3s_init_hv(void)
{
int r;
@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void)
if (r < 0)
return -ENODEV;
r = kvm_init_subcore_bitmap();
if (r)
return r;
kvm_ops_hv.owner = THIS_MODULE;
kvmppc_hv_ops = &kvm_ops_hv;

View File

@ -13,6 +13,9 @@
#include <linux/kernel.h>
#include <asm/opal.h>
#include <asm/mce.h>
#include <asm/machdep.h>
#include <asm/cputhreads.h>
#include <asm/hmi.h>
/* SRR1 bits for machine check on POWER7 */
#define SRR1_MC_LDSTERR (1ul << (63-42))
@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
{
return kvmppc_realmode_mc_power7(vcpu);
}
/* Check if dynamic split is in force and return subcore size accordingly. */
static inline int kvmppc_cur_subcore_size(void)
{
if (local_paca->kvm_hstate.kvm_split_mode)
return local_paca->kvm_hstate.kvm_split_mode->subcore_size;
return threads_per_subcore;
}
void kvmppc_subcore_enter_guest(void)
{
int thread_id, subcore_id;
thread_id = cpu_thread_in_core(local_paca->paca_index);
subcore_id = thread_id / kvmppc_cur_subcore_size();
local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
}
void kvmppc_subcore_exit_guest(void)
{
int thread_id, subcore_id;
thread_id = cpu_thread_in_core(local_paca->paca_index);
subcore_id = thread_id / kvmppc_cur_subcore_size();
local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
}
static bool kvmppc_tb_resync_required(void)
{
if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT,
&local_paca->sibling_subcore_state->flags))
return false;
return true;
}
static void kvmppc_tb_resync_done(void)
{
clear_bit(CORE_TB_RESYNC_REQ_BIT,
&local_paca->sibling_subcore_state->flags);
}
/*
* kvmppc_realmode_hmi_handler() is called only by primary thread during
* guest exit path.
*
* There are multiple reasons why HMI could occur, one of them is
* Timebase (TB) error. If this HMI is due to TB error, then TB would
* have been in stopped state. The opal hmi handler Will fix it and
* restore the TB value with host timebase value. For HMI caused due
* to non-TB errors, opal hmi handler will not touch/restore TB register
* and hence there won't be any change in TB value.
*
* Since we are not sure about the cause of this HMI, we can't be sure
* about the content of TB register whether it holds guest or host timebase
* value. Hence the idea is to resync the TB on every HMI, so that we
* know about the exact state of the TB value. Resync TB call will
* restore TB to host timebase.
*
* Things to consider:
* - On TB error, HMI interrupt is reported on all the threads of the core
* that has encountered TB error irrespective of split-core mode.
* - The very first thread on the core that get chance to fix TB error
* would rsync the TB with local chipTOD value.
* - The resync TB is a core level action i.e. it will sync all the TBs
* in that core independent of split-core mode. This means if we trigger
* TB sync from a thread from one subcore, it would affect TB values of
* sibling subcores of the same core.
*
* All threads need to co-ordinate before making opal hmi handler.
* All threads will use sibling_subcore_state->in_guest[] (shared by all
* threads in the core) in paca which holds information about whether
* sibling subcores are in Guest mode or host mode. The in_guest[] array
* is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset
* subcore status. Only primary threads from each subcore is responsible
* to set/unset its designated array element while entering/exiting the
* guset.
*
* After invoking opal hmi handler call, one of the thread (of entire core)
* will need to resync the TB. Bit 63 from subcore state bitmap flags
* (sibling_subcore_state->flags) will be used to co-ordinate between
* primary threads to decide who takes up the responsibility.
*
* This is what we do:
* - Primary thread from each subcore tries to set resync required bit[63]
* of paca->sibling_subcore_state->flags.
* - The first primary thread that is able to set the flag takes the
* responsibility of TB resync. (Let us call it as thread leader)
* - All other threads which are in host will call
* wait_for_subcore_guest_exit() and wait for in_guest[0-3] from
* paca->sibling_subcore_state to get cleared.
* - All the primary thread will clear its subcore status from subcore
* state in_guest[] array respectively.
* - Once all primary threads clear in_guest[0-3], all of them will invoke
* opal hmi handler.
* - Now all threads will wait for TB resync to complete by invoking
* wait_for_tb_resync() except the thread leader.
* - Thread leader will do a TB resync by invoking opal_resync_timebase()
* call and the it will clear the resync required bit.
* - All other threads will now come out of resync wait loop and proceed
* with individual execution.
* - On return of this function, primary thread will signal all
* secondary threads to proceed.
* - All secondary threads will eventually call opal hmi handler on
* their exit path.
*/
long kvmppc_realmode_hmi_handler(void)
{
int ptid = local_paca->kvm_hstate.ptid;
bool resync_req;
/* This is only called on primary thread. */
BUG_ON(ptid != 0);
__this_cpu_inc(irq_stat.hmi_exceptions);
/*
* By now primary thread has already completed guest->host
* partition switch but haven't signaled secondaries yet.
* All the secondary threads on this subcore is waiting
* for primary thread to signal them to go ahead.
*
* For threads from subcore which isn't in guest, they all will
* wait until all other subcores on this core exit the guest.
*
* Now set the resync required bit. If you are the first to
* set this bit then kvmppc_tb_resync_required() function will
* return true. For rest all other subcores
* kvmppc_tb_resync_required() will return false.
*
* If resync_req == true, then this thread is responsible to
* initiate TB resync after hmi handler has completed.
* All other threads on this core will wait until this thread
* clears the resync required bit flag.
*/
resync_req = kvmppc_tb_resync_required();
/* Reset the subcore status to indicate it has exited guest */
kvmppc_subcore_exit_guest();
/*
* Wait for other subcores on this core to exit the guest.
* All the primary threads and threads from subcore that are
* not in guest will wait here until all subcores are out
* of guest context.
*/
wait_for_subcore_guest_exit();
/*
* At this point we are sure that primary threads from each
* subcore on this core have completed guest->host partition
* switch. Now it is safe to call HMI handler.
*/
if (ppc_md.hmi_exception_early)
ppc_md.hmi_exception_early(NULL);
/*
* Check if this thread is responsible to resync TB.
* All other threads will wait until this thread completes the
* TB resync.
*/
if (resync_req) {
opal_resync_timebase();
/* Reset TB resync req bit */
kvmppc_tb_resync_done();
} else {
wait_for_tb_resync();
}
return 0;
}

View File

@ -29,6 +29,7 @@
#include <asm/kvm_book3s_asm.h>
#include <asm/book3s/64/mmu-hash.h>
#include <asm/tm.h>
#include <asm/opal.h>
#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
@ -373,6 +374,18 @@ kvm_secondary_got_guest:
lwsync
std r0, HSTATE_KVM_VCORE(r13)
/*
* All secondaries exiting guest will fall through this path.
* Before proceeding, just check for HMI interrupt and
* invoke opal hmi handler. By now we are sure that the
* primary thread on this core/subcore has already made partition
* switch/TB resync and we are good to call opal hmi handler.
*/
cmpwi r12, BOOK3S_INTERRUPT_HMI
bne kvm_no_guest
li r3,0 /* NULL argument */
bl hmi_exception_realmode
/*
* At this point we have finished executing in the guest.
* We need to wait for hwthread_req to become zero, since
@ -427,6 +440,22 @@ kvm_no_guest:
* whole-core mode, so we need to nap.
*/
kvm_unsplit_nap:
/*
* When secondaries are napping in kvm_unsplit_nap() with
* hwthread_req = 1, HMI goes ignored even though subcores are
* already exited the guest. Hence HMI keeps waking up secondaries
* from nap in a loop and secondaries always go back to nap since
* no vcore is assigned to them. This makes impossible for primary
* thread to get hold of secondary threads resulting into a soft
* lockup in KVM path.
*
* Let us check if HMI is pending and handle it before we go to nap.
*/
cmpwi r12, BOOK3S_INTERRUPT_HMI
bne 55f
li r3, 0 /* NULL argument */
bl hmi_exception_realmode
55:
/*
* Ensure that secondary doesn't nap when it has
* its vcore pointer set.
@ -601,6 +630,11 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Mark the subcore state as inside guest */
bl kvmppc_subcore_enter_guest
nop
ld r5, HSTATE_KVM_VCORE(r13)
ld r4, HSTATE_KVM_VCPU(r13)
li r0,1
stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
@ -1683,6 +1717,23 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* If HMI, call kvmppc_realmode_hmi_handler() */
cmpwi r12, BOOK3S_INTERRUPT_HMI
bne 27f
bl kvmppc_realmode_hmi_handler
nop
li r12, BOOK3S_INTERRUPT_HMI
/*
* At this point kvmppc_realmode_hmi_handler would have resync-ed
* the TB. Hence it is not required to subtract guest timebase
* offset from timebase. So, skip it.
*
* Also, do not call kvmppc_subcore_exit_guest() because it has
* been invoked as part of kvmppc_realmode_hmi_handler().
*/
b 30f
27:
/* Subtract timebase offset from timebase */
ld r8,VCORE_TB_OFFSET(r5)
cmpdi r8,0
@ -1698,8 +1749,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
addis r8,r8,0x100 /* if so, increment upper 40 bits */
mtspr SPRN_TBU40,r8
17: bl kvmppc_subcore_exit_guest
nop
30: ld r5,HSTATE_KVM_VCORE(r13)
ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
/* Reset PCR */
17: ld r0, VCORE_PCR(r5)
ld r0, VCORE_PCR(r5)
cmpdi r0, 0
beq 18f
li r0, 0
@ -2461,6 +2517,8 @@ BEGIN_FTR_SECTION
cmpwi r6, 3 /* hypervisor doorbell? */
beq 3f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
cmpwi r6, 0xa /* Hypervisor maintenance ? */
beq 4f
li r3, 1 /* anything else, return 1 */
0: blr
@ -2482,6 +2540,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, -1
blr
/* Woken up due to Hypervisor maintenance interrupt */
4: li r12, BOOK3S_INTERRUPT_HMI
li r3, 1
blr
/*
* Determine what sort of external interrupt is pending (if any).
* Returns:

View File

@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
int emul;
program_interrupt:
flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
/*
* shadow_srr1 only contains valid flags if we came here via
* a program exception. The other exceptions (emulation assist,
* FP unavailable, etc.) do not provide flags in SRR1, so use
* an illegal-instruction exception when injecting a program
* interrupt into the guest.
*/
if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
else
flags = SRR1_PROGILL;
emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
if (emul != EMULATE_DONE) {

View File

@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
advance = 0;
printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
"(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
kvmppc_core_queue_program(vcpu, 0);
}
}

View File

@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1); \
OPAL_BRANCH(opal_tracepoint_entry) \
mfcr r12; \
stw r12,8(r1); \
std r1,PACAR1(r13); \
li r11,0; \
mfmsr r12; \
ori r11,r11,MSR_EE; \
@ -127,7 +126,6 @@ opal_tracepoint_entry:
mfcr r12
std r11,16(r1)
stw r12,8(r1)
std r1,PACAR1(r13)
li r11,0
mfmsr r12
ori r11,r11,MSR_EE