linux/arch/powerpc/kvm/trace.h
Paul Mackerras de56a948b9 KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode.  Using hypervisor mode means
that the guest can use the processor's supervisor mode.  That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host.  This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.

This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses.  That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification.  In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.

Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.

This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.

With the guest running in supervisor mode, most exceptions go straight
to the guest.  We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.

We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.

In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount.  Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.

The POWER7 processor has a restriction that all threads in a core have
to be in the same partition.  MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest.  At present we require the host and guest to run
in single-thread mode because of this hardware restriction.

This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management.  This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.

This also adds a few new exports needed by the book3s_hv code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-07-12 13:16:54 +03:00

344 lines
8.3 KiB
C

#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KVM_H
#include <linux/tracepoint.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace
/*
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_ppc_instr,
TP_PROTO(unsigned int inst, unsigned long _pc, unsigned int emulate),
TP_ARGS(inst, _pc, emulate),
TP_STRUCT__entry(
__field( unsigned int, inst )
__field( unsigned long, pc )
__field( unsigned int, emulate )
),
TP_fast_assign(
__entry->inst = inst;
__entry->pc = _pc;
__entry->emulate = emulate;
),
TP_printk("inst %u pc 0x%lx emulate %u\n",
__entry->inst, __entry->pc, __entry->emulate)
);
TRACE_EVENT(kvm_stlb_inval,
TP_PROTO(unsigned int stlb_index),
TP_ARGS(stlb_index),
TP_STRUCT__entry(
__field( unsigned int, stlb_index )
),
TP_fast_assign(
__entry->stlb_index = stlb_index;
),
TP_printk("stlb_index %u", __entry->stlb_index)
);
TRACE_EVENT(kvm_stlb_write,
TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0,
unsigned int word1, unsigned int word2),
TP_ARGS(victim, tid, word0, word1, word2),
TP_STRUCT__entry(
__field( unsigned int, victim )
__field( unsigned int, tid )
__field( unsigned int, word0 )
__field( unsigned int, word1 )
__field( unsigned int, word2 )
),
TP_fast_assign(
__entry->victim = victim;
__entry->tid = tid;
__entry->word0 = word0;
__entry->word1 = word1;
__entry->word2 = word2;
),
TP_printk("victim %u tid %u w0 %u w1 %u w2 %u",
__entry->victim, __entry->tid, __entry->word0,
__entry->word1, __entry->word2)
);
TRACE_EVENT(kvm_gtlb_write,
TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0,
unsigned int word1, unsigned int word2),
TP_ARGS(gtlb_index, tid, word0, word1, word2),
TP_STRUCT__entry(
__field( unsigned int, gtlb_index )
__field( unsigned int, tid )
__field( unsigned int, word0 )
__field( unsigned int, word1 )
__field( unsigned int, word2 )
),
TP_fast_assign(
__entry->gtlb_index = gtlb_index;
__entry->tid = tid;
__entry->word0 = word0;
__entry->word1 = word1;
__entry->word2 = word2;
),
TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u",
__entry->gtlb_index, __entry->tid, __entry->word0,
__entry->word1, __entry->word2)
);
/*************************************************************************
* Book3S trace points *
*************************************************************************/
#ifdef CONFIG_KVM_BOOK3S_PR
TRACE_EVENT(kvm_book3s_exit,
TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
TP_ARGS(exit_nr, vcpu),
TP_STRUCT__entry(
__field( unsigned int, exit_nr )
__field( unsigned long, pc )
__field( unsigned long, msr )
__field( unsigned long, dar )
__field( unsigned long, srr1 )
),
TP_fast_assign(
__entry->exit_nr = exit_nr;
__entry->pc = kvmppc_get_pc(vcpu);
__entry->dar = kvmppc_get_fault_dar(vcpu);
__entry->msr = vcpu->arch.shared->msr;
__entry->srr1 = to_svcpu(vcpu)->shadow_srr1;
),
TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
__entry->exit_nr, __entry->pc, __entry->msr, __entry->dar,
__entry->srr1)
);
TRACE_EVENT(kvm_book3s_reenter,
TP_PROTO(int r, struct kvm_vcpu *vcpu),
TP_ARGS(r, vcpu),
TP_STRUCT__entry(
__field( unsigned int, r )
__field( unsigned long, pc )
),
TP_fast_assign(
__entry->r = r;
__entry->pc = kvmppc_get_pc(vcpu);
),
TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
);
#ifdef CONFIG_PPC_BOOK3S_64
TRACE_EVENT(kvm_book3s_64_mmu_map,
TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
struct kvmppc_pte *orig_pte),
TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
TP_STRUCT__entry(
__field( unsigned char, flag_w )
__field( unsigned char, flag_x )
__field( unsigned long, eaddr )
__field( unsigned long, hpteg )
__field( unsigned long, va )
__field( unsigned long long, vpage )
__field( unsigned long, hpaddr )
),
TP_fast_assign(
__entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
__entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x';
__entry->eaddr = orig_pte->eaddr;
__entry->hpteg = hpteg;
__entry->va = va;
__entry->vpage = orig_pte->vpage;
__entry->hpaddr = hpaddr;
),
TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
__entry->flag_w, __entry->flag_x, __entry->eaddr,
__entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
);
#endif /* CONFIG_PPC_BOOK3S_64 */
TRACE_EVENT(kvm_book3s_mmu_map,
TP_PROTO(struct hpte_cache *pte),
TP_ARGS(pte),
TP_STRUCT__entry(
__field( u64, host_va )
__field( u64, pfn )
__field( ulong, eaddr )
__field( u64, vpage )
__field( ulong, raddr )
__field( int, flags )
),
TP_fast_assign(
__entry->host_va = pte->host_va;
__entry->pfn = pte->pfn;
__entry->eaddr = pte->pte.eaddr;
__entry->vpage = pte->pte.vpage;
__entry->raddr = pte->pte.raddr;
__entry->flags = (pte->pte.may_read ? 0x4 : 0) |
(pte->pte.may_write ? 0x2 : 0) |
(pte->pte.may_execute ? 0x1 : 0);
),
TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
__entry->host_va, __entry->pfn, __entry->eaddr,
__entry->vpage, __entry->raddr, __entry->flags)
);
TRACE_EVENT(kvm_book3s_mmu_invalidate,
TP_PROTO(struct hpte_cache *pte),
TP_ARGS(pte),
TP_STRUCT__entry(
__field( u64, host_va )
__field( u64, pfn )
__field( ulong, eaddr )
__field( u64, vpage )
__field( ulong, raddr )
__field( int, flags )
),
TP_fast_assign(
__entry->host_va = pte->host_va;
__entry->pfn = pte->pfn;
__entry->eaddr = pte->pte.eaddr;
__entry->vpage = pte->pte.vpage;
__entry->raddr = pte->pte.raddr;
__entry->flags = (pte->pte.may_read ? 0x4 : 0) |
(pte->pte.may_write ? 0x2 : 0) |
(pte->pte.may_execute ? 0x1 : 0);
),
TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
__entry->host_va, __entry->pfn, __entry->eaddr,
__entry->vpage, __entry->raddr, __entry->flags)
);
TRACE_EVENT(kvm_book3s_mmu_flush,
TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
unsigned long long p2),
TP_ARGS(type, vcpu, p1, p2),
TP_STRUCT__entry(
__field( int, count )
__field( unsigned long long, p1 )
__field( unsigned long long, p2 )
__field( const char *, type )
),
TP_fast_assign(
__entry->count = to_book3s(vcpu)->hpte_cache_count;
__entry->p1 = p1;
__entry->p2 = p2;
__entry->type = type;
),
TP_printk("Flush %d %sPTEs: %llx - %llx",
__entry->count, __entry->type, __entry->p1, __entry->p2)
);
TRACE_EVENT(kvm_book3s_slb_found,
TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
TP_ARGS(gvsid, hvsid),
TP_STRUCT__entry(
__field( unsigned long long, gvsid )
__field( unsigned long long, hvsid )
),
TP_fast_assign(
__entry->gvsid = gvsid;
__entry->hvsid = hvsid;
),
TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
);
TRACE_EVENT(kvm_book3s_slb_fail,
TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
TP_ARGS(sid_map_mask, gvsid),
TP_STRUCT__entry(
__field( unsigned short, sid_map_mask )
__field( unsigned long long, gvsid )
),
TP_fast_assign(
__entry->sid_map_mask = sid_map_mask;
__entry->gvsid = gvsid;
),
TP_printk("%x/%x: %llx", __entry->sid_map_mask,
SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
);
TRACE_EVENT(kvm_book3s_slb_map,
TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
unsigned long long hvsid),
TP_ARGS(sid_map_mask, gvsid, hvsid),
TP_STRUCT__entry(
__field( unsigned short, sid_map_mask )
__field( unsigned long long, guest_vsid )
__field( unsigned long long, host_vsid )
),
TP_fast_assign(
__entry->sid_map_mask = sid_map_mask;
__entry->guest_vsid = gvsid;
__entry->host_vsid = hvsid;
),
TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
__entry->guest_vsid, __entry->host_vsid)
);
TRACE_EVENT(kvm_book3s_slbmte,
TP_PROTO(u64 slb_vsid, u64 slb_esid),
TP_ARGS(slb_vsid, slb_esid),
TP_STRUCT__entry(
__field( u64, slb_vsid )
__field( u64, slb_esid )
),
TP_fast_assign(
__entry->slb_vsid = slb_vsid;
__entry->slb_esid = slb_esid;
),
TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
);
#endif /* CONFIG_PPC_BOOK3S */
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
#include <trace/define_trace.h>