From 1a52e051368a0e29b26e8790bacd8d1d759e3287 Mon Sep 17 00:00:00 2001 From: Nitin A Kamble Date: Tue, 18 Sep 2007 16:34:25 -0700 Subject: [PATCH 01/13] KVM: x86 emulator: fix merge screwup due to emulator split This code has gone to wrong place in the file. Moving it back to right location. Signed-off-by: Nitin A Kamble Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 51 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 9737c3b2f48c..b1026d2c8aec 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -1083,31 +1083,6 @@ push: case 0xd2 ... 0xd3: /* Grp2 */ src.val = _regs[VCPU_REGS_RCX]; goto grp2; - case 0xe8: /* call (near) */ { - long int rel; - switch (op_bytes) { - case 2: - rel = insn_fetch(s16, 2, _eip); - break; - case 4: - rel = insn_fetch(s32, 4, _eip); - break; - case 8: - rel = insn_fetch(s64, 8, _eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } - src.val = (unsigned long) _eip; - JMP_REL(rel); - goto push; - } - case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ - JMP_REL(src.val); - no_wb = 1; /* Disable writeback. */ - break; case 0xf6 ... 0xf7: /* Grp3 */ switch (modrm_reg) { case 0 ... 1: /* test */ @@ -1350,6 +1325,32 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; + case 0xe8: /* call (near) */ { + long int rel; + switch (op_bytes) { + case 2: + rel = insn_fetch(s16, 2, _eip); + break; + case 4: + rel = insn_fetch(s32, 4, _eip); + break; + case 8: + rel = insn_fetch(s64, 8, _eip); + break; + default: + DPRINTF("Call: Invalid op_bytes\n"); + goto cannot_emulate; + } + src.val = (unsigned long) _eip; + JMP_REL(rel); + goto push; + } + case 0xe9: /* jmp rel */ + case 0xeb: /* jmp rel short */ + JMP_REL(src.val); + no_wb = 1; /* Disable writeback. */ + break; + } goto writeback; From ae6200baea4175cac684ea76f78082b31afbdefa Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 20 Sep 2007 11:17:24 +0200 Subject: [PATCH 02/13] KVM: x86 emulator: fix repne/repnz decoding The repnz/repne instructions must set rep_prefix to 1 like rep/repe/repz. This patch correct the disk probe problem met with OpenBSD. This issue appears with commit e70669abd4e60dfea3ac1639848e20e2b8dd1255 because before it, the decoding was done internally to kvm and after it is done by x86_emulate.c (which doesn't do it correctly). Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index b1026d2c8aec..80b1758e2d33 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -596,11 +596,10 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case 0xf0: /* LOCK */ lock_prefix = 1; break; + case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP/REPE/REPZ */ rep_prefix = 1; break; - case 0xf2: /* REPNE/REPNZ */ - break; default: goto done_prefixes; } From 7f2145ad6f3e7060147a2a4c4db35c641ff61b5c Mon Sep 17 00:00:00 2001 From: Izik Eidus Date: Sun, 23 Sep 2007 12:30:19 +0200 Subject: [PATCH 03/13] KVM: MMU: Set shadow pte atomically in mmu_pte_write_zap_pte() Setting shadow page table entry should be set atomicly using set_shadow_pte(). Signed-off-by: Izik Eidus Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 6d84d30f5ed0..71716182d59d 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1088,7 +1088,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, mmu_page_remove_parent_pte(child, spte); } } - *spte = 0; + set_shadow_pte(spte, 0); kvm_flush_remote_tlbs(vcpu->kvm); } From 1b6269db3f83396c2fd2c8d0f3e0f37ac0e6ba05 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 9 Oct 2007 12:12:19 +0200 Subject: [PATCH 04/13] KVM: VMX: Handle NMIs before enabling interrupts and preemption This makes sure we handle NMI on the current cpu, and that we don't service maskable interrupts before non-maskable ones. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 4f115a8e45ef..bcc1e398a976 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1760,10 +1760,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ - asm ("int $2"); - return 1; - } + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { vmx_fpu_activate(vcpu); @@ -2196,6 +2194,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info; /* * Loading guest fpu may have cleared host cr0.ts @@ -2322,6 +2321,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; + + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* We need to handle NMIs before interrupts are enabled */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + asm("int $2"); } static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, From 8668a3c468ed55d19514117a5a959d91d3d03823 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Wed, 10 Oct 2007 14:26:45 +0800 Subject: [PATCH 05/13] KVM: VMX: Reset mmu context when entering real mode Resetting an SMP guest will force AP enter real mode (RESET) with paging enabled in protected mode. While current enter_rmode() can only handle mode switch from nonpaging mode to real mode which leads to SMP reboot failure. Fix by reloading the mmu context on entering real mode. Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Qing He Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 1 + drivers/kvm/vmx.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 71716182d59d..feb5ac986c5d 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1049,6 +1049,7 @@ int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) destroy_kvm_mmu(vcpu); return init_kvm_mmu(vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index bcc1e398a976..f130c01422cf 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1128,6 +1128,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); + kvm_mmu_reset_context(vcpu); init_rmode_tss(vcpu->kvm); } From a012e65aee48379a7a87eadafa74f878b61522b9 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 15 Oct 2007 14:24:20 +0800 Subject: [PATCH 06/13] KVM: x86 emulator: implement 'movnti mem, reg' Implement emulation of instruction: movnti m32/m64, r32/r64 opcode: 0x0f 0xc3 Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 80b1758e2d33..0a8696d9b82c 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -212,7 +212,8 @@ static u16 twobyte_table[256] = { 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */ @@ -1501,6 +1502,10 @@ twobyte_insn: dst.bytes = op_bytes; dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; break; + case 0xc3: /* movnti */ + dst.bytes = op_bytes; + dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; + break; } goto writeback; From 78f7826868da8e27d097802139a3fec39f47f3b8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 16 Oct 2007 19:06:15 +0200 Subject: [PATCH 07/13] KVM: VMX: Force vm86 mode if setting flags during real mode When resetting from userspace, we need to handle the flags being cleared even after we are in real mode. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index f130c01422cf..bb56ae3f89b6 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -523,6 +523,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (vcpu->rmode.active) + rflags |= IOPL_MASK | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } From 4e62417bf317504c0b85e0d7abd236f334f54eaf Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Wed, 17 Oct 2007 19:30:41 +0200 Subject: [PATCH 08/13] KVM: x86 emulator: fix access registers for instructions with ModR/M byte and Mod = 3 The patch belows changes the access type to register from memory for instructions that are declared as SrcMem or DstMem, but have a ModR/M byte with Mod = 3. It fixes (at least) the lmsw and smsw instructions on an AMD64 CPU, which are needed for FreeBSD. Signed-off-by: Aurelien Jarno Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 0a8696d9b82c..a6ace302e0cd 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -825,6 +825,14 @@ done_prefixes: if (twobyte && b == 0x01 && modrm_reg == 7) break; srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((d & ModRM) && modrm_mod == 3) { + src.type = OP_REG; + break; + } src.type = OP_MEM; src.ptr = (unsigned long *)cr2; src.val = 0; @@ -893,6 +901,14 @@ done_prefixes: dst.ptr = (unsigned long *)cr2; dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.val = 0; + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((d & ModRM) && modrm_mod == 3) { + dst.type = OP_REG; + break; + } if (d & BitOp) { unsigned long mask = ~(dst.bytes * 8 - 1); From 0552f73b9a81d39d50b71a5c06cf36efff80b6fd Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 18 Oct 2007 15:19:01 +0200 Subject: [PATCH 09/13] KVM: Move kvm_guest_exit() after local_irq_enable() We need to make sure that the timer interrupt happens before we clear PF_VCPU, so the accounting code actually sees guest mode. http://lkml.org/lkml/2007/10/15/114 Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index af2d288c881d..8c458f262872 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2054,12 +2054,21 @@ again: kvm_x86_ops->run(vcpu, kvm_run); - kvm_guest_exit(); vcpu->guest_mode = 0; local_irq_enable(); ++vcpu->stat.exits; + /* + * We must have an instruction between local_irq_enable() and + * kvm_guest_exit(), so the timer interrupt isn't delayed by + * the interrupt shadow. The stat.exits increment will do nicely. + * But we need to prevent reordering, hence this barrier(): + */ + barrier(); + + kvm_guest_exit(); + preempt_enable(); /* From b33ac88b4c23330043acad930517282eb486db1d Mon Sep 17 00:00:00 2001 From: Kevin Pedretti Date: Sun, 21 Oct 2007 08:54:53 +0200 Subject: [PATCH 10/13] KVM: Fix local apic timer divide by zero kvm_lapic_reset() was initializing apic->timer.divide_count to 0, which could potentially lead to a divide by zero error in apic_get_tmcct(). Any guest that reads the APIC's CCR (current count) register before setting DCR (divide configuration) would trigger a divide by zero exception in the host kernel, leading to a host-OS crash. This patch results in apic->timer.divide_count being initialized to 2 at reset, eliminating the bug (DCR=0 at reset, meaning divide by 2). Signed-off-by: Kevin Pedretti Signed-off-by: Avi Kivity --- drivers/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c index a190587cf6a5..443730e689e3 100644 --- a/drivers/kvm/lapic.c +++ b/drivers/kvm/lapic.c @@ -853,7 +853,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->timer.divide_count = 0; + update_divide_count(apic); atomic_set(&apic->timer.pending, 0); if (vcpu->vcpu_id == 0) vcpu->apic_base |= MSR_IA32_APICBASE_BSP; From 9da8f4e83a824dabf3fb7ad0890549257ae614a0 Mon Sep 17 00:00:00 2001 From: Kevin Pedretti Date: Sun, 21 Oct 2007 08:55:50 +0200 Subject: [PATCH 11/13] KVM: Improve local apic timer wraparound handling Better handle wrap-around cases when reading the APIC CCR (current count register). Also, if ICR is 0, CCR should also be 0... previously reading CCR before setting ICR would result in a large kinda-random number. Signed-off-by: Kevin Pedretti Signed-off-by: Avi Kivity --- drivers/kvm/lapic.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c index 443730e689e3..238fcad3cece 100644 --- a/drivers/kvm/lapic.c +++ b/drivers/kvm/lapic.c @@ -494,12 +494,19 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - u32 counter_passed; - ktime_t passed, now = apic->timer.dev.base->get_time(); - u32 tmcct = apic_get_reg(apic, APIC_TMICT); + u64 counter_passed; + ktime_t passed, now; + u32 tmcct; ASSERT(apic != NULL); + now = apic->timer.dev.base->get_time(); + tmcct = apic_get_reg(apic, APIC_TMICT); + + /* if initial count is 0, current count should also be 0 */ + if (tmcct == 0) + return 0; + if (unlikely(ktime_to_ns(now) <= ktime_to_ns(apic->timer.last_update))) { /* Wrap around */ @@ -514,15 +521,24 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) counter_passed = div64_64(ktime_to_ns(passed), (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); - tmcct -= counter_passed; - if (tmcct <= 0) { - if (unlikely(!apic_lvtt_period(apic))) + if (counter_passed > tmcct) { + if (unlikely(!apic_lvtt_period(apic))) { + /* one-shot timers stick at 0 until reset */ tmcct = 0; - else - do { - tmcct += apic_get_reg(apic, APIC_TMICT); - } while (tmcct <= 0); + } else { + /* + * periodic timers reset to APIC_TMICT when they + * hit 0. The while loop simulates this happening N + * times. (counter_passed %= tmcct) would also work, + * but might be slower or not work on 32-bit?? + */ + while (counter_passed > tmcct) + counter_passed -= tmcct; + tmcct -= counter_passed; + } + } else { + tmcct -= counter_passed; } return tmcct; From 83d87d167367ae2cc2c6810399aefac33a2ced41 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 18 Oct 2007 15:19:01 +0200 Subject: [PATCH 12/13] sched: don't clear PF_VCPU in scheduler KVM clears it by itself now, and for s390 this is plain wrong. Signed-off-by: Laurent Vivier Acked-by: Ingo Molnar Signed-off-by: Avi Kivity --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7581e331b139..2810e562a991 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3375,7 +3375,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, if (p->flags & PF_VCPU) { account_guest_time(p, cputime); - p->flags &= ~PF_VCPU; return; } From 49d3bd7e2b990e717aa66e229410b8f5096c4956 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Mon, 22 Oct 2007 16:33:07 +0200 Subject: [PATCH 13/13] KVM: Use new smp_call_function_mask() in kvm_flush_remote_tlbs() In kvm_flush_remote_tlbs(), replace a loop using smp_call_function_single() by a single call to smp_call_function_mask() (which is new for x86_64). Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 8c458f262872..07ae280e8fe5 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -198,21 +198,15 @@ static void vcpu_put(struct kvm_vcpu *vcpu) static void ack_flush(void *_completed) { - atomic_t *completed = _completed; - - atomic_inc(completed); } void kvm_flush_remote_tlbs(struct kvm *kvm) { - int i, cpu, needed; + int i, cpu; cpumask_t cpus; struct kvm_vcpu *vcpu; - atomic_t completed; - atomic_set(&completed, 0); cpus_clear(cpus); - needed = 0; for (i = 0; i < KVM_MAX_VCPUS; ++i) { vcpu = kvm->vcpus[i]; if (!vcpu) @@ -221,23 +215,9 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) continue; cpu = vcpu->cpu; if (cpu != -1 && cpu != raw_smp_processor_id()) - if (!cpu_isset(cpu, cpus)) { - cpu_set(cpu, cpus); - ++needed; - } - } - - /* - * We really want smp_call_function_mask() here. But that's not - * available, so ipi all cpus in parallel and wait for them - * to complete. - */ - for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus)) - smp_call_function_single(cpu, ack_flush, &completed, 1, 0); - while (atomic_read(&completed) != needed) { - cpu_relax(); - barrier(); + cpu_set(cpu, cpus); } + smp_call_function_mask(cpus, ack_flush, NULL, 1); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)