From ed52870f4676489124d8697fd00e6ae6c504e586 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 4 Dec 2017 22:21:30 -0800 Subject: [PATCH 01/22] KVM: MMU: Fix infinite loop when there is no available mmu page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The below test case can cause infinite loop in kvm when ept=0. #include #include #include #include #include #include #include long r[5]; int main() { r[2] = open("/dev/kvm", O_RDONLY); r[3] = ioctl(r[2], KVM_CREATE_VM, 0); r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7); ioctl(r[4], KVM_RUN, 0); } It doesn't setup the memory regions, mmu_alloc_shadow/direct_roots() in kvm return 1 when kvm fails to allocate root page table which can result in beblow infinite loop: vcpu_run() { for (;;) { r = vcpu_enter_guest()::kvm_mmu_reload() returns 1 if (r <= 0) break; if (need_resched()) cond_resched(); } } This patch fixes it by returning -ENOSPC when there is no available kvm mmu page for root page table. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: stable@vger.kernel.org Fixes: 26eeb53cf0f (KVM: MMU: Bail out immediately if there is no available mmu page) Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5e66e5c6640..c4deb1f34faa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if(make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, 0, 0, vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); @@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); @@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, 0, vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); @@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 0, ACC_ALL); From d73235d17ba63b53dc0e1051dbc10a1f1be91b71 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 7 Dec 2017 00:30:08 -0800 Subject: [PATCH 02/22] KVM: X86: Fix load RFLAGS w/o the fixed bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit *** Guest State *** CR0: actual=0x0000000000000030, shadow=0x0000000060000010, gh_mask=fffffffffffffff7 CR4: actual=0x0000000000002050, shadow=0x0000000000000000, gh_mask=ffffffffffffe871 CR3 = 0x00000000fffbc000 RSP = 0x0000000000000000 RIP = 0x0000000000000000 RFLAGS=0x00000000 DR7 = 0x0000000000000400 ^^^^^^^^^^ The failed vmentry is triggered by the following testcase when ept=Y: #include #include #include #include #include #include #include long r[5]; int main() { r[2] = open("/dev/kvm", O_RDONLY); r[3] = ioctl(r[2], KVM_CREATE_VM, 0); r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7); struct kvm_regs regs = { .rflags = 0, }; ioctl(r[4], KVM_SET_REGS, ®s); ioctl(r[4], KVM_RUN, 0); } X86 RFLAGS bit 1 is fixed set, userspace can simply clearing bit 1 of RFLAGS with KVM_SET_REGS ioctl which results in vmentry fails. This patch fixes it by oring X86_EFLAGS_FIXED during ioctl. Cc: stable@vger.kernel.org Suggested-by: Jim Mattson Reviewed-by: David Hildenbrand Reviewed-by: Quan Xu Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index faf843c9b916..154ea27746e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7384,7 +7384,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) #endif kvm_rip_write(vcpu, regs->rip); - kvm_set_rflags(vcpu, regs->rflags); + kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); vcpu->arch.exception.pending = false; From 5663d8f9bbe4bf15488f7351efb61ea20fa6de06 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 12 Dec 2017 17:15:02 +0100 Subject: [PATCH 03/22] kvm: x86: fix WARN due to uninitialized guest FPU state ------------[ cut here ]------------ Bad FPU state detected at kvm_put_guest_fpu+0xd8/0x2d0 [kvm], reinitializing FPU registers. WARNING: CPU: 1 PID: 4594 at arch/x86/mm/extable.c:103 ex_handler_fprestore+0x88/0x90 CPU: 1 PID: 4594 Comm: qemu-system-x86 Tainted: G B OE 4.15.0-rc2+ #10 RIP: 0010:ex_handler_fprestore+0x88/0x90 Call Trace: fixup_exception+0x4e/0x60 do_general_protection+0xff/0x270 general_protection+0x22/0x30 RIP: 0010:kvm_put_guest_fpu+0xd8/0x2d0 [kvm] RSP: 0018:ffff8803d5627810 EFLAGS: 00010246 kvm_vcpu_reset+0x3b4/0x3c0 [kvm] kvm_apic_accept_events+0x1c0/0x240 [kvm] kvm_arch_vcpu_ioctl_run+0x1658/0x2fb0 [kvm] kvm_vcpu_ioctl+0x479/0x880 [kvm] do_vfs_ioctl+0x142/0x9a0 SyS_ioctl+0x74/0x80 do_syscall_64+0x15f/0x600 where kvm_put_guest_fpu is called without a prior kvm_load_guest_fpu. To fix it, move kvm_load_guest_fpu to the very beginning of kvm_arch_vcpu_ioctl_run. Cc: stable@vger.kernel.org Fixes: f775b13eedee2f7f3c6fdd4e90fb79090ce5d339 Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 154ea27746e9..56d036b9ad75 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7264,13 +7264,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct fpu *fpu = ¤t->thread.fpu; int r; - fpu__initialize(fpu); - kvm_sigset_activate(vcpu); + kvm_load_guest_fpu(vcpu); + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { r = -EINTR; @@ -7296,14 +7295,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - kvm_load_guest_fpu(vcpu); - if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; r = cui(vcpu); if (r <= 0) - goto out_fpu; + goto out; } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); @@ -7312,9 +7309,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) else r = vcpu_run(vcpu); -out_fpu: - kvm_put_guest_fpu(vcpu); out: + kvm_put_guest_fpu(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); From 19e8e54f4309eaa438237aa1973fe40c331903d4 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:19 +0100 Subject: [PATCH 04/22] tools/kvm_stat: fix command line option '-g' Specifying a guest via '-g foo' always results in an error: $ kvm_stat -g foo Usage: kvm_stat [options] kvm_stat: error: Error while searching for guest "foo", use "-p" to specify a pid instead Reason is that Tui.get_pid_from_gname() is not static, as it is supposed to be. Signed-off-by: Stefan Raspl Tested-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 217cf6f95c36..884a74b8ca87 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -950,7 +950,8 @@ class Tui(object): curses.nocbreak() curses.endwin() - def get_all_gnames(self): + @staticmethod + def get_all_gnames(): """Returns a list of (pid, gname) tuples of all running guests""" res = [] try: @@ -963,7 +964,7 @@ class Tui(object): # perform a sanity check before calling the more expensive # function to possibly extract the guest name if ' -name ' in line[1]: - res.append((line[0], self.get_gname_from_pid(line[0]))) + res.append((line[0], Tui.get_gname_from_pid(line[0]))) child.stdout.close() return res @@ -984,7 +985,8 @@ class Tui(object): except Exception: self.screen.addstr(row + 1, 2, 'Not available') - def get_pid_from_gname(self, gname): + @staticmethod + def get_pid_from_gname(gname): """Fuzzy function to convert guest name to QEMU process pid. Returns a list of potential pids, can be empty if no match found. @@ -992,7 +994,7 @@ class Tui(object): """ pids = [] - for line in self.get_all_gnames(): + for line in Tui.get_all_gnames(): if gname == line[1]: pids.append(int(line[0])) From faa06650418bf28d07426fcfdc5213782fb131f6 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:20 +0100 Subject: [PATCH 05/22] tools/kvm_stat: fix drilldown in events-by-guests mode When displaying debugfs events listed by guests, an attempt to switch to reporting of stats for individual child trace events results in garbled output. Reason is that when toggling drilldown, the update of the stats doesn't honor when events are displayed by guests, as indicated by Tui._display_guests. To reproduce, run 'kvm_stat -d' and press 'b' followed by 'x'. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 884a74b8ca87..6347ad5d0d35 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1360,7 +1360,7 @@ class Tui(object): if char == 'x': self.update_drilldown() # prevents display of current values on next refresh - self.stats.get() + self.stats.get(self._display_guests) except KeyboardInterrupt: break except curses.error: From 67c162b0892ac481e47bef06d9c6231ee993843a Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:21 +0100 Subject: [PATCH 06/22] tools/kvm_stat: fix missing field update after filter change When updating the fields filter, tracepoint events of fields previously not visible were not enabled, as TracepointProvider.update_fields() updated the member variable directly instead of using the setter, which triggers the event enable/disable. To reproduce, run 'kvm_stat -f kvm_exit', press 'c' to remove the filter, and notice that no add'l fields that do not match the regex 'kvm_exit' will appear. This issue was introduced by commit c469117df059 ("tools/kvm_stat: simplify initializers"). Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 6347ad5d0d35..f133755fdde2 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -549,8 +549,8 @@ class TracepointProvider(Provider): def update_fields(self, fields_filter): """Refresh fields, applying fields_filter""" - self._fields = [field for field in self.get_available_fields() - if self.is_field_wanted(fields_filter, field)] + self.fields = [field for field in self.get_available_fields() + if self.is_field_wanted(fields_filter, field)] @staticmethod def get_online_cpus(): From b74faa930deb2e37ed5caa0abfc687c8c532e946 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:22 +0100 Subject: [PATCH 07/22] tools/kvm_stat: fix extra handling of 'help' with fields filter Commit 67fbcd62f54d ("tools/kvm_stat: add '-f help' to get the available event list") added support for '-f help'. However, the extra handling of 'help' will also take effect when 'help' is specified as a regex in interactive mode via 'f'. This results in display of all events while only those matching this regex should be shown. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index f133755fdde2..4faf9f85a00e 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -478,7 +478,7 @@ class Provider(object): @staticmethod def is_field_wanted(fields_filter, field): """Indicate whether field is valid according to fields_filter.""" - if not fields_filter or fields_filter == "help": + if not fields_filter: return True return re.match(fields_filter, field) is not None @@ -1567,6 +1567,7 @@ def main(): stats = Stats(options) if options.fields == "help": + stats.fields_filter = None event_list = "\n" s = stats.get() for key in s.keys(): From fff8c9eb48aa58259071b5df0e6d4c1c0bc1ba51 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:23 +0100 Subject: [PATCH 08/22] tools/kvm_stat: fix child trace events accounting Child trace events were included in calculation of the overall total, which is used for calculation of the percentages of the '%Total' column. However, the parent trace envents' stats summarize the child trace events, hence we'd incorrectly account for them twice, leading to slightly wrong stats. With this fix, we use the correct total. Consequently, the sum of the child trace events' '%Total' column values is identical to the respective value of the respective parent event. However, this also means that the sum of the '%Total' column values will aggregate to more than 100 percent. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 6 +++--- tools/kvm/kvm_stat/kvm_stat.txt | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 4faf9f85a00e..90f0445d7808 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1092,14 +1092,14 @@ class Tui(object): # sort by totals return (0, -stats[x][0]) total = 0. - for val in stats.values(): - total += val[0] + for key in stats.keys(): + if key.find('(') is -1: + total += stats[key][0] if self._sorting == SORT_DEFAULT: sortkey = sortCurAvg else: sortkey = sortTotal for key in sorted(stats.keys(), key=sortkey): - if row >= self.screen.getmaxyx()[0]: break values = stats[key] diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt index e5cf836be8a1..75368a3c285f 100644 --- a/tools/kvm/kvm_stat/kvm_stat.txt +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -50,6 +50,8 @@ INTERACTIVE COMMANDS *s*:: set update interval *x*:: toggle reporting of stats for child trace events + :: *Note*: The stats for the parents summarize the respective child trace + events Press any other key to refresh statistics immediately. From f3d11b0e8619bbb053d3e13f2271819fb01c1e2a Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:24 +0100 Subject: [PATCH 09/22] tools/kvm_stat: add hint on '-f help' to man page The man page update for this new functionality was omitted. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt index 75368a3c285f..b5b3810c9e94 100644 --- a/tools/kvm/kvm_stat/kvm_stat.txt +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -88,7 +88,7 @@ OPTIONS -f:: --fields=:: - fields to display (regex) + fields to display (regex), "-f help" for a list of available events -h:: --help:: From 08e20a6300e106d5feb89c9e47ea479533fec46f Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:25 +0100 Subject: [PATCH 10/22] tools/kvm_stat: handle invalid regular expressions Passing an invalid regular expression on the command line results in a traceback. Note that interactive specification of invalid regular expressions is not affected To reproduce, run "kvm_stat -f '*'". Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 90f0445d7808..29c56f3a05dc 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1521,6 +1521,13 @@ Press any other key to refresh statistics immediately. callback=cb_guest_to_pid, ) (options, _) = optparser.parse_args(sys.argv) + try: + # verify that we were passed a valid regex up front + re.compile(options.fields) + except re.error: + sys.exit('Error: "' + options.fields + '" is not a valid regular ' + 'expression') + return options From 822cfe3e4813c8f52199362b0e689fba9459ddc9 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:26 +0100 Subject: [PATCH 11/22] tools/kvm_stat: suppress usage information on command line errors Errors while parsing the '-g' command line argument result in display of usage information prior to the error message. This is a bit confusing, as the command line is syntactically correct. To reproduce, run 'kvm_stat -g' and specify a non-existing or inactive guest. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 29c56f3a05dc..bf65531570f5 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1453,16 +1453,13 @@ Press any other key to refresh statistics immediately. try: pids = Tui.get_pid_from_gname(val) except: - raise optparse.OptionValueError('Error while searching for guest ' - '"{}", use "-p" to specify a pid ' - 'instead'.format(val)) + sys.exit('Error while searching for guest "{}". Use "-p" to ' + 'specify a pid instead?'.format(val)) if len(pids) == 0: - raise optparse.OptionValueError('No guest by the name "{}" ' - 'found'.format(val)) + sys.exit('Error: No guest by the name "{}" found'.format(val)) if len(pids) > 1: - raise optparse.OptionValueError('Multiple processes found (pids: ' - '{}) - use "-p" to specify a pid ' - 'instead'.format(" ".join(pids))) + sys.exit('Error: Multiple processes found (pids: {}). Use "-p" ' + 'to specify the desired pid'.format(" ".join(pids))) parser.values.pid = pids[0] optparser = optparse.OptionParser(description=description_text, From 73fab6ffbd83795e38974bb438e7afce0242c61a Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:27 +0100 Subject: [PATCH 12/22] tools/kvm_stat: stop ignoring unhandled arguments Unhandled arguments, which could easily include typos, are simply ignored. We should be strict to avoid undetected typos. To reproduce start kvm_stat with an extra argument, e.g. 'kvm_stat -d bnuh5ol' and note that this will actually work. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index bf65531570f5..aa3bc47af1d0 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1517,7 +1517,9 @@ Press any other key to refresh statistics immediately. help='restrict statistics to guest by name', callback=cb_guest_to_pid, ) - (options, _) = optparser.parse_args(sys.argv) + options, unkn = optparser.parse_args(sys.argv) + if len(unkn) != 1: + sys.exit('Error: Extra argument(s): ' + ' '.join(unkn[1:])) try: # verify that we were passed a valid regex up front re.compile(options.fields) From cf656c76614c6ec5b016233cac29738881c83c08 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 11 Dec 2017 12:25:29 +0100 Subject: [PATCH 13/22] tools/kvm_stat: add line for totals Add a line for the total number of events and current average at the bottom of the body. Note that both values exclude child trace events. I.e. if drilldown is activated via interactive command 'x', only the totals are accounted, or we'd be counting these twice (see previous commit "tools/kvm_stat: fix child trace events accounting"). Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index aa3bc47af1d0..566a70ddd005 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1099,8 +1099,9 @@ class Tui(object): sortkey = sortCurAvg else: sortkey = sortTotal + tavg = 0 for key in sorted(stats.keys(), key=sortkey): - if row >= self.screen.getmaxyx()[0]: + if row >= self.screen.getmaxyx()[0] - 1: break values = stats[key] if not values[0] and not values[1]: @@ -1112,9 +1113,15 @@ class Tui(object): self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % (key, values[0], values[0] * 100 / total, cur)) + if cur is not '' and key.find('(') is -1: + tavg += cur row += 1 if row == 3: self.screen.addstr(4, 1, 'No matching events reported yet') + else: + self.screen.addstr(row, 1, '%-40s %10d %8s' % + ('Total', total, tavg if tavg else ''), + curses.A_BOLD) self.screen.refresh() def show_msg(self, text): From f29810335965ac1f7bcb501ee2af5f039f792416 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 14 Dec 2017 03:01:52 -0500 Subject: [PATCH 14/22] KVM/x86: Check input paging mode when cs.l is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: WARNING: CPU: 0 PID: 27962 at arch/x86/kvm/emulate.c:5631 x86_emulate_insn+0x557/0x15f0 [kvm] Modules linked in: kvm_intel kvm [last unloaded: kvm] CPU: 0 PID: 27962 Comm: syz-executor Tainted: G B W 4.15.0-rc2-next-20171208+ #32 Hardware name: Intel Corporation S1200SP/S1200SP, BIOS S1200SP.86B.01.03.0006.040720161253 04/07/2016 RIP: 0010:x86_emulate_insn+0x557/0x15f0 [kvm] RSP: 0018:ffff8807234476d0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88072d0237a0 RCX: ffffffffa0065c4d RDX: 1ffff100e5a046f9 RSI: 0000000000000003 RDI: ffff88072d0237c8 RBP: ffff880723447728 R08: ffff88072d020000 R09: ffffffffa008d240 R10: 0000000000000002 R11: ffffed00e7d87db3 R12: ffff88072d0237c8 R13: ffff88072d023870 R14: ffff88072d0238c2 R15: ffffffffa008d080 FS: 00007f8a68666700(0000) GS:ffff880802200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000002009506c CR3: 000000071fec4005 CR4: 00000000003626f0 Call Trace: x86_emulate_instruction+0x3bc/0xb70 [kvm] ? reexecute_instruction.part.162+0x130/0x130 [kvm] vmx_handle_exit+0x46d/0x14f0 [kvm_intel] ? trace_event_raw_event_kvm_entry+0xe7/0x150 [kvm] ? handle_vmfunc+0x2f0/0x2f0 [kvm_intel] ? wait_lapic_expire+0x25/0x270 [kvm] vcpu_enter_guest+0x720/0x1ef0 [kvm] ... When CS.L is set, vcpu should run in the 64 bit paging mode. Current kvm set_sregs function doesn't have such check when userspace inputs sreg values. This will lead unexpected behavior. This patch is to add checks for CS.L, EFER.LME, EFER.LMA and CR4.PAE when get SREG inputs from userspace in order to avoid unexpected behavior. Suggested-by: Paolo Bonzini Reported-by: Dmitry Vyukov Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Dmitry Vyukov Cc: Jim Mattson Signed-off-by: Tianyu Lan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 56d036b9ad75..3a82f2d4333b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7494,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); +int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) { + /* + * When EFER.LME and CR0.PG are set, the processor is in + * 64-bit mode (though maybe in a 32-bit code segment). + * CR4.PAE and EFER.LMA must be set. + */ + if (!(sregs->cr4 & X86_CR4_PAE_BIT) + || !(sregs->efer & EFER_LMA)) + return -EINVAL; + } else { + /* + * Not in 64-bit mode: EFER.LMA is clear and the code + * segment cannot be 64-bit. + */ + if (sregs->efer & EFER_LMA || sregs->cs.l) + return -EINVAL; + } + + return 0; +} + int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -7506,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, (sregs->cr4 & X86_CR4_OSXSAVE)) return -EINVAL; + if (kvm_valid_sregs(vcpu, sregs)) + return -EINVAL; + apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) From bfe766cf65fb65e68c4764f76158718560bdcee5 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Wed, 6 Dec 2017 17:09:49 +0000 Subject: [PATCH 15/22] arm64: kvm: Prevent restoring stale PMSCR_EL1 for vcpu When VHE is not present, KVM needs to save and restores PMSCR_EL1 when possible. If SPE is used by the host, value of PMSCR_EL1 cannot be saved for the guest. If the host starts using SPE between two save+restore on the same vcpu, restore will write the value of PMSCR_EL1 read during the first save. Make sure __debug_save_spe_nvhe clears the value of the saved PMSCR_EL1 when the guest cannot use SPE. Signed-off-by: Julien Thierry Cc: Christoffer Dall Cc: Marc Zyngier Cc: Catalin Marinas Cc: Reviewed-by: Will Deacon Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp/debug-sr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c index 321c9c05dd9e..f4363d40e2cd 100644 --- a/arch/arm64/kvm/hyp/debug-sr.c +++ b/arch/arm64/kvm/hyp/debug-sr.c @@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1) { u64 reg; + /* Clear pmscr in case of early return */ + *pmscr_el1 = 0; + /* SPE present on this CPU? */ if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1), ID_AA64DFR0_PMSVER_SHIFT)) From 7839c672e58bf62da8f2f0197fefb442c02ba1dd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 7 Dec 2017 11:45:45 +0000 Subject: [PATCH 16/22] KVM: arm/arm64: Fix HYP unmapping going off limits When we unmap the HYP memory, we try to be clever and unmap one PGD at a time. If we start with a non-PGD aligned address and try to unmap a whole PGD, things go horribly wrong in unmap_hyp_range (addr and end can never match, and it all goes really badly as we keep incrementing pgd and parse random memory as page tables...). The obvious fix is to let unmap_hyp_range do what it does best, which is to iterate over a range. The size of the linear mapping, which begins at PAGE_OFFSET, can be easily calculated by subtracting PAGE_OFFSET form high_memory, because high_memory is defined as the linear map address of the last byte of DRAM, plus one. The size of the vmalloc region is given trivially by VMALLOC_END - VMALLOC_START. Cc: stable@vger.kernel.org Reported-by: Andre Przywara Tested-by: Andre Przywara Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index b36945d49986..b4b69c2d1012 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -509,8 +509,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) */ void free_hyp_pgds(void) { - unsigned long addr; - mutex_lock(&kvm_hyp_pgd_mutex); if (boot_hyp_pgd) { @@ -521,10 +519,10 @@ void free_hyp_pgds(void) if (hyp_pgd) { unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); - for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); - for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), + (uintptr_t)high_memory - PAGE_OFFSET); + unmap_hyp_range(hyp_pgd, kern_hyp_va(VMALLOC_START), + VMALLOC_END - VMALLOC_START); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; From f384dcfe4d918c1d80477d290c22ce0093823771 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 7 Dec 2017 11:46:15 +0000 Subject: [PATCH 17/22] KVM: arm/arm64: timer: Don't set irq as forwarded if no usable GIC If we don't have a usable GIC, do not try to set the vcpu affinity as this is guaranteed to fail. Reported-by: Andre Przywara Reviewed-by: Andre Przywara Tested-by: Andre Przywara Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_arch_timer.h | 2 +- virt/kvm/arm/arch_timer.c | 13 ++++++++----- virt/kvm/arm/arm.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6e45608b2399..9da6ce22803f 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -62,7 +62,7 @@ struct arch_timer_cpu { bool enabled; }; -int kvm_timer_hyp_init(void); +int kvm_timer_hyp_init(bool); int kvm_timer_enable(struct kvm_vcpu *vcpu); int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index f9555b1e7f15..aa9adfafe12b 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -720,7 +720,7 @@ static int kvm_timer_dying_cpu(unsigned int cpu) return 0; } -int kvm_timer_hyp_init(void) +int kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; int err; @@ -756,10 +756,13 @@ int kvm_timer_hyp_init(void) return err; } - err = irq_set_vcpu_affinity(host_vtimer_irq, kvm_get_running_vcpus()); - if (err) { - kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; + if (has_gic) { + err = irq_set_vcpu_affinity(host_vtimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } } kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 6b60c98a6e22..2e43f9d42bd5 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1326,7 +1326,7 @@ static int init_subsystems(void) /* * Init HYP architected timer support */ - err = kvm_timer_hyp_init(); + err = kvm_timer_hyp_init(vgic_present); if (err) goto out; From 36e5cfd410ad6060b527e51d1b4bc174a8068cfd Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 14 Dec 2017 19:54:50 +0100 Subject: [PATCH 18/22] KVM: arm/arm64: Properly handle arch-timer IRQs after vtimer_save_state The recent timer rework was assuming that once the timer was disabled, we should no longer see any interrupts from the timer. This assumption turns out to not be true, and instead we have to handle the case when the timer ISR runs even after the timer has been disabled. This requires a couple of changes: First, we should never overwrite the cached guest state of the timer control register when the ISR runs, because KVM may have disabled its timers when doing vcpu_put(), even though the guest still had the timer enabled. Second, we shouldn't assume that the timer is actually firing just because we see an interrupt, but we should check the actual state of the timer in the timer control register to understand if the hardware timer is really firing or not. We also add an ISB to vtimer_save_state() to ensure the timer is actually disabled once we enable interrupts, which should clarify the intention of the implementation, and reduce the risk of unwanted interrupts. Fixes: b103cc3f10c0 ("KVM: arm/arm64: Avoid timer save/restore in vcpu entry/exit") Reported-by: Marc Zyngier Reported-by: Jia He Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index aa9adfafe12b..14c018f990a7 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -92,16 +92,23 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; struct arch_timer_context *vtimer; + u32 cnt_ctl; + + /* + * We may see a timer interrupt after vcpu_put() has been called which + * sets the CPU's vcpu pointer to NULL, because even though the timer + * has been disabled in vtimer_save_state(), the hardware interrupt + * signal may not have been retired from the interrupt controller yet. + */ + if (!vcpu) + return IRQ_HANDLED; - if (!vcpu) { - pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n"); - return IRQ_NONE; - } vtimer = vcpu_vtimer(vcpu); - if (!vtimer->irq.level) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - if (kvm_timer_irq_can_fire(vtimer)) + cnt_ctl = read_sysreg_el0(cntv_ctl); + cnt_ctl &= ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT | + ARCH_TIMER_CTRL_IT_MASK; + if (cnt_ctl == (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT)) kvm_timer_update_irq(vcpu, true, vtimer); } @@ -355,6 +362,7 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu) /* Disable the virtual timer */ write_sysreg_el0(0, cntv_ctl); + isb(); vtimer->loaded = false; out: From 0eb7c33cadf6b2f1a94e58ded8b0eb89b4eba382 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 15 Dec 2017 00:30:12 +0100 Subject: [PATCH 19/22] KVM: arm/arm64: Fix timer enable flow When enabling the timer on the first run, we fail to ever restore the state and mark it as loaded. That means, that in the initial entry to the VCPU ioctl, unless we exit to userspace for some reason such as a pending signal, if the guest programs a timer and blocks, we will wait forever, because we never read back the hardware state (the loaded flag is not set), and so we think the timer is disabled, and we never schedule a background soft timer. The end result? The VCPU blocks forever, and the only solution is to kill the thread. Fixes: 4a2c4da1250d ("arm/arm64: KVM: Load the timer state when enabling the timer") Reported-by: Marc Zyngier Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 14c018f990a7..cc29a8148328 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -846,10 +846,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) no_vgic: preempt_disable(); timer->enabled = 1; - if (!irqchip_in_kernel(vcpu->kvm)) - kvm_timer_vcpu_load_user(vcpu); - else - kvm_timer_vcpu_load_vgic(vcpu); + kvm_timer_vcpu_load(vcpu); preempt_enable(); return 0; From e39d200fa5bf5b94a0948db0dae44c1b73b84a56 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 14 Dec 2017 17:40:50 -0800 Subject: [PATCH 20/22] KVM: Fix stack-out-of-bounds read in write_mmio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm] Read of size 8 at addr ffff8803259df7f8 by task syz-executor/32298 CPU: 6 PID: 32298 Comm: syz-executor Tainted: G OE 4.15.0-rc2+ #18 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016 Call Trace: dump_stack+0xab/0xe1 print_address_description+0x6b/0x290 kasan_report+0x28a/0x370 write_mmio+0x11e/0x270 [kvm] emulator_read_write_onepage+0x311/0x600 [kvm] emulator_read_write+0xef/0x240 [kvm] emulator_fix_hypercall+0x105/0x150 [kvm] em_hypercall+0x2b/0x80 [kvm] x86_emulate_insn+0x2b1/0x1640 [kvm] x86_emulate_instruction+0x39a/0xb90 [kvm] handle_exception+0x1b4/0x4d0 [kvm_intel] vcpu_enter_guest+0x15a0/0x2640 [kvm] kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm] kvm_vcpu_ioctl+0x479/0x880 [kvm] do_vfs_ioctl+0x142/0x9a0 SyS_ioctl+0x74/0x80 entry_SYSCALL_64_fastpath+0x23/0x9a The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall) to the guest memory, however, write_mmio tracepoint always prints 8 bytes through *(u64 *)val since kvm splits the mmio access into 8 bytes. This leaks 5 bytes from the kernel stack (CVE-2017-17741). This patch fixes it by just accessing the bytes which we operate on. Before patch: syz-executor-5567 [007] .... 51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 val 0x1ffff10077c1010f After patch: syz-executor-13416 [002] .... 51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 val 0xc1010f Reported-by: Dmitry Vyukov Reviewed-by: Darren Kenny Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++---- include/trace/events/kvm.h | 7 +++++-- virt/kvm/arm/mmio.c | 6 +++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a82f2d4333b..1cec2c62a0b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4384,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) break; - trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v); handled += n; addr += n; len -= n; @@ -4643,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_fragments[0].gpa, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, val); vcpu->mmio_read_completed = 0; return 1; } @@ -4665,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) { - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val); return vcpu_mmio_write(vcpu, gpa, bytes, val); } static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL); return X86EMUL_IO_NEEDED; } diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index e4b0b8e09932..2c735a3e6613 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq, { KVM_TRACE_MMIO_WRITE, "write" } TRACE_EVENT(kvm_mmio, - TP_PROTO(int type, int len, u64 gpa, u64 val), + TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( @@ -225,7 +225,10 @@ TRACE_EVENT(kvm_mmio, __entry->type = type; __entry->len = len; __entry->gpa = gpa; - __entry->val = val; + __entry->val = 0; + if (val) + memcpy(&__entry->val, val, + min_t(u32, sizeof(__entry->val), len)); ), TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index b6e715fd3c90..dac7ceb1a677 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -112,7 +112,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) } trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, - data); + &data); data = vcpu_data_host_to_guest(vcpu, data, len); vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); } @@ -182,14 +182,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), len); - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } else { trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, - fault_ipa, 0); + fault_ipa, NULL); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); From fae1a3e775cca8c3a9e0eb34443b310871a15a92 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Dec 2017 00:49:14 +0100 Subject: [PATCH 21/22] kvm: x86: fix RSM when PCID is non-zero rsm_load_state_64() and rsm_enter_protected_mode() load CR3, then CR4 & ~PCIDE, then CR0, then CR4. However, setting CR4.PCIDE fails if CR3[11:0] != 0. It's probably easier in the long run to replace rsm_enter_protected_mode() with an emulator callback that sets all the special registers (like KVM_SET_SREGS would do). For now, set the PCID field of CR3 only after CR4.PCIDE is 1. Reported-by: Laszlo Ersek Tested-by: Laszlo Ersek Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index abe74f779f9d..b514b2b2845a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2390,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) } static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - u64 cr0, u64 cr4) + u64 cr0, u64 cr3, u64 cr4) { int bad; + u64 pcid; + + /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ + pcid = 0; + if (cr4 & X86_CR4_PCIDE) { + pcid = cr3 & 0xfff; + cr3 &= ~0xfff; + } + + bad = ctxt->ops->set_cr(ctxt, 3, cr3); + if (bad) + return X86EMUL_UNHANDLEABLE; /* * First enable PAE, long mode needs it before CR0.PG = 1 is set. @@ -2411,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, bad = ctxt->ops->set_cr(ctxt, 4, cr4); if (bad) return X86EMUL_UNHANDLEABLE; + if (pcid) { + bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + } return X86EMUL_CONTINUE; @@ -2421,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) struct desc_struct desc; struct desc_ptr dt; u16 selector; - u32 val, cr0, cr4; + u32 val, cr0, cr3, cr4; int i; cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); + cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); @@ -2467,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); - return rsm_enter_protected_mode(ctxt, cr0, cr4); + return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) { struct desc_struct desc; struct desc_ptr dt; - u64 val, cr0, cr4; + u64 val, cr0, cr3, cr4; u32 base3; u16 selector; int i, r; @@ -2491,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); + cr3 = GET_SMSTATE(u64, smbase, 0x7f50); cr4 = GET_SMSTATE(u64, smbase, 0x7f48); ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); val = GET_SMSTATE(u64, smbase, 0x7ed0); @@ -2519,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) dt.address = GET_SMSTATE(u64, smbase, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); - r = rsm_enter_protected_mode(ctxt, cr0, cr4); + r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); if (r != X86EMUL_CONTINUE) return r; From aa12f594f97efe50223611dbd13ecca4e8dafee6 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 21 Dec 2017 13:03:27 +0100 Subject: [PATCH 22/22] tools/kvm_stat: sort '-f help' output Sort the fields returned by specifying '-f help' on the command line. While at it, simplify the code a bit, indent the output and eliminate an extra blank line at the beginning. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 566a70ddd005..a5684d0968b4 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1579,17 +1579,13 @@ def main(): stats = Stats(options) - if options.fields == "help": + if options.fields == 'help': stats.fields_filter = None - event_list = "\n" - s = stats.get() - for key in s.keys(): - if key.find('(') != -1: - key = key[0:key.find('(')] - if event_list.find('\n' + key + '\n') == -1: - event_list += key + '\n' - sys.stdout.write(event_list) - return "" + event_list = [] + for key in stats.get().keys(): + event_list.append(key.split('(', 1)[0]) + sys.stdout.write(' ' + '\n '.join(sorted(set(event_list))) + '\n') + sys.exit(0) if options.log: log(stats)