From f53884b1bf28497e9596cac8b44ef1d41bd6dfc5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:39 +1000 Subject: [PATCH 001/240] powerpc/64s: Remove WORT SPR from POWER9/10 (take 2) This removes a missed remnant of the WORT SPR. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-2-npiggin@gmail.com --- arch/powerpc/platforms/powernv/idle.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index e3ffdc8e8567..86e787502e42 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -589,7 +589,6 @@ struct p9_sprs { u64 purr; u64 spurr; u64 dscr; - u64 wort; u64 ciabr; u64 mmcra; From 736df58fd5bcd02f811f7d474bbe02a35ffaa8f0 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:40 +1000 Subject: [PATCH 002/240] powerpc/64s: guard optional TIDR SPR with CPU ftr test The TIDR SPR only exists on POWER9. Avoid accessing it when the feature bit for it is not set. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-3-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 12 ++++++++---- arch/powerpc/xmon/xmon.c | 10 ++++++++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7b74fc0a986b..2777f66001a8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3780,7 +3780,8 @@ static void load_spr_state(struct kvm_vcpu *vcpu) mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); mtspr(SPRN_BESCR, vcpu->arch.bescr); - mtspr(SPRN_TIDR, vcpu->arch.tid); + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + mtspr(SPRN_TIDR, vcpu->arch.tid); mtspr(SPRN_AMR, vcpu->arch.amr); mtspr(SPRN_UAMOR, vcpu->arch.uamor); @@ -3806,7 +3807,8 @@ static void store_spr_state(struct kvm_vcpu *vcpu) vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); vcpu->arch.bescr = mfspr(SPRN_BESCR); - vcpu->arch.tid = mfspr(SPRN_TIDR); + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + vcpu->arch.tid = mfspr(SPRN_TIDR); vcpu->arch.amr = mfspr(SPRN_AMR); vcpu->arch.uamor = mfspr(SPRN_UAMOR); vcpu->arch.dscr = mfspr(SPRN_DSCR); @@ -3826,7 +3828,8 @@ struct p9_host_os_sprs { static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) { host_os_sprs->dscr = mfspr(SPRN_DSCR); - host_os_sprs->tidr = mfspr(SPRN_TIDR); + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + host_os_sprs->tidr = mfspr(SPRN_TIDR); host_os_sprs->iamr = mfspr(SPRN_IAMR); host_os_sprs->amr = mfspr(SPRN_AMR); host_os_sprs->fscr = mfspr(SPRN_FSCR); @@ -3840,7 +3843,8 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, mtspr(SPRN_UAMOR, 0); mtspr(SPRN_DSCR, host_os_sprs->dscr); - mtspr(SPRN_TIDR, host_os_sprs->tidr); + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + mtspr(SPRN_TIDR, host_os_sprs->tidr); mtspr(SPRN_IAMR, host_os_sprs->iamr); if (host_os_sprs->amr != vcpu->arch.amr) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 8b28ff9d98d1..83100c6524cc 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2107,8 +2107,14 @@ static void dump_300_sprs(void) if (!cpu_has_feature(CPU_FTR_ARCH_300)) return; - printf("pidr = %.16lx tidr = %.16lx\n", - mfspr(SPRN_PID), mfspr(SPRN_TIDR)); + if (cpu_has_feature(CPU_FTR_P9_TIDR)) { + printf("pidr = %.16lx tidr = %.16lx\n", + mfspr(SPRN_PID), mfspr(SPRN_TIDR)); + } else { + printf("pidr = %.16lx\n", + mfspr(SPRN_PID)); + } + printf("psscr = %.16lx\n", hv ? mfspr(SPRN_PSSCR) : mfspr(SPRN_PSSCR_PR)); From 5955c7469a73033f607ebd6d418058943fe13dd3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:41 +1000 Subject: [PATCH 003/240] KMV: PPC: Book3S HV P9: Use set_dec to set decrementer to host The host Linux timer code arms the decrementer with the value 'decrementers_next_tb - current_tb' using set_dec(), which stores val - 1 on Book3S-64, which is not quite the same as what KVM does to re-arm the host decrementer when exiting the guest. This shouldn't be a significant change, but it makes the logic match and avoids this small extra change being brought into the next patch. Suggested-by: Alexey Kardashevskiy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-4-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2777f66001a8..c7dbdec183b9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4063,7 +4063,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 0x101; vc->in_guest = 0; - mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + set_dec(local_paca->kvm_hstate.dec_expires - mftb()); /* We may have raced with new irq work */ if (test_irq_work_pending()) set_dec(1); From 4ebbd075bcde7884e078d4360510b989f559bfec Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:42 +1000 Subject: [PATCH 004/240] KVM: PPC: Book3S HV P9: Use host timer accounting to avoid decrementer read There is no need to save away the host DEC value, as it is derived from the host timer subsystem which maintains the next timer time, so it can be restored from there. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-5-npiggin@gmail.com --- arch/powerpc/include/asm/time.h | 5 +++++ arch/powerpc/kernel/time.c | 1 + arch/powerpc/kvm/book3s_hv.c | 14 +++++++------- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 8c2c3dd4ddba..fd09b4797fd7 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -111,6 +111,11 @@ static inline unsigned long test_irq_work_pending(void) DECLARE_PER_CPU(u64, decrementers_next_tb); +static inline u64 timer_get_next_tb(void) +{ + return __this_cpu_read(decrementers_next_tb); +} + /* Convert timebase ticks to nanoseconds */ unsigned long long tb_to_ns(unsigned long long tb_ticks); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index cae8f03a44fe..374950afec2f 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -107,6 +107,7 @@ struct clock_event_device decrementer_clockevent = { EXPORT_SYMBOL(decrementer_clockevent); DEFINE_PER_CPU(u64, decrementers_next_tb); +EXPORT_SYMBOL_GPL(decrementers_next_tb); static DEFINE_PER_CPU(struct clock_event_device, decrementers); #define XSEC_PER_SEC (1024*1024) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c7dbdec183b9..3322edbafc64 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3873,18 +3873,17 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, struct kvmppc_vcore *vc = vcpu->arch.vcore; struct p9_host_os_sprs host_os_sprs; s64 dec; - u64 tb; + u64 tb, next_timer; int trap, save_pmu; WARN_ON_ONCE(vcpu->arch.ceded); - dec = mfspr(SPRN_DEC); tb = mftb(); - if (dec < 0) + next_timer = timer_get_next_tb(); + if (tb >= next_timer) return BOOK3S_INTERRUPT_HV_DECREMENTER; - local_paca->kvm_hstate.dec_expires = dec + tb; - if (local_paca->kvm_hstate.dec_expires < time_limit) - time_limit = local_paca->kvm_hstate.dec_expires; + if (next_timer < time_limit) + time_limit = next_timer; save_p9_host_os_sprs(&host_os_sprs); @@ -4063,7 +4062,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 0x101; vc->in_guest = 0; - set_dec(local_paca->kvm_hstate.dec_expires - mftb()); + next_timer = timer_get_next_tb(); + set_dec(next_timer - mftb()); /* We may have raced with new irq work */ if (test_irq_work_pending()) set_dec(1); From 9581991a60817abe311c2581ae4554b28bfa32f1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:43 +1000 Subject: [PATCH 005/240] KVM: PPC: Book3S HV P9: Use large decrementer for HDEC On processors that don't suppress the HDEC exceptions when LPCR[HDICE]=0, this could help reduce needless guest exits due to leftover exceptions on entering the guest. Signed-off-by: Nicholas Piggin Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-6-npiggin@gmail.com --- arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c | 1 + arch/powerpc/kvm/book3s_hv_p9_entry.c | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index fd09b4797fd7..69b6be617772 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -18,6 +18,8 @@ #include /* time.c */ +extern u64 decrementer_max; + extern unsigned long tb_ticks_per_jiffy; extern unsigned long tb_ticks_per_usec; extern unsigned long tb_ticks_per_sec; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 374950afec2f..2769d565f842 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -88,6 +88,7 @@ static struct clocksource clocksource_timebase = { #define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF u64 decrementer_max = DECREMENTER_DEFAULT_MAX; +EXPORT_SYMBOL_GPL(decrementer_max); /* for KVM HDEC */ static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 961b3d70483c..0ff9ddb5e7ca 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -504,7 +504,8 @@ tm_return_to_guest: vc->tb_offset_applied = 0; } - mtspr(SPRN_HDEC, 0x7fffffff); + /* HDEC must be at least as large as DEC, so decrementer_max fits */ + mtspr(SPRN_HDEC, decrementer_max); save_clear_guest_mmu(kvm, vcpu); switch_mmu_to_host(kvm, host_pidr); From 34bf08a2079fffc7206a1ae93086ab8167e0afb6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:44 +1000 Subject: [PATCH 006/240] KVM: PPC: Book3S HV P9: Reduce mftb per guest entry/exit mftb is serialising (dispatch next-to-complete) so it is heavy weight for a mfspr. Avoid reading it multiple times in the entry or exit paths. A small number of cycles delay to timers is tolerable. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-7-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 4 ++-- arch/powerpc/kvm/book3s_hv_p9_entry.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3322edbafc64..5fc0c168a39a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3940,7 +3940,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, * * XXX: Another day's problem. */ - mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); + mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb); if (kvmhv_on_pseries()) { /* @@ -4063,7 +4063,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->in_guest = 0; next_timer = timer_get_next_tb(); - set_dec(next_timer - mftb()); + set_dec(next_timer - tb); /* We may have raced with new irq work */ if (test_irq_work_pending()) set_dec(1); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 0ff9ddb5e7ca..bd8cf0a65ce8 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -203,7 +203,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc unsigned long host_dawr1; unsigned long host_dawrx1; - hdec = time_limit - mftb(); + tb = mftb(); + hdec = time_limit - tb; if (hdec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; @@ -215,7 +216,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vcpu->arch.ceded = 0; if (vc->tb_offset) { - u64 new_tb = mftb() + vc->tb_offset; + u64 new_tb = tb + vc->tb_offset; mtspr(SPRN_TBU40, new_tb); tb = mftb(); if ((tb & 0xffffff) < (new_tb & 0xffffff)) From 25aa145856cd0d94864bf501218be84a7c8062ae Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:45 +1000 Subject: [PATCH 007/240] powerpc/time: add API for KVM to re-arm the host timer/decrementer Rather than have KVM look up the host timer and fiddle with the irq-work internal details, have the powerpc/time.c code provide a function for KVM to re-arm the Linux timer code when exiting a guest. This is implementation has an improvement over existing code of marking a decrementer interrupt as soft-pending if a timer has expired, rather than setting DEC to a -ve value, which tended to cause host timers to take two interrupts (first hdec to exit the guest, then the immediate dec). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-8-npiggin@gmail.com --- arch/powerpc/include/asm/time.h | 16 +++------- arch/powerpc/kernel/time.c | 52 +++++++++++++++++++++++++++------ arch/powerpc/kvm/book3s_hv.c | 7 ++--- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 69b6be617772..924b2157882f 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low, extern void secondary_cpu_time_init(void); extern void __init time_init(void); -#ifdef CONFIG_PPC64 -static inline unsigned long test_irq_work_pending(void) -{ - unsigned long x; - - asm volatile("lbz %0,%1(13)" - : "=r" (x) - : "i" (offsetof(struct paca_struct, irq_work_pending))); - return x; -} -#endif - DECLARE_PER_CPU(u64, decrementers_next_tb); static inline u64 timer_get_next_tb(void) @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void) return __this_cpu_read(decrementers_next_tb); } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void timer_rearm_host_dec(u64 now); +#endif + /* Convert timebase ticks to nanoseconds */ unsigned long long tb_to_ns(unsigned long long tb_ticks); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2769d565f842..f7cddb82938f 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -498,6 +498,16 @@ EXPORT_SYMBOL(profile_pc); * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... */ #ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) +{ + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; +} + static inline void set_irq_work_pending_flag(void) { asm volatile("stb %0,%1(13)" : : @@ -541,13 +551,44 @@ void arch_irq_work_raise(void) preempt_enable(); } +static void set_dec_or_work(u64 val) +{ + set_dec(val); + /* We may have raced with new irq work */ + if (unlikely(test_irq_work_pending())) + set_dec(1); +} + #else /* CONFIG_IRQ_WORK */ #define test_irq_work_pending() 0 #define clear_irq_work_pending() +static void set_dec_or_work(u64 val) +{ + set_dec(val); +} #endif /* CONFIG_IRQ_WORK */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void timer_rearm_host_dec(u64 now) +{ + u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); + + WARN_ON_ONCE(!arch_irqs_disabled()); + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (now >= *next_tb) { + local_paca->irq_happened |= PACA_IRQ_DEC; + } else { + now = *next_tb - now; + if (now <= decrementer_max) + set_dec_or_work(now); + } +} +EXPORT_SYMBOL_GPL(timer_rearm_host_dec); +#endif + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. @@ -608,10 +649,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) } else { now = *next_tb - now; if (now <= decrementer_max) - set_dec(now); - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + set_dec_or_work(now); __this_cpu_inc(irq_stat.timer_irqs_others); } @@ -845,11 +883,7 @@ static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { __this_cpu_write(decrementers_next_tb, get_tb() + evt); - set_dec(evt); - - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + set_dec_or_work(evt); return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5fc0c168a39a..1b556dbfcfc8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4062,11 +4062,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 0x101; vc->in_guest = 0; - next_timer = timer_get_next_tb(); - set_dec(next_timer - tb); - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + timer_rearm_host_dec(tb); + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu(); From eacc818864bb01828280f4d64334c4e5ae6a4daf Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:46 +1000 Subject: [PATCH 008/240] KVM: PPC: Book3S HV: POWER10 enable HAIL when running radix guests HV interrupts may be taken with the MMU enabled when radix guests are running. Enable LPCR[HAIL] on ISA v3.1 processors for radix guests. Make this depend on the host LPCR[HAIL] being enabled. Currently that is always enabled, but having this test means any issue that might require LPCR[HAIL] to be disabled in the host will not have to be duplicated in KVM. This optimisation takes 1380 cycles off a NULL hcall entry+exit micro benchmark on a POWER10. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-9-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1b556dbfcfc8..a683ee5f420a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5073,6 +5073,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) */ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) { + unsigned long lpcr, lpcr_mask; + if (nesting_enabled(kvm)) kvmhv_release_all_nested(kvm); kvmppc_rmap_reset(kvm); @@ -5082,8 +5084,13 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) kvm->arch.radix = 0; spin_unlock(&kvm->mmu_lock); kvmppc_free_radix(kvm); - kvmppc_update_lpcr(kvm, LPCR_VPM1, - LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + + lpcr = LPCR_VPM1; + lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR; + if (cpu_has_feature(CPU_FTR_ARCH_31)) + lpcr_mask |= LPCR_HAIL; + kvmppc_update_lpcr(kvm, lpcr, lpcr_mask); + return 0; } @@ -5093,6 +5100,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) */ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) { + unsigned long lpcr, lpcr_mask; int err; err = kvmppc_init_vm_radix(kvm); @@ -5104,8 +5112,17 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) kvm->arch.radix = 1; spin_unlock(&kvm->mmu_lock); kvmppc_free_hpt(&kvm->arch.hpt); - kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, - LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + + lpcr = LPCR_UPRT | LPCR_GTSE | LPCR_HR; + lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR; + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + lpcr_mask |= LPCR_HAIL; + if (cpu_has_feature(CPU_FTR_HVMODE) && + (kvm->arch.host_lpcr & LPCR_HAIL)) + lpcr |= LPCR_HAIL; + } + kvmppc_update_lpcr(kvm, lpcr, lpcr_mask); + return 0; } @@ -5269,6 +5286,10 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.mmu_ready = 1; lpcr &= ~LPCR_VPM1; lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_31) && + (kvm->arch.host_lpcr & LPCR_HAIL)) + lpcr |= LPCR_HAIL; ret = kvmppc_init_vm_radix(kvm); if (ret) { kvmppc_free_lpid(kvm->arch.lpid); From 46f9caf1a246a5c0622fa8cc7e673658e925f97e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:47 +1000 Subject: [PATCH 009/240] powerpc/64s: Keep AMOR SPR a constant ~0 at runtime This register controls supervisor SPR modifications, and as such is only relevant for KVM. KVM always sets AMOR to ~0 on guest entry, and never restores it coming back out to the host, so it can be kept constant and avoid the mtSPR in KVM guest entry. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-10-npiggin@gmail.com --- arch/powerpc/kernel/cpu_setup_power.c | 8 ++++++++ arch/powerpc/kernel/dt_cpu_ftrs.c | 2 ++ arch/powerpc/kvm/book3s_hv_p9_entry.c | 2 -- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 -- arch/powerpc/mm/book3s64/radix_pgtable.c | 15 --------------- arch/powerpc/platforms/powernv/idle.c | 8 +++----- 6 files changed, 13 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/cpu_setup_power.c b/arch/powerpc/kernel/cpu_setup_power.c index 3cca88ee96d7..a29dc8326622 100644 --- a/arch/powerpc/kernel/cpu_setup_power.c +++ b/arch/powerpc/kernel/cpu_setup_power.c @@ -137,6 +137,7 @@ void __setup_cpu_power7(unsigned long offset, struct cpu_spec *t) return; mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH); } @@ -150,6 +151,7 @@ void __restore_cpu_power7(void) return; mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH); } @@ -164,6 +166,7 @@ void __setup_cpu_power8(unsigned long offset, struct cpu_spec *t) return; mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */ init_HFSCR(); @@ -184,6 +187,7 @@ void __restore_cpu_power8(void) return; mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */ init_HFSCR(); @@ -202,6 +206,7 @@ void __setup_cpu_power9(unsigned long offset, struct cpu_spec *t) mtspr(SPRN_PSSCR, 0); mtspr(SPRN_LPID, 0); mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); @@ -223,6 +228,7 @@ void __restore_cpu_power9(void) mtspr(SPRN_PSSCR, 0); mtspr(SPRN_LPID, 0); mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); @@ -242,6 +248,7 @@ void __setup_cpu_power10(unsigned long offset, struct cpu_spec *t) mtspr(SPRN_PSSCR, 0); mtspr(SPRN_LPID, 0); mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); @@ -264,6 +271,7 @@ void __restore_cpu_power10(void) mtspr(SPRN_PSSCR, 0); mtspr(SPRN_LPID, 0); mtspr(SPRN_PID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_PCR, PCR_MASK); init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\ LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index ba527fb52993..de59971319ab 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -80,6 +80,7 @@ static void __restore_cpu_cpufeatures(void) mtspr(SPRN_LPCR, system_registers.lpcr); if (hv_mode) { mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_HFSCR, system_registers.hfscr); mtspr(SPRN_PCR, system_registers.pcr); } @@ -216,6 +217,7 @@ static int __init feat_enable_hv(struct dt_cpu_feature *f) } mtspr(SPRN_LPID, 0); + mtspr(SPRN_AMOR, ~0); lpcr = mfspr(SPRN_LPCR); lpcr &= ~LPCR_LPES0; /* HV external interrupts */ diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index bd8cf0a65ce8..a7f63082b4e3 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -286,8 +286,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); - mtspr(SPRN_AMOR, ~0UL); - local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9; /* diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 32a4b4d412b9..c45ec4cd9d52 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -778,10 +778,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* Restore AMR and UAMOR, set AMOR to all 1s */ ld r5,VCPU_AMR(r4) ld r6,VCPU_UAMOR(r4) - li r7,-1 mtspr SPRN_AMR,r5 mtspr SPRN_UAMOR,r6 - mtspr SPRN_AMOR,r7 /* Restore state of CTRL run bit; assume 1 on entry */ lwz r5,VCPU_CTRL(r4) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 3a600bd7fbc6..77820036c722 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -572,18 +572,6 @@ void __init radix__early_init_devtree(void) return; } -static void radix_init_amor(void) -{ - /* - * In HV mode, we init AMOR (Authority Mask Override Register) so that - * the hypervisor and guest can setup IAMR (Instruction Authority Mask - * Register), enable key 0 and set it to 1. - * - * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11) - */ - mtspr(SPRN_AMOR, (3ul << 62)); -} - void __init radix__early_init_mmu(void) { unsigned long lpcr; @@ -644,7 +632,6 @@ void __init radix__early_init_mmu(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); - radix_init_amor(); } else { radix_init_pseries(); } @@ -668,8 +655,6 @@ void radix__early_init_mmu_secondary(void) set_ptcr_when_no_uv(__pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - - radix_init_amor(); } radix__switch_mmu_context(NULL, &init_mm); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 86e787502e42..3bc84e2fe064 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -306,8 +306,8 @@ struct p7_sprs { /* per thread SPRs that get lost in shallow states */ u64 amr; u64 iamr; - u64 amor; u64 uamor; + /* amor is restored to constant ~0 */ }; static unsigned long power7_idle_insn(unsigned long type) @@ -378,7 +378,6 @@ static unsigned long power7_idle_insn(unsigned long type) if (cpu_has_feature(CPU_FTR_ARCH_207S)) { sprs.amr = mfspr(SPRN_AMR); sprs.iamr = mfspr(SPRN_IAMR); - sprs.amor = mfspr(SPRN_AMOR); sprs.uamor = mfspr(SPRN_UAMOR); } @@ -397,7 +396,7 @@ static unsigned long power7_idle_insn(unsigned long type) */ mtspr(SPRN_AMR, sprs.amr); mtspr(SPRN_IAMR, sprs.iamr); - mtspr(SPRN_AMOR, sprs.amor); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_UAMOR, sprs.uamor); } } @@ -686,7 +685,6 @@ static unsigned long power9_idle_stop(unsigned long psscr) sprs.amr = mfspr(SPRN_AMR); sprs.iamr = mfspr(SPRN_IAMR); - sprs.amor = mfspr(SPRN_AMOR); sprs.uamor = mfspr(SPRN_UAMOR); srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */ @@ -707,7 +705,7 @@ static unsigned long power9_idle_stop(unsigned long psscr) */ mtspr(SPRN_AMR, sprs.amr); mtspr(SPRN_IAMR, sprs.iamr); - mtspr(SPRN_AMOR, sprs.amor); + mtspr(SPRN_AMOR, ~0); mtspr(SPRN_UAMOR, sprs.uamor); /* From d3c8a2d3740d93778ea102d4c781746d284177bf Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:48 +1000 Subject: [PATCH 010/240] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting Provide a config option that controls the workaround added by commit 63279eeb7f93 ("KVM: PPC: Book3S HV: Always save guest pmu for guest capable of nesting"). The option defaults to y for now, but is expected to go away within a few releases. Nested capable guests running with the earlier commit 178266389794 ("KVM: PPC: Book3S HV Nested: Reflect guest PMU in-use to L0 when guest SPRs are live") will now indicate the PMU in-use status of their guests, which means the parent does not need to unconditionally save the PMU for nested capable guests. After this latest round of performance optimisations, this option costs about 540 cycles or 10% entry/exit performance on a POWER9 nested-capable guest. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas References: 178266389794 ("KVM: PPC: Book3S HV Nested: Reflect guest PMU in-use to L0 when guest SPRs are live") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-11-npiggin@gmail.com --- arch/powerpc/kvm/Kconfig | 15 +++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 10 ++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index ff581d70f20c..6a58532300c5 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -130,6 +130,21 @@ config KVM_BOOK3S_HV_EXIT_TIMING If unsure, say N. +config KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND + bool "Nested L0 host workaround for L1 KVM host PMU handling bug" if EXPERT + depends on KVM_BOOK3S_HV_POSSIBLE + default !EXPERT + help + Old nested HV capable Linux guests have a bug where they don't + reflect the PMU in-use status of their L2 guest to the L0 host + while the L2 PMU registers are live. This can result in loss + of L2 PMU register state, causing perf to not work correctly in + L2 guests. + + Selecting this option for the L0 host implements a workaround for + those buggy L1s which saves the L2 state, at the cost of performance + in all nested-capable guest entry/exit. + config KVM_BOOKE_HV bool diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a683ee5f420a..6e760f48bbaf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4047,8 +4047,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.vpa.dirty = 1; save_pmu = lp->pmcregs_in_use; } - /* Must save pmu if this guest is capable of running nested guests */ - save_pmu |= nesting_enabled(vcpu->kvm); + if (IS_ENABLED(CONFIG_KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND)) { + /* + * Save pmu if this guest is capable of running nested guests. + * This is option is for old L1s that do not set their + * lppaca->pmcregs_in_use properly when entering their L2. + */ + save_pmu |= nesting_enabled(vcpu->kvm); + } kvmhv_save_guest_pmu(vcpu, save_pmu); #ifdef CONFIG_PPC_PSERIES From 245ebf8e7380b3d84c0aac37fbfd9306b45a3a7a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:49 +1000 Subject: [PATCH 011/240] powerpc/64s: Always set PMU control registers to frozen/disabled when not in use KVM PMU management code looks for particular frozen/disabled bits in the PMU registers so it knows whether it must clear them when coming out of a guest or not. Setting this up helps KVM make these optimisations without getting confused. Longer term the better approach might be to move guest/host PMU switching to the perf subsystem. Signed-off-by: Nicholas Piggin Reviewed-by: Athira Jajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-12-npiggin@gmail.com --- arch/powerpc/kernel/cpu_setup_power.c | 4 ++-- arch/powerpc/kernel/dt_cpu_ftrs.c | 6 +++--- arch/powerpc/kvm/book3s_hv.c | 5 +++++ 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/cpu_setup_power.c b/arch/powerpc/kernel/cpu_setup_power.c index a29dc8326622..3dc61e203f37 100644 --- a/arch/powerpc/kernel/cpu_setup_power.c +++ b/arch/powerpc/kernel/cpu_setup_power.c @@ -109,7 +109,7 @@ static void init_PMU_HV_ISA207(void) static void init_PMU(void) { mtspr(SPRN_MMCRA, 0); - mtspr(SPRN_MMCR0, 0); + mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0); } @@ -123,7 +123,7 @@ static void init_PMU_ISA31(void) { mtspr(SPRN_MMCR3, 0); mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE); - mtspr(SPRN_MMCR0, MMCR0_PMCCEXT); + mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT); } /* diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index de59971319ab..d2b35fb9181d 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -353,7 +353,7 @@ static void init_pmu_power8(void) } mtspr(SPRN_MMCRA, 0); - mtspr(SPRN_MMCR0, 0); + mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0); mtspr(SPRN_MMCRS, 0); @@ -392,7 +392,7 @@ static void init_pmu_power9(void) mtspr(SPRN_MMCRC, 0); mtspr(SPRN_MMCRA, 0); - mtspr(SPRN_MMCR0, 0); + mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0); } @@ -428,7 +428,7 @@ static void init_pmu_power10(void) mtspr(SPRN_MMCR3, 0); mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE); - mtspr(SPRN_MMCR0, MMCR0_PMCCEXT); + mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT); } static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6e760f48bbaf..8bf0f6337212 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2715,6 +2715,11 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) #endif #endif vcpu->arch.mmcr[0] = MMCR0_FC; + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + vcpu->arch.mmcr[0] |= MMCR0_PMCCEXT; + vcpu->arch.mmcra = MMCRA_BHRB_DISABLE; + } + vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); From 0a4b4327ce867e3ac1b3ad15f4d2b686b516b3a2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:50 +1000 Subject: [PATCH 012/240] powerpc/64s: Implement PMU override command line option It can be useful in simulators (with very constrained environments) to allow some PMCs to run from boot so they can be sampled directly by a test harness, rather than having to run perf. A previous change freezes counters at boot by default, so provide a boot time option to un-freeze (plus a bit more flexibility). Signed-off-by: Nicholas Piggin Reviewed-by: Athira Jajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-13-npiggin@gmail.com --- .../admin-guide/kernel-parameters.txt | 8 +++++ arch/powerpc/perf/core-book3s.c | 35 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..2711ddb4835a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4144,6 +4144,14 @@ Override pmtimer IOPort with a hex value. e.g. pmtmr=0x508 + pmu_override= [PPC] Override the PMU. + This option takes over the PMU facility, so it is no + longer usable by perf. Setting this option starts the + PMU counters by setting MMCR0 to 0 (the FC bit is + cleared). If a number is given, then MMCR1 is set to + that number, otherwise (e.g., 'pmu_override=on'), MMCR1 + remains 0. + pm_debug_messages [SUSPEND,KNL] Enable suspend/resume debug messages during boot up. diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 73e62e9b179b..8d4ff93462fb 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2419,8 +2419,24 @@ int register_power_pmu(struct power_pmu *pmu) } #ifdef CONFIG_PPC64 +static bool pmu_override = false; +static unsigned long pmu_override_val; +static void do_pmu_override(void *data) +{ + ppc_set_pmu_inuse(1); + if (pmu_override_val) + mtspr(SPRN_MMCR1, pmu_override_val); + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC); +} + static int __init init_ppc64_pmu(void) { + if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) { + pr_warn("disabling perf due to pmu_override= command line option.\n"); + on_each_cpu(do_pmu_override, NULL, 1); + return 0; + } + /* run through all the pmu drivers one at a time */ if (!init_power5_pmu()) return 0; @@ -2442,4 +2458,23 @@ static int __init init_ppc64_pmu(void) return init_generic_compat_pmu(); } early_initcall(init_ppc64_pmu); + +static int __init pmu_setup(char *str) +{ + unsigned long val; + + if (!early_cpu_has_feature(CPU_FTR_HVMODE)) + return 0; + + pmu_override = true; + + if (kstrtoul(str, 0, &val)) + val = 0; + + pmu_override_val = val; + + return 1; +} +__setup("pmu_override=", pmu_setup); + #endif From 57dc0eed73caa31bfe36ce8fed234e214e37a5ae Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:51 +1000 Subject: [PATCH 013/240] KVM: PPC: Book3S HV P9: Implement PMU save/restore in C Implement the P9 path PMU save/restore code in C, and remove the POWER9/10 code from the P7/8 path assembly. Signed-off-by: Nicholas Piggin Reviewed-by: Athira Jajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-14-npiggin@gmail.com --- arch/powerpc/include/asm/asm-prototypes.h | 5 - arch/powerpc/kvm/book3s_hv.c | 221 +++++++++++++++++++--- arch/powerpc/kvm/book3s_hv_interrupts.S | 13 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 43 +---- 4 files changed, 208 insertions(+), 74 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 222823861a67..41b8a1e1144a 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -141,11 +141,6 @@ static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv) { } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -void kvmhv_save_host_pmu(void); -void kvmhv_load_host_pmu(void); -void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); -void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); - void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu); long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8bf0f6337212..8fdd640873a3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3775,6 +3775,196 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) trace_kvmppc_run_core(vc, 1); } +/* + * Privileged (non-hypervisor) host registers to save. + */ +struct p9_host_os_sprs { + unsigned long dscr; + unsigned long tidr; + unsigned long iamr; + unsigned long amr; + unsigned long fscr; + + unsigned int pmc1; + unsigned int pmc2; + unsigned int pmc3; + unsigned int pmc4; + unsigned int pmc5; + unsigned int pmc6; + unsigned long mmcr0; + unsigned long mmcr1; + unsigned long mmcr2; + unsigned long mmcr3; + unsigned long mmcra; + unsigned long siar; + unsigned long sier1; + unsigned long sier2; + unsigned long sier3; + unsigned long sdar; +}; + +static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra) +{ + if (!(mmcr0 & MMCR0_FC)) + goto do_freeze; + if (mmcra & MMCRA_SAMPLE_ENABLE) + goto do_freeze; + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + if (!(mmcr0 & MMCR0_PMCCEXT)) + goto do_freeze; + if (!(mmcra & MMCRA_BHRB_DISABLE)) + goto do_freeze; + } + return; + +do_freeze: + mmcr0 = MMCR0_FC; + mmcra = 0; + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mmcr0 |= MMCR0_PMCCEXT; + mmcra = MMCRA_BHRB_DISABLE; + } + + mtspr(SPRN_MMCR0, mmcr0); + mtspr(SPRN_MMCRA, mmcra); + isync(); +} + +static void save_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs) +{ + if (ppc_get_pmu_inuse()) { + /* + * It might be better to put PMU handling (at least for the + * host) in the perf subsystem because it knows more about what + * is being used. + */ + + /* POWER9, POWER10 do not implement HPMC or SPMC */ + + host_os_sprs->mmcr0 = mfspr(SPRN_MMCR0); + host_os_sprs->mmcra = mfspr(SPRN_MMCRA); + + freeze_pmu(host_os_sprs->mmcr0, host_os_sprs->mmcra); + + host_os_sprs->pmc1 = mfspr(SPRN_PMC1); + host_os_sprs->pmc2 = mfspr(SPRN_PMC2); + host_os_sprs->pmc3 = mfspr(SPRN_PMC3); + host_os_sprs->pmc4 = mfspr(SPRN_PMC4); + host_os_sprs->pmc5 = mfspr(SPRN_PMC5); + host_os_sprs->pmc6 = mfspr(SPRN_PMC6); + host_os_sprs->mmcr1 = mfspr(SPRN_MMCR1); + host_os_sprs->mmcr2 = mfspr(SPRN_MMCR2); + host_os_sprs->sdar = mfspr(SPRN_SDAR); + host_os_sprs->siar = mfspr(SPRN_SIAR); + host_os_sprs->sier1 = mfspr(SPRN_SIER); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + host_os_sprs->mmcr3 = mfspr(SPRN_MMCR3); + host_os_sprs->sier2 = mfspr(SPRN_SIER2); + host_os_sprs->sier3 = mfspr(SPRN_SIER3); + } + } +} + +static void load_p9_guest_pmu(struct kvm_vcpu *vcpu) +{ + mtspr(SPRN_PMC1, vcpu->arch.pmc[0]); + mtspr(SPRN_PMC2, vcpu->arch.pmc[1]); + mtspr(SPRN_PMC3, vcpu->arch.pmc[2]); + mtspr(SPRN_PMC4, vcpu->arch.pmc[3]); + mtspr(SPRN_PMC5, vcpu->arch.pmc[4]); + mtspr(SPRN_PMC6, vcpu->arch.pmc[5]); + mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]); + mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]); + mtspr(SPRN_SDAR, vcpu->arch.sdar); + mtspr(SPRN_SIAR, vcpu->arch.siar); + mtspr(SPRN_SIER, vcpu->arch.sier[0]); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mtspr(SPRN_MMCR3, vcpu->arch.mmcr[3]); + mtspr(SPRN_SIER2, vcpu->arch.sier[1]); + mtspr(SPRN_SIER3, vcpu->arch.sier[2]); + } + + /* Set MMCRA then MMCR0 last */ + mtspr(SPRN_MMCRA, vcpu->arch.mmcra); + mtspr(SPRN_MMCR0, vcpu->arch.mmcr[0]); + /* No isync necessary because we're starting counters */ +} + +static void save_p9_guest_pmu(struct kvm_vcpu *vcpu) +{ + struct lppaca *lp; + int save_pmu = 1; + + lp = vcpu->arch.vpa.pinned_addr; + if (lp) + save_pmu = lp->pmcregs_in_use; + if (IS_ENABLED(CONFIG_KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND)) { + /* + * Save pmu if this guest is capable of running nested guests. + * This is option is for old L1s that do not set their + * lppaca->pmcregs_in_use properly when entering their L2. + */ + save_pmu |= nesting_enabled(vcpu->kvm); + } + + if (save_pmu) { + vcpu->arch.mmcr[0] = mfspr(SPRN_MMCR0); + vcpu->arch.mmcra = mfspr(SPRN_MMCRA); + + freeze_pmu(vcpu->arch.mmcr[0], vcpu->arch.mmcra); + + vcpu->arch.pmc[0] = mfspr(SPRN_PMC1); + vcpu->arch.pmc[1] = mfspr(SPRN_PMC2); + vcpu->arch.pmc[2] = mfspr(SPRN_PMC3); + vcpu->arch.pmc[3] = mfspr(SPRN_PMC4); + vcpu->arch.pmc[4] = mfspr(SPRN_PMC5); + vcpu->arch.pmc[5] = mfspr(SPRN_PMC6); + vcpu->arch.mmcr[1] = mfspr(SPRN_MMCR1); + vcpu->arch.mmcr[2] = mfspr(SPRN_MMCR2); + vcpu->arch.sdar = mfspr(SPRN_SDAR); + vcpu->arch.siar = mfspr(SPRN_SIAR); + vcpu->arch.sier[0] = mfspr(SPRN_SIER); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + vcpu->arch.mmcr[3] = mfspr(SPRN_MMCR3); + vcpu->arch.sier[1] = mfspr(SPRN_SIER2); + vcpu->arch.sier[2] = mfspr(SPRN_SIER3); + } + } else { + freeze_pmu(mfspr(SPRN_MMCR0), mfspr(SPRN_MMCRA)); + } +} + +static void load_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs) +{ + if (ppc_get_pmu_inuse()) { + mtspr(SPRN_PMC1, host_os_sprs->pmc1); + mtspr(SPRN_PMC2, host_os_sprs->pmc2); + mtspr(SPRN_PMC3, host_os_sprs->pmc3); + mtspr(SPRN_PMC4, host_os_sprs->pmc4); + mtspr(SPRN_PMC5, host_os_sprs->pmc5); + mtspr(SPRN_PMC6, host_os_sprs->pmc6); + mtspr(SPRN_MMCR1, host_os_sprs->mmcr1); + mtspr(SPRN_MMCR2, host_os_sprs->mmcr2); + mtspr(SPRN_SDAR, host_os_sprs->sdar); + mtspr(SPRN_SIAR, host_os_sprs->siar); + mtspr(SPRN_SIER, host_os_sprs->sier1); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mtspr(SPRN_MMCR3, host_os_sprs->mmcr3); + mtspr(SPRN_SIER2, host_os_sprs->sier2); + mtspr(SPRN_SIER3, host_os_sprs->sier3); + } + + /* Set MMCRA then MMCR0 last */ + mtspr(SPRN_MMCRA, host_os_sprs->mmcra); + mtspr(SPRN_MMCR0, host_os_sprs->mmcr0); + isync(); + } +} + static void load_spr_state(struct kvm_vcpu *vcpu) { mtspr(SPRN_DSCR, vcpu->arch.dscr); @@ -3819,17 +4009,6 @@ static void store_spr_state(struct kvm_vcpu *vcpu) vcpu->arch.dscr = mfspr(SPRN_DSCR); } -/* - * Privileged (non-hypervisor) host registers to save. - */ -struct p9_host_os_sprs { - unsigned long dscr; - unsigned long tidr; - unsigned long iamr; - unsigned long amr; - unsigned long fscr; -}; - static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) { host_os_sprs->dscr = mfspr(SPRN_DSCR); @@ -3879,7 +4058,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, struct p9_host_os_sprs host_os_sprs; s64 dec; u64 tb, next_timer; - int trap, save_pmu; + int trap; WARN_ON_ONCE(vcpu->arch.ceded); @@ -3892,7 +4071,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, save_p9_host_os_sprs(&host_os_sprs); - kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ + save_p9_host_pmu(&host_os_sprs); kvmppc_subcore_enter_guest(); @@ -3922,7 +4101,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, barrier(); } #endif - kvmhv_load_guest_pmu(vcpu); + load_p9_guest_pmu(vcpu); msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); load_fp_state(&vcpu->arch.fp); @@ -4044,24 +4223,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); - save_pmu = 1; if (vcpu->arch.vpa.pinned_addr) { struct lppaca *lp = vcpu->arch.vpa.pinned_addr; u32 yield_count = be32_to_cpu(lp->yield_count) + 1; lp->yield_count = cpu_to_be32(yield_count); vcpu->arch.vpa.dirty = 1; - save_pmu = lp->pmcregs_in_use; - } - if (IS_ENABLED(CONFIG_KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND)) { - /* - * Save pmu if this guest is capable of running nested guests. - * This is option is for old L1s that do not set their - * lppaca->pmcregs_in_use properly when entering their L2. - */ - save_pmu |= nesting_enabled(vcpu->kvm); } - kvmhv_save_guest_pmu(vcpu, save_pmu); + save_p9_guest_pmu(vcpu); #ifdef CONFIG_PPC_PSERIES if (kvmhv_on_pseries()) { barrier(); @@ -4077,7 +4246,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); - kvmhv_load_host_pmu(); + load_p9_host_pmu(&host_os_sprs); kvmppc_subcore_exit_guest(); diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 4444f83cb133..59d89e4b154a 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -104,7 +104,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtlr r0 blr -_GLOBAL(kvmhv_save_host_pmu) +/* + * void kvmhv_save_host_pmu(void) + */ +kvmhv_save_host_pmu: BEGIN_FTR_SECTION /* Work around P8 PMAE bug */ li r3, -1 @@ -138,14 +141,6 @@ BEGIN_FTR_SECTION std r8, HSTATE_MMCR2(r13) std r9, HSTATE_SIER(r13) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) -BEGIN_FTR_SECTION - mfspr r5, SPRN_MMCR3 - mfspr r6, SPRN_SIER2 - mfspr r7, SPRN_SIER3 - std r5, HSTATE_MMCR3(r13) - std r6, HSTATE_SIER2(r13) - std r7, HSTATE_SIER3(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) mfspr r3, SPRN_PMC1 mfspr r5, SPRN_PMC2 mfspr r6, SPRN_PMC3 diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c45ec4cd9d52..a454d65e6353 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2776,10 +2776,11 @@ kvmppc_msr_interrupt: blr /* + * void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu) + * * Load up guest PMU state. R3 points to the vcpu struct. */ -_GLOBAL(kvmhv_load_guest_pmu) -EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu) +kvmhv_load_guest_pmu: mr r4, r3 mflr r0 li r3, 1 @@ -2813,27 +2814,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) mtspr SPRN_MMCRA, r6 mtspr SPRN_SIAR, r7 mtspr SPRN_SDAR, r8 -BEGIN_FTR_SECTION - ld r5, VCPU_MMCR + 24(r4) - ld r6, VCPU_SIER + 8(r4) - ld r7, VCPU_SIER + 16(r4) - mtspr SPRN_MMCR3, r5 - mtspr SPRN_SIER2, r6 - mtspr SPRN_SIER3, r7 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) BEGIN_FTR_SECTION ld r5, VCPU_MMCR + 16(r4) ld r6, VCPU_SIER(r4) mtspr SPRN_MMCR2, r5 mtspr SPRN_SIER, r6 -BEGIN_FTR_SECTION_NESTED(96) lwz r7, VCPU_PMC + 24(r4) lwz r8, VCPU_PMC + 28(r4) ld r9, VCPU_MMCRS(r4) mtspr SPRN_SPMC1, r7 mtspr SPRN_SPMC2, r8 mtspr SPRN_MMCRS, r9 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_MMCR0, r3 isync @@ -2841,10 +2832,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) blr /* + * void kvmhv_load_host_pmu(void) + * * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu. */ -_GLOBAL(kvmhv_load_host_pmu) -EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu) +kvmhv_load_host_pmu: mflr r0 lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ cmpwi r4, 0 @@ -2882,25 +2874,18 @@ BEGIN_FTR_SECTION mtspr SPRN_MMCR2, r8 mtspr SPRN_SIER, r9 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) -BEGIN_FTR_SECTION - ld r5, HSTATE_MMCR3(r13) - ld r6, HSTATE_SIER2(r13) - ld r7, HSTATE_SIER3(r13) - mtspr SPRN_MMCR3, r5 - mtspr SPRN_SIER2, r6 - mtspr SPRN_SIER3, r7 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) mtspr SPRN_MMCR0, r3 isync mtlr r0 23: blr /* + * void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use) + * * Save guest PMU state into the vcpu struct. * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA) */ -_GLOBAL(kvmhv_save_guest_pmu) -EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu) +kvmhv_save_guest_pmu: mr r9, r3 mr r8, r4 BEGIN_FTR_SECTION @@ -2949,14 +2934,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION std r10, VCPU_MMCR + 16(r9) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) -BEGIN_FTR_SECTION - mfspr r5, SPRN_MMCR3 - mfspr r6, SPRN_SIER2 - mfspr r7, SPRN_SIER3 - std r5, VCPU_MMCR + 24(r9) - std r6, VCPU_SIER + 8(r9) - std r7, VCPU_SIER + 16(r9) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) std r7, VCPU_SIAR(r9) std r8, VCPU_SDAR(r9) mfspr r3, SPRN_PMC1 @@ -2974,7 +2951,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) BEGIN_FTR_SECTION mfspr r5, SPRN_SIER std r5, VCPU_SIER(r9) -BEGIN_FTR_SECTION_NESTED(96) mfspr r6, SPRN_SPMC1 mfspr r7, SPRN_SPMC2 mfspr r8, SPRN_MMCRS @@ -2983,7 +2959,6 @@ BEGIN_FTR_SECTION_NESTED(96) std r8, VCPU_MMCRS(r9) lis r4, 0x8000 mtspr SPRN_MMCRS, r4 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 22: blr From 401e1ae372673664465d45a86975c006dc6a488d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:52 +1000 Subject: [PATCH 014/240] KVM: PPC: Book3S HV P9: Factor PMU save/load into context switch functions Rather than guest/host save/retsore functions, implement context switch functions that take care of details like the VPA update for nested. The reason to split these kind of helpers into explicit save/load functions is mainly to schedule SPR access nicely, but PMU is a special case where the load requires mtSPR (to stop counters) and other difficulties, so there's less possibility to schedule those nicely. The SPR accesses also have side-effects if the PMU is running, and in later changes we keep the host PMU running as long as possible so this code can be better profiled, which also complicates scheduling. Signed-off-by: Nicholas Piggin Reviewed-by: Athira Jajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-15-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 61 +++++++++++++++++------------------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8fdd640873a3..5ffaaf8a30b3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3830,7 +3830,8 @@ do_freeze: isync(); } -static void save_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs) +static void switch_pmu_to_guest(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) { if (ppc_get_pmu_inuse()) { /* @@ -3864,10 +3865,21 @@ static void save_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs) host_os_sprs->sier3 = mfspr(SPRN_SIER3); } } -} -static void load_p9_guest_pmu(struct kvm_vcpu *vcpu) -{ +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; + } else { + get_lppaca()->pmcregs_in_use = 1; + } + barrier(); + } +#endif + + /* load guest */ mtspr(SPRN_PMC1, vcpu->arch.pmc[0]); mtspr(SPRN_PMC2, vcpu->arch.pmc[1]); mtspr(SPRN_PMC3, vcpu->arch.pmc[2]); @@ -3892,7 +3904,8 @@ static void load_p9_guest_pmu(struct kvm_vcpu *vcpu) /* No isync necessary because we're starting counters */ } -static void save_p9_guest_pmu(struct kvm_vcpu *vcpu) +static void switch_pmu_to_host(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) { struct lppaca *lp; int save_pmu = 1; @@ -3935,10 +3948,15 @@ static void save_p9_guest_pmu(struct kvm_vcpu *vcpu) } else { freeze_pmu(mfspr(SPRN_MMCR0), mfspr(SPRN_MMCRA)); } -} -static void load_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs) -{ +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); + barrier(); + } +#endif + if (ppc_get_pmu_inuse()) { mtspr(SPRN_PMC1, host_os_sprs->pmc1); mtspr(SPRN_PMC2, host_os_sprs->pmc2); @@ -4071,8 +4089,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, save_p9_host_os_sprs(&host_os_sprs); - save_p9_host_pmu(&host_os_sprs); - kvmppc_subcore_enter_guest(); vc->entry_exit_map = 1; @@ -4089,19 +4105,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); -#ifdef CONFIG_PPC_PSERIES - if (kvmhv_on_pseries()) { - barrier(); - if (vcpu->arch.vpa.pinned_addr) { - struct lppaca *lp = vcpu->arch.vpa.pinned_addr; - get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; - } else { - get_lppaca()->pmcregs_in_use = 1; - } - barrier(); - } -#endif - load_p9_guest_pmu(vcpu); + switch_pmu_to_guest(vcpu, &host_os_sprs); msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); load_fp_state(&vcpu->arch.fp); @@ -4230,14 +4234,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.vpa.dirty = 1; } - save_p9_guest_pmu(vcpu); -#ifdef CONFIG_PPC_PSERIES - if (kvmhv_on_pseries()) { - barrier(); - get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); - barrier(); - } -#endif + switch_pmu_to_host(vcpu, &host_os_sprs); vc->entry_exit_map = 0x101; vc->in_guest = 0; @@ -4246,8 +4243,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); - load_p9_host_pmu(&host_os_sprs); - kvmppc_subcore_exit_guest(); return trap; From 9d3ddb86d96d9f0314f3baaf0e37f987b40d3eee Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:53 +1000 Subject: [PATCH 015/240] KVM: PPC: Book3S HV P9: Demand fault PMU SPRs when marked not inuse The pmcregs_in_use field in the guest VPA can not be trusted to reflect what the guest is doing with PMU SPRs, so the PMU must always be managed (stopped) when exiting the guest, and SPR values set when entering the guest to ensure it can't cause a covert channel or otherwise cause other guests or the host to misbehave. So prevent guest access to the PMU with HFSCR[PM] if pmcregs_in_use is clear, and avoid the PMU SPR access on every partition switch. Guests that set pmcregs_in_use incorrectly or when first setting it and using the PMU will take a hypervisor facility unavailable interrupt that will bring in the PMU SPRs. Signed-off-by: Nicholas Piggin Reviewed-by: Athira Jajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-16-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 131 ++++++++++++++++++++++++++--------- 1 file changed, 98 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5ffaaf8a30b3..e66ce7a19ac6 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1421,6 +1421,23 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +/* + * If the lppaca had pmcregs_in_use clear when we exited the guest, then + * HFSCR_PM is cleared for next entry. If the guest then tries to access + * the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM + * back in the guest HFSCR will cause the next entry to load the PMU SPRs and + * allow the guest access to continue. + */ +static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hfscr_permitted & HFSCR_PM)) + return EMULATE_FAIL; + + vcpu->arch.hfscr |= HFSCR_PM; + + return RESUME_GUEST; +} + static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -1702,16 +1719,22 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, * to emulate. * Otherwise, we just generate a program interrupt to the guest. */ - case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: { + u64 cause = vcpu->arch.hfscr >> 56; + r = EMULATE_FAIL; - if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) && - cpu_has_feature(CPU_FTR_ARCH_300)) - r = kvmppc_emulate_doorbell_instr(vcpu); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (cause == FSCR_MSGP_LG) + r = kvmppc_emulate_doorbell_instr(vcpu); + if (cause == FSCR_PM_LG) + r = kvmppc_pmu_unavailable(vcpu); + } if (r == EMULATE_FAIL) { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; } break; + } case BOOK3S_INTERRUPT_HV_RM_HARD: r = RESUME_PASSTHROUGH; @@ -2750,6 +2773,11 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) vcpu->arch.hfscr_permitted = vcpu->arch.hfscr; + /* + * PM is demand-faulted so start with it clear. + */ + vcpu->arch.hfscr &= ~HFSCR_PM; + kvmppc_mmu_book3s_hv_init(vcpu); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; @@ -3833,6 +3861,14 @@ do_freeze: static void switch_pmu_to_guest(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { + struct lppaca *lp; + int load_pmu = 1; + + lp = vcpu->arch.vpa.pinned_addr; + if (lp) + load_pmu = lp->pmcregs_in_use; + + /* Save host */ if (ppc_get_pmu_inuse()) { /* * It might be better to put PMU handling (at least for the @@ -3867,41 +3903,47 @@ static void switch_pmu_to_guest(struct kvm_vcpu *vcpu, } #ifdef CONFIG_PPC_PSERIES + /* After saving PMU, before loading guest PMU, flip pmcregs_in_use */ if (kvmhv_on_pseries()) { barrier(); - if (vcpu->arch.vpa.pinned_addr) { - struct lppaca *lp = vcpu->arch.vpa.pinned_addr; - get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; - } else { - get_lppaca()->pmcregs_in_use = 1; - } + get_lppaca()->pmcregs_in_use = load_pmu; barrier(); } #endif - /* load guest */ - mtspr(SPRN_PMC1, vcpu->arch.pmc[0]); - mtspr(SPRN_PMC2, vcpu->arch.pmc[1]); - mtspr(SPRN_PMC3, vcpu->arch.pmc[2]); - mtspr(SPRN_PMC4, vcpu->arch.pmc[3]); - mtspr(SPRN_PMC5, vcpu->arch.pmc[4]); - mtspr(SPRN_PMC6, vcpu->arch.pmc[5]); - mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]); - mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]); - mtspr(SPRN_SDAR, vcpu->arch.sdar); - mtspr(SPRN_SIAR, vcpu->arch.siar); - mtspr(SPRN_SIER, vcpu->arch.sier[0]); + /* + * Load guest. If the VPA said the PMCs are not in use but the guest + * tried to access them anyway, HFSCR[PM] will be set by the HFAC + * fault so we can make forward progress. + */ + if (load_pmu || (vcpu->arch.hfscr & HFSCR_PM)) { + mtspr(SPRN_PMC1, vcpu->arch.pmc[0]); + mtspr(SPRN_PMC2, vcpu->arch.pmc[1]); + mtspr(SPRN_PMC3, vcpu->arch.pmc[2]); + mtspr(SPRN_PMC4, vcpu->arch.pmc[3]); + mtspr(SPRN_PMC5, vcpu->arch.pmc[4]); + mtspr(SPRN_PMC6, vcpu->arch.pmc[5]); + mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]); + mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]); + mtspr(SPRN_SDAR, vcpu->arch.sdar); + mtspr(SPRN_SIAR, vcpu->arch.siar); + mtspr(SPRN_SIER, vcpu->arch.sier[0]); - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - mtspr(SPRN_MMCR3, vcpu->arch.mmcr[3]); - mtspr(SPRN_SIER2, vcpu->arch.sier[1]); - mtspr(SPRN_SIER3, vcpu->arch.sier[2]); + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mtspr(SPRN_MMCR3, vcpu->arch.mmcr[3]); + mtspr(SPRN_SIER2, vcpu->arch.sier[1]); + mtspr(SPRN_SIER3, vcpu->arch.sier[2]); + } + + /* Set MMCRA then MMCR0 last */ + mtspr(SPRN_MMCRA, vcpu->arch.mmcra); + mtspr(SPRN_MMCR0, vcpu->arch.mmcr[0]); + /* No isync necessary because we're starting counters */ + + if (!vcpu->arch.nested && + (vcpu->arch.hfscr_permitted & HFSCR_PM)) + vcpu->arch.hfscr |= HFSCR_PM; } - - /* Set MMCRA then MMCR0 last */ - mtspr(SPRN_MMCRA, vcpu->arch.mmcra); - mtspr(SPRN_MMCR0, vcpu->arch.mmcr[0]); - /* No isync necessary because we're starting counters */ } static void switch_pmu_to_host(struct kvm_vcpu *vcpu, @@ -3945,9 +3987,32 @@ static void switch_pmu_to_host(struct kvm_vcpu *vcpu, vcpu->arch.sier[1] = mfspr(SPRN_SIER2); vcpu->arch.sier[2] = mfspr(SPRN_SIER3); } - } else { + + } else if (vcpu->arch.hfscr & HFSCR_PM) { + /* + * The guest accessed PMC SPRs without specifying they should + * be preserved, or it cleared pmcregs_in_use after the last + * access. Just ensure they are frozen. + */ freeze_pmu(mfspr(SPRN_MMCR0), mfspr(SPRN_MMCRA)); - } + + /* + * Demand-fault PMU register access in the guest. + * + * This is used to grab the guest's VPA pmcregs_in_use value + * and reflect it into the host's VPA in the case of a nested + * hypervisor. + * + * It also avoids having to zero-out SPRs after each guest + * exit to avoid side-channels when. + * + * This is cleared here when we exit the guest, so later HFSCR + * interrupt handling can add it back to run the guest with + * PM enabled next time. + */ + if (!vcpu->arch.nested) + vcpu->arch.hfscr &= ~HFSCR_PM; + } /* otherwise the PMU should still be frozen */ #ifdef CONFIG_PPC_PSERIES if (kvmhv_on_pseries()) { From b1adcf57ceca7eab9bfdafc754802e05e634bfcc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:54 +1000 Subject: [PATCH 016/240] KVM: PPC: Book3S HV P9: Factor out yield_count increment Factor duplicated code into a helper function. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-17-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e66ce7a19ac6..bbaf018dcb67 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4131,6 +4131,16 @@ static inline bool hcall_is_xics(unsigned long req) req == H_IPOLL || req == H_XIRR || req == H_XIRR_X; } +static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu) +{ + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + if (lp) { + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + } +} + /* * Guest entry for POWER9 and later CPUs. */ @@ -4159,12 +4169,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 1; vc->in_guest = 1; - if (vcpu->arch.vpa.pinned_addr) { - struct lppaca *lp = vcpu->arch.vpa.pinned_addr; - u32 yield_count = be32_to_cpu(lp->yield_count) + 1; - lp->yield_count = cpu_to_be32(yield_count); - vcpu->arch.vpa.dirty = 1; - } + vcpu_vpa_increment_dispatch(vcpu); if (cpu_has_feature(CPU_FTR_TM) || cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) @@ -4292,12 +4297,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); - if (vcpu->arch.vpa.pinned_addr) { - struct lppaca *lp = vcpu->arch.vpa.pinned_addr; - u32 yield_count = be32_to_cpu(lp->yield_count) + 1; - lp->yield_count = cpu_to_be32(yield_count); - vcpu->arch.vpa.dirty = 1; - } + vcpu_vpa_increment_dispatch(vcpu); switch_pmu_to_host(vcpu, &host_os_sprs); From a1a19e1154e4e9c6c1136474cb040657b1c17817 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:55 +1000 Subject: [PATCH 017/240] KVM: PPC: Book3S HV: CTRL SPR does not require read-modify-write Processors that support KVM HV do not require read-modify-write of the CTRL SPR to set/clear their thread's runlatch. Just write 1 or 0 to it. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-18-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bbaf018dcb67..e4fb36871ce4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4071,7 +4071,7 @@ static void load_spr_state(struct kvm_vcpu *vcpu) */ if (!(vcpu->arch.ctrl & 1)) - mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + mtspr(SPRN_CTRLT, 0); } static void store_spr_state(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index a454d65e6353..be79ae7afdf5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -781,12 +781,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_AMR,r5 mtspr SPRN_UAMOR,r6 - /* Restore state of CTRL run bit; assume 1 on entry */ + /* Restore state of CTRL run bit; the host currently has it set to 1 */ lwz r5,VCPU_CTRL(r4) andi. r5,r5,1 bne 4f - mfspr r6,SPRN_CTRLF - clrrdi r6,r6,1 + li r6,0 mtspr SPRN_CTRLT,r6 4: /* Secondary threads wait for primary to have done partition switch */ @@ -1209,12 +1208,12 @@ guest_bypass: stw r0, VCPU_CPU(r9) stw r0, VCPU_THREAD_CPU(r9) - /* Save guest CTRL register, set runlatch to 1 */ + /* Save guest CTRL register, set runlatch to 1 if it was clear */ mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f - ori r6,r6,1 + li r6,1 mtspr SPRN_CTRLT,r6 4: /* @@ -2184,8 +2183,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM) * Also clear the runlatch bit before napping. */ kvm_do_nap: - mfspr r0, SPRN_CTRLF - clrrdi r0, r0, 1 + li r0,0 mtspr SPRN_CTRLT, r0 li r0,1 @@ -2204,8 +2202,7 @@ kvm_nap_sequence: /* desired LPCR value in r5 */ bl isa206_idle_insn_mayloss - mfspr r0, SPRN_CTRLF - ori r0, r0, 1 + li r0,1 mtspr SPRN_CTRLT, r0 mtspr SPRN_SRR1, r3 From 174a3ab633392859888fc1a5cff278d5546d8474 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:56 +1000 Subject: [PATCH 018/240] KVM: PPC: Book3S HV P9: Move SPRG restore to restore_p9_host_os_sprs Move the SPR update into its relevant helper function. This will help with SPR scheduling improvements in later changes. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-19-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e4fb36871ce4..4e6a42b16998 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4106,6 +4106,8 @@ static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); + mtspr(SPRN_PSPB, 0); mtspr(SPRN_UAMOR, 0); @@ -4306,8 +4308,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, timer_rearm_host_dec(tb); - mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); - kvmppc_subcore_exit_guest(); return trap; From 34e119c96b2b381278d1ddf6b1708678462daba4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:57 +1000 Subject: [PATCH 019/240] KVM: PPC: Book3S HV P9: Reduce mtmsrd instructions required to save host SPRs This reduces the number of mtmsrd required to enable facility bits when saving/restoring registers, by having the KVM code set all bits up front rather than using individual facility functions that set their particular MSR bits. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-20-npiggin@gmail.com --- arch/powerpc/include/asm/switch_to.h | 2 + arch/powerpc/kernel/process.c | 28 +++++++++++++ arch/powerpc/kvm/book3s_hv.c | 59 ++++++++++++++++++--------- arch/powerpc/kvm/book3s_hv_p9_entry.c | 1 + 4 files changed, 71 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 9d1fbd8be1c7..e8013cd6b646 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -112,6 +112,8 @@ static inline void clear_task_ebb(struct task_struct *t) #endif } +void kvmppc_save_user_regs(void); + extern int set_thread_tidr(struct task_struct *t); #endif /* _ASM_POWERPC_SWITCH_TO_H */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 406d7ee9e322..8f841fbe16ad 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1156,6 +1156,34 @@ static inline void save_sprs(struct thread_struct *t) #endif } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void kvmppc_save_user_regs(void) +{ + unsigned long usermsr; + + if (!current->thread.regs) + return; + + usermsr = current->thread.regs->msr; + + if (usermsr & MSR_FP) + save_fpu(current); + + if (usermsr & MSR_VEC) + save_altivec(current); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (usermsr & MSR_TM) { + current->thread.tm_tfhar = mfspr(SPRN_TFHAR); + current->thread.tm_tfiar = mfspr(SPRN_TFIAR); + current->thread.tm_texasr = mfspr(SPRN_TEXASR); + current->thread.regs->msr &= ~MSR_TM; + } +#endif +} +EXPORT_SYMBOL_GPL(kvmppc_save_user_regs); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + static inline void restore_sprs(struct thread_struct *old_thread, struct thread_struct *new_thread) { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 4e6a42b16998..541a023e25dd 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4153,6 +4153,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, struct p9_host_os_sprs host_os_sprs; s64 dec; u64 tb, next_timer; + unsigned long msr; int trap; WARN_ON_ONCE(vcpu->arch.ceded); @@ -4164,8 +4165,23 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, if (next_timer < time_limit) time_limit = next_timer; + vcpu->arch.ceded = 0; + save_p9_host_os_sprs(&host_os_sprs); + /* MSR bits may have been cleared by context switch */ + msr = 0; + if (IS_ENABLED(CONFIG_PPC_FPU)) + msr |= MSR_FP; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + msr |= MSR_VEC; + if (cpu_has_feature(CPU_FTR_VSX)) + msr |= MSR_VSX; + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + msr |= MSR_TM; + msr = msr_check_and_set(msr); + kvmppc_subcore_enter_guest(); vc->entry_exit_map = 1; @@ -4174,12 +4190,13 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu_vpa_increment_dispatch(vcpu); if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + msr = mfmsr(); /* TM restore can update msr */ + } switch_pmu_to_guest(vcpu, &host_os_sprs); - msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); load_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC load_vr_state(&vcpu->arch.vr); @@ -4288,7 +4305,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, restore_p9_host_os_sprs(vcpu, &host_os_sprs); - msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); store_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC store_vr_state(&vcpu->arch.vr); @@ -4851,19 +4867,24 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) unsigned long user_tar = 0; unsigned int user_vrsave; struct kvm *kvm; + unsigned long msr; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL; } + /* No need to go into the guest when all we'll do is come back out */ + if (signal_pending(current)) { + run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* * Don't allow entry with a suspended transaction, because * the guest entry/exit code will lose it. - * If the guest has TM enabled, save away their TM-related SPRs - * (they will get restored by the TM unavailable interrupt). */ -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs && (current->thread.regs->msr & MSR_TM)) { if (MSR_TM_ACTIVE(current->thread.regs->msr)) { @@ -4871,12 +4892,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) run->fail_entry.hardware_entry_failure_reason = 0; return -EINVAL; } - /* Enable TM so we can read the TM SPRs */ - mtmsr(mfmsr() | MSR_TM); - current->thread.tm_tfhar = mfspr(SPRN_TFHAR); - current->thread.tm_tfiar = mfspr(SPRN_TFIAR); - current->thread.tm_texasr = mfspr(SPRN_TEXASR); - current->thread.regs->msr &= ~MSR_TM; } #endif @@ -4891,18 +4906,24 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) kvmppc_core_prepare_to_enter(vcpu); - /* No need to go into the guest when all we'll do is come back out */ - if (signal_pending(current)) { - run->exit_reason = KVM_EXIT_INTR; - return -EINTR; - } - kvm = vcpu->kvm; atomic_inc(&kvm->arch.vcpus_running); /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */ smp_mb(); - flush_all_to_thread(current); + msr = 0; + if (IS_ENABLED(CONFIG_PPC_FPU)) + msr |= MSR_FP; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + msr |= MSR_VEC; + if (cpu_has_feature(CPU_FTR_VSX)) + msr |= MSR_VSX; + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + msr |= MSR_TM; + msr = msr_check_and_set(msr); + + kvmppc_save_user_regs(); /* Save userspace EBB and other register values */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) { diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index a7f63082b4e3..fb9cb34445ea 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -224,6 +224,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vc->tb_offset_applied = vc->tb_offset; } + /* Could avoid mfmsr by passing around, but probably no big deal */ msr = mfmsr(); host_hfscr = mfspr(SPRN_HFSCR); From 2251fbe76395e4d89c31099984714c5f1135f052 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:58 +1000 Subject: [PATCH 020/240] KVM: PPC: Book3S HV P9: Improve mtmsrd scheduling by delaying MSR[EE] disable Moving the mtmsrd after the host SPRs are saved and before the guest SPRs start to be loaded can prevent an SPR scoreboard stall (because the mtmsrd is L=1 type which does not cause context synchronisation. This is also now more convenient to combined with the mtmsrd L=0 instruction to enable facilities just below, but that is not done yet. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-21-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 541a023e25dd..6b0689589e13 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4169,6 +4169,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, save_p9_host_os_sprs(&host_os_sprs); + /* + * This could be combined with MSR[RI] clearing, but that expands + * the unrecoverable window. It would be better to cover unrecoverable + * with KVM bad interrupt handling rather than use MSR[RI] at all. + * + * Much more difficult and less worthwhile to combine with IR/DR + * disable. + */ + hard_irq_disable(); + if (lazy_irq_pending()) + return 0; + /* MSR bits may have been cleared by context switch */ msr = 0; if (IS_ENABLED(CONFIG_PPC_FPU)) @@ -4680,6 +4692,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, struct kvmppc_vcore *vc; struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested; + unsigned long flags; trace_kvmppc_run_vcpu_enter(vcpu); @@ -4723,11 +4736,11 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, if (kvm_is_radix(kvm)) kvmppc_prepare_radix_vcpu(vcpu, pcpu); - local_irq_disable(); - hard_irq_disable(); + /* flags save not required, but irq_pmu has no disable/enable API */ + powerpc_local_irq_pmu_save(flags); if (signal_pending(current)) goto sigpend; - if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready) + if (need_resched() || !kvm->arch.mmu_ready) goto out; if (!nested) { @@ -4795,7 +4808,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, } vtime_account_guest_exit(); - local_irq_enable(); + powerpc_local_irq_pmu_restore(flags); cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest); @@ -4853,7 +4866,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, run->exit_reason = KVM_EXIT_INTR; vcpu->arch.ret = -EINTR; out: - local_irq_enable(); + powerpc_local_irq_pmu_restore(flags); preempt_enable(); goto done; } From cf99dedb4b2d2a18e004b1c84852fffa810dc44c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:51:59 +1000 Subject: [PATCH 021/240] KVM: PPC: Book3S HV P9: Add kvmppc_stop_thread to match kvmppc_start_thread Small cleanup makes it a bit easier to match up entry and exit operations. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-22-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6b0689589e13..d326e6a20abd 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3070,6 +3070,13 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) kvmppc_ipi_thread(cpu); } +/* Old path does this in asm */ +static void kvmppc_stop_thread(struct kvm_vcpu *vcpu) +{ + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; +} + static void kvmppc_wait_for_nap(int n_threads) { int cpu = smp_processor_id(); @@ -4310,8 +4317,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, dec = (s32) dec; tb = mftb(); vcpu->arch.dec_expires = dec + tb; - vcpu->cpu = -1; - vcpu->arch.thread_cpu = -1; store_spr_state(vcpu); @@ -4808,6 +4813,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, } vtime_account_guest_exit(); + kvmppc_stop_thread(vcpu); + powerpc_local_irq_pmu_restore(flags); cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest); From 3c1a4322bba79aad2d3f6f996b7e1c336bd909b3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:00 +1000 Subject: [PATCH 022/240] KVM: PPC: Book3S HV: Change dec_expires to be relative to guest timebase Change dec_expires to be relative to the guest timebase, and allow it to be moved into low level P9 guest entry functions, to improve SPR access scheduling. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-23-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s.h | 6 +++ arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kvm/book3s_hv.c | 58 +++++++++++++------------ arch/powerpc/kvm/book3s_hv_nested.c | 3 ++ arch/powerpc/kvm/book3s_hv_p9_entry.c | 10 ++++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 ------ 6 files changed, 49 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 3d31f2c59e43..91c9f937edcd 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -406,6 +406,12 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) return vcpu->arch.fault_dar; } +/* Expiry time of vcpu DEC relative to host TB */ +static inline u64 kvmppc_dec_expires_host_tb(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.dec_expires - vcpu->arch.vcore->tb_offset; +} + static inline bool is_kvmppc_resume_guest(int r) { return (r == RESUME_GUEST || r == RESUME_GUEST_NV); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e4d23193eba7..21ca15c3bc0b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -741,7 +741,7 @@ struct kvm_vcpu_arch { struct hrtimer dec_timer; u64 dec_jiffies; - u64 dec_expires; + u64 dec_expires; /* Relative to guest timebase. */ unsigned long pending_exceptions; u8 ceded; u8 prodded; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d326e6a20abd..bc4afec760ca 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2261,8 +2261,7 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); break; case KVM_REG_PPC_DEC_EXPIRY: - *val = get_reg_val(id, vcpu->arch.dec_expires + - vcpu->arch.vcore->tb_offset); + *val = get_reg_val(id, vcpu->arch.dec_expires); break; case KVM_REG_PPC_ONLINE: *val = get_reg_val(id, vcpu->arch.online); @@ -2514,8 +2513,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_DEC_EXPIRY: - vcpu->arch.dec_expires = set_reg_val(id, *val) - - vcpu->arch.vcore->tb_offset; + vcpu->arch.dec_expires = set_reg_val(id, *val); break; case KVM_REG_PPC_ONLINE: i = set_reg_val(id, *val); @@ -2902,13 +2900,13 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) unsigned long dec_nsec, now; now = get_tb(); - if (now > vcpu->arch.dec_expires) { + if (now > kvmppc_dec_expires_host_tb(vcpu)) { /* decrementer has already gone negative */ kvmppc_core_queue_dec(vcpu); kvmppc_core_prepare_to_enter(vcpu); return; } - dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now); + dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now); hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL); vcpu->arch.timer_running = 1; } @@ -3380,7 +3378,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) */ spin_unlock(&vc->lock); /* cancel pending dec exception if dec is positive */ - if (now < vcpu->arch.dec_expires && + if (now < kvmppc_dec_expires_host_tb(vcpu) && kvmppc_core_pending_dec(vcpu)) kvmppc_core_dequeue_dec(vcpu); @@ -4224,20 +4222,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, load_spr_state(vcpu); - /* - * When setting DEC, we must always deal with irq_work_raise via NMI vs - * setting DEC. The problem occurs right as we switch into guest mode - * if a NMI hits and sets pending work and sets DEC, then that will - * apply to the guest and not bring us back to the host. - * - * irq_work_raise could check a flag (or possibly LPCR[HDICE] for - * example) and set HDEC to 1? That wouldn't solve the nested hv - * case which needs to abort the hcall or zero the time limit. - * - * XXX: Another day's problem. - */ - mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb); - if (kvmhv_on_pseries()) { /* * We need to save and restore the guest visible part of the @@ -4263,6 +4247,23 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, hvregs.vcpu_token = vcpu->vcpu_id; } hvregs.hdec_expiry = time_limit; + + /* + * When setting DEC, we must always deal with irq_work_raise + * via NMI vs setting DEC. The problem occurs right as we + * switch into guest mode if a NMI hits and sets pending work + * and sets DEC, then that will apply to the guest and not + * bring us back to the host. + * + * irq_work_raise could check a flag (or possibly LPCR[HDICE] + * for example) and set HDEC to 1? That wouldn't solve the + * nested hv case which needs to abort the hcall or zero the + * time limit. + * + * XXX: Another day's problem. + */ + mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - tb); + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), @@ -4274,6 +4275,12 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); mtspr(SPRN_PSSCR_PR, host_psscr); + dec = mfspr(SPRN_DEC); + if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ + dec = (s32) dec; + tb = mftb(); + vcpu->arch.dec_expires = dec + (tb + vc->tb_offset); + /* H_CEDE has to be handled now, not later */ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && kvmppc_get_gpr(vcpu, 3) == H_CEDE) { @@ -4281,6 +4288,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, kvmppc_set_gpr(vcpu, 3, 0); trap = 0; } + } else { kvmppc_xive_push_vcpu(vcpu); trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr); @@ -4312,12 +4320,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.slb_max = 0; } - dec = mfspr(SPRN_DEC); - if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ - dec = (s32) dec; - tb = mftb(); - vcpu->arch.dec_expires = dec + tb; - store_spr_state(vcpu); restore_p9_host_os_sprs(vcpu, &host_os_sprs); @@ -4827,7 +4829,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, * by L2 and the L1 decrementer is provided in hdec_expires */ if (kvmppc_core_pending_dec(vcpu) && - ((get_tb() < vcpu->arch.dec_expires) || + ((get_tb() < kvmppc_dec_expires_host_tb(vcpu)) || (trap == BOOK3S_INTERRUPT_SYSCALL && kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED))) kvmppc_core_dequeue_dec(vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index ed8a2c9f5629..7bed0b91245e 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -358,6 +358,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* convert TB values/offsets to host (L0) values */ hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; vc->tb_offset += l2_hv.tb_offset; + vcpu->arch.dec_expires += l2_hv.tb_offset; /* set L1 state to L2 state */ vcpu->arch.nested = l2; @@ -399,6 +400,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) if (l2_regs.msr & MSR_TS_MASK) vcpu->arch.shregs.msr |= MSR_TS_S; vc->tb_offset = saved_l1_hv.tb_offset; + /* XXX: is this always the same delta as saved_l1_hv.tb_offset? */ + vcpu->arch.dec_expires -= l2_hv.tb_offset; restore_hv_regs(vcpu, &saved_l1_hv); vcpu->arch.purr += delta_purr; vcpu->arch.spurr += delta_spurr; diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index fb9cb34445ea..814b0dfd590f 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -188,7 +188,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested; struct kvmppc_vcore *vc = vcpu->arch.vcore; - s64 hdec; + s64 hdec, dec; u64 tb, purr, spurr; u64 *exsave; bool ri_set; @@ -317,6 +317,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc */ mtspr(SPRN_HDEC, hdec); + mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb); + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM tm_return_to_guest: #endif @@ -461,6 +463,12 @@ tm_return_to_guest: vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); + dec = mfspr(SPRN_DEC); + if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ + dec = (s32) dec; + tb = mftb(); + vcpu->arch.dec_expires = dec + tb; + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ mtspr(SPRN_PSSCR, host_psscr | (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index be79ae7afdf5..3f1aeff72438 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -814,10 +814,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) * Set the decrementer to the guest decrementer. */ ld r8,VCPU_DEC_EXPIRES(r4) - /* r8 is a host timebase value here, convert to guest TB */ - ld r5,HSTATE_KVM_VCORE(r13) - ld r6,VCORE_TB_OFFSET_APPL(r5) - add r8,r8,r6 mftb r7 subf r3,r7,r8 mtspr SPRN_DEC,r3 @@ -1192,9 +1188,6 @@ guest_bypass: mftb r6 extsw r5,r5 16: add r5,r5,r6 - /* r5 is a guest timebase value here, convert to host TB */ - ld r4,VCORE_TB_OFFSET_APPL(r3) - subf r5,r4,r5 std r5,VCPU_DEC_EXPIRES(r9) /* Increment exit count, poke other threads to exit */ @@ -2160,9 +2153,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM) /* save expiry time of guest decrementer */ add r3, r3, r5 ld r4, HSTATE_KVM_VCPU(r13) - ld r5, HSTATE_KVM_VCORE(r13) - ld r6, VCORE_TB_OFFSET_APPL(r5) - subf r3, r6, r3 /* convert to host TB value */ std r3, VCPU_DEC_EXPIRES(r4) #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING @@ -2259,9 +2249,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM) /* Restore guest decrementer */ ld r3, VCPU_DEC_EXPIRES(r4) - ld r5, HSTATE_KVM_VCORE(r13) - ld r6, VCORE_TB_OFFSET_APPL(r5) - add r3, r3, r6 /* convert host TB to guest TB value */ mftb r7 subf r3, r7, r3 mtspr SPRN_DEC, r3 From 6547af3eba88e4806e853fee7547031b2cc6a560 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:01 +1000 Subject: [PATCH 023/240] KVM: PPC: Book3S HV P9: Move TB updates Move the TB updates between saving and loading guest and host SPRs, to improve scheduling by keeping issue-NTC operations together as much as possible. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-24-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_p9_entry.c | 36 +++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 814b0dfd590f..e7793bb806eb 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -215,15 +215,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vcpu->arch.ceded = 0; - if (vc->tb_offset) { - u64 new_tb = tb + vc->tb_offset; - mtspr(SPRN_TBU40, new_tb); - tb = mftb(); - if ((tb & 0xffffff) < (new_tb & 0xffffff)) - mtspr(SPRN_TBU40, new_tb + 0x1000000); - vc->tb_offset_applied = vc->tb_offset; - } - /* Could avoid mfmsr by passing around, but probably no big deal */ msr = mfmsr(); @@ -238,6 +229,15 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc host_dawrx1 = mfspr(SPRN_DAWRX1); } + if (vc->tb_offset) { + u64 new_tb = tb + vc->tb_offset; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = vc->tb_offset; + } + if (vc->pcr) mtspr(SPRN_PCR, vc->pcr | PCR_MASK); mtspr(SPRN_DPDES, vc->dpdes); @@ -469,6 +469,15 @@ tm_return_to_guest: tb = mftb(); vcpu->arch.dec_expires = dec + tb; + if (vc->tb_offset_applied) { + u64 new_tb = tb - vc->tb_offset_applied; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = 0; + } + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ mtspr(SPRN_PSSCR, host_psscr | (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); @@ -503,15 +512,6 @@ tm_return_to_guest: if (vc->pcr) mtspr(SPRN_PCR, PCR_MASK); - if (vc->tb_offset_applied) { - u64 new_tb = mftb() - vc->tb_offset_applied; - mtspr(SPRN_TBU40, new_tb); - tb = mftb(); - if ((tb & 0xffffff) < (new_tb & 0xffffff)) - mtspr(SPRN_TBU40, new_tb + 0x1000000); - vc->tb_offset_applied = 0; - } - /* HDEC must be at least as large as DEC, so decrementer_max fits */ mtspr(SPRN_HDEC, decrementer_max); From cb2553a093093ae46cfaee31321bcedcd0312c5d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:02 +1000 Subject: [PATCH 024/240] KVM: PPC: Book3S HV P9: Optimise timebase reads Reduce the number of mfTB executed by passing the current timebase around entry and exit code rather than read it multiple times. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-25-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 2 +- arch/powerpc/kvm/book3s_hv.c | 88 +++++++++++++----------- arch/powerpc/kvm/book3s_hv_p9_entry.c | 33 +++++---- 3 files changed, 65 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index fff391b9b97b..0a319ed9c2fd 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -154,7 +154,7 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu) return radix; } -int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr); +int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb); #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bc4afec760ca..3a9447f75a9e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -276,22 +276,22 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) * they should never fail.) */ -static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc) +static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb) { unsigned long flags; spin_lock_irqsave(&vc->stoltb_lock, flags); - vc->preempt_tb = mftb(); + vc->preempt_tb = tb; spin_unlock_irqrestore(&vc->stoltb_lock, flags); } -static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc) +static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb) { unsigned long flags; spin_lock_irqsave(&vc->stoltb_lock, flags); if (vc->preempt_tb != TB_NIL) { - vc->stolen_tb += mftb() - vc->preempt_tb; + vc->stolen_tb += tb - vc->preempt_tb; vc->preempt_tb = TB_NIL; } spin_unlock_irqrestore(&vc->stoltb_lock, flags); @@ -301,6 +301,7 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; unsigned long flags; + u64 now = mftb(); /* * We can test vc->runner without taking the vcore lock, @@ -309,12 +310,12 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) * ever sets it to NULL. */ if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) - kvmppc_core_end_stolen(vc); + kvmppc_core_end_stolen(vc, now); spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && vcpu->arch.busy_preempt != TB_NIL) { - vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; + vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt; vcpu->arch.busy_preempt = TB_NIL; } spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); @@ -324,13 +325,14 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; unsigned long flags; + u64 now = mftb(); if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) - kvmppc_core_start_stolen(vc); + kvmppc_core_start_stolen(vc, now); spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) - vcpu->arch.busy_preempt = mftb(); + vcpu->arch.busy_preempt = now; spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } @@ -685,7 +687,7 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) } static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, - struct kvmppc_vcore *vc) + struct kvmppc_vcore *vc, u64 tb) { struct dtl_entry *dt; struct lppaca *vpa; @@ -696,7 +698,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, dt = vcpu->arch.dtl_ptr; vpa = vcpu->arch.vpa.pinned_addr; - now = mftb(); + now = tb; core_stolen = vcore_stolen_time(vc, now); stolen = core_stolen - vcpu->arch.stolen_logged; vcpu->arch.stolen_logged = core_stolen; @@ -2914,14 +2916,14 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) extern int __kvmppc_vcore_entry(void); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, u64 tb) { u64 now; if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) return; spin_lock_irq(&vcpu->arch.tbacct_lock); - now = mftb(); + now = tb; vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - vcpu->arch.stolen_logged; vcpu->arch.busy_preempt = now; @@ -3172,14 +3174,14 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) } /* Start accumulating stolen time */ - kvmppc_core_start_stolen(vc); + kvmppc_core_start_stolen(vc, mftb()); } static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc) { struct preempted_vcore_list *lp; - kvmppc_core_end_stolen(vc); + kvmppc_core_end_stolen(vc, mftb()); if (!list_empty(&vc->preempt_list)) { lp = &per_cpu(preempted_vcores, vc->pcpu); spin_lock(&lp->lock); @@ -3306,7 +3308,7 @@ static void prepare_threads(struct kvmppc_vcore *vc) vcpu->arch.ret = RESUME_GUEST; else continue; - kvmppc_remove_runnable(vc, vcpu); + kvmppc_remove_runnable(vc, vcpu, mftb()); wake_up(&vcpu->arch.cpu_run); } } @@ -3325,7 +3327,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) list_del_init(&pvc->preempt_list); if (pvc->runner == NULL) { pvc->vcore_state = VCORE_INACTIVE; - kvmppc_core_end_stolen(pvc); + kvmppc_core_end_stolen(pvc, mftb()); } spin_unlock(&pvc->lock); continue; @@ -3334,7 +3336,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&pvc->lock); continue; } - kvmppc_core_end_stolen(pvc); + kvmppc_core_end_stolen(pvc, mftb()); pvc->vcore_state = VCORE_PIGGYBACK; if (cip->total_threads >= target_threads) break; @@ -3401,7 +3403,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) else ++still_running; } else { - kvmppc_remove_runnable(vc, vcpu); + kvmppc_remove_runnable(vc, vcpu, mftb()); wake_up(&vcpu->arch.cpu_run); } } @@ -3410,7 +3412,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) kvmppc_vcore_preempt(vc); } else if (vc->runner) { vc->vcore_state = VCORE_PREEMPT; - kvmppc_core_start_stolen(vc); + kvmppc_core_start_stolen(vc, mftb()); } else { vc->vcore_state = VCORE_INACTIVE; } @@ -3541,7 +3543,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; - kvmppc_remove_runnable(vc, vcpu); + kvmppc_remove_runnable(vc, vcpu, mftb()); wake_up(&vcpu->arch.cpu_run); } goto out; @@ -3673,7 +3675,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) pvc->pcpu = pcpu + thr; for_each_runnable_thread(i, vcpu, pvc) { kvmppc_start_thread(vcpu, pvc); - kvmppc_create_dtl_entry(vcpu, pvc); + kvmppc_create_dtl_entry(vcpu, pvc, mftb()); trace_kvm_guest_enter(vcpu); if (!vcpu->arch.ptid) thr0_done = true; @@ -4152,20 +4154,17 @@ static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu) * Guest entry for POWER9 and later CPUs. */ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, - unsigned long lpcr) + unsigned long lpcr, u64 *tb) { struct kvmppc_vcore *vc = vcpu->arch.vcore; struct p9_host_os_sprs host_os_sprs; s64 dec; - u64 tb, next_timer; + u64 next_timer; unsigned long msr; int trap; - WARN_ON_ONCE(vcpu->arch.ceded); - - tb = mftb(); next_timer = timer_get_next_tb(); - if (tb >= next_timer) + if (*tb >= next_timer) return BOOK3S_INTERRUPT_HV_DECREMENTER; if (next_timer < time_limit) time_limit = next_timer; @@ -4262,7 +4261,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, * * XXX: Another day's problem. */ - mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - tb); + mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb); mtspr(SPRN_DAR, vcpu->arch.shregs.dar); mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); @@ -4278,8 +4277,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ dec = (s32) dec; - tb = mftb(); - vcpu->arch.dec_expires = dec + (tb + vc->tb_offset); + *tb = mftb(); + vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset); /* H_CEDE has to be handled now, not later */ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && @@ -4291,7 +4290,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, } else { kvmppc_xive_push_vcpu(vcpu); - trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr); + trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb); if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && !(vcpu->arch.shregs.msr & MSR_PR)) { unsigned long req = kvmppc_get_gpr(vcpu, 3); @@ -4322,6 +4321,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, store_spr_state(vcpu); + timer_rearm_host_dec(*tb); + restore_p9_host_os_sprs(vcpu, &host_os_sprs); store_fp_state(&vcpu->arch.fp); @@ -4341,8 +4342,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 0x101; vc->in_guest = 0; - timer_rearm_host_dec(tb); - kvmppc_subcore_exit_guest(); return trap; @@ -4596,7 +4595,7 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu) if ((vc->vcore_state == VCORE_PIGGYBACK || vc->vcore_state == VCORE_RUNNING) && !VCORE_IS_EXITING(vc)) { - kvmppc_create_dtl_entry(vcpu, vc); + kvmppc_create_dtl_entry(vcpu, vc, mftb()); kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { @@ -4631,7 +4630,7 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu) for_each_runnable_thread(i, v, vc) { kvmppc_core_prepare_to_enter(v); if (signal_pending(v->arch.run_task)) { - kvmppc_remove_runnable(vc, v); + kvmppc_remove_runnable(vc, v, mftb()); v->stat.signal_exits++; v->run->exit_reason = KVM_EXIT_INTR; v->arch.ret = -EINTR; @@ -4672,7 +4671,7 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu) kvmppc_vcore_end_preempt(vc); if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { - kvmppc_remove_runnable(vc, vcpu); + kvmppc_remove_runnable(vc, vcpu, mftb()); vcpu->stat.signal_exits++; run->exit_reason = KVM_EXIT_INTR; vcpu->arch.ret = -EINTR; @@ -4700,6 +4699,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested; unsigned long flags; + u64 tb; trace_kvmppc_run_vcpu_enter(vcpu); @@ -4710,7 +4710,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vc = vcpu->arch.vcore; vcpu->arch.ceded = 0; vcpu->arch.run_task = current; - vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; vcpu->arch.busy_preempt = TB_NIL; vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; @@ -4735,7 +4734,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, kvmppc_update_vpas(vcpu); init_vcore_to_run(vc); - vc->preempt_tb = TB_NIL; preempt_disable(); pcpu = smp_processor_id(); @@ -4745,6 +4743,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, /* flags save not required, but irq_pmu has no disable/enable API */ powerpc_local_irq_pmu_save(flags); + if (signal_pending(current)) goto sigpend; if (need_resched() || !kvm->arch.mmu_ready) @@ -4767,12 +4766,17 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, goto out; } + tb = mftb(); + + vcpu->arch.stolen_logged = vcore_stolen_time(vc, tb); + vc->preempt_tb = TB_NIL; + kvmppc_clear_host_core(pcpu); local_paca->kvm_hstate.napping = 0; local_paca->kvm_hstate.kvm_split_mode = NULL; kvmppc_start_thread(vcpu, vc); - kvmppc_create_dtl_entry(vcpu, vc); + kvmppc_create_dtl_entry(vcpu, vc, tb); trace_kvm_guest_enter(vcpu); vc->vcore_state = VCORE_RUNNING; @@ -4787,7 +4791,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, /* Tell lockdep that we're about to enable interrupts */ trace_hardirqs_on(); - trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr); + trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, &tb); vcpu->arch.trap = trap; trace_hardirqs_off(); @@ -4829,7 +4833,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, * by L2 and the L1 decrementer is provided in hdec_expires */ if (kvmppc_core_pending_dec(vcpu) && - ((get_tb() < kvmppc_dec_expires_host_tb(vcpu)) || + ((tb < kvmppc_dec_expires_host_tb(vcpu)) || (trap == BOOK3S_INTERRUPT_SYSCALL && kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED))) kvmppc_core_dequeue_dec(vcpu); @@ -4865,7 +4869,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, trace_kvmppc_run_core(vc, 1); done: - kvmppc_remove_runnable(vc, vcpu); + kvmppc_remove_runnable(vc, vcpu, tb); trace_kvmppc_run_vcpu_exit(vcpu); return vcpu->arch.ret; diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index e7793bb806eb..2bd96d8256d1 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -183,13 +183,13 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu) } } -int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) +int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested; struct kvmppc_vcore *vc = vcpu->arch.vcore; s64 hdec, dec; - u64 tb, purr, spurr; + u64 purr, spurr; u64 *exsave; bool ri_set; int trap; @@ -203,8 +203,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc unsigned long host_dawr1; unsigned long host_dawrx1; - tb = mftb(); - hdec = time_limit - tb; + hdec = time_limit - *tb; if (hdec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; @@ -230,11 +229,13 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc } if (vc->tb_offset) { - u64 new_tb = tb + vc->tb_offset; + u64 new_tb = *tb + vc->tb_offset; mtspr(SPRN_TBU40, new_tb); - tb = mftb(); - if ((tb & 0xffffff) < (new_tb & 0xffffff)) - mtspr(SPRN_TBU40, new_tb + 0x1000000); + if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) { + new_tb += 0x1000000; + mtspr(SPRN_TBU40, new_tb); + } + *tb = new_tb; vc->tb_offset_applied = vc->tb_offset; } @@ -317,7 +318,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc */ mtspr(SPRN_HDEC, hdec); - mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb); + mtspr(SPRN_DEC, vcpu->arch.dec_expires - *tb); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM tm_return_to_guest: @@ -466,15 +467,17 @@ tm_return_to_guest: dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ dec = (s32) dec; - tb = mftb(); - vcpu->arch.dec_expires = dec + tb; + *tb = mftb(); + vcpu->arch.dec_expires = dec + *tb; if (vc->tb_offset_applied) { - u64 new_tb = tb - vc->tb_offset_applied; + u64 new_tb = *tb - vc->tb_offset_applied; mtspr(SPRN_TBU40, new_tb); - tb = mftb(); - if ((tb & 0xffffff) < (new_tb & 0xffffff)) - mtspr(SPRN_TBU40, new_tb + 0x1000000); + if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) { + new_tb += 0x1000000; + mtspr(SPRN_TBU40, new_tb); + } + *tb = new_tb; vc->tb_offset_applied = 0; } From 9a1e530bbbdaa2184993a7d7fc61d78871540ccd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:03 +1000 Subject: [PATCH 025/240] KVM: PPC: Book3S HV P9: Avoid SPR scoreboard stalls Avoid interleaving mfSPR and mtSPR to reduce SPR scoreboard stalls. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-26-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 8 ++++---- arch/powerpc/kvm/book3s_hv_p9_entry.c | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3a9447f75a9e..75a674b5cd84 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4321,10 +4321,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, store_spr_state(vcpu); - timer_rearm_host_dec(*tb); - - restore_p9_host_os_sprs(vcpu, &host_os_sprs); - store_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC store_vr_state(&vcpu->arch.vr); @@ -4339,6 +4335,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, switch_pmu_to_host(vcpu, &host_os_sprs); + timer_rearm_host_dec(*tb); + + restore_p9_host_os_sprs(vcpu, &host_os_sprs); + vc->entry_exit_map = 0x101; vc->in_guest = 0; diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 2bd96d8256d1..bd0021cd3a67 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -228,6 +228,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc host_dawrx1 = mfspr(SPRN_DAWRX1); } + local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); + local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); + if (vc->tb_offset) { u64 new_tb = *tb + vc->tb_offset; mtspr(SPRN_TBU40, new_tb); @@ -244,8 +247,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc mtspr(SPRN_DPDES, vc->dpdes); mtspr(SPRN_VTB, vc->vtb); - local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); - local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); mtspr(SPRN_PURR, vcpu->arch.purr); mtspr(SPRN_SPURR, vcpu->arch.spurr); @@ -448,10 +449,8 @@ tm_return_to_guest: /* Advance host PURR/SPURR by the amount used by guest */ purr = mfspr(SPRN_PURR); spurr = mfspr(SPRN_SPURR); - mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr + - purr - vcpu->arch.purr); - mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr + - spurr - vcpu->arch.spurr); + local_paca->kvm_hstate.host_purr += purr - vcpu->arch.purr; + local_paca->kvm_hstate.host_spurr += spurr - vcpu->arch.spurr; vcpu->arch.purr = purr; vcpu->arch.spurr = spurr; @@ -464,6 +463,9 @@ tm_return_to_guest: vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); + vc->dpdes = mfspr(SPRN_DPDES); + vc->vtb = mfspr(SPRN_VTB); + dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ dec = (s32) dec; @@ -481,6 +483,9 @@ tm_return_to_guest: vc->tb_offset_applied = 0; } + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr); + mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr); + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ mtspr(SPRN_PSSCR, host_psscr | (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); @@ -509,8 +514,6 @@ tm_return_to_guest: if (cpu_has_feature(CPU_FTR_ARCH_31)) asm volatile(PPC_CP_ABORT); - vc->dpdes = mfspr(SPRN_DPDES); - vc->vtb = mfspr(SPRN_VTB); mtspr(SPRN_DPDES, 0); if (vc->pcr) mtspr(SPRN_PCR, PCR_MASK); From 9dfe7aa7bc50556063c8658f59ad475131c09b65 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:04 +1000 Subject: [PATCH 026/240] KVM: PPC: Book3S HV P9: Only execute mtSPR if the value changed Keep better track of the current SPR value in places where they are to be loaded with a new context, to reduce expensive mtSPR operations. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-27-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 51 ++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 75a674b5cd84..5c44c4ff5d46 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4055,20 +4055,28 @@ static void switch_pmu_to_host(struct kvm_vcpu *vcpu, } } -static void load_spr_state(struct kvm_vcpu *vcpu) +static void load_spr_state(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) { - mtspr(SPRN_DSCR, vcpu->arch.dscr); - mtspr(SPRN_IAMR, vcpu->arch.iamr); - mtspr(SPRN_PSPB, vcpu->arch.pspb); - mtspr(SPRN_FSCR, vcpu->arch.fscr); mtspr(SPRN_TAR, vcpu->arch.tar); mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); mtspr(SPRN_BESCR, vcpu->arch.bescr); + if (cpu_has_feature(CPU_FTR_P9_TIDR)) mtspr(SPRN_TIDR, vcpu->arch.tid); - mtspr(SPRN_AMR, vcpu->arch.amr); - mtspr(SPRN_UAMOR, vcpu->arch.uamor); + if (host_os_sprs->iamr != vcpu->arch.iamr) + mtspr(SPRN_IAMR, vcpu->arch.iamr); + if (host_os_sprs->amr != vcpu->arch.amr) + mtspr(SPRN_AMR, vcpu->arch.amr); + if (vcpu->arch.uamor != 0) + mtspr(SPRN_UAMOR, vcpu->arch.uamor); + if (host_os_sprs->fscr != vcpu->arch.fscr) + mtspr(SPRN_FSCR, vcpu->arch.fscr); + if (host_os_sprs->dscr != vcpu->arch.dscr) + mtspr(SPRN_DSCR, vcpu->arch.dscr); + if (vcpu->arch.pspb != 0) + mtspr(SPRN_PSPB, vcpu->arch.pspb); /* * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI] @@ -4083,20 +4091,21 @@ static void load_spr_state(struct kvm_vcpu *vcpu) static void store_spr_state(struct kvm_vcpu *vcpu) { - vcpu->arch.ctrl = mfspr(SPRN_CTRLF); - - vcpu->arch.iamr = mfspr(SPRN_IAMR); - vcpu->arch.pspb = mfspr(SPRN_PSPB); - vcpu->arch.fscr = mfspr(SPRN_FSCR); vcpu->arch.tar = mfspr(SPRN_TAR); vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); vcpu->arch.bescr = mfspr(SPRN_BESCR); + if (cpu_has_feature(CPU_FTR_P9_TIDR)) vcpu->arch.tid = mfspr(SPRN_TIDR); + vcpu->arch.iamr = mfspr(SPRN_IAMR); vcpu->arch.amr = mfspr(SPRN_AMR); vcpu->arch.uamor = mfspr(SPRN_UAMOR); + vcpu->arch.fscr = mfspr(SPRN_FSCR); vcpu->arch.dscr = mfspr(SPRN_DSCR); + vcpu->arch.pspb = mfspr(SPRN_PSPB); + + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); } static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) @@ -4107,6 +4116,7 @@ static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) host_os_sprs->iamr = mfspr(SPRN_IAMR); host_os_sprs->amr = mfspr(SPRN_AMR); host_os_sprs->fscr = mfspr(SPRN_FSCR); + host_os_sprs->dscr = mfspr(SPRN_DSCR); } /* vcpu guest regs must already be saved */ @@ -4115,19 +4125,20 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, { mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); - mtspr(SPRN_PSPB, 0); - mtspr(SPRN_UAMOR, 0); - - mtspr(SPRN_DSCR, host_os_sprs->dscr); if (cpu_has_feature(CPU_FTR_P9_TIDR)) mtspr(SPRN_TIDR, host_os_sprs->tidr); - mtspr(SPRN_IAMR, host_os_sprs->iamr); - + if (host_os_sprs->iamr != vcpu->arch.iamr) + mtspr(SPRN_IAMR, host_os_sprs->iamr); + if (vcpu->arch.uamor != 0) + mtspr(SPRN_UAMOR, 0); if (host_os_sprs->amr != vcpu->arch.amr) mtspr(SPRN_AMR, host_os_sprs->amr); - if (host_os_sprs->fscr != vcpu->arch.fscr) mtspr(SPRN_FSCR, host_os_sprs->fscr); + if (host_os_sprs->dscr != vcpu->arch.dscr) + mtspr(SPRN_DSCR, host_os_sprs->dscr); + if (vcpu->arch.pspb != 0) + mtspr(SPRN_PSPB, 0); /* Save guest CTRL register, set runlatch to 1 */ if (!(vcpu->arch.ctrl & 1)) @@ -4219,7 +4230,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, #endif mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); - load_spr_state(vcpu); + load_spr_state(vcpu, &host_os_sprs); if (kvmhv_on_pseries()) { /* From 0f3b6c4851aef7a98b435c6f08b2c9c88165d254 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:05 +1000 Subject: [PATCH 027/240] KVM: PPC: Book3S HV P9: Juggle SPR switching around This juggles SPR switching on the entry and exit sides to be more symmetric, which makes the next refactoring patch possible with no functional change. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-28-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5c44c4ff5d46..53fe41102c22 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4222,7 +4222,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, msr = mfmsr(); /* TM restore can update msr */ } - switch_pmu_to_guest(vcpu, &host_os_sprs); + load_spr_state(vcpu, &host_os_sprs); load_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC @@ -4230,7 +4230,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, #endif mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); - load_spr_state(vcpu, &host_os_sprs); + switch_pmu_to_guest(vcpu, &host_os_sprs); if (kvmhv_on_pseries()) { /* @@ -4330,6 +4330,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.slb_max = 0; } + switch_pmu_to_host(vcpu, &host_os_sprs); + store_spr_state(vcpu); store_fp_state(&vcpu->arch.fp); @@ -4344,8 +4346,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu_vpa_increment_dispatch(vcpu); - switch_pmu_to_host(vcpu, &host_os_sprs); - timer_rearm_host_dec(*tb); restore_p9_host_os_sprs(vcpu, &host_os_sprs); From 516b334210b831827e0491676625323f484275dd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:06 +1000 Subject: [PATCH 028/240] KVM: PPC: Book3S HV P9: Move vcpu register save/restore into functions This should be no functional difference but makes the caller easier to read. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-29-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 65 +++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 53fe41102c22..0eb52f2732a4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4108,6 +4108,44 @@ static void store_spr_state(struct kvm_vcpu *vcpu) vcpu->arch.ctrl = mfspr(SPRN_CTRLF); } +/* Returns true if current MSR and/or guest MSR may have changed */ +static bool load_vcpu_state(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) +{ + bool ret = false; + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + ret = true; + } + + load_spr_state(vcpu, host_os_sprs); + + load_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + load_vr_state(&vcpu->arch.vr); +#endif + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); + + return ret; +} + +static void store_vcpu_state(struct kvm_vcpu *vcpu) +{ + store_spr_state(vcpu); + + store_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + store_vr_state(&vcpu->arch.vr); +#endif + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); +} + static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) { host_os_sprs->dscr = mfspr(SPRN_DSCR); @@ -4216,19 +4254,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu_vpa_increment_dispatch(vcpu); - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { - kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); - msr = mfmsr(); /* TM restore can update msr */ - } - - load_spr_state(vcpu, &host_os_sprs); - - load_fp_state(&vcpu->arch.fp); -#ifdef CONFIG_ALTIVEC - load_vr_state(&vcpu->arch.vr); -#endif - mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); + if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) + msr = mfmsr(); /* MSR may have been updated */ switch_pmu_to_guest(vcpu, &host_os_sprs); @@ -4332,17 +4359,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, switch_pmu_to_host(vcpu, &host_os_sprs); - store_spr_state(vcpu); - - store_fp_state(&vcpu->arch.fp); -#ifdef CONFIG_ALTIVEC - store_vr_state(&vcpu->arch.vr); -#endif - vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); - - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) - kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + store_vcpu_state(vcpu); vcpu_vpa_increment_dispatch(vcpu); From aabcaf6ae2a0912898bd243f0aec0ce6853983fc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:07 +1000 Subject: [PATCH 029/240] KVM: PPC: Book3S HV P9: Move host OS save/restore functions to built-in Move the P9 guest/host register switching functions to the built-in P9 entry code, and export it for nested to use as well. This allows more flexibility in scheduling these supervisor privileged SPR accesses with the HV privileged and PR SPR accesses in the low level entry code. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-30-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 379 +------------------------- arch/powerpc/kvm/book3s_hv.h | 45 +++ arch/powerpc/kvm/book3s_hv_p9_entry.c | 353 ++++++++++++++++++++++++ 3 files changed, 399 insertions(+), 378 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_hv.h diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0eb52f2732a4..8a9d2314d67c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -80,6 +80,7 @@ #include #include "book3s.h" +#include "book3s_hv.h" #define CREATE_TRACE_POINTS #include "trace_hv.h" @@ -127,11 +128,6 @@ static bool nested = true; module_param(nested, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)"); -static inline bool nesting_enabled(struct kvm *kvm) -{ - return kvm->arch.nested_enable && kvm_is_radix(kvm); -} - static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); /* @@ -3810,379 +3806,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) trace_kvmppc_run_core(vc, 1); } -/* - * Privileged (non-hypervisor) host registers to save. - */ -struct p9_host_os_sprs { - unsigned long dscr; - unsigned long tidr; - unsigned long iamr; - unsigned long amr; - unsigned long fscr; - - unsigned int pmc1; - unsigned int pmc2; - unsigned int pmc3; - unsigned int pmc4; - unsigned int pmc5; - unsigned int pmc6; - unsigned long mmcr0; - unsigned long mmcr1; - unsigned long mmcr2; - unsigned long mmcr3; - unsigned long mmcra; - unsigned long siar; - unsigned long sier1; - unsigned long sier2; - unsigned long sier3; - unsigned long sdar; -}; - -static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra) -{ - if (!(mmcr0 & MMCR0_FC)) - goto do_freeze; - if (mmcra & MMCRA_SAMPLE_ENABLE) - goto do_freeze; - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - if (!(mmcr0 & MMCR0_PMCCEXT)) - goto do_freeze; - if (!(mmcra & MMCRA_BHRB_DISABLE)) - goto do_freeze; - } - return; - -do_freeze: - mmcr0 = MMCR0_FC; - mmcra = 0; - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - mmcr0 |= MMCR0_PMCCEXT; - mmcra = MMCRA_BHRB_DISABLE; - } - - mtspr(SPRN_MMCR0, mmcr0); - mtspr(SPRN_MMCRA, mmcra); - isync(); -} - -static void switch_pmu_to_guest(struct kvm_vcpu *vcpu, - struct p9_host_os_sprs *host_os_sprs) -{ - struct lppaca *lp; - int load_pmu = 1; - - lp = vcpu->arch.vpa.pinned_addr; - if (lp) - load_pmu = lp->pmcregs_in_use; - - /* Save host */ - if (ppc_get_pmu_inuse()) { - /* - * It might be better to put PMU handling (at least for the - * host) in the perf subsystem because it knows more about what - * is being used. - */ - - /* POWER9, POWER10 do not implement HPMC or SPMC */ - - host_os_sprs->mmcr0 = mfspr(SPRN_MMCR0); - host_os_sprs->mmcra = mfspr(SPRN_MMCRA); - - freeze_pmu(host_os_sprs->mmcr0, host_os_sprs->mmcra); - - host_os_sprs->pmc1 = mfspr(SPRN_PMC1); - host_os_sprs->pmc2 = mfspr(SPRN_PMC2); - host_os_sprs->pmc3 = mfspr(SPRN_PMC3); - host_os_sprs->pmc4 = mfspr(SPRN_PMC4); - host_os_sprs->pmc5 = mfspr(SPRN_PMC5); - host_os_sprs->pmc6 = mfspr(SPRN_PMC6); - host_os_sprs->mmcr1 = mfspr(SPRN_MMCR1); - host_os_sprs->mmcr2 = mfspr(SPRN_MMCR2); - host_os_sprs->sdar = mfspr(SPRN_SDAR); - host_os_sprs->siar = mfspr(SPRN_SIAR); - host_os_sprs->sier1 = mfspr(SPRN_SIER); - - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - host_os_sprs->mmcr3 = mfspr(SPRN_MMCR3); - host_os_sprs->sier2 = mfspr(SPRN_SIER2); - host_os_sprs->sier3 = mfspr(SPRN_SIER3); - } - } - -#ifdef CONFIG_PPC_PSERIES - /* After saving PMU, before loading guest PMU, flip pmcregs_in_use */ - if (kvmhv_on_pseries()) { - barrier(); - get_lppaca()->pmcregs_in_use = load_pmu; - barrier(); - } -#endif - - /* - * Load guest. If the VPA said the PMCs are not in use but the guest - * tried to access them anyway, HFSCR[PM] will be set by the HFAC - * fault so we can make forward progress. - */ - if (load_pmu || (vcpu->arch.hfscr & HFSCR_PM)) { - mtspr(SPRN_PMC1, vcpu->arch.pmc[0]); - mtspr(SPRN_PMC2, vcpu->arch.pmc[1]); - mtspr(SPRN_PMC3, vcpu->arch.pmc[2]); - mtspr(SPRN_PMC4, vcpu->arch.pmc[3]); - mtspr(SPRN_PMC5, vcpu->arch.pmc[4]); - mtspr(SPRN_PMC6, vcpu->arch.pmc[5]); - mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]); - mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]); - mtspr(SPRN_SDAR, vcpu->arch.sdar); - mtspr(SPRN_SIAR, vcpu->arch.siar); - mtspr(SPRN_SIER, vcpu->arch.sier[0]); - - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - mtspr(SPRN_MMCR3, vcpu->arch.mmcr[3]); - mtspr(SPRN_SIER2, vcpu->arch.sier[1]); - mtspr(SPRN_SIER3, vcpu->arch.sier[2]); - } - - /* Set MMCRA then MMCR0 last */ - mtspr(SPRN_MMCRA, vcpu->arch.mmcra); - mtspr(SPRN_MMCR0, vcpu->arch.mmcr[0]); - /* No isync necessary because we're starting counters */ - - if (!vcpu->arch.nested && - (vcpu->arch.hfscr_permitted & HFSCR_PM)) - vcpu->arch.hfscr |= HFSCR_PM; - } -} - -static void switch_pmu_to_host(struct kvm_vcpu *vcpu, - struct p9_host_os_sprs *host_os_sprs) -{ - struct lppaca *lp; - int save_pmu = 1; - - lp = vcpu->arch.vpa.pinned_addr; - if (lp) - save_pmu = lp->pmcregs_in_use; - if (IS_ENABLED(CONFIG_KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND)) { - /* - * Save pmu if this guest is capable of running nested guests. - * This is option is for old L1s that do not set their - * lppaca->pmcregs_in_use properly when entering their L2. - */ - save_pmu |= nesting_enabled(vcpu->kvm); - } - - if (save_pmu) { - vcpu->arch.mmcr[0] = mfspr(SPRN_MMCR0); - vcpu->arch.mmcra = mfspr(SPRN_MMCRA); - - freeze_pmu(vcpu->arch.mmcr[0], vcpu->arch.mmcra); - - vcpu->arch.pmc[0] = mfspr(SPRN_PMC1); - vcpu->arch.pmc[1] = mfspr(SPRN_PMC2); - vcpu->arch.pmc[2] = mfspr(SPRN_PMC3); - vcpu->arch.pmc[3] = mfspr(SPRN_PMC4); - vcpu->arch.pmc[4] = mfspr(SPRN_PMC5); - vcpu->arch.pmc[5] = mfspr(SPRN_PMC6); - vcpu->arch.mmcr[1] = mfspr(SPRN_MMCR1); - vcpu->arch.mmcr[2] = mfspr(SPRN_MMCR2); - vcpu->arch.sdar = mfspr(SPRN_SDAR); - vcpu->arch.siar = mfspr(SPRN_SIAR); - vcpu->arch.sier[0] = mfspr(SPRN_SIER); - - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - vcpu->arch.mmcr[3] = mfspr(SPRN_MMCR3); - vcpu->arch.sier[1] = mfspr(SPRN_SIER2); - vcpu->arch.sier[2] = mfspr(SPRN_SIER3); - } - - } else if (vcpu->arch.hfscr & HFSCR_PM) { - /* - * The guest accessed PMC SPRs without specifying they should - * be preserved, or it cleared pmcregs_in_use after the last - * access. Just ensure they are frozen. - */ - freeze_pmu(mfspr(SPRN_MMCR0), mfspr(SPRN_MMCRA)); - - /* - * Demand-fault PMU register access in the guest. - * - * This is used to grab the guest's VPA pmcregs_in_use value - * and reflect it into the host's VPA in the case of a nested - * hypervisor. - * - * It also avoids having to zero-out SPRs after each guest - * exit to avoid side-channels when. - * - * This is cleared here when we exit the guest, so later HFSCR - * interrupt handling can add it back to run the guest with - * PM enabled next time. - */ - if (!vcpu->arch.nested) - vcpu->arch.hfscr &= ~HFSCR_PM; - } /* otherwise the PMU should still be frozen */ - -#ifdef CONFIG_PPC_PSERIES - if (kvmhv_on_pseries()) { - barrier(); - get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); - barrier(); - } -#endif - - if (ppc_get_pmu_inuse()) { - mtspr(SPRN_PMC1, host_os_sprs->pmc1); - mtspr(SPRN_PMC2, host_os_sprs->pmc2); - mtspr(SPRN_PMC3, host_os_sprs->pmc3); - mtspr(SPRN_PMC4, host_os_sprs->pmc4); - mtspr(SPRN_PMC5, host_os_sprs->pmc5); - mtspr(SPRN_PMC6, host_os_sprs->pmc6); - mtspr(SPRN_MMCR1, host_os_sprs->mmcr1); - mtspr(SPRN_MMCR2, host_os_sprs->mmcr2); - mtspr(SPRN_SDAR, host_os_sprs->sdar); - mtspr(SPRN_SIAR, host_os_sprs->siar); - mtspr(SPRN_SIER, host_os_sprs->sier1); - - if (cpu_has_feature(CPU_FTR_ARCH_31)) { - mtspr(SPRN_MMCR3, host_os_sprs->mmcr3); - mtspr(SPRN_SIER2, host_os_sprs->sier2); - mtspr(SPRN_SIER3, host_os_sprs->sier3); - } - - /* Set MMCRA then MMCR0 last */ - mtspr(SPRN_MMCRA, host_os_sprs->mmcra); - mtspr(SPRN_MMCR0, host_os_sprs->mmcr0); - isync(); - } -} - -static void load_spr_state(struct kvm_vcpu *vcpu, - struct p9_host_os_sprs *host_os_sprs) -{ - mtspr(SPRN_TAR, vcpu->arch.tar); - mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); - mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); - mtspr(SPRN_BESCR, vcpu->arch.bescr); - - if (cpu_has_feature(CPU_FTR_P9_TIDR)) - mtspr(SPRN_TIDR, vcpu->arch.tid); - if (host_os_sprs->iamr != vcpu->arch.iamr) - mtspr(SPRN_IAMR, vcpu->arch.iamr); - if (host_os_sprs->amr != vcpu->arch.amr) - mtspr(SPRN_AMR, vcpu->arch.amr); - if (vcpu->arch.uamor != 0) - mtspr(SPRN_UAMOR, vcpu->arch.uamor); - if (host_os_sprs->fscr != vcpu->arch.fscr) - mtspr(SPRN_FSCR, vcpu->arch.fscr); - if (host_os_sprs->dscr != vcpu->arch.dscr) - mtspr(SPRN_DSCR, vcpu->arch.dscr); - if (vcpu->arch.pspb != 0) - mtspr(SPRN_PSPB, vcpu->arch.pspb); - - /* - * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI] - * clear (or hstate set appropriately to catch those registers - * being clobbered if we take a MCE or SRESET), so those are done - * later. - */ - - if (!(vcpu->arch.ctrl & 1)) - mtspr(SPRN_CTRLT, 0); -} - -static void store_spr_state(struct kvm_vcpu *vcpu) -{ - vcpu->arch.tar = mfspr(SPRN_TAR); - vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); - vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); - vcpu->arch.bescr = mfspr(SPRN_BESCR); - - if (cpu_has_feature(CPU_FTR_P9_TIDR)) - vcpu->arch.tid = mfspr(SPRN_TIDR); - vcpu->arch.iamr = mfspr(SPRN_IAMR); - vcpu->arch.amr = mfspr(SPRN_AMR); - vcpu->arch.uamor = mfspr(SPRN_UAMOR); - vcpu->arch.fscr = mfspr(SPRN_FSCR); - vcpu->arch.dscr = mfspr(SPRN_DSCR); - vcpu->arch.pspb = mfspr(SPRN_PSPB); - - vcpu->arch.ctrl = mfspr(SPRN_CTRLF); -} - -/* Returns true if current MSR and/or guest MSR may have changed */ -static bool load_vcpu_state(struct kvm_vcpu *vcpu, - struct p9_host_os_sprs *host_os_sprs) -{ - bool ret = false; - - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { - kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); - ret = true; - } - - load_spr_state(vcpu, host_os_sprs); - - load_fp_state(&vcpu->arch.fp); -#ifdef CONFIG_ALTIVEC - load_vr_state(&vcpu->arch.vr); -#endif - mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); - - return ret; -} - -static void store_vcpu_state(struct kvm_vcpu *vcpu) -{ - store_spr_state(vcpu); - - store_fp_state(&vcpu->arch.fp); -#ifdef CONFIG_ALTIVEC - store_vr_state(&vcpu->arch.vr); -#endif - vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); - - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) - kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); -} - -static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) -{ - host_os_sprs->dscr = mfspr(SPRN_DSCR); - if (cpu_has_feature(CPU_FTR_P9_TIDR)) - host_os_sprs->tidr = mfspr(SPRN_TIDR); - host_os_sprs->iamr = mfspr(SPRN_IAMR); - host_os_sprs->amr = mfspr(SPRN_AMR); - host_os_sprs->fscr = mfspr(SPRN_FSCR); - host_os_sprs->dscr = mfspr(SPRN_DSCR); -} - -/* vcpu guest regs must already be saved */ -static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, - struct p9_host_os_sprs *host_os_sprs) -{ - mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); - - if (cpu_has_feature(CPU_FTR_P9_TIDR)) - mtspr(SPRN_TIDR, host_os_sprs->tidr); - if (host_os_sprs->iamr != vcpu->arch.iamr) - mtspr(SPRN_IAMR, host_os_sprs->iamr); - if (vcpu->arch.uamor != 0) - mtspr(SPRN_UAMOR, 0); - if (host_os_sprs->amr != vcpu->arch.amr) - mtspr(SPRN_AMR, host_os_sprs->amr); - if (host_os_sprs->fscr != vcpu->arch.fscr) - mtspr(SPRN_FSCR, host_os_sprs->fscr); - if (host_os_sprs->dscr != vcpu->arch.dscr) - mtspr(SPRN_DSCR, host_os_sprs->dscr); - if (vcpu->arch.pspb != 0) - mtspr(SPRN_PSPB, 0); - - /* Save guest CTRL register, set runlatch to 1 */ - if (!(vcpu->arch.ctrl & 1)) - mtspr(SPRN_CTRLT, 1); -} - static inline bool hcall_is_xics(unsigned long req) { return req == H_EOI || req == H_CPPR || req == H_IPI || diff --git a/arch/powerpc/kvm/book3s_hv.h b/arch/powerpc/kvm/book3s_hv.h new file mode 100644 index 000000000000..d7485b9e9762 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv.h @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Privileged (non-hypervisor) host registers to save. + */ +struct p9_host_os_sprs { + unsigned long dscr; + unsigned long tidr; + unsigned long iamr; + unsigned long amr; + unsigned long fscr; + + unsigned int pmc1; + unsigned int pmc2; + unsigned int pmc3; + unsigned int pmc4; + unsigned int pmc5; + unsigned int pmc6; + unsigned long mmcr0; + unsigned long mmcr1; + unsigned long mmcr2; + unsigned long mmcr3; + unsigned long mmcra; + unsigned long siar; + unsigned long sier1; + unsigned long sier2; + unsigned long sier3; + unsigned long sdar; +}; + +static inline bool nesting_enabled(struct kvm *kvm) +{ + return kvm->arch.nested_enable && kvm_is_radix(kvm); +} + +bool load_vcpu_state(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs); +void store_vcpu_state(struct kvm_vcpu *vcpu); +void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs); +void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs); +void switch_pmu_to_guest(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs); +void switch_pmu_to_host(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index bd0021cd3a67..784ff5429ebc 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -4,8 +4,361 @@ #include #include #include +#include #include +#include "book3s_hv.h" + +static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra) +{ + if (!(mmcr0 & MMCR0_FC)) + goto do_freeze; + if (mmcra & MMCRA_SAMPLE_ENABLE) + goto do_freeze; + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + if (!(mmcr0 & MMCR0_PMCCEXT)) + goto do_freeze; + if (!(mmcra & MMCRA_BHRB_DISABLE)) + goto do_freeze; + } + return; + +do_freeze: + mmcr0 = MMCR0_FC; + mmcra = 0; + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mmcr0 |= MMCR0_PMCCEXT; + mmcra = MMCRA_BHRB_DISABLE; + } + + mtspr(SPRN_MMCR0, mmcr0); + mtspr(SPRN_MMCRA, mmcra); + isync(); +} + +void switch_pmu_to_guest(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) +{ + struct lppaca *lp; + int load_pmu = 1; + + lp = vcpu->arch.vpa.pinned_addr; + if (lp) + load_pmu = lp->pmcregs_in_use; + + /* Save host */ + if (ppc_get_pmu_inuse()) { + /* + * It might be better to put PMU handling (at least for the + * host) in the perf subsystem because it knows more about what + * is being used. + */ + + /* POWER9, POWER10 do not implement HPMC or SPMC */ + + host_os_sprs->mmcr0 = mfspr(SPRN_MMCR0); + host_os_sprs->mmcra = mfspr(SPRN_MMCRA); + + freeze_pmu(host_os_sprs->mmcr0, host_os_sprs->mmcra); + + host_os_sprs->pmc1 = mfspr(SPRN_PMC1); + host_os_sprs->pmc2 = mfspr(SPRN_PMC2); + host_os_sprs->pmc3 = mfspr(SPRN_PMC3); + host_os_sprs->pmc4 = mfspr(SPRN_PMC4); + host_os_sprs->pmc5 = mfspr(SPRN_PMC5); + host_os_sprs->pmc6 = mfspr(SPRN_PMC6); + host_os_sprs->mmcr1 = mfspr(SPRN_MMCR1); + host_os_sprs->mmcr2 = mfspr(SPRN_MMCR2); + host_os_sprs->sdar = mfspr(SPRN_SDAR); + host_os_sprs->siar = mfspr(SPRN_SIAR); + host_os_sprs->sier1 = mfspr(SPRN_SIER); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + host_os_sprs->mmcr3 = mfspr(SPRN_MMCR3); + host_os_sprs->sier2 = mfspr(SPRN_SIER2); + host_os_sprs->sier3 = mfspr(SPRN_SIER3); + } + } + +#ifdef CONFIG_PPC_PSERIES + /* After saving PMU, before loading guest PMU, flip pmcregs_in_use */ + if (kvmhv_on_pseries()) { + barrier(); + get_lppaca()->pmcregs_in_use = load_pmu; + barrier(); + } +#endif + + /* + * Load guest. If the VPA said the PMCs are not in use but the guest + * tried to access them anyway, HFSCR[PM] will be set by the HFAC + * fault so we can make forward progress. + */ + if (load_pmu || (vcpu->arch.hfscr & HFSCR_PM)) { + mtspr(SPRN_PMC1, vcpu->arch.pmc[0]); + mtspr(SPRN_PMC2, vcpu->arch.pmc[1]); + mtspr(SPRN_PMC3, vcpu->arch.pmc[2]); + mtspr(SPRN_PMC4, vcpu->arch.pmc[3]); + mtspr(SPRN_PMC5, vcpu->arch.pmc[4]); + mtspr(SPRN_PMC6, vcpu->arch.pmc[5]); + mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]); + mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]); + mtspr(SPRN_SDAR, vcpu->arch.sdar); + mtspr(SPRN_SIAR, vcpu->arch.siar); + mtspr(SPRN_SIER, vcpu->arch.sier[0]); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mtspr(SPRN_MMCR3, vcpu->arch.mmcr[3]); + mtspr(SPRN_SIER2, vcpu->arch.sier[1]); + mtspr(SPRN_SIER3, vcpu->arch.sier[2]); + } + + /* Set MMCRA then MMCR0 last */ + mtspr(SPRN_MMCRA, vcpu->arch.mmcra); + mtspr(SPRN_MMCR0, vcpu->arch.mmcr[0]); + /* No isync necessary because we're starting counters */ + + if (!vcpu->arch.nested && + (vcpu->arch.hfscr_permitted & HFSCR_PM)) + vcpu->arch.hfscr |= HFSCR_PM; + } +} +EXPORT_SYMBOL_GPL(switch_pmu_to_guest); + +void switch_pmu_to_host(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) +{ + struct lppaca *lp; + int save_pmu = 1; + + lp = vcpu->arch.vpa.pinned_addr; + if (lp) + save_pmu = lp->pmcregs_in_use; + if (IS_ENABLED(CONFIG_KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND)) { + /* + * Save pmu if this guest is capable of running nested guests. + * This is option is for old L1s that do not set their + * lppaca->pmcregs_in_use properly when entering their L2. + */ + save_pmu |= nesting_enabled(vcpu->kvm); + } + + if (save_pmu) { + vcpu->arch.mmcr[0] = mfspr(SPRN_MMCR0); + vcpu->arch.mmcra = mfspr(SPRN_MMCRA); + + freeze_pmu(vcpu->arch.mmcr[0], vcpu->arch.mmcra); + + vcpu->arch.pmc[0] = mfspr(SPRN_PMC1); + vcpu->arch.pmc[1] = mfspr(SPRN_PMC2); + vcpu->arch.pmc[2] = mfspr(SPRN_PMC3); + vcpu->arch.pmc[3] = mfspr(SPRN_PMC4); + vcpu->arch.pmc[4] = mfspr(SPRN_PMC5); + vcpu->arch.pmc[5] = mfspr(SPRN_PMC6); + vcpu->arch.mmcr[1] = mfspr(SPRN_MMCR1); + vcpu->arch.mmcr[2] = mfspr(SPRN_MMCR2); + vcpu->arch.sdar = mfspr(SPRN_SDAR); + vcpu->arch.siar = mfspr(SPRN_SIAR); + vcpu->arch.sier[0] = mfspr(SPRN_SIER); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + vcpu->arch.mmcr[3] = mfspr(SPRN_MMCR3); + vcpu->arch.sier[1] = mfspr(SPRN_SIER2); + vcpu->arch.sier[2] = mfspr(SPRN_SIER3); + } + + } else if (vcpu->arch.hfscr & HFSCR_PM) { + /* + * The guest accessed PMC SPRs without specifying they should + * be preserved, or it cleared pmcregs_in_use after the last + * access. Just ensure they are frozen. + */ + freeze_pmu(mfspr(SPRN_MMCR0), mfspr(SPRN_MMCRA)); + + /* + * Demand-fault PMU register access in the guest. + * + * This is used to grab the guest's VPA pmcregs_in_use value + * and reflect it into the host's VPA in the case of a nested + * hypervisor. + * + * It also avoids having to zero-out SPRs after each guest + * exit to avoid side-channels when. + * + * This is cleared here when we exit the guest, so later HFSCR + * interrupt handling can add it back to run the guest with + * PM enabled next time. + */ + if (!vcpu->arch.nested) + vcpu->arch.hfscr &= ~HFSCR_PM; + } /* otherwise the PMU should still be frozen */ + +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); + barrier(); + } +#endif + + if (ppc_get_pmu_inuse()) { + mtspr(SPRN_PMC1, host_os_sprs->pmc1); + mtspr(SPRN_PMC2, host_os_sprs->pmc2); + mtspr(SPRN_PMC3, host_os_sprs->pmc3); + mtspr(SPRN_PMC4, host_os_sprs->pmc4); + mtspr(SPRN_PMC5, host_os_sprs->pmc5); + mtspr(SPRN_PMC6, host_os_sprs->pmc6); + mtspr(SPRN_MMCR1, host_os_sprs->mmcr1); + mtspr(SPRN_MMCR2, host_os_sprs->mmcr2); + mtspr(SPRN_SDAR, host_os_sprs->sdar); + mtspr(SPRN_SIAR, host_os_sprs->siar); + mtspr(SPRN_SIER, host_os_sprs->sier1); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mtspr(SPRN_MMCR3, host_os_sprs->mmcr3); + mtspr(SPRN_SIER2, host_os_sprs->sier2); + mtspr(SPRN_SIER3, host_os_sprs->sier3); + } + + /* Set MMCRA then MMCR0 last */ + mtspr(SPRN_MMCRA, host_os_sprs->mmcra); + mtspr(SPRN_MMCR0, host_os_sprs->mmcr0); + isync(); + } +} +EXPORT_SYMBOL_GPL(switch_pmu_to_host); + +static void load_spr_state(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) +{ + mtspr(SPRN_TAR, vcpu->arch.tar); + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + mtspr(SPRN_BESCR, vcpu->arch.bescr); + + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + mtspr(SPRN_TIDR, vcpu->arch.tid); + if (host_os_sprs->iamr != vcpu->arch.iamr) + mtspr(SPRN_IAMR, vcpu->arch.iamr); + if (host_os_sprs->amr != vcpu->arch.amr) + mtspr(SPRN_AMR, vcpu->arch.amr); + if (vcpu->arch.uamor != 0) + mtspr(SPRN_UAMOR, vcpu->arch.uamor); + if (host_os_sprs->fscr != vcpu->arch.fscr) + mtspr(SPRN_FSCR, vcpu->arch.fscr); + if (host_os_sprs->dscr != vcpu->arch.dscr) + mtspr(SPRN_DSCR, vcpu->arch.dscr); + if (vcpu->arch.pspb != 0) + mtspr(SPRN_PSPB, vcpu->arch.pspb); + + /* + * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI] + * clear (or hstate set appropriately to catch those registers + * being clobbered if we take a MCE or SRESET), so those are done + * later. + */ + + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, 0); +} + +static void store_spr_state(struct kvm_vcpu *vcpu) +{ + vcpu->arch.tar = mfspr(SPRN_TAR); + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); + vcpu->arch.bescr = mfspr(SPRN_BESCR); + + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + vcpu->arch.tid = mfspr(SPRN_TIDR); + vcpu->arch.iamr = mfspr(SPRN_IAMR); + vcpu->arch.amr = mfspr(SPRN_AMR); + vcpu->arch.uamor = mfspr(SPRN_UAMOR); + vcpu->arch.fscr = mfspr(SPRN_FSCR); + vcpu->arch.dscr = mfspr(SPRN_DSCR); + vcpu->arch.pspb = mfspr(SPRN_PSPB); + + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); +} + +/* Returns true if current MSR and/or guest MSR may have changed */ +bool load_vcpu_state(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) +{ + bool ret = false; + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + ret = true; + } + + load_spr_state(vcpu, host_os_sprs); + + load_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + load_vr_state(&vcpu->arch.vr); +#endif + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); + + return ret; +} +EXPORT_SYMBOL_GPL(load_vcpu_state); + +void store_vcpu_state(struct kvm_vcpu *vcpu) +{ + store_spr_state(vcpu); + + store_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + store_vr_state(&vcpu->arch.vr); +#endif + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); +} +EXPORT_SYMBOL_GPL(store_vcpu_state); + +void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) +{ + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + host_os_sprs->tidr = mfspr(SPRN_TIDR); + host_os_sprs->iamr = mfspr(SPRN_IAMR); + host_os_sprs->amr = mfspr(SPRN_AMR); + host_os_sprs->fscr = mfspr(SPRN_FSCR); + host_os_sprs->dscr = mfspr(SPRN_DSCR); +} +EXPORT_SYMBOL_GPL(save_p9_host_os_sprs); + +/* vcpu guest regs must already be saved */ +void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) +{ + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); + + if (cpu_has_feature(CPU_FTR_P9_TIDR)) + mtspr(SPRN_TIDR, host_os_sprs->tidr); + if (host_os_sprs->iamr != vcpu->arch.iamr) + mtspr(SPRN_IAMR, host_os_sprs->iamr); + if (vcpu->arch.uamor != 0) + mtspr(SPRN_UAMOR, 0); + if (host_os_sprs->amr != vcpu->arch.amr) + mtspr(SPRN_AMR, host_os_sprs->amr); + if (host_os_sprs->fscr != vcpu->arch.fscr) + mtspr(SPRN_FSCR, host_os_sprs->fscr); + if (host_os_sprs->dscr != vcpu->arch.dscr) + mtspr(SPRN_DSCR, host_os_sprs->dscr); + if (vcpu->arch.pspb != 0) + mtspr(SPRN_PSPB, 0); + + /* Save guest CTRL register, set runlatch to 1 */ + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, 1); +} +EXPORT_SYMBOL_GPL(restore_p9_host_os_sprs); + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING static void __start_timing(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next) { From 08b3f08af583c01b3cfdc15bda68063c2a401512 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:08 +1000 Subject: [PATCH 030/240] KVM: PPC: Book3S HV P9: Move nested guest entry into its own function Move the part of the guest entry which is specific to nested HV into its own function. This is just refactoring. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-31-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 125 +++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8a9d2314d67c..69631309b6af 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3822,6 +3822,72 @@ static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu) } } +/* call our hypervisor to load up HV regs and go */ +static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long host_psscr; + struct hv_guest_state hvregs; + int trap; + s64 dec; + + /* + * We need to save and restore the guest visible part of the + * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor + * doesn't do this for us. Note only required if pseries since + * this is done in kvmhv_vcpu_entry_p9() below otherwise. + */ + host_psscr = mfspr(SPRN_PSSCR_PR); + mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr); + kvmhv_save_hv_regs(vcpu, &hvregs); + hvregs.lpcr = lpcr; + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + hvregs.version = HV_GUEST_STATE_VERSION; + if (vcpu->arch.nested) { + hvregs.lpid = vcpu->arch.nested->shadow_lpid; + hvregs.vcpu_token = vcpu->arch.nested_vcpu_id; + } else { + hvregs.lpid = vcpu->kvm->arch.lpid; + hvregs.vcpu_token = vcpu->vcpu_id; + } + hvregs.hdec_expiry = time_limit; + + /* + * When setting DEC, we must always deal with irq_work_raise + * via NMI vs setting DEC. The problem occurs right as we + * switch into guest mode if a NMI hits and sets pending work + * and sets DEC, then that will apply to the guest and not + * bring us back to the host. + * + * irq_work_raise could check a flag (or possibly LPCR[HDICE] + * for example) and set HDEC to 1? That wouldn't solve the + * nested hv case which needs to abort the hcall or zero the + * time limit. + * + * XXX: Another day's problem. + */ + mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb); + + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), + __pa(&vcpu->arch.regs)); + kvmhv_restore_hv_return_state(vcpu, &hvregs); + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); + vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); + mtspr(SPRN_PSSCR_PR, host_psscr); + + dec = mfspr(SPRN_DEC); + if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ + dec = (s32) dec; + *tb = mftb(); + vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset); + + return trap; +} + /* * Guest entry for POWER9 and later CPUs. */ @@ -3830,7 +3896,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, { struct kvmppc_vcore *vc = vcpu->arch.vcore; struct p9_host_os_sprs host_os_sprs; - s64 dec; u64 next_timer; unsigned long msr; int trap; @@ -3883,63 +3948,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, switch_pmu_to_guest(vcpu, &host_os_sprs); if (kvmhv_on_pseries()) { - /* - * We need to save and restore the guest visible part of the - * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor - * doesn't do this for us. Note only required if pseries since - * this is done in kvmhv_vcpu_entry_p9() below otherwise. - */ - unsigned long host_psscr; - /* call our hypervisor to load up HV regs and go */ - struct hv_guest_state hvregs; - - host_psscr = mfspr(SPRN_PSSCR_PR); - mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr); - kvmhv_save_hv_regs(vcpu, &hvregs); - hvregs.lpcr = lpcr; - vcpu->arch.regs.msr = vcpu->arch.shregs.msr; - hvregs.version = HV_GUEST_STATE_VERSION; - if (vcpu->arch.nested) { - hvregs.lpid = vcpu->arch.nested->shadow_lpid; - hvregs.vcpu_token = vcpu->arch.nested_vcpu_id; - } else { - hvregs.lpid = vcpu->kvm->arch.lpid; - hvregs.vcpu_token = vcpu->vcpu_id; - } - hvregs.hdec_expiry = time_limit; - - /* - * When setting DEC, we must always deal with irq_work_raise - * via NMI vs setting DEC. The problem occurs right as we - * switch into guest mode if a NMI hits and sets pending work - * and sets DEC, then that will apply to the guest and not - * bring us back to the host. - * - * irq_work_raise could check a flag (or possibly LPCR[HDICE] - * for example) and set HDEC to 1? That wouldn't solve the - * nested hv case which needs to abort the hcall or zero the - * time limit. - * - * XXX: Another day's problem. - */ - mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb); - - mtspr(SPRN_DAR, vcpu->arch.shregs.dar); - mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); - trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), - __pa(&vcpu->arch.regs)); - kvmhv_restore_hv_return_state(vcpu, &hvregs); - vcpu->arch.shregs.msr = vcpu->arch.regs.msr; - vcpu->arch.shregs.dar = mfspr(SPRN_DAR); - vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); - vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); - mtspr(SPRN_PSSCR_PR, host_psscr); - - dec = mfspr(SPRN_DEC); - if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ - dec = (s32) dec; - *tb = mftb(); - vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset); + trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb); /* H_CEDE has to be handled now, not later */ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && From d5f480194577423731ee8413791a5486f26a95ab Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:09 +1000 Subject: [PATCH 031/240] KVM: PPC: Book3S HV P9: Move remaining SPR and MSR access into low level entry Move register saving and loading from kvmhv_p9_guest_entry() into the HV and nested entry handlers. Accesses are scheduled to reduce mtSPR / mfSPR interleaving which reduces SPR scoreboard stalls. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-32-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 79 ++++++++++------------ arch/powerpc/kvm/book3s_hv_p9_entry.c | 96 ++++++++++++++++++++------- 2 files changed, 109 insertions(+), 66 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 69631309b6af..40bee0d61482 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3827,9 +3827,15 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns { struct kvmppc_vcore *vc = vcpu->arch.vcore; unsigned long host_psscr; + unsigned long msr; struct hv_guest_state hvregs; - int trap; + struct p9_host_os_sprs host_os_sprs; s64 dec; + int trap; + + switch_pmu_to_guest(vcpu, &host_os_sprs); + + save_p9_host_os_sprs(&host_os_sprs); /* * We need to save and restore the guest visible part of the @@ -3838,6 +3844,27 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns * this is done in kvmhv_vcpu_entry_p9() below otherwise. */ host_psscr = mfspr(SPRN_PSSCR_PR); + + hard_irq_disable(); + if (lazy_irq_pending()) + return 0; + + /* MSR bits may have been cleared by context switch */ + msr = 0; + if (IS_ENABLED(CONFIG_PPC_FPU)) + msr |= MSR_FP; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + msr |= MSR_VEC; + if (cpu_has_feature(CPU_FTR_VSX)) + msr |= MSR_VSX; + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + msr |= MSR_TM; + msr = msr_check_and_set(msr); + + if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) + msr = mfmsr(); /* TM restore can update msr */ + mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr); kvmhv_save_hv_regs(vcpu, &hvregs); hvregs.lpcr = lpcr; @@ -3879,12 +3906,20 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); mtspr(SPRN_PSSCR_PR, host_psscr); + store_vcpu_state(vcpu); + dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ dec = (s32) dec; *tb = mftb(); vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset); + timer_rearm_host_dec(*tb); + + restore_p9_host_os_sprs(vcpu, &host_os_sprs); + + switch_pmu_to_host(vcpu, &host_os_sprs); + return trap; } @@ -3895,9 +3930,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - struct p9_host_os_sprs host_os_sprs; u64 next_timer; - unsigned long msr; int trap; next_timer = timer_get_next_tb(); @@ -3908,33 +3941,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.ceded = 0; - save_p9_host_os_sprs(&host_os_sprs); - - /* - * This could be combined with MSR[RI] clearing, but that expands - * the unrecoverable window. It would be better to cover unrecoverable - * with KVM bad interrupt handling rather than use MSR[RI] at all. - * - * Much more difficult and less worthwhile to combine with IR/DR - * disable. - */ - hard_irq_disable(); - if (lazy_irq_pending()) - return 0; - - /* MSR bits may have been cleared by context switch */ - msr = 0; - if (IS_ENABLED(CONFIG_PPC_FPU)) - msr |= MSR_FP; - if (cpu_has_feature(CPU_FTR_ALTIVEC)) - msr |= MSR_VEC; - if (cpu_has_feature(CPU_FTR_VSX)) - msr |= MSR_VSX; - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) - msr |= MSR_TM; - msr = msr_check_and_set(msr); - kvmppc_subcore_enter_guest(); vc->entry_exit_map = 1; @@ -3942,11 +3948,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu_vpa_increment_dispatch(vcpu); - if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) - msr = mfmsr(); /* MSR may have been updated */ - - switch_pmu_to_guest(vcpu, &host_os_sprs); - if (kvmhv_on_pseries()) { trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb); @@ -3989,16 +3990,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.slb_max = 0; } - switch_pmu_to_host(vcpu, &host_os_sprs); - - store_vcpu_state(vcpu); - vcpu_vpa_increment_dispatch(vcpu); - timer_rearm_host_dec(*tb); - - restore_p9_host_os_sprs(vcpu, &host_os_sprs); - vc->entry_exit_map = 0x101; vc->in_guest = 0; diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 784ff5429ebc..fa080533bd8d 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -538,6 +538,7 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu) int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { + struct p9_host_os_sprs host_os_sprs; struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested; struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -567,9 +568,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vcpu->arch.ceded = 0; - /* Could avoid mfmsr by passing around, but probably no big deal */ - msr = mfmsr(); - host_hfscr = mfspr(SPRN_HFSCR); host_ciabr = mfspr(SPRN_CIABR); host_dawr0 = mfspr(SPRN_DAWR0); @@ -584,6 +582,41 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); + switch_pmu_to_guest(vcpu, &host_os_sprs); + + save_p9_host_os_sprs(&host_os_sprs); + + /* + * This could be combined with MSR[RI] clearing, but that expands + * the unrecoverable window. It would be better to cover unrecoverable + * with KVM bad interrupt handling rather than use MSR[RI] at all. + * + * Much more difficult and less worthwhile to combine with IR/DR + * disable. + */ + hard_irq_disable(); + if (lazy_irq_pending()) { + trap = 0; + goto out; + } + + /* MSR bits may have been cleared by context switch */ + msr = 0; + if (IS_ENABLED(CONFIG_PPC_FPU)) + msr |= MSR_FP; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + msr |= MSR_VEC; + if (cpu_has_feature(CPU_FTR_VSX)) + msr |= MSR_VSX; + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + msr |= MSR_TM; + msr = msr_check_and_set(msr); + /* Save MSR for restore. This is after hard disable, so EE is clear. */ + + if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) + msr = mfmsr(); /* MSR may have been updated */ + if (vc->tb_offset) { u64 new_tb = *tb + vc->tb_offset; mtspr(SPRN_TBU40, new_tb); @@ -642,6 +675,14 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); + /* + * It might be preferable to load_vcpu_state here, in order to get the + * GPR/FP register loads executing in parallel with the previous mtSPR + * instructions, but for now that can't be done because the TM handling + * in load_vcpu_state can change some SPRs and vcpu state (nip, msr). + * But TM could be split out if this would be a significant benefit. + */ + local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9; /* @@ -819,6 +860,20 @@ tm_return_to_guest: vc->dpdes = mfspr(SPRN_DPDES); vc->vtb = mfspr(SPRN_VTB); + save_clear_guest_mmu(kvm, vcpu); + switch_mmu_to_host(kvm, host_pidr); + + /* + * If we are in real mode, only switch MMU on after the MMU is + * switched to host, to avoid the P9_RADIX_PREFETCH_BUG. + */ + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && + vcpu->arch.shregs.msr & MSR_TS_MASK) + msr |= MSR_TS_S; + __mtmsrd(msr, 0); + + store_vcpu_state(vcpu); + dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ dec = (s32) dec; @@ -851,6 +906,19 @@ tm_return_to_guest: mtspr(SPRN_DAWRX1, host_dawrx1); } + mtspr(SPRN_DPDES, 0); + if (vc->pcr) + mtspr(SPRN_PCR, PCR_MASK); + + /* HDEC must be at least as large as DEC, so decrementer_max fits */ + mtspr(SPRN_HDEC, decrementer_max); + + timer_rearm_host_dec(*tb); + + restore_p9_host_os_sprs(vcpu, &host_os_sprs); + + local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE; + if (kvm_is_radix(kvm)) { /* * Since this is radix, do a eieio; tlbsync; ptesync sequence @@ -867,26 +935,8 @@ tm_return_to_guest: if (cpu_has_feature(CPU_FTR_ARCH_31)) asm volatile(PPC_CP_ABORT); - mtspr(SPRN_DPDES, 0); - if (vc->pcr) - mtspr(SPRN_PCR, PCR_MASK); - - /* HDEC must be at least as large as DEC, so decrementer_max fits */ - mtspr(SPRN_HDEC, decrementer_max); - - save_clear_guest_mmu(kvm, vcpu); - switch_mmu_to_host(kvm, host_pidr); - local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE; - - /* - * If we are in real mode, only switch MMU on after the MMU is - * switched to host, to avoid the P9_RADIX_PREFETCH_BUG. - */ - if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && - vcpu->arch.shregs.msr & MSR_TS_MASK) - msr |= MSR_TS_S; - - __mtmsrd(msr, 0); +out: + switch_pmu_to_host(vcpu, &host_os_sprs); end_timing(vcpu); From 3f9e2966d1b0dd81bcfaeb816335e0ddeedde3c1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:10 +1000 Subject: [PATCH 032/240] KVM: PPC: Book3S HV P9: Implement TM fastpath for guest entry/exit If TM is not active, only TM register state needs to be saved and restored, avoiding several mfmsr/mtmsrd instructions and improving performance. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-33-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_p9_entry.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index fa080533bd8d..6bef509bccb8 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -287,11 +287,20 @@ bool load_vcpu_state(struct kvm_vcpu *vcpu, { bool ret = false; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (cpu_has_feature(CPU_FTR_TM) || cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { - kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); - ret = true; + unsigned long guest_msr = vcpu->arch.shregs.msr; + if (MSR_TM_ACTIVE(guest_msr)) { + kvmppc_restore_tm_hv(vcpu, guest_msr, true); + ret = true; + } else { + mtspr(SPRN_TEXASR, vcpu->arch.texasr); + mtspr(SPRN_TFHAR, vcpu->arch.tfhar); + mtspr(SPRN_TFIAR, vcpu->arch.tfiar); + } } +#endif load_spr_state(vcpu, host_os_sprs); @@ -315,9 +324,19 @@ void store_vcpu_state(struct kvm_vcpu *vcpu) #endif vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) - kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { + unsigned long guest_msr = vcpu->arch.shregs.msr; + if (MSR_TM_ACTIVE(guest_msr)) { + kvmppc_save_tm_hv(vcpu, guest_msr, true); + } else { + vcpu->arch.texasr = mfspr(SPRN_TEXASR); + vcpu->arch.tfhar = mfspr(SPRN_TFHAR); + vcpu->arch.tfiar = mfspr(SPRN_TFIAR); + } + } +#endif } EXPORT_SYMBOL_GPL(store_vcpu_state); From 3e7b3379023dad2e78c3200373a6368f5d0ee599 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:11 +1000 Subject: [PATCH 033/240] KVM: PPC: Book3S HV P9: Switch PMU to guest as late as possible This moves PMU switch to guest as late as possible in entry, and switch back to host as early as possible at exit. This helps the host get the most perf coverage of KVM entry/exit code as possible. This is slightly suboptimal for SPR scheduling point of view when the PMU is enabled, but when perf is disabled there is no real difference. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-34-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 6 ++---- arch/powerpc/kvm/book3s_hv_p9_entry.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 40bee0d61482..c14467cf23d3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3833,8 +3833,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns s64 dec; int trap; - switch_pmu_to_guest(vcpu, &host_os_sprs); - save_p9_host_os_sprs(&host_os_sprs); /* @@ -3897,9 +3895,11 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns mtspr(SPRN_DAR, vcpu->arch.shregs.dar); mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + switch_pmu_to_guest(vcpu, &host_os_sprs); trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), __pa(&vcpu->arch.regs)); kvmhv_restore_hv_return_state(vcpu, &hvregs); + switch_pmu_to_host(vcpu, &host_os_sprs); vcpu->arch.shregs.msr = vcpu->arch.regs.msr; vcpu->arch.shregs.dar = mfspr(SPRN_DAR); vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); @@ -3918,8 +3918,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns restore_p9_host_os_sprs(vcpu, &host_os_sprs); - switch_pmu_to_host(vcpu, &host_os_sprs); - return trap; } diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 6bef509bccb8..619bbcd47b92 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -601,8 +601,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); - switch_pmu_to_guest(vcpu, &host_os_sprs); - save_p9_host_os_sprs(&host_os_sprs); /* @@ -744,7 +742,9 @@ tm_return_to_guest: accumulate_time(vcpu, &vcpu->arch.guest_time); + switch_pmu_to_guest(vcpu, &host_os_sprs); kvmppc_p9_enter_guest(vcpu); + switch_pmu_to_host(vcpu, &host_os_sprs); accumulate_time(vcpu, &vcpu->arch.rm_intr); @@ -955,8 +955,6 @@ tm_return_to_guest: asm volatile(PPC_CP_ABORT); out: - switch_pmu_to_host(vcpu, &host_os_sprs); - end_timing(vcpu); return trap; From d55b1eccc7aa14a1750aecf271806365478ca805 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:12 +1000 Subject: [PATCH 034/240] KVM: PPC: Book3S HV P9: Restrict DSISR canary workaround to processors that require it Use CPU_FTR_P9_RADIX_PREFETCH_BUG to apply the workaround, to test for DD2.1 and below processors. This saves a mtSPR in guest entry. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-35-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 3 ++- arch/powerpc/kvm/book3s_hv_p9_entry.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c14467cf23d3..3795080d5403 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1590,7 +1590,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, unsigned long vsid; long err; - if (vcpu->arch.fault_dsisr == HDSISR_CANARY) { + if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) && + unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) { r = RESUME_GUEST; /* Just retry if it's the canary */ break; } diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 619bbcd47b92..67f57b03a896 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -683,9 +683,11 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc * HDSI which should correctly update the HDSISR the second time HDSI * entry. * - * Just do this on all p9 processors for now. + * The "radix prefetch bug" test can be used to test for this bug, as + * it also exists fo DD2.1 and below. */ - mtspr(SPRN_HDSISR, HDSISR_CANARY); + if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) + mtspr(SPRN_HDSISR, HDSISR_CANARY); mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); From 34e02d555d8fa36cc756a083de1eeb56edab0e00 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:13 +1000 Subject: [PATCH 035/240] KVM: PPC: Book3S HV P9: More SPR speed improvements This avoids more scoreboard stalls and reduces mtSPRs. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-36-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_p9_entry.c | 73 ++++++++++++++++----------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 67f57b03a896..a23f09fa7d2d 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -645,24 +645,29 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vc->tb_offset_applied = vc->tb_offset; } - if (vc->pcr) - mtspr(SPRN_PCR, vc->pcr | PCR_MASK); - mtspr(SPRN_DPDES, vc->dpdes); mtspr(SPRN_VTB, vc->vtb); - mtspr(SPRN_PURR, vcpu->arch.purr); mtspr(SPRN_SPURR, vcpu->arch.spurr); + if (vc->pcr) + mtspr(SPRN_PCR, vc->pcr | PCR_MASK); + if (vc->dpdes) + mtspr(SPRN_DPDES, vc->dpdes); + if (dawr_enabled()) { - mtspr(SPRN_DAWR0, vcpu->arch.dawr0); - mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0); + if (vcpu->arch.dawr0 != host_dawr0) + mtspr(SPRN_DAWR0, vcpu->arch.dawr0); + if (vcpu->arch.dawrx0 != host_dawrx0) + mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0); if (cpu_has_feature(CPU_FTR_DAWR1)) { - mtspr(SPRN_DAWR1, vcpu->arch.dawr1); - mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1); + if (vcpu->arch.dawr1 != host_dawr1) + mtspr(SPRN_DAWR1, vcpu->arch.dawr1); + if (vcpu->arch.dawrx1 != host_dawrx1) + mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1); } } - mtspr(SPRN_CIABR, vcpu->arch.ciabr); - mtspr(SPRN_IC, vcpu->arch.ic); + if (vcpu->arch.ciabr != host_ciabr) + mtspr(SPRN_CIABR, vcpu->arch.ciabr); mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); @@ -881,20 +886,6 @@ tm_return_to_guest: vc->dpdes = mfspr(SPRN_DPDES); vc->vtb = mfspr(SPRN_VTB); - save_clear_guest_mmu(kvm, vcpu); - switch_mmu_to_host(kvm, host_pidr); - - /* - * If we are in real mode, only switch MMU on after the MMU is - * switched to host, to avoid the P9_RADIX_PREFETCH_BUG. - */ - if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && - vcpu->arch.shregs.msr & MSR_TS_MASK) - msr |= MSR_TS_S; - __mtmsrd(msr, 0); - - store_vcpu_state(vcpu); - dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ dec = (s32) dec; @@ -912,6 +903,22 @@ tm_return_to_guest: vc->tb_offset_applied = 0; } + save_clear_guest_mmu(kvm, vcpu); + switch_mmu_to_host(kvm, host_pidr); + + /* + * Enable MSR here in order to have facilities enabled to save + * guest registers. This enables MMU (if we were in realmode), so + * only switch MMU on after the MMU is switched to host, to avoid + * the P9_RADIX_PREFETCH_BUG or hash guest context. + */ + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && + vcpu->arch.shregs.msr & MSR_TS_MASK) + msr |= MSR_TS_S; + __mtmsrd(msr, 0); + + store_vcpu_state(vcpu); + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr); mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr); @@ -919,15 +926,21 @@ tm_return_to_guest: mtspr(SPRN_PSSCR, host_psscr | (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); mtspr(SPRN_HFSCR, host_hfscr); - mtspr(SPRN_CIABR, host_ciabr); - mtspr(SPRN_DAWR0, host_dawr0); - mtspr(SPRN_DAWRX0, host_dawrx0); + if (vcpu->arch.ciabr != host_ciabr) + mtspr(SPRN_CIABR, host_ciabr); + if (vcpu->arch.dawr0 != host_dawr0) + mtspr(SPRN_DAWR0, host_dawr0); + if (vcpu->arch.dawrx0 != host_dawrx0) + mtspr(SPRN_DAWRX0, host_dawrx0); if (cpu_has_feature(CPU_FTR_DAWR1)) { - mtspr(SPRN_DAWR1, host_dawr1); - mtspr(SPRN_DAWRX1, host_dawrx1); + if (vcpu->arch.dawr1 != host_dawr1) + mtspr(SPRN_DAWR1, host_dawr1); + if (vcpu->arch.dawrx1 != host_dawrx1) + mtspr(SPRN_DAWRX1, host_dawrx1); } - mtspr(SPRN_DPDES, 0); + if (vc->dpdes) + mtspr(SPRN_DPDES, 0); if (vc->pcr) mtspr(SPRN_PCR, PCR_MASK); From a3e18ca8ab6f7f2260978f0a3842845414d799c0 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:14 +1000 Subject: [PATCH 036/240] KVM: PPC: Book3S HV P9: Demand fault EBB facility registers Use HFSCR facility disabling to implement demand faulting for EBB, with a hysteresis counter similar to the load_fp etc counters in context switching that implement the equivalent demand faulting for userspace facilities. This speeds up guest entry/exit by avoiding the register save/restore when a guest is not frequently using them. When a guest does use them often, there will be some additional demand fault overhead, but these are not commonly used facilities. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-37-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_hv.c | 16 +++++++++++++-- arch/powerpc/kvm/book3s_hv_p9_entry.c | 28 +++++++++++++++++++++------ 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 21ca15c3bc0b..7a55b19eb6c0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -579,6 +579,7 @@ struct kvm_vcpu_arch { ulong cfar; ulong ppr; u32 pspb; + u8 load_ebb; ulong fscr; ulong shadow_fscr; ulong ebbhr; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3795080d5403..da29cf9236c8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1436,6 +1436,16 @@ static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB)) + return EMULATE_FAIL; + + vcpu->arch.hfscr |= HFSCR_EBB; + + return RESUME_GUEST; +} + static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -1727,6 +1737,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, r = kvmppc_emulate_doorbell_instr(vcpu); if (cause == FSCR_PM_LG) r = kvmppc_pmu_unavailable(vcpu); + if (cause == FSCR_EBB_LG) + r = kvmppc_ebb_unavailable(vcpu); } if (r == EMULATE_FAIL) { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); @@ -2771,9 +2783,9 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) vcpu->arch.hfscr_permitted = vcpu->arch.hfscr; /* - * PM is demand-faulted so start with it clear. + * PM, EBB is demand-faulted so start with it clear. */ - vcpu->arch.hfscr &= ~HFSCR_PM; + vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB); kvmppc_mmu_book3s_hv_init(vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index a23f09fa7d2d..929a7c336b09 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -232,9 +232,12 @@ static void load_spr_state(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { mtspr(SPRN_TAR, vcpu->arch.tar); - mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); - mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); - mtspr(SPRN_BESCR, vcpu->arch.bescr); + + if (vcpu->arch.hfscr & HFSCR_EBB) { + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + mtspr(SPRN_BESCR, vcpu->arch.bescr); + } if (cpu_has_feature(CPU_FTR_P9_TIDR)) mtspr(SPRN_TIDR, vcpu->arch.tid); @@ -265,9 +268,22 @@ static void load_spr_state(struct kvm_vcpu *vcpu, static void store_spr_state(struct kvm_vcpu *vcpu) { vcpu->arch.tar = mfspr(SPRN_TAR); - vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); - vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); - vcpu->arch.bescr = mfspr(SPRN_BESCR); + + if (vcpu->arch.hfscr & HFSCR_EBB) { + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); + vcpu->arch.bescr = mfspr(SPRN_BESCR); + /* + * This is like load_fp in context switching, turn off the + * facility after it wraps the u8 to try avoiding saving + * and restoring the registers each partition switch. + */ + if (!vcpu->arch.nested) { + vcpu->arch.load_ebb++; + if (!vcpu->arch.load_ebb) + vcpu->arch.hfscr &= ~HFSCR_EBB; + } + } if (cpu_has_feature(CPU_FTR_P9_TIDR)) vcpu->arch.tid = mfspr(SPRN_TIDR); From 022ecb960c89faad42ff0b417a71d9255dd115a3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:15 +1000 Subject: [PATCH 037/240] KVM: PPC: Book3S HV P9: Demand fault TM facility registers Use HFSCR facility disabling to implement demand faulting for TM, with a hysteresis counter similar to the load_fp etc counters in context switching that implement the equivalent demand faulting for userspace facilities. This speeds up guest entry/exit by avoiding the register save/restore when a guest is not frequently using them. When a guest does use them often, there will be some additional demand fault overhead, but these are not commonly used facilities. Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-38-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_host.h | 3 +++ arch/powerpc/kvm/book3s_hv.c | 26 ++++++++++++++++++++------ arch/powerpc/kvm/book3s_hv_p9_entry.c | 15 +++++++++++---- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7a55b19eb6c0..d7004412b859 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -580,6 +580,9 @@ struct kvm_vcpu_arch { ulong ppr; u32 pspb; u8 load_ebb; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + u8 load_tm; +#endif ulong fscr; ulong shadow_fscr; ulong ebbhr; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index da29cf9236c8..198f6d997330 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1446,6 +1446,16 @@ static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hfscr_permitted & HFSCR_TM)) + return EMULATE_FAIL; + + vcpu->arch.hfscr |= HFSCR_TM; + + return RESUME_GUEST; +} + static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -1739,6 +1749,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, r = kvmppc_pmu_unavailable(vcpu); if (cause == FSCR_EBB_LG) r = kvmppc_ebb_unavailable(vcpu); + if (cause == FSCR_TM_LG) + r = kvmppc_tm_unavailable(vcpu); } if (r == EMULATE_FAIL) { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); @@ -2783,9 +2795,9 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) vcpu->arch.hfscr_permitted = vcpu->arch.hfscr; /* - * PM, EBB is demand-faulted so start with it clear. + * PM, EBB, TM are demand-faulted so start with it clear. */ - vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB); + vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB | HFSCR_TM); kvmppc_mmu_book3s_hv_init(vcpu); @@ -3868,8 +3880,9 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns msr |= MSR_VEC; if (cpu_has_feature(CPU_FTR_VSX)) msr |= MSR_VSX; - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + if ((cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) && + (vcpu->arch.hfscr & HFSCR_TM)) msr |= MSR_TM; msr = msr_check_and_set(msr); @@ -4608,8 +4621,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) msr |= MSR_VEC; if (cpu_has_feature(CPU_FTR_VSX)) msr |= MSR_VSX; - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + if ((cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) && + (vcpu->arch.hfscr & HFSCR_TM)) msr |= MSR_TM; msr = msr_check_and_set(msr); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 929a7c336b09..8499e8a9ca8f 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -310,7 +310,7 @@ bool load_vcpu_state(struct kvm_vcpu *vcpu, if (MSR_TM_ACTIVE(guest_msr)) { kvmppc_restore_tm_hv(vcpu, guest_msr, true); ret = true; - } else { + } else if (vcpu->arch.hfscr & HFSCR_TM) { mtspr(SPRN_TEXASR, vcpu->arch.texasr); mtspr(SPRN_TFHAR, vcpu->arch.tfhar); mtspr(SPRN_TFIAR, vcpu->arch.tfiar); @@ -346,10 +346,16 @@ void store_vcpu_state(struct kvm_vcpu *vcpu) unsigned long guest_msr = vcpu->arch.shregs.msr; if (MSR_TM_ACTIVE(guest_msr)) { kvmppc_save_tm_hv(vcpu, guest_msr, true); - } else { + } else if (vcpu->arch.hfscr & HFSCR_TM) { vcpu->arch.texasr = mfspr(SPRN_TEXASR); vcpu->arch.tfhar = mfspr(SPRN_TFHAR); vcpu->arch.tfiar = mfspr(SPRN_TFIAR); + + if (!vcpu->arch.nested) { + vcpu->arch.load_tm++; /* see load_ebb comment */ + if (!vcpu->arch.load_tm) + vcpu->arch.hfscr &= ~HFSCR_TM; + } } } #endif @@ -641,8 +647,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc msr |= MSR_VEC; if (cpu_has_feature(CPU_FTR_VSX)) msr |= MSR_VSX; - if (cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + if ((cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) && + (vcpu->arch.hfscr & HFSCR_TM)) msr |= MSR_TM; msr = msr_check_and_set(msr); /* Save MSR for restore. This is after hard disable, so EE is clear. */ From 5236756d04454c7ce9f45e27b434d75b8d6f8759 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:16 +1000 Subject: [PATCH 038/240] KVM: PPC: Book3S HV P9: Use Linux SPR save/restore to manage some host SPRs Linux implements SPR save/restore including storage space for registers in the task struct for process context switching. Make use of this similarly to the way we make use of the context switching fp/vec save restore. This improves code reuse, allows some stack space to be saved, and helps with avoiding VRSAVE updates if they are not required. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-39-npiggin@gmail.com --- arch/powerpc/include/asm/switch_to.h | 1 + arch/powerpc/kernel/process.c | 6 ++ arch/powerpc/kvm/book3s_hv.c | 21 +----- arch/powerpc/kvm/book3s_hv.h | 3 - arch/powerpc/kvm/book3s_hv_p9_entry.c | 93 +++++++++++++++++++-------- 5 files changed, 73 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index e8013cd6b646..1f43ef696033 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -113,6 +113,7 @@ static inline void clear_task_ebb(struct task_struct *t) } void kvmppc_save_user_regs(void); +void kvmppc_save_current_sprs(void); extern int set_thread_tidr(struct task_struct *t); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8f841fbe16ad..5d2333d2a283 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1182,6 +1182,12 @@ void kvmppc_save_user_regs(void) #endif } EXPORT_SYMBOL_GPL(kvmppc_save_user_regs); + +void kvmppc_save_current_sprs(void) +{ + save_sprs(¤t->thread); +} +EXPORT_SYMBOL_GPL(kvmppc_save_current_sprs); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ static inline void restore_sprs(struct thread_struct *old_thread, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 198f6d997330..7d48aa8aebb2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4566,9 +4566,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; int r; int srcu_idx; - unsigned long ebb_regs[3] = {}; /* shut up GCC */ - unsigned long user_tar = 0; - unsigned int user_vrsave; struct kvm *kvm; unsigned long msr; @@ -4629,14 +4626,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) kvmppc_save_user_regs(); - /* Save userspace EBB and other register values */ - if (cpu_has_feature(CPU_FTR_ARCH_207S)) { - ebb_regs[0] = mfspr(SPRN_EBBHR); - ebb_regs[1] = mfspr(SPRN_EBBRR); - ebb_regs[2] = mfspr(SPRN_BESCR); - user_tar = mfspr(SPRN_TAR); - } - user_vrsave = mfspr(SPRN_VRSAVE); + kvmppc_save_current_sprs(); vcpu->arch.waitp = &vcpu->arch.vcore->wait; vcpu->arch.pgdir = kvm->mm->pgd; @@ -4677,15 +4667,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) } } while (is_kvmppc_resume_guest(r)); - /* Restore userspace EBB and other register values */ - if (cpu_has_feature(CPU_FTR_ARCH_207S)) { - mtspr(SPRN_EBBHR, ebb_regs[0]); - mtspr(SPRN_EBBRR, ebb_regs[1]); - mtspr(SPRN_BESCR, ebb_regs[2]); - mtspr(SPRN_TAR, user_tar); - } - mtspr(SPRN_VRSAVE, user_vrsave); - vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&kvm->arch.vcpus_running); diff --git a/arch/powerpc/kvm/book3s_hv.h b/arch/powerpc/kvm/book3s_hv.h index d7485b9e9762..6b7f07d9026b 100644 --- a/arch/powerpc/kvm/book3s_hv.h +++ b/arch/powerpc/kvm/book3s_hv.h @@ -4,11 +4,8 @@ * Privileged (non-hypervisor) host registers to save. */ struct p9_host_os_sprs { - unsigned long dscr; - unsigned long tidr; unsigned long iamr; unsigned long amr; - unsigned long fscr; unsigned int pmc1; unsigned int pmc2; diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 8499e8a9ca8f..093ac0453d91 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -231,15 +231,26 @@ EXPORT_SYMBOL_GPL(switch_pmu_to_host); static void load_spr_state(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { + /* TAR is very fast */ mtspr(SPRN_TAR, vcpu->arch.tar); +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC) && + current->thread.vrsave != vcpu->arch.vrsave) + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); +#endif + if (vcpu->arch.hfscr & HFSCR_EBB) { - mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); - mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); - mtspr(SPRN_BESCR, vcpu->arch.bescr); + if (current->thread.ebbhr != vcpu->arch.ebbhr) + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + if (current->thread.ebbrr != vcpu->arch.ebbrr) + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + if (current->thread.bescr != vcpu->arch.bescr) + mtspr(SPRN_BESCR, vcpu->arch.bescr); } - if (cpu_has_feature(CPU_FTR_P9_TIDR)) + if (cpu_has_feature(CPU_FTR_P9_TIDR) && + current->thread.tidr != vcpu->arch.tid) mtspr(SPRN_TIDR, vcpu->arch.tid); if (host_os_sprs->iamr != vcpu->arch.iamr) mtspr(SPRN_IAMR, vcpu->arch.iamr); @@ -247,9 +258,9 @@ static void load_spr_state(struct kvm_vcpu *vcpu, mtspr(SPRN_AMR, vcpu->arch.amr); if (vcpu->arch.uamor != 0) mtspr(SPRN_UAMOR, vcpu->arch.uamor); - if (host_os_sprs->fscr != vcpu->arch.fscr) + if (current->thread.fscr != vcpu->arch.fscr) mtspr(SPRN_FSCR, vcpu->arch.fscr); - if (host_os_sprs->dscr != vcpu->arch.dscr) + if (current->thread.dscr != vcpu->arch.dscr) mtspr(SPRN_DSCR, vcpu->arch.dscr); if (vcpu->arch.pspb != 0) mtspr(SPRN_PSPB, vcpu->arch.pspb); @@ -269,20 +280,15 @@ static void store_spr_state(struct kvm_vcpu *vcpu) { vcpu->arch.tar = mfspr(SPRN_TAR); +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); +#endif + if (vcpu->arch.hfscr & HFSCR_EBB) { vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); vcpu->arch.bescr = mfspr(SPRN_BESCR); - /* - * This is like load_fp in context switching, turn off the - * facility after it wraps the u8 to try avoiding saving - * and restoring the registers each partition switch. - */ - if (!vcpu->arch.nested) { - vcpu->arch.load_ebb++; - if (!vcpu->arch.load_ebb) - vcpu->arch.hfscr &= ~HFSCR_EBB; - } } if (cpu_has_feature(CPU_FTR_P9_TIDR)) @@ -324,7 +330,6 @@ bool load_vcpu_state(struct kvm_vcpu *vcpu, #ifdef CONFIG_ALTIVEC load_vr_state(&vcpu->arch.vr); #endif - mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); return ret; } @@ -338,7 +343,6 @@ void store_vcpu_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_ALTIVEC store_vr_state(&vcpu->arch.vr); #endif - vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (cpu_has_feature(CPU_FTR_TM) || @@ -364,12 +368,8 @@ EXPORT_SYMBOL_GPL(store_vcpu_state); void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) { - if (cpu_has_feature(CPU_FTR_P9_TIDR)) - host_os_sprs->tidr = mfspr(SPRN_TIDR); host_os_sprs->iamr = mfspr(SPRN_IAMR); host_os_sprs->amr = mfspr(SPRN_AMR); - host_os_sprs->fscr = mfspr(SPRN_FSCR); - host_os_sprs->dscr = mfspr(SPRN_DSCR); } EXPORT_SYMBOL_GPL(save_p9_host_os_sprs); @@ -377,26 +377,63 @@ EXPORT_SYMBOL_GPL(save_p9_host_os_sprs); void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { + /* + * current->thread.xxx registers must all be restored to host + * values before a potential context switch, othrewise the context + * switch itself will overwrite current->thread.xxx with the values + * from the guest SPRs. + */ + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); - if (cpu_has_feature(CPU_FTR_P9_TIDR)) - mtspr(SPRN_TIDR, host_os_sprs->tidr); + if (cpu_has_feature(CPU_FTR_P9_TIDR) && + current->thread.tidr != vcpu->arch.tid) + mtspr(SPRN_TIDR, current->thread.tidr); if (host_os_sprs->iamr != vcpu->arch.iamr) mtspr(SPRN_IAMR, host_os_sprs->iamr); if (vcpu->arch.uamor != 0) mtspr(SPRN_UAMOR, 0); if (host_os_sprs->amr != vcpu->arch.amr) mtspr(SPRN_AMR, host_os_sprs->amr); - if (host_os_sprs->fscr != vcpu->arch.fscr) - mtspr(SPRN_FSCR, host_os_sprs->fscr); - if (host_os_sprs->dscr != vcpu->arch.dscr) - mtspr(SPRN_DSCR, host_os_sprs->dscr); + if (current->thread.fscr != vcpu->arch.fscr) + mtspr(SPRN_FSCR, current->thread.fscr); + if (current->thread.dscr != vcpu->arch.dscr) + mtspr(SPRN_DSCR, current->thread.dscr); if (vcpu->arch.pspb != 0) mtspr(SPRN_PSPB, 0); /* Save guest CTRL register, set runlatch to 1 */ if (!(vcpu->arch.ctrl & 1)) mtspr(SPRN_CTRLT, 1); + +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC) && + vcpu->arch.vrsave != current->thread.vrsave) + mtspr(SPRN_VRSAVE, current->thread.vrsave); +#endif + if (vcpu->arch.hfscr & HFSCR_EBB) { + if (vcpu->arch.bescr != current->thread.bescr) + mtspr(SPRN_BESCR, current->thread.bescr); + if (vcpu->arch.ebbhr != current->thread.ebbhr) + mtspr(SPRN_EBBHR, current->thread.ebbhr); + if (vcpu->arch.ebbrr != current->thread.ebbrr) + mtspr(SPRN_EBBRR, current->thread.ebbrr); + + if (!vcpu->arch.nested) { + /* + * This is like load_fp in context switching, turn off + * the facility after it wraps the u8 to try avoiding + * saving and restoring the registers each partition + * switch. + */ + vcpu->arch.load_ebb++; + if (!vcpu->arch.load_ebb) + vcpu->arch.hfscr &= ~HFSCR_EBB; + } + } + + if (vcpu->arch.tar != current->thread.tar) + mtspr(SPRN_TAR, current->thread.tar); } EXPORT_SYMBOL_GPL(restore_p9_host_os_sprs); From cf3b16cfa6503b1fe5e680f9711262e6a51ef097 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:17 +1000 Subject: [PATCH 039/240] KVM: PPC: Book3S HV P9: Comment and fix MMU context switching code Tighten up partition switching code synchronisation and comments. In particular, hwsync ; isync is required after the last access that is performed in the context of a partition, before the partition is switched away from. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-40-npiggin@gmail.com --- arch/powerpc/kvm/book3s_64_entry.S | 11 +++++-- arch/powerpc/kvm/book3s_64_mmu_radix.c | 4 +++ arch/powerpc/kvm/book3s_hv_p9_entry.c | 40 +++++++++++++++++++------- 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index 983b8c18bc31..05e003eb5d90 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -374,11 +374,16 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) BEGIN_FTR_SECTION mtspr SPRN_DAWRX1,r10 END_FTR_SECTION_IFSET(CPU_FTR_DAWR1) - mtspr SPRN_PID,r10 /* - * Switch to host MMU mode + * Switch to host MMU mode (don't have the real host PID but we aren't + * going back to userspace). */ + hwsync + isync + + mtspr SPRN_PID,r10 + ld r10, HSTATE_KVM_VCPU(r13) ld r10, VCPU_KVM(r10) lwz r10, KVM_HOST_LPID(r10) @@ -389,6 +394,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_DAWR1) ld r10, KVM_HOST_LPCR(r10) mtspr SPRN_LPCR,r10 + isync + /* * Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear * MSR_RI in r12 ([H]SRR1) so the handler won't try to return. diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 16359525a40f..8cebe5542256 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -57,6 +57,8 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, preempt_disable(); + asm volatile("hwsync" ::: "memory"); + isync(); /* switch the lpid first to avoid running host with unallocated pid */ old_lpid = mfspr(SPRN_LPID); if (old_lpid != lpid) @@ -75,6 +77,8 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, ret = __copy_to_user_inatomic((void __user *)to, from, n); pagefault_enable(); + asm volatile("hwsync" ::: "memory"); + isync(); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid) mtspr(SPRN_PID, old_pid); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 093ac0453d91..323b692bbfe2 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -531,17 +531,19 @@ static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u6 lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; /* - * All the isync()s are overkill but trivially follow the ISA - * requirements. Some can likely be replaced with justification - * comment for why they are not needed. + * Prior memory accesses to host PID Q3 must be completed before we + * start switching, and stores must be drained to avoid not-my-LPAR + * logic (see switch_mmu_to_host). */ + asm volatile("hwsync" ::: "memory"); isync(); mtspr(SPRN_LPID, lpid); - isync(); mtspr(SPRN_LPCR, lpcr); - isync(); mtspr(SPRN_PID, vcpu->arch.pid); - isync(); + /* + * isync not required here because we are HRFID'ing to guest before + * any guest context access, which is context synchronising. + */ } static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr) @@ -551,25 +553,41 @@ static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpid = kvm->arch.lpid; + /* + * See switch_mmu_to_guest_radix. ptesync should not be required here + * even if the host is in HPT mode because speculative accesses would + * not cause RC updates (we are in real mode). + */ + asm volatile("hwsync" ::: "memory"); + isync(); mtspr(SPRN_LPID, lpid); mtspr(SPRN_LPCR, lpcr); mtspr(SPRN_PID, vcpu->arch.pid); for (i = 0; i < vcpu->arch.slb_max; i++) mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv); - - isync(); + /* + * isync not required here, see switch_mmu_to_guest_radix. + */ } static void switch_mmu_to_host(struct kvm *kvm, u32 pid) { + /* + * The guest has exited, so guest MMU context is no longer being + * non-speculatively accessed, but a hwsync is needed before the + * mtLPIDR / mtPIDR switch, in order to ensure all stores are drained, + * so the not-my-LPAR tlbie logic does not overlook them. + */ + asm volatile("hwsync" ::: "memory"); isync(); mtspr(SPRN_PID, pid); - isync(); mtspr(SPRN_LPID, kvm->arch.host_lpid); - isync(); mtspr(SPRN_LPCR, kvm->arch.host_lpcr); - isync(); + /* + * isync is not required after the switch, because mtmsrd with L=0 + * is performed after this switch, which is context synchronising. + */ if (!radix_enabled()) slb_restore_bolted_realmode(); From 9c75f65f3583b0cf467c378a1076f0b50bbc2fb1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:18 +1000 Subject: [PATCH 040/240] KVM: PPC: Book3S HV P9: Test dawr_enabled() before saving host DAWR SPRs Some of the DAWR SPR access is already predicated on dawr_enabled(), apply this to the remainder of the accesses. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-41-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_p9_entry.c | 34 ++++++++++++++++----------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 323b692bbfe2..0f341011816c 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -666,13 +666,16 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc host_hfscr = mfspr(SPRN_HFSCR); host_ciabr = mfspr(SPRN_CIABR); - host_dawr0 = mfspr(SPRN_DAWR0); - host_dawrx0 = mfspr(SPRN_DAWRX0); host_psscr = mfspr(SPRN_PSSCR); host_pidr = mfspr(SPRN_PID); - if (cpu_has_feature(CPU_FTR_DAWR1)) { - host_dawr1 = mfspr(SPRN_DAWR1); - host_dawrx1 = mfspr(SPRN_DAWRX1); + + if (dawr_enabled()) { + host_dawr0 = mfspr(SPRN_DAWR0); + host_dawrx0 = mfspr(SPRN_DAWRX0); + if (cpu_has_feature(CPU_FTR_DAWR1)) { + host_dawr1 = mfspr(SPRN_DAWR1); + host_dawrx1 = mfspr(SPRN_DAWRX1); + } } local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); @@ -1006,15 +1009,18 @@ tm_return_to_guest: mtspr(SPRN_HFSCR, host_hfscr); if (vcpu->arch.ciabr != host_ciabr) mtspr(SPRN_CIABR, host_ciabr); - if (vcpu->arch.dawr0 != host_dawr0) - mtspr(SPRN_DAWR0, host_dawr0); - if (vcpu->arch.dawrx0 != host_dawrx0) - mtspr(SPRN_DAWRX0, host_dawrx0); - if (cpu_has_feature(CPU_FTR_DAWR1)) { - if (vcpu->arch.dawr1 != host_dawr1) - mtspr(SPRN_DAWR1, host_dawr1); - if (vcpu->arch.dawrx1 != host_dawrx1) - mtspr(SPRN_DAWRX1, host_dawrx1); + + if (dawr_enabled()) { + if (vcpu->arch.dawr0 != host_dawr0) + mtspr(SPRN_DAWR0, host_dawr0); + if (vcpu->arch.dawrx0 != host_dawrx0) + mtspr(SPRN_DAWRX0, host_dawrx0); + if (cpu_has_feature(CPU_FTR_DAWR1)) { + if (vcpu->arch.dawr1 != host_dawr1) + mtspr(SPRN_DAWR1, host_dawr1); + if (vcpu->arch.dawrx1 != host_dawrx1) + mtspr(SPRN_DAWRX1, host_dawrx1); + } } if (vc->dpdes) From a089a6869e7f613a8d961ac65bafd127317e4c5c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:19 +1000 Subject: [PATCH 041/240] KVM: PPC: Book3S HV P9: Don't restore PSSCR if not needed This also moves the PSSCR update in nested entry to avoid a SPR scoreboard stall. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-42-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 7 +++++-- arch/powerpc/kvm/book3s_hv_p9_entry.c | 26 +++++++++++++++++++------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7d48aa8aebb2..9da27f19a697 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3889,7 +3889,9 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) msr = mfmsr(); /* TM restore can update msr */ - mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr); + if (vcpu->arch.psscr != host_psscr) + mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr); + kvmhv_save_hv_regs(vcpu, &hvregs); hvregs.lpcr = lpcr; vcpu->arch.regs.msr = vcpu->arch.shregs.msr; @@ -3930,7 +3932,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns vcpu->arch.shregs.dar = mfspr(SPRN_DAR); vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); - mtspr(SPRN_PSSCR_PR, host_psscr); store_vcpu_state(vcpu); @@ -3943,6 +3944,8 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns timer_rearm_host_dec(*tb); restore_p9_host_os_sprs(vcpu, &host_os_sprs); + if (vcpu->arch.psscr != host_psscr) + mtspr(SPRN_PSSCR_PR, host_psscr); return trap; } diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 0f341011816c..eae9d806d704 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -649,6 +649,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc unsigned long host_dawr0; unsigned long host_dawrx0; unsigned long host_psscr; + unsigned long host_hpsscr; unsigned long host_pidr; unsigned long host_dawr1; unsigned long host_dawrx1; @@ -666,7 +667,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc host_hfscr = mfspr(SPRN_HFSCR); host_ciabr = mfspr(SPRN_CIABR); - host_psscr = mfspr(SPRN_PSSCR); + host_psscr = mfspr(SPRN_PSSCR_PR); + if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + host_hpsscr = mfspr(SPRN_PSSCR); host_pidr = mfspr(SPRN_PID); if (dawr_enabled()) { @@ -750,8 +753,14 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc if (vcpu->arch.ciabr != host_ciabr) mtspr(SPRN_CIABR, vcpu->arch.ciabr); - mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | - (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + + if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { + mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + } else { + if (vcpu->arch.psscr != host_psscr) + mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr); + } mtspr(SPRN_HFSCR, vcpu->arch.hfscr); @@ -957,7 +966,7 @@ tm_return_to_guest: vcpu->arch.ic = mfspr(SPRN_IC); vcpu->arch.pid = mfspr(SPRN_PID); - vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS; + vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0); vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1); @@ -1003,9 +1012,12 @@ tm_return_to_guest: mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr); mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr); - /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ - mtspr(SPRN_PSSCR, host_psscr | - (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) { + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ + mtspr(SPRN_PSSCR, host_hpsscr | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + } + mtspr(SPRN_HFSCR, host_hfscr); if (vcpu->arch.ciabr != host_ciabr) mtspr(SPRN_CIABR, host_ciabr); From 0ba0e5d5a691806cca3d4f290dcc61f656049872 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:20 +1000 Subject: [PATCH 042/240] KVM: PPC: Book3S HV: Split P8 from P9 path guest vCPU TLB flushing This creates separate functions for old and new paths for vCPU TLB flushing, which will reduce complexity of the next change. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-43-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_ppc.h | 3 +- arch/powerpc/kvm/book3s_hv_builtin.c | 53 ++++------------------- arch/powerpc/kvm/book3s_hv_p9_entry.c | 62 ++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 671fbd1a765e..2b76d51e4b13 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -552,8 +552,7 @@ extern void kvm_hv_vm_activated(void); extern void kvm_hv_vm_deactivated(void); extern bool kvm_hv_mode_active(void); -extern void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu, - struct kvm_nested_guest *nested); +extern void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu); #else static inline void __init kvm_cma_reserve(void) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 70b7a8f97153..ad70756a777c 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -682,60 +682,23 @@ static void flush_guest_tlb(struct kvm *kvm) unsigned long rb, set; rb = PPC_BIT(52); /* IS = 2 */ - if (kvm_is_radix(kvm)) { - /* R=1 PRS=1 RIC=2 */ + for (set = 0; set < kvm->arch.tlb_sets; ++set) { + /* R=0 PRS=0 RIC=0 */ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r" (rb), "i" (1), "i" (1), "i" (2), + : : "r" (rb), "i" (0), "i" (0), "i" (0), "r" (0) : "memory"); - for (set = 1; set < kvm->arch.tlb_sets; ++set) { - rb += PPC_BIT(51); /* increment set number */ - /* R=1 PRS=1 RIC=0 */ - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r" (rb), "i" (1), "i" (1), "i" (0), - "r" (0) : "memory"); - } - asm volatile("ptesync": : :"memory"); - // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. - asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory"); - } else { - for (set = 0; set < kvm->arch.tlb_sets; ++set) { - /* R=0 PRS=0 RIC=0 */ - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r" (rb), "i" (0), "i" (0), "i" (0), - "r" (0) : "memory"); - rb += PPC_BIT(51); /* increment set number */ - } - asm volatile("ptesync": : :"memory"); - // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. - if (cpu_has_feature(CPU_FTR_ARCH_300)) - asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory"); + rb += PPC_BIT(51); /* increment set number */ } + asm volatile("ptesync": : :"memory"); } -void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu, - struct kvm_nested_guest *nested) +void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu) { - cpumask_t *need_tlb_flush; - - /* - * On POWER9, individual threads can come in here, but the - * TLB is shared between the 4 threads in a core, hence - * invalidating on one thread invalidates for all. - * Thus we make all 4 threads use the same bit. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - pcpu = cpu_first_tlb_thread_sibling(pcpu); - - if (nested) - need_tlb_flush = &nested->need_tlb_flush; - else - need_tlb_flush = &kvm->arch.need_tlb_flush; - - if (cpumask_test_cpu(pcpu, need_tlb_flush)) { + if (cpumask_test_cpu(pcpu, &kvm->arch.need_tlb_flush)) { flush_guest_tlb(kvm); /* Clear the bit after the TLB flush */ - cpumask_clear_cpu(pcpu, need_tlb_flush); + cpumask_clear_cpu(pcpu, &kvm->arch.need_tlb_flush); } } EXPORT_SYMBOL_GPL(kvmppc_check_need_tlb_flush); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index eae9d806d704..d0216d32ec91 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -632,6 +632,66 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu) } } +static void flush_guest_tlb(struct kvm *kvm) +{ + unsigned long rb, set; + + rb = PPC_BIT(52); /* IS = 2 */ + if (kvm_is_radix(kvm)) { + /* R=1 PRS=1 RIC=2 */ + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r" (rb), "i" (1), "i" (1), "i" (2), + "r" (0) : "memory"); + for (set = 1; set < kvm->arch.tlb_sets; ++set) { + rb += PPC_BIT(51); /* increment set number */ + /* R=1 PRS=1 RIC=0 */ + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r" (rb), "i" (1), "i" (1), "i" (0), + "r" (0) : "memory"); + } + asm volatile("ptesync": : :"memory"); + // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. + asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory"); + } else { + for (set = 0; set < kvm->arch.tlb_sets; ++set) { + /* R=0 PRS=0 RIC=0 */ + asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + : : "r" (rb), "i" (0), "i" (0), "i" (0), + "r" (0) : "memory"); + rb += PPC_BIT(51); /* increment set number */ + } + asm volatile("ptesync": : :"memory"); + // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory"); + } +} + +static void check_need_tlb_flush(struct kvm *kvm, int pcpu, + struct kvm_nested_guest *nested) +{ + cpumask_t *need_tlb_flush; + + /* + * On POWER9, individual threads can come in here, but the + * TLB is shared between the 4 threads in a core, hence + * invalidating on one thread invalidates for all. + * Thus we make all 4 threads use the same bit. + */ + pcpu = cpu_first_tlb_thread_sibling(pcpu); + + if (nested) + need_tlb_flush = &nested->need_tlb_flush; + else + need_tlb_flush = &kvm->arch.need_tlb_flush; + + if (cpumask_test_cpu(pcpu, need_tlb_flush)) { + flush_guest_tlb(kvm); + + /* Clear the bit after the TLB flush */ + cpumask_clear_cpu(pcpu, need_tlb_flush); + } +} + int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { struct p9_host_os_sprs host_os_sprs; @@ -819,7 +879,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc } /* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */ - kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested); + check_need_tlb_flush(kvm, vc->pcpu, nested); /* * P9 suppresses the HDEC exception when LPCR[HDICE] = 0, From d5c0e8332d82c04deee25dd6f28c5bbe84d49a73 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:21 +1000 Subject: [PATCH 043/240] KVM: PPC: Book3S HV P9: Avoid tlbsync sequence on radix guest exit Use the existing TLB flushing logic to IPI the previous CPU and run the necessary barriers before running a guest vCPU on a new physical CPU, to do the necessary radix GTSE barriers for handling the case of an interrupted guest tlbie sequence. This requires the vCPU TLB flush sequence that is currently just done on one thread, to be expanded to ensure the other threads execute a ptesync, because causing them to exit the guest will no longer cause a ptesync by itself. This results in more IPIs than the TLB flush logic requires, but it's a significant win for common case scheduling when the vCPU remains on the same physical CPU. This saves about 520 cycles (nearly 10%) on a guest entry+exit micro benchmark on a POWER9. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-44-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 48 +++++++++++++++++++++------ arch/powerpc/kvm/book3s_hv_p9_entry.c | 48 +++++++++++++++------------ arch/powerpc/kvm/book3s_hv_rm_mmu.c | 6 ---- 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9da27f19a697..df4e3f88398d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3002,29 +3002,54 @@ static void kvmppc_release_hwthread(int cpu) static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { struct kvm_nested_guest *nested = vcpu->arch.nested; - cpumask_t *cpu_in_guest; + cpumask_t *cpu_in_guest, *need_tlb_flush; int i; - cpu = cpu_first_tlb_thread_sibling(cpu); if (nested) { - cpumask_set_cpu(cpu, &nested->need_tlb_flush); + need_tlb_flush = &nested->need_tlb_flush; cpu_in_guest = &nested->cpu_in_guest; } else { - cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + need_tlb_flush = &kvm->arch.need_tlb_flush; cpu_in_guest = &kvm->arch.cpu_in_guest; } + + cpu = cpu_first_tlb_thread_sibling(cpu); + for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu); + i += cpu_tlb_thread_sibling_step()) + cpumask_set_cpu(i, need_tlb_flush); + /* * Make sure setting of bit in need_tlb_flush precedes * testing of cpu_in_guest bits. The matching barrier on * the other side is the first smp_mb() in kvmppc_run_core(). */ smp_mb(); + for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu); i += cpu_tlb_thread_sibling_step()) if (cpumask_test_cpu(i, cpu_in_guest)) smp_call_function_single(i, do_nothing, NULL, 1); } +static void do_migrate_away_vcpu(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct kvm *kvm = vcpu->kvm; + + /* + * If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync; + * ptesync sequence on the old CPU before migrating to a new one, in + * case we interrupted the guest between a tlbie ; eieio ; + * tlbsync; ptesync sequence. + * + * Otherwise, ptesync is sufficient for ordering tlbiel sequences. + */ + if (kvm->arch.lpcr & LPCR_GTSE) + asm volatile("eieio; tlbsync; ptesync"); + else + asm volatile("ptesync"); +} + static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) { struct kvm_nested_guest *nested = vcpu->arch.nested; @@ -3048,14 +3073,17 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) * can move around between pcpus. To cope with this, when * a vcpu moves from one pcpu to another, we need to tell * any vcpus running on the same core as this vcpu previously - * ran to flush the TLB. The TLB is shared between threads, - * so we use a single bit in .need_tlb_flush for all 4 threads. + * ran to flush the TLB. */ if (prev_cpu != pcpu) { - if (prev_cpu >= 0 && - cpu_first_tlb_thread_sibling(prev_cpu) != - cpu_first_tlb_thread_sibling(pcpu)) - radix_flush_cpu(kvm, prev_cpu, vcpu); + if (prev_cpu >= 0) { + if (cpu_first_tlb_thread_sibling(prev_cpu) != + cpu_first_tlb_thread_sibling(pcpu)) + radix_flush_cpu(kvm, prev_cpu, vcpu); + + smp_call_function_single(prev_cpu, + do_migrate_away_vcpu, vcpu, 1); + } if (nested) nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu; else diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index d0216d32ec91..9e899c813803 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -670,26 +670,41 @@ static void check_need_tlb_flush(struct kvm *kvm, int pcpu, struct kvm_nested_guest *nested) { cpumask_t *need_tlb_flush; - - /* - * On POWER9, individual threads can come in here, but the - * TLB is shared between the 4 threads in a core, hence - * invalidating on one thread invalidates for all. - * Thus we make all 4 threads use the same bit. - */ - pcpu = cpu_first_tlb_thread_sibling(pcpu); + bool all_set = true; + int i; if (nested) need_tlb_flush = &nested->need_tlb_flush; else need_tlb_flush = &kvm->arch.need_tlb_flush; - if (cpumask_test_cpu(pcpu, need_tlb_flush)) { - flush_guest_tlb(kvm); + if (likely(!cpumask_test_cpu(pcpu, need_tlb_flush))) + return; - /* Clear the bit after the TLB flush */ - cpumask_clear_cpu(pcpu, need_tlb_flush); + /* + * Individual threads can come in here, but the TLB is shared between + * the 4 threads in a core, hence invalidating on one thread + * invalidates for all, so only invalidate the first time (if all bits + * were set. The others must still execute a ptesync. + * + * If a race occurs and two threads do the TLB flush, that is not a + * problem, just sub-optimal. + */ + for (i = cpu_first_tlb_thread_sibling(pcpu); + i <= cpu_last_tlb_thread_sibling(pcpu); + i += cpu_tlb_thread_sibling_step()) { + if (!cpumask_test_cpu(i, need_tlb_flush)) { + all_set = false; + break; + } } + if (all_set) + flush_guest_tlb(kvm); + else + asm volatile("ptesync" ::: "memory"); + + /* Clear the bit after the TLB flush */ + cpumask_clear_cpu(pcpu, need_tlb_flush); } int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) @@ -1109,15 +1124,6 @@ tm_return_to_guest: local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE; - if (kvm_is_radix(kvm)) { - /* - * Since this is radix, do a eieio; tlbsync; ptesync sequence - * in case we interrupted the guest between a tlbie and a - * ptesync. - */ - asm volatile("eieio; tlbsync; ptesync"); - } - /* * cp_abort is required if the processor supports local copy-paste * to clear the copy buffer that was under control of the guest. diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 2c1f3c6e72d1..2257fb18cb72 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -55,12 +55,6 @@ static int global_invalidates(struct kvm *kvm) smp_wmb(); cpumask_setall(&kvm->arch.need_tlb_flush); cpu = local_paca->kvm_hstate.kvm_vcore->pcpu; - /* - * On POWER9, threads are independent but the TLB is shared, - * so use the bit for the first thread to represent the core. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - cpu = cpu_first_tlb_thread_sibling(cpu); cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush); } From 46dea77f790c1e7ab2e9f7452e34de0dc5da9b13 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:22 +1000 Subject: [PATCH 044/240] KVM: PPC: Book3S HV Nested: Avoid extra mftb() in nested entry mftb() is expensive and one can be avoided on nested guest dispatch. If the time checking code distinguishes between the L0 timer and the nested HV timer, then both can be tested in the same place with the same mftb() value. This also nicely illustrates the relationship between the L0 and nested HV timers. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-45-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_asm.h | 1 + arch/powerpc/kvm/book3s_hv.c | 12 ++++++++++++ arch/powerpc/kvm/book3s_hv_nested.c | 5 ----- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index fbbf3cec92e9..d68d71987d5c 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -79,6 +79,7 @@ #define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 #define BOOK3S_INTERRUPT_DECREMENTER 0x900 #define BOOK3S_INTERRUPT_HV_DECREMENTER 0x980 +#define BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER 0x1980 #define BOOK3S_INTERRUPT_DOORBELL 0xa00 #define BOOK3S_INTERRUPT_SYSCALL 0xc00 #define BOOK3S_INTERRUPT_TRACE 0xd00 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index df4e3f88398d..65c9157579a3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1486,6 +1486,10 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, run->ready_for_interrupt_injection = 1; switch (vcpu->arch.trap) { /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER: + WARN_ON_ONCE(1); /* Should never happen */ + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; + fallthrough; case BOOK3S_INTERRUPT_HV_DECREMENTER: vcpu->stat.dec_exits++; r = RESUME_GUEST; @@ -1814,6 +1818,12 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; + /* These need to go to the nested HV */ + case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER: + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; + vcpu->stat.dec_exits++; + r = RESUME_HOST; + break; /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ case BOOK3S_INTERRUPT_HMI: case BOOK3S_INTERRUPT_PERFMON: @@ -3993,6 +4003,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, return BOOK3S_INTERRUPT_HV_DECREMENTER; if (next_timer < time_limit) time_limit = next_timer; + else if (*tb >= time_limit) /* nested time limit */ + return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER; vcpu->arch.ceded = 0; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 7bed0b91245e..e57c08b968c0 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -375,11 +375,6 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; do { - if (mftb() >= hdec_exp) { - vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; - r = RESUME_HOST; - break; - } r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr); } while (is_kvmppc_resume_guest(r)); From b49c65c5f9f1dac4ef1764578ad55bacf526eb38 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:23 +1000 Subject: [PATCH 045/240] KVM: PPC: Book3S HV P9: Improve mfmsr performance on entry Rearrange the MSR saving on entry so it does not follow the mtmsrd to disable interrupts, avoiding a possible RAW scoreboard stall. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-46-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 2 + arch/powerpc/kvm/book3s_hv.c | 18 ++----- arch/powerpc/kvm/book3s_hv_p9_entry.c | 66 +++++++++++++++--------- 3 files changed, 47 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 0a319ed9c2fd..96f0fda50a07 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -154,6 +154,8 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu) return radix; } +unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, unsigned long msr); + int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb); #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 65c9157579a3..e532a7010dba 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3896,6 +3896,8 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns s64 dec; int trap; + msr = mfmsr(); + save_p9_host_os_sprs(&host_os_sprs); /* @@ -3906,24 +3908,10 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns */ host_psscr = mfspr(SPRN_PSSCR_PR); - hard_irq_disable(); + kvmppc_msr_hard_disable_set_facilities(vcpu, msr); if (lazy_irq_pending()) return 0; - /* MSR bits may have been cleared by context switch */ - msr = 0; - if (IS_ENABLED(CONFIG_PPC_FPU)) - msr |= MSR_FP; - if (cpu_has_feature(CPU_FTR_ALTIVEC)) - msr |= MSR_VEC; - if (cpu_has_feature(CPU_FTR_VSX)) - msr |= MSR_VSX; - if ((cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) && - (vcpu->arch.hfscr & HFSCR_TM)) - msr |= MSR_TM; - msr = msr_check_and_set(msr); - if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) msr = mfmsr(); /* TM restore can update msr */ diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 9e899c813803..d123813296ba 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -707,6 +707,44 @@ static void check_need_tlb_flush(struct kvm *kvm, int pcpu, cpumask_clear_cpu(pcpu, need_tlb_flush); } +unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, unsigned long msr) +{ + unsigned long msr_needed = 0; + + msr &= ~MSR_EE; + + /* MSR bits may have been cleared by context switch so must recheck */ + if (IS_ENABLED(CONFIG_PPC_FPU)) + msr_needed |= MSR_FP; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + msr_needed |= MSR_VEC; + if (cpu_has_feature(CPU_FTR_VSX)) + msr_needed |= MSR_VSX; + if ((cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) && + (vcpu->arch.hfscr & HFSCR_TM)) + msr_needed |= MSR_TM; + + /* + * This could be combined with MSR[RI] clearing, but that expands + * the unrecoverable window. It would be better to cover unrecoverable + * with KVM bad interrupt handling rather than use MSR[RI] at all. + * + * Much more difficult and less worthwhile to combine with IR/DR + * disable. + */ + if ((msr & msr_needed) != msr_needed) { + msr |= msr_needed; + __mtmsrd(msr, 0); + } else { + __hard_irq_disable(); + } + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + return msr; +} +EXPORT_SYMBOL_GPL(kvmppc_msr_hard_disable_set_facilities); + int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { struct p9_host_os_sprs host_os_sprs; @@ -740,6 +778,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vcpu->arch.ceded = 0; + /* Save MSR for restore, with EE clear. */ + msr = mfmsr() & ~MSR_EE; + host_hfscr = mfspr(SPRN_HFSCR); host_ciabr = mfspr(SPRN_CIABR); host_psscr = mfspr(SPRN_PSSCR_PR); @@ -761,35 +802,12 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc save_p9_host_os_sprs(&host_os_sprs); - /* - * This could be combined with MSR[RI] clearing, but that expands - * the unrecoverable window. It would be better to cover unrecoverable - * with KVM bad interrupt handling rather than use MSR[RI] at all. - * - * Much more difficult and less worthwhile to combine with IR/DR - * disable. - */ - hard_irq_disable(); + msr = kvmppc_msr_hard_disable_set_facilities(vcpu, msr); if (lazy_irq_pending()) { trap = 0; goto out; } - /* MSR bits may have been cleared by context switch */ - msr = 0; - if (IS_ENABLED(CONFIG_PPC_FPU)) - msr |= MSR_FP; - if (cpu_has_feature(CPU_FTR_ALTIVEC)) - msr |= MSR_VEC; - if (cpu_has_feature(CPU_FTR_VSX)) - msr |= MSR_VSX; - if ((cpu_has_feature(CPU_FTR_TM) || - cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) && - (vcpu->arch.hfscr & HFSCR_TM)) - msr |= MSR_TM; - msr = msr_check_and_set(msr); - /* Save MSR for restore. This is after hard disable, so EE is clear. */ - if (unlikely(load_vcpu_state(vcpu, &host_os_sprs))) msr = mfmsr(); /* MSR may have been updated */ From 241d1f19f0e5c257881a0661f201b51dc3e57f8c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:24 +1000 Subject: [PATCH 046/240] KVM: PPC: Book3S HV P9: Optimise hash guest SLB saving slbmfee/slbmfev instructions are very expensive, moreso than a regular mfspr instruction, so minimising them significantly improves hash guest exit performance. The slbmfev is only required if slbmfee found a valid SLB entry. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-47-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_p9_entry.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index d123813296ba..8fa48ba01f79 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -487,10 +487,22 @@ static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator #define accumulate_time(vcpu, next) do {} while (0) #endif -static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev) +static inline u64 mfslbv(unsigned int idx) { - asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx)); - asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx)); + u64 slbev; + + asm volatile("slbmfev %0,%1" : "=r" (slbev) : "r" (idx)); + + return slbev; +} + +static inline u64 mfslbe(unsigned int idx) +{ + u64 slbee; + + asm volatile("slbmfee %0,%1" : "=r" (slbee) : "r" (idx)); + + return slbee; } static inline void mtslb(u64 slbee, u64 slbev) @@ -620,8 +632,10 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu) */ for (i = 0; i < vcpu->arch.slb_nr; i++) { u64 slbee, slbev; - mfslb(i, &slbee, &slbev); + + slbee = mfslbe(i); if (slbee & SLB_ESID_V) { + slbev = mfslbv(i); vcpu->arch.slb[nr].orige = slbee | i; vcpu->arch.slb[nr].origv = slbev; nr++; From f08cbf5c7d1f86f12143a1dce23740411b03a807 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:25 +1000 Subject: [PATCH 047/240] KVM: PPC: Book3S HV P9: Avoid changing MSR[RI] in entry and exit kvm_hstate.in_guest provides the equivalent of MSR[RI]=0 protection, and it covers the existing MSR[RI]=0 section in late entry and early exit, so clearing and setting MSR[RI] in those cases does not actually do anything useful. Remove the RI manipulation and replace it with comments. Make the in_guest memory accesses a bit closer to a proper critical section pattern. This speeds up guest entry/exit performance. This also removes the MSR[RI] warnings which aren't very interesting and would cause crashes if they hit due to causing an interrupt in non-recoverable code. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-48-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_p9_entry.c | 50 ++++++++++++--------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 8fa48ba01f79..6120cdf281b9 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -904,7 +904,15 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc * But TM could be split out if this would be a significant benefit. */ - local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9; + /* + * MSR[RI] does not need to be cleared (and is not, for radix guests + * with no prefetch bug), because in_guest is set. If we take a SRESET + * or MCE with in_guest set but still in HV mode, then + * kvmppc_p9_bad_interrupt handles the interrupt, which effectively + * clears MSR[RI] and doesn't return. + */ + WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_HV_P9); + barrier(); /* Open in_guest critical section */ /* * Hash host, hash guest, or radix guest with prefetch bug, all have @@ -916,14 +924,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc save_clear_host_mmu(kvm); - if (kvm_is_radix(kvm)) { + if (kvm_is_radix(kvm)) switch_mmu_to_guest_radix(kvm, vcpu, lpcr); - if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) - __mtmsrd(0, 1); /* clear RI */ - - } else { + else switch_mmu_to_guest_hpt(kvm, vcpu, lpcr); - } /* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */ check_need_tlb_flush(kvm, vc->pcpu, nested); @@ -978,19 +982,16 @@ tm_return_to_guest: vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2; /* - * Only set RI after reading machine check regs (DAR, DSISR, SRR0/1) - * and hstate scratch (which we need to move into exsave to make - * re-entrant vs SRESET/MCE) + * After reading machine check regs (DAR, DSISR, SRR0/1) and hstate + * scratch (which we need to move into exsave to make re-entrant vs + * SRESET/MCE), register state is protected from reentrancy. However + * timebase, MMU, among other state is still set to guest, so don't + * enable MSR[RI] here. It gets enabled at the end, after in_guest + * is cleared. + * + * It is possible an NMI could come in here, which is why it is + * important to save the above state early so it can be debugged. */ - if (ri_set) { - if (unlikely(!(mfmsr() & MSR_RI))) { - __mtmsrd(MSR_RI, 1); - WARN_ON_ONCE(1); - } - } else { - WARN_ON_ONCE(mfmsr() & MSR_RI); - __mtmsrd(MSR_RI, 1); - } vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)]; vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)]; @@ -1048,13 +1049,6 @@ tm_return_to_guest: */ mtspr(SPRN_HSRR0, vcpu->arch.regs.nip); mtspr(SPRN_HSRR1, vcpu->arch.shregs.msr); - - /* - * tm_return_to_guest re-loads SRR0/1, DAR, - * DSISR after RI is cleared, in case they had - * been clobbered by a MCE. - */ - __mtmsrd(0, 1); /* clear RI */ goto tm_return_to_guest; } } @@ -1154,7 +1148,9 @@ tm_return_to_guest: restore_p9_host_os_sprs(vcpu, &host_os_sprs); - local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE; + barrier(); /* Close in_guest critical section */ + WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_NONE); + /* Interrupts are recoverable at this point */ /* * cp_abort is required if the processor supports local copy-paste From 4c9a68914eab1f17f6c428c579ffd75c4448461e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:26 +1000 Subject: [PATCH 048/240] KVM: PPC: Book3S HV P9: Add unlikely annotation for !mmu_ready The mmu will almost always be ready. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-49-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e532a7010dba..4056605d3367 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4426,7 +4426,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vc->runner = vcpu; /* See if the MMU is ready to go */ - if (!kvm->arch.mmu_ready) { + if (unlikely(!kvm->arch.mmu_ready)) { r = kvmhv_setup_mmu(vcpu); if (r) { run->exit_reason = KVM_EXIT_FAIL_ENTRY; From 434398ab5eed03dbc0075af9436e871712bfb45a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:27 +1000 Subject: [PATCH 049/240] KVM: PPC: Book3S HV P9: Avoid cpu_in_guest atomics on entry and exit cpu_in_guest is set to determine if a CPU needs to be IPI'ed to exit the guest and notice the need_tlb_flush bit. This can be implemented as a global per-CPU pointer to the currently running guest instead of per-guest cpumasks, saving 2 atomics per entry/exit. P7/8 doesn't require cpu_in_guest, nor does a nested HV (only the L0 does), so move it to the P9 HV path. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-50-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 1 - arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/book3s_hv.c | 39 +++++++++++++----------- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 96f0fda50a07..fe07558173ef 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -44,7 +44,6 @@ struct kvm_nested_guest { struct mutex tlb_lock; /* serialize page faults and tlbies */ struct kvm_nested_guest *next; cpumask_t need_tlb_flush; - cpumask_t cpu_in_guest; short prev_cpu[NR_CPUS]; u8 radix; /* is this nested guest radix */ }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d7004412b859..17263276189e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -287,7 +287,6 @@ struct kvm_arch { u32 online_vcores; atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; - cpumask_t cpu_in_guest; u8 radix; u8 fwnmi_enabled; u8 secure_guest; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 4056605d3367..00c1e102c103 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3009,19 +3009,18 @@ static void kvmppc_release_hwthread(int cpu) tpaca->kvm_hstate.kvm_split_mode = NULL; } +static DEFINE_PER_CPU(struct kvm *, cpu_in_guest); + static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { struct kvm_nested_guest *nested = vcpu->arch.nested; - cpumask_t *cpu_in_guest, *need_tlb_flush; + cpumask_t *need_tlb_flush; int i; - if (nested) { + if (nested) need_tlb_flush = &nested->need_tlb_flush; - cpu_in_guest = &nested->cpu_in_guest; - } else { + else need_tlb_flush = &kvm->arch.need_tlb_flush; - cpu_in_guest = &kvm->arch.cpu_in_guest; - } cpu = cpu_first_tlb_thread_sibling(cpu); for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu); @@ -3029,16 +3028,21 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) cpumask_set_cpu(i, need_tlb_flush); /* - * Make sure setting of bit in need_tlb_flush precedes - * testing of cpu_in_guest bits. The matching barrier on - * the other side is the first smp_mb() in kvmppc_run_core(). + * Make sure setting of bit in need_tlb_flush precedes testing of + * cpu_in_guest. The matching barrier on the other side is hwsync + * when switching to guest MMU mode, which happens between + * cpu_in_guest being set to the guest kvm, and need_tlb_flush bit + * being tested. */ smp_mb(); for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu); - i += cpu_tlb_thread_sibling_step()) - if (cpumask_test_cpu(i, cpu_in_guest)) + i += cpu_tlb_thread_sibling_step()) { + struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i); + + if (running == kvm) smp_call_function_single(i, do_nothing, NULL, 1); + } } static void do_migrate_away_vcpu(void *arg) @@ -3105,7 +3109,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; struct paca_struct *tpaca; - struct kvm *kvm = vc->kvm; cpu = vc->pcpu; if (vcpu) { @@ -3116,7 +3119,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) cpu += vcpu->arch.ptid; vcpu->cpu = vc->pcpu; vcpu->arch.thread_cpu = cpu; - cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); } tpaca = paca_ptrs[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; @@ -3847,7 +3849,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_release_hwthread(pcpu + i); if (sip && sip->napped[i]) kvmppc_ipi_thread(pcpu + i); - cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); } spin_unlock(&vc->lock); @@ -4015,8 +4016,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, } } else { + struct kvm *kvm = vcpu->kvm; + kvmppc_xive_push_vcpu(vcpu); + + __this_cpu_write(cpu_in_guest, kvm); trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb); + __this_cpu_write(cpu_in_guest, NULL); + if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && !(vcpu->arch.shregs.msr & MSR_PR)) { unsigned long req = kvmppc_get_gpr(vcpu, 3); @@ -4041,7 +4048,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, } kvmppc_xive_pull_vcpu(vcpu); - if (kvm_is_radix(vcpu->kvm)) + if (kvm_is_radix(kvm)) vcpu->arch.slb_max = 0; } @@ -4531,8 +4538,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, powerpc_local_irq_pmu_restore(flags); - cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest); - preempt_enable(); /* From ecb6a7207f92e33c2b7a1271165ecf5d8f420bba Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:28 +1000 Subject: [PATCH 050/240] KVM: PPC: Book3S HV P9: Remove most of the vcore logic The P9 path always uses one vcpu per vcore, so none of the vcore, locks, stolen time, blocking logic, shared waitq, etc., is required. Remove most of it. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-51-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 147 ++++++++++++++++++++--------------- 1 file changed, 85 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 00c1e102c103..e56804b84804 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -276,6 +276,8 @@ static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb) { unsigned long flags; + WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)); + spin_lock_irqsave(&vc->stoltb_lock, flags); vc->preempt_tb = tb; spin_unlock_irqrestore(&vc->stoltb_lock, flags); @@ -285,6 +287,8 @@ static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb) { unsigned long flags; + WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)); + spin_lock_irqsave(&vc->stoltb_lock, flags); if (vc->preempt_tb != TB_NIL) { vc->stolen_tb += tb - vc->preempt_tb; @@ -297,7 +301,12 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; unsigned long flags; - u64 now = mftb(); + u64 now; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; + + now = mftb(); /* * We can test vc->runner without taking the vcore lock, @@ -321,7 +330,12 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; unsigned long flags; - u64 now = mftb(); + u64 now; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; + + now = mftb(); if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) kvmppc_core_start_stolen(vc, now); @@ -673,6 +687,8 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) u64 p; unsigned long flags; + WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)); + spin_lock_irqsave(&vc->stoltb_lock, flags); p = vc->stolen_tb; if (vc->vcore_state != VCORE_INACTIVE && @@ -695,13 +711,19 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, dt = vcpu->arch.dtl_ptr; vpa = vcpu->arch.vpa.pinned_addr; now = tb; - core_stolen = vcore_stolen_time(vc, now); - stolen = core_stolen - vcpu->arch.stolen_logged; - vcpu->arch.stolen_logged = core_stolen; - spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); - stolen += vcpu->arch.busy_stolen; - vcpu->arch.busy_stolen = 0; - spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + stolen = 0; + } else { + core_stolen = vcore_stolen_time(vc, now); + stolen = core_stolen - vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = core_stolen; + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); + stolen += vcpu->arch.busy_stolen; + vcpu->arch.busy_stolen = 0; + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); + } + if (!dt || !vpa) return; memset(dt, 0, sizeof(struct dtl_entry)); @@ -898,13 +920,14 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) * mode handler is not called but no other threads are in the * source vcore. */ - - spin_lock(&vcore->lock); - if (target->arch.state == KVMPPC_VCPU_RUNNABLE && - vcore->vcore_state != VCORE_INACTIVE && - vcore->runner) - target = vcore->runner; - spin_unlock(&vcore->lock); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + spin_lock(&vcore->lock); + if (target->arch.state == KVMPPC_VCPU_RUNNABLE && + vcore->vcore_state != VCORE_INACTIVE && + vcore->runner) + target = vcore->runner; + spin_unlock(&vcore->lock); + } return kvm_vcpu_yield_to(target); } @@ -3131,13 +3154,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) kvmppc_ipi_thread(cpu); } -/* Old path does this in asm */ -static void kvmppc_stop_thread(struct kvm_vcpu *vcpu) -{ - vcpu->cpu = -1; - vcpu->arch.thread_cpu = -1; -} - static void kvmppc_wait_for_nap(int n_threads) { int cpu = smp_processor_id(); @@ -3226,6 +3242,8 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) { struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores); + WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)); + vc->vcore_state = VCORE_PREEMPT; vc->pcpu = smp_processor_id(); if (vc->num_threads < threads_per_vcore(vc->kvm)) { @@ -3242,6 +3260,8 @@ static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc) { struct preempted_vcore_list *lp; + WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)); + kvmppc_core_end_stolen(vc, mftb()); if (!list_empty(&vc->preempt_list)) { lp = &per_cpu(preempted_vcores, vc->pcpu); @@ -3983,7 +4003,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { - struct kvmppc_vcore *vc = vcpu->arch.vcore; u64 next_timer; int trap; @@ -3999,9 +4018,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, kvmppc_subcore_enter_guest(); - vc->entry_exit_map = 1; - vc->in_guest = 1; - vcpu_vpa_increment_dispatch(vcpu); if (kvmhv_on_pseries()) { @@ -4054,9 +4070,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu_vpa_increment_dispatch(vcpu); - vc->entry_exit_map = 0x101; - vc->in_guest = 0; - kvmppc_subcore_exit_guest(); return trap; @@ -4122,6 +4135,13 @@ static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu) return false; } +static bool kvmppc_vcpu_check_block(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu)) + return true; + return false; +} + /* * Check to see if any of the runnable vcpus on the vcore have pending * exceptions or are no longer ceded @@ -4132,7 +4152,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) int i; for_each_runnable_thread(i, vcpu, vc) { - if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu)) + if (kvmppc_vcpu_check_block(vcpu)) return 1; } @@ -4149,6 +4169,8 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) int do_sleep = 1; u64 block_ns; + WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)); + /* Poll for pending exceptions and ceded state */ cur = start_poll = ktime_get(); if (vc->halt_poll_ns) { @@ -4426,11 +4448,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.ceded = 0; vcpu->arch.run_task = current; vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; - vcpu->arch.busy_preempt = TB_NIL; vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; - vc->runnable_threads[0] = vcpu; - vc->n_runnable = 1; - vc->runner = vcpu; /* See if the MMU is ready to go */ if (unlikely(!kvm->arch.mmu_ready)) { @@ -4448,11 +4466,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, kvmppc_update_vpas(vcpu); - init_vcore_to_run(vc); - preempt_disable(); pcpu = smp_processor_id(); - vc->pcpu = pcpu; if (kvm_is_radix(kvm)) kvmppc_prepare_radix_vcpu(vcpu, pcpu); @@ -4481,21 +4496,23 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, goto out; } + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } + tb = mftb(); - vcpu->arch.stolen_logged = vcore_stolen_time(vc, tb); - vc->preempt_tb = TB_NIL; + vcpu->cpu = pcpu; + vcpu->arch.thread_cpu = pcpu; + vc->pcpu = pcpu; + local_paca->kvm_hstate.kvm_vcpu = vcpu; + local_paca->kvm_hstate.ptid = 0; + local_paca->kvm_hstate.fake_suspend = 0; - kvmppc_clear_host_core(pcpu); - - local_paca->kvm_hstate.napping = 0; - local_paca->kvm_hstate.kvm_split_mode = NULL; - kvmppc_start_thread(vcpu, vc); kvmppc_create_dtl_entry(vcpu, vc, tb); - trace_kvm_guest_enter(vcpu); - vc->vcore_state = VCORE_RUNNING; - trace_kvmppc_run_core(vc, 0); + trace_kvm_guest_enter(vcpu); guest_enter_irqoff(); @@ -4517,8 +4534,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, set_irq_happened(trap); - kvmppc_set_host_core(pcpu); - context_tracking_guest_exit(); if (!vtime_accounting_enabled_this_cpu()) { local_irq_enable(); @@ -4534,7 +4549,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, } vtime_account_guest_exit(); - kvmppc_stop_thread(vcpu); + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; powerpc_local_irq_pmu_restore(flags); @@ -4561,28 +4577,31 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, } vcpu->arch.ret = r; - if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && - !kvmppc_vcpu_woken(vcpu)) { + if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) { kvmppc_set_timer(vcpu); - while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) { + + prepare_to_rcuwait(&vcpu->wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) { vcpu->stat.signal_exits++; run->exit_reason = KVM_EXIT_INTR; vcpu->arch.ret = -EINTR; break; } - spin_lock(&vc->lock); - kvmppc_vcore_blocked(vc); - spin_unlock(&vc->lock); + + if (kvmppc_vcpu_check_block(vcpu)) + break; + + trace_kvmppc_vcore_blocked(vc, 0); + schedule(); + trace_kvmppc_vcore_blocked(vc, 1); } + finish_rcuwait(&vcpu->wait); } vcpu->arch.ceded = 0; - vc->vcore_state = VCORE_INACTIVE; - trace_kvmppc_run_core(vc, 1); - done: - kvmppc_remove_runnable(vc, vcpu, tb); trace_kvmppc_run_vcpu_exit(vcpu); return vcpu->arch.ret; @@ -4664,7 +4683,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) kvmppc_save_current_sprs(); - vcpu->arch.waitp = &vcpu->arch.vcore->wait; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + vcpu->arch.waitp = &vcpu->arch.vcore->wait; vcpu->arch.pgdir = kvm->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; @@ -5126,6 +5146,9 @@ void kvmppc_alloc_host_rm_ops(void) int cpu, core; int size; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return; + /* Not the first time here ? */ if (kvmppc_host_rm_ops_hv != NULL) return; From 617326ff01df30796d897895ebd18ce583c9b883 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:29 +1000 Subject: [PATCH 051/240] KVM: PPC: Book3S HV P9: Tidy kvmppc_create_dtl_entry This goes further to removing vcores from the P9 path. Also avoid the memset in favour of explicitly initialising all fields. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-52-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 60 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e56804b84804..2d598291d8cf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -698,41 +698,30 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) return p; } -static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, - struct kvmppc_vcore *vc, u64 tb) +static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, + unsigned int pcpu, u64 now, + unsigned long stolen) { struct dtl_entry *dt; struct lppaca *vpa; - unsigned long stolen; - unsigned long core_stolen; - u64 now; - unsigned long flags; dt = vcpu->arch.dtl_ptr; vpa = vcpu->arch.vpa.pinned_addr; - now = tb; - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - stolen = 0; - } else { - core_stolen = vcore_stolen_time(vc, now); - stolen = core_stolen - vcpu->arch.stolen_logged; - vcpu->arch.stolen_logged = core_stolen; - spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); - stolen += vcpu->arch.busy_stolen; - vcpu->arch.busy_stolen = 0; - spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); - } if (!dt || !vpa) return; - memset(dt, 0, sizeof(struct dtl_entry)); + dt->dispatch_reason = 7; - dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid); - dt->timebase = cpu_to_be64(now + vc->tb_offset); + dt->preempt_reason = 0; + dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid); dt->enqueue_to_dispatch_time = cpu_to_be32(stolen); + dt->ready_to_enqueue_time = 0; + dt->waiting_to_ready_time = 0; + dt->timebase = cpu_to_be64(now); + dt->fault_addr = 0; dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu)); dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr); + ++dt; if (dt == vcpu->arch.dtl.pinned_end) dt = vcpu->arch.dtl.pinned_addr; @@ -743,6 +732,27 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, vcpu->arch.dtl.dirty = true; } +static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, + struct kvmppc_vcore *vc) +{ + unsigned long stolen; + unsigned long core_stolen; + u64 now; + unsigned long flags; + + now = mftb(); + + core_stolen = vcore_stolen_time(vc, now); + stolen = core_stolen - vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = core_stolen; + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); + stolen += vcpu->arch.busy_stolen; + vcpu->arch.busy_stolen = 0; + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); + + __kvmppc_create_dtl_entry(vcpu, vc->pcpu, now + vc->tb_offset, stolen); +} + /* See if there is a doorbell interrupt pending for a vcpu */ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) { @@ -3756,7 +3766,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) pvc->pcpu = pcpu + thr; for_each_runnable_thread(i, vcpu, pvc) { kvmppc_start_thread(vcpu, pvc); - kvmppc_create_dtl_entry(vcpu, pvc, mftb()); + kvmppc_create_dtl_entry(vcpu, pvc); trace_kvm_guest_enter(vcpu); if (!vcpu->arch.ptid) thr0_done = true; @@ -4332,7 +4342,7 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu) if ((vc->vcore_state == VCORE_PIGGYBACK || vc->vcore_state == VCORE_RUNNING) && !VCORE_IS_EXITING(vc)) { - kvmppc_create_dtl_entry(vcpu, vc, mftb()); + kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { @@ -4510,7 +4520,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, local_paca->kvm_hstate.ptid = 0; local_paca->kvm_hstate.fake_suspend = 0; - kvmppc_create_dtl_entry(vcpu, vc, tb); + __kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0); trace_kvm_guest_enter(vcpu); From 6398326b9ba182936bdc9d66475c09e39b701aa2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:30 +1000 Subject: [PATCH 052/240] KVM: PPC: Book3S HV P9: Stop using vc->dpdes The P9 path uses vc->dpdes only for msgsndp / SMT emulation. This adds an ordering requirement between vcpu->doorbell_request and vc->dpdes for no real benefit. Use vcpu->doorbell_request directly. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-53-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 18 ++++++++++-------- arch/powerpc/kvm/book3s_hv_builtin.c | 2 ++ arch/powerpc/kvm/book3s_hv_p9_entry.c | 14 ++++++++++---- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2d598291d8cf..214481e5d56d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -761,6 +761,8 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) if (vcpu->arch.doorbell_request) return true; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return false; /* * Ensure that the read of vcore->dpdes comes after the read * of vcpu->doorbell_request. This barrier matches the @@ -2185,8 +2187,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, * either vcore->dpdes or doorbell_request. * On POWER8, doorbell_request is 0. */ - *val = get_reg_val(id, vcpu->arch.vcore->dpdes | - vcpu->arch.doorbell_request); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + *val = get_reg_val(id, vcpu->arch.doorbell_request); + else + *val = get_reg_val(id, vcpu->arch.vcore->dpdes); break; case KVM_REG_PPC_VTB: *val = get_reg_val(id, vcpu->arch.vcore->vtb); @@ -2423,7 +2427,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, vcpu->arch.pspb = set_reg_val(id, *val); break; case KVM_REG_PPC_DPDES: - vcpu->arch.vcore->dpdes = set_reg_val(id, *val); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1; + else + vcpu->arch.vcore->dpdes = set_reg_val(id, *val); break; case KVM_REG_PPC_VTB: vcpu->arch.vcore->vtb = set_reg_val(id, *val); @@ -4491,11 +4498,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, if (!nested) { kvmppc_core_prepare_to_enter(vcpu); - if (vcpu->arch.doorbell_request) { - vc->dpdes = 1; - smp_wmb(); - vcpu->arch.doorbell_request = 0; - } if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions)) lpcr |= LPCR_MER; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index ad70756a777c..7d6d91338c3f 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -649,6 +649,8 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) int ext; unsigned long lpcr; + WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)); + /* Insert EXTERNAL bit into LPCR at the MER bit position */ ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1; lpcr = mfspr(SPRN_LPCR); diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 6120cdf281b9..72119bc13e1d 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -780,6 +780,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc unsigned long host_pidr; unsigned long host_dawr1; unsigned long host_dawrx1; + unsigned long dpdes; hdec = time_limit - *tb; if (hdec < 0) @@ -842,8 +843,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc if (vc->pcr) mtspr(SPRN_PCR, vc->pcr | PCR_MASK); - if (vc->dpdes) - mtspr(SPRN_DPDES, vc->dpdes); + if (vcpu->arch.doorbell_request) { + vcpu->arch.doorbell_request = 0; + mtspr(SPRN_DPDES, 1); + } if (dawr_enabled()) { if (vcpu->arch.dawr0 != host_dawr0) @@ -1074,7 +1077,10 @@ tm_return_to_guest: vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); - vc->dpdes = mfspr(SPRN_DPDES); + dpdes = mfspr(SPRN_DPDES); + if (dpdes) + vcpu->arch.doorbell_request = 1; + vc->vtb = mfspr(SPRN_VTB); dec = mfspr(SPRN_DEC); @@ -1136,7 +1142,7 @@ tm_return_to_guest: } } - if (vc->dpdes) + if (dpdes) mtspr(SPRN_DPDES, 0); if (vc->pcr) mtspr(SPRN_PCR, PCR_MASK); From 9c5a432a558105d6145b058fad78eb6fcf3d4c38 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 23 Nov 2021 19:52:31 +1000 Subject: [PATCH 053/240] KVM: PPC: Book3S HV P9: Remove subcore HMI handling On POWER9 and newer, rather than the complex HMI synchronisation and subcore state, have each thread un-apply the guest TB offset before calling into the early HMI handler. This allows the subcore state to be avoided, including subcore enter / exit guest, which includes an expensive divide that shows up slightly in profiles. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211123095231.1036501-54-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_ppc.h | 1 + arch/powerpc/kvm/book3s_hv.c | 12 +++--- arch/powerpc/kvm/book3s_hv_hmi.c | 7 +++- arch/powerpc/kvm/book3s_hv_p9_entry.c | 2 +- arch/powerpc/kvm/book3s_hv_ras.c | 54 +++++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2b76d51e4b13..33db83b82fbd 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -759,6 +759,7 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); void kvmppc_subcore_enter_guest(void); void kvmppc_subcore_exit_guest(void); long kvmppc_realmode_hmi_handler(void); +long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu); long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel); long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 214481e5d56d..98e90bdf1f27 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4033,8 +4033,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.ceded = 0; - kvmppc_subcore_enter_guest(); - vcpu_vpa_increment_dispatch(vcpu); if (kvmhv_on_pseries()) { @@ -4087,8 +4085,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu_vpa_increment_dispatch(vcpu); - kvmppc_subcore_exit_guest(); - return trap; } @@ -6102,9 +6098,11 @@ static int kvmppc_book3s_init_hv(void) if (r) return r; - r = kvm_init_subcore_bitmap(); - if (r) - return r; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + r = kvm_init_subcore_bitmap(); + if (r) + return r; + } /* * We need a way of accessing the XICS interrupt controller, diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c index 9af660476314..1ec50c69678b 100644 --- a/arch/powerpc/kvm/book3s_hv_hmi.c +++ b/arch/powerpc/kvm/book3s_hv_hmi.c @@ -20,10 +20,15 @@ void wait_for_subcore_guest_exit(void) /* * NULL bitmap pointer indicates that KVM module hasn't - * been loaded yet and hence no guests are running. + * been loaded yet and hence no guests are running, or running + * on POWER9 or newer CPU. + * * If no KVM is in use, no need to co-ordinate among threads * as all of them will always be in host and no one is going * to modify TB other than the opal hmi handler. + * + * POWER9 and newer don't need this synchronisation. + * * Hence, just return from here. */ if (!local_paca->sibling_subcore_state) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index 72119bc13e1d..ebb4781859e2 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -1013,7 +1013,7 @@ tm_return_to_guest: kvmppc_realmode_machine_check(vcpu); } else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) { - kvmppc_realmode_hmi_handler(); + kvmppc_p9_realmode_hmi_handler(vcpu); } else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) { vcpu->arch.emul_inst = mfspr(SPRN_HEIR); diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index d4bca93b79f6..ccfd96965630 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -136,6 +136,60 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) vcpu->arch.mce_evt = mce_evt; } + +long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + long ret = 0; + + /* + * Unapply and clear the offset first. That way, if the TB was not + * resynced then it will remain in host-offset, and if it was resynced + * then it is brought into host-offset. Then the tb offset is + * re-applied before continuing with the KVM exit. + * + * This way, we don't need to actually know whether not OPAL resynced + * the timebase or do any of the complicated dance that the P7/8 + * path requires. + */ + if (vc->tb_offset_applied) { + u64 new_tb = mftb() - vc->tb_offset_applied; + mtspr(SPRN_TBU40, new_tb); + if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) { + new_tb += 0x1000000; + mtspr(SPRN_TBU40, new_tb); + } + vc->tb_offset_applied = 0; + } + + local_paca->hmi_irqs++; + + if (hmi_handle_debugtrig(NULL) >= 0) { + ret = 1; + goto out; + } + + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(NULL); + +out: + if (vc->tb_offset) { + u64 new_tb = mftb() + vc->tb_offset; + mtspr(SPRN_TBU40, new_tb); + if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) { + new_tb += 0x1000000; + mtspr(SPRN_TBU40, new_tb); + } + vc->tb_offset_applied = vc->tb_offset; + } + + return ret; +} + +/* + * The following subcore HMI handling is all only for pre-POWER9 CPUs. + */ + /* Check if dynamic split is in force and return subcore size accordingly. */ static inline int kvmppc_cur_subcore_size(void) { From f6e82647ff71d427d4148964b71f239fba9d7937 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 20 Nov 2015 20:33:19 +0000 Subject: [PATCH 054/240] powerpc/6xx: add missing of_node_put for_each_compatible_node performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. A simplified version of the semantic patch that fixes this problem is as follows (http://coccinelle.lip6.fr): // @@ expression e; local idexpression n; @@ @@ local idexpression n; expression e; @@ for_each_compatible_node(n,...) { ... ( of_node_put(n); | e = n | + of_node_put(n); ? break; ) ... } ... when != n // Signed-off-by: Julia Lawall Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1448051604-25256-2-git-send-email-Julia.Lawall@lip6.fr --- arch/powerpc/platforms/embedded6xx/hlwd-pic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c index 15396333a90b..a4b020e4b6af 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c @@ -214,6 +214,7 @@ void hlwd_pic_probe(void) irq_set_chained_handler(cascade_virq, hlwd_pic_irq_cascade); hlwd_irq_host = host; + of_node_put(np); break; } } From 7d405a939ca960162eb30c1475759cb2fdf38f8c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 20 Nov 2015 20:33:21 +0000 Subject: [PATCH 055/240] powerpc/powernv: add missing of_node_put for_each_compatible_node performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. A simplified version of the semantic patch that fixes this problem is as follows (http://coccinelle.lip6.fr): // @@ local idexpression n; expression e; @@ for_each_compatible_node(n,...) { ... ( of_node_put(n); | e = n | + of_node_put(n); ? break; ) ... } ... when != n // Signed-off-by: Julia Lawall Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1448051604-25256-4-git-send-email-Julia.Lawall@lip6.fr --- arch/powerpc/platforms/powernv/opal-lpc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 1e5d51db40f8..5390c888db16 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -396,6 +396,7 @@ void __init opal_lpc_init(void) if (!of_get_property(np, "primary", NULL)) continue; opal_lpc_chip_id = of_get_ibm_chip_id(np); + of_node_put(np); break; } if (opal_lpc_chip_id < 0) From a841fd009e51c8c0a8f07c942e9ab6bb48da8858 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 20 Nov 2015 21:33:24 +0100 Subject: [PATCH 056/240] powerpc/cell: add missing of_node_put for_each_node_by_name performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. A simplified version of the semantic patch that fixes this problem is as follows (http://coccinelle.lip6.fr): // @@ expression e,e1; local idexpression n; @@ for_each_node_by_name(n, e1) { ... when != of_node_put(n) when != e = n ( return n; | + of_node_put(n); ? return ...; ) ... } // Signed-off-by: Julia Lawall Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1448051604-25256-7-git-send-email-Julia.Lawall@lip6.fr --- arch/powerpc/platforms/cell/iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index fa08699aedeb..d32f24de8479 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -977,6 +977,7 @@ static int __init cell_iommu_fixed_mapping_init(void) if (hbase < dbase || (hend > (dbase + dsize))) { pr_debug("iommu: hash window doesn't fit in" "real DMA window\n"); + of_node_put(np); return -1; } } From a1d2b210ffa52d60acabbf7b6af3ef7e1e69cda0 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 20 Nov 2015 20:33:23 +0000 Subject: [PATCH 057/240] powerpc/btext: add missing of_node_put for_each_node_by_type performs an of_node_get on each iteration, so a break out of the loop requires an of_node_put. A simplified version of the semantic patch that fixes this problem is as follows (http://coccinelle.lip6.fr): // @@ local idexpression n; expression e; @@ for_each_node_by_type(n,...) { ... ( of_node_put(n); | e = n | + of_node_put(n); ? break; ) ... } ... when != n // Signed-off-by: Julia Lawall Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1448051604-25256-6-git-send-email-Julia.Lawall@lip6.fr --- arch/powerpc/kernel/btext.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 803c2a45b22a..1cffb5e7c38d 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -241,8 +241,10 @@ int __init btext_find_display(int allow_nonstdout) rc = btext_initialize(np); printk("result: %d\n", rc); } - if (rc == 0) + if (rc == 0) { + of_node_put(np); break; + } } return rc; } From d02fa40d759ff9a53c93b10d8a4b591688982b26 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 3 May 2021 23:02:43 +1000 Subject: [PATCH 058/240] powerpc/powernv: Remove POWER9 PVR version check for entry and uaccess flushes These aren't necessarily POWER9 only, and it's not to say some new vulnerability may not get discovered on other processors for which we would like the flexibility of having the workaround enabled by firmware. Remove the restriction that the workarounds only apply to POWER9. However POWER7 and POWER8 are not affected, and they may not have older firmware that does not advertise this, so clear these workarounds manually. Signed-off-by: Nicholas Piggin Reviewed-by: Joel Stanley [mpe: Incorporate changes from Nick, reword comment slightly.] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210503130243.891868-5-npiggin@gmail.com --- arch/powerpc/platforms/powernv/setup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index ad56a54ac9c5..5ef6b8afb3d0 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -123,10 +123,14 @@ static void pnv_setup_security_mitigations(void) } /* - * If we are non-Power9 bare metal, we don't need to flush on kernel - * entry or after user access: they fix a P9 specific vulnerability. + * The issues addressed by the entry and uaccess flush don't affect P7 + * or P8, so on bare metal disable them explicitly in case firmware does + * not include the features to disable them. POWER9 and newer processors + * should have the appropriate firmware flags. */ - if (!pvr_version_is(PVR_POWER9)) { + if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p) || + pvr_version_is(PVR_POWER8E) || pvr_version_is(PVR_POWER8NVL) || + pvr_version_is(PVR_POWER8)) { security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY); security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS); } From 44b9c8ddcbc351d47ead974f0870d09bfc74b3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:26 +0100 Subject: [PATCH 059/240] powerpc/xive: Replace pr_devel() by pr_debug() to ease debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These routines are not on hot code paths and pr_debug() is easier to activate. Also add a '0x' prefix to hex printed values (HW IRQ number). Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-2-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 29 +++++++++++------------ arch/powerpc/sysdev/xive/spapr.c | 38 +++++++++++++++---------------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 7b69299c2912..442642be3505 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -451,6 +451,8 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd, { u64 val; + pr_debug("%s: HW 0x%x %smask\n", __func__, xd->hw_irq, mask ? "" : "un"); + /* * If the interrupt had P set, it may be in a queue. * @@ -612,8 +614,8 @@ static unsigned int xive_irq_startup(struct irq_data *d) xd->saved_p = false; xd->stale_p = false; - pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", - d->irq, hw_irq, d); + + pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d); /* Pick a target */ target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); @@ -654,8 +656,7 @@ static void xive_irq_shutdown(struct irq_data *d) struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n", - d->irq, hw_irq, d); + pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d); if (WARN_ON(xd->target == XIVE_INVALID_TARGET)) return; @@ -679,7 +680,7 @@ static void xive_irq_unmask(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); - pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd); + pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); xive_do_source_set_mask(xd, false); } @@ -688,7 +689,7 @@ static void xive_irq_mask(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); - pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd); + pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd); xive_do_source_set_mask(xd, true); } @@ -702,7 +703,7 @@ static int xive_irq_set_affinity(struct irq_data *d, u32 target, old_target; int rc = 0; - pr_debug("%s: irq %d/%x\n", __func__, d->irq, hw_irq); + pr_debug("%s: irq %d/0x%x\n", __func__, d->irq, hw_irq); /* Is this valid ? */ if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) @@ -975,7 +976,7 @@ EXPORT_SYMBOL_GPL(is_xive_irq); void xive_cleanup_irq_data(struct xive_irq_data *xd) { - pr_debug("%s for HW %x\n", __func__, xd->hw_irq); + pr_debug("%s for HW 0x%x\n", __func__, xd->hw_irq); if (xd->eoi_mmio) { iounmap(xd->eoi_mmio); @@ -1211,8 +1212,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu) pr_err("Failed to map IPI CPU %d\n", cpu); return -EIO; } - pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu, - xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); + pr_debug("CPU %d HW IPI 0x%x, virq %d, trig_mmio=%p\n", cpu, + xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); /* Unmask it */ xive_do_source_set_mask(&xc->ipi_data, false); @@ -1390,7 +1391,7 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, if (rc) return rc; - pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs); + pr_debug("%s %d/0x%lx #%d\n", __func__, virq, hwirq, nr_irqs); for (i = 0; i < nr_irqs; i++) { /* TODO: call xive_irq_domain_map() */ @@ -1504,7 +1505,7 @@ static void xive_setup_cpu(void) #ifdef CONFIG_SMP void xive_smp_setup_cpu(void) { - pr_devel("SMP setup CPU %d\n", smp_processor_id()); + pr_debug("SMP setup CPU %d\n", smp_processor_id()); /* This will have already been done on the boot CPU */ if (smp_processor_id() != boot_cpuid) @@ -1650,10 +1651,10 @@ bool __init xive_core_init(struct device_node *np, const struct xive_ops *ops, ppc_md.get_irq = xive_get_irq; __xive_enabled = true; - pr_devel("Initializing host..\n"); + pr_debug("Initializing host..\n"); xive_init_host(np); - pr_devel("Initializing boot CPU..\n"); + pr_debug("Initializing boot CPU..\n"); /* Allocate per-CPU data and queues */ xive_prepare_cpu(smp_processor_id()); diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index f143b6f111ac..77943dc70860 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -173,7 +173,7 @@ static long plpar_int_get_source_info(unsigned long flags, } while (plpar_busy_delay(rc)); if (rc) { - pr_err("H_INT_GET_SOURCE_INFO lisn=%ld failed %ld\n", lisn, rc); + pr_err("H_INT_GET_SOURCE_INFO lisn=0x%lx failed %ld\n", lisn, rc); return rc; } @@ -182,8 +182,8 @@ static long plpar_int_get_source_info(unsigned long flags, *trig_page = retbuf[2]; *esb_shift = retbuf[3]; - pr_devel("H_INT_GET_SOURCE_INFO flags=%lx eoi=%lx trig=%lx shift=%lx\n", - retbuf[0], retbuf[1], retbuf[2], retbuf[3]); + pr_debug("H_INT_GET_SOURCE_INFO lisn=0x%lx flags=0x%lx eoi=0x%lx trig=0x%lx shift=0x%lx\n", + lisn, retbuf[0], retbuf[1], retbuf[2], retbuf[3]); return 0; } @@ -200,8 +200,8 @@ static long plpar_int_set_source_config(unsigned long flags, long rc; - pr_devel("H_INT_SET_SOURCE_CONFIG flags=%lx lisn=%lx target=%lx prio=%lx sw_irq=%lx\n", - flags, lisn, target, prio, sw_irq); + pr_debug("H_INT_SET_SOURCE_CONFIG flags=0x%lx lisn=0x%lx target=%ld prio=%ld sw_irq=%ld\n", + flags, lisn, target, prio, sw_irq); do { @@ -210,7 +210,7 @@ static long plpar_int_set_source_config(unsigned long flags, } while (plpar_busy_delay(rc)); if (rc) { - pr_err("H_INT_SET_SOURCE_CONFIG lisn=%ld target=%lx prio=%lx failed %ld\n", + pr_err("H_INT_SET_SOURCE_CONFIG lisn=0x%lx target=%ld prio=%ld failed %ld\n", lisn, target, prio, rc); return rc; } @@ -227,7 +227,7 @@ static long plpar_int_get_source_config(unsigned long flags, unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; long rc; - pr_devel("H_INT_GET_SOURCE_CONFIG flags=%lx lisn=%lx\n", flags, lisn); + pr_debug("H_INT_GET_SOURCE_CONFIG flags=0x%lx lisn=0x%lx\n", flags, lisn); do { rc = plpar_hcall(H_INT_GET_SOURCE_CONFIG, retbuf, flags, lisn, @@ -235,7 +235,7 @@ static long plpar_int_get_source_config(unsigned long flags, } while (plpar_busy_delay(rc)); if (rc) { - pr_err("H_INT_GET_SOURCE_CONFIG lisn=%ld failed %ld\n", + pr_err("H_INT_GET_SOURCE_CONFIG lisn=0x%lx failed %ld\n", lisn, rc); return rc; } @@ -244,8 +244,8 @@ static long plpar_int_get_source_config(unsigned long flags, *prio = retbuf[1]; *sw_irq = retbuf[2]; - pr_devel("H_INT_GET_SOURCE_CONFIG target=%lx prio=%lx sw_irq=%lx\n", - retbuf[0], retbuf[1], retbuf[2]); + pr_debug("H_INT_GET_SOURCE_CONFIG target=%ld prio=%ld sw_irq=%ld\n", + retbuf[0], retbuf[1], retbuf[2]); return 0; } @@ -273,8 +273,8 @@ static long plpar_int_get_queue_info(unsigned long flags, *esn_page = retbuf[0]; *esn_size = retbuf[1]; - pr_devel("H_INT_GET_QUEUE_INFO page=%lx size=%lx\n", - retbuf[0], retbuf[1]); + pr_debug("H_INT_GET_QUEUE_INFO cpu=%ld prio=%ld page=0x%lx size=0x%lx\n", + target, priority, retbuf[0], retbuf[1]); return 0; } @@ -289,8 +289,8 @@ static long plpar_int_set_queue_config(unsigned long flags, { long rc; - pr_devel("H_INT_SET_QUEUE_CONFIG flags=%lx target=%lx priority=%lx qpage=%lx qsize=%lx\n", - flags, target, priority, qpage, qsize); + pr_debug("H_INT_SET_QUEUE_CONFIG flags=0x%lx target=%ld priority=0x%lx qpage=0x%lx qsize=0x%lx\n", + flags, target, priority, qpage, qsize); do { rc = plpar_hcall_norets(H_INT_SET_QUEUE_CONFIG, flags, target, @@ -298,7 +298,7 @@ static long plpar_int_set_queue_config(unsigned long flags, } while (plpar_busy_delay(rc)); if (rc) { - pr_err("H_INT_SET_QUEUE_CONFIG cpu=%ld prio=%ld qpage=%lx returned %ld\n", + pr_err("H_INT_SET_QUEUE_CONFIG cpu=%ld prio=%ld qpage=0x%lx returned %ld\n", target, priority, qpage, rc); return rc; } @@ -315,7 +315,7 @@ static long plpar_int_sync(unsigned long flags, unsigned long lisn) } while (plpar_busy_delay(rc)); if (rc) { - pr_err("H_INT_SYNC lisn=%ld returned %ld\n", lisn, rc); + pr_err("H_INT_SYNC lisn=0x%lx returned %ld\n", lisn, rc); return rc; } @@ -333,8 +333,8 @@ static long plpar_int_esb(unsigned long flags, unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; long rc; - pr_devel("H_INT_ESB flags=%lx lisn=%lx offset=%lx in=%lx\n", - flags, lisn, offset, in_data); + pr_debug("H_INT_ESB flags=0x%lx lisn=0x%lx offset=0x%lx in=0x%lx\n", + flags, lisn, offset, in_data); do { rc = plpar_hcall(H_INT_ESB, retbuf, flags, lisn, offset, @@ -342,7 +342,7 @@ static long plpar_int_esb(unsigned long flags, } while (plpar_busy_delay(rc)); if (rc) { - pr_err("H_INT_ESB lisn=%ld offset=%ld returned %ld\n", + pr_err("H_INT_ESB lisn=0x%lx offset=0x%lx returned %ld\n", lisn, offset, rc); return rc; } From bd5b00c6cf0c37fce1bcd94390044d7e1dd638e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:27 +0100 Subject: [PATCH 060/240] powerpc/xive: Introduce an helper to print out interrupt characteristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and extend output of debugfs and xmon with addresses of the ESB management and trigger pages. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-3-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 52 +++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 442642be3505..b74b8f18b80c 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -227,6 +227,19 @@ static void xive_esb_write(struct xive_irq_data *xd, u32 offset, u64 data) out_be64(xd->eoi_mmio + offset, data); } +static void xive_irq_data_dump(struct xive_irq_data *xd, char *buffer, size_t size) +{ + u64 val = xive_esb_read(xd, XIVE_ESB_GET); + + snprintf(buffer, size, "flags=%c%c%c PQ=%c%c 0x%016llx 0x%016llx", + xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ', + xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ', + xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ', + val & XIVE_ESB_VAL_P ? 'P' : '-', + val & XIVE_ESB_VAL_Q ? 'Q' : '-', + xd->trig_page, xd->eoi_page); +} + #ifdef CONFIG_XMON static notrace void xive_dump_eq(const char *name, struct xive_q *q) { @@ -252,11 +265,10 @@ notrace void xmon_xive_do_dump(int cpu) #ifdef CONFIG_SMP { - u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET); + char buffer[128]; - xmon_printf("IPI=0x%08x PQ=%c%c ", xc->hw_ipi, - val & XIVE_ESB_VAL_P ? 'P' : '-', - val & XIVE_ESB_VAL_Q ? 'Q' : '-'); + xive_irq_data_dump(&xc->ipi_data, buffer, sizeof(buffer)); + xmon_printf("IPI=0x%08x %s", xc->hw_ipi, buffer); } #endif xive_dump_eq("EQ", &xc->queue[xive_irq_priority]); @@ -291,15 +303,11 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) d = xive_get_irq_data(hw_irq); if (d) { - struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); - u64 val = xive_esb_read(xd, XIVE_ESB_GET); + char buffer[128]; - xmon_printf("flags=%c%c%c PQ=%c%c", - xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ', - xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ', - xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ', - val & XIVE_ESB_VAL_P ? 'P' : '-', - val & XIVE_ESB_VAL_Q ? 'Q' : '-'); + xive_irq_data_dump(irq_data_get_irq_handler_data(d), + buffer, sizeof(buffer)); + xmon_printf("%s", buffer); } xmon_printf("\n"); @@ -1702,11 +1710,10 @@ static void xive_debug_show_cpu(struct seq_file *m, int cpu) #ifdef CONFIG_SMP { - u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET); + char buffer[128]; - seq_printf(m, "IPI=0x%08x PQ=%c%c ", xc->hw_ipi, - val & XIVE_ESB_VAL_P ? 'P' : '-', - val & XIVE_ESB_VAL_Q ? 'Q' : '-'); + xive_irq_data_dump(&xc->ipi_data, buffer, sizeof(buffer)); + seq_printf(m, "IPI=0x%08x %s", xc->hw_ipi, buffer); } #endif { @@ -1733,8 +1740,7 @@ static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d) u32 target; u8 prio; u32 lirq; - struct xive_irq_data *xd; - u64 val; + char buffer[128]; rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq); if (rc) { @@ -1745,14 +1751,8 @@ static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d) seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", hw_irq, target, prio, lirq); - xd = irq_data_get_irq_handler_data(d); - val = xive_esb_read(xd, XIVE_ESB_GET); - seq_printf(m, "flags=%c%c%c PQ=%c%c", - xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ', - xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ', - xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ', - val & XIVE_ESB_VAL_P ? 'P' : '-', - val & XIVE_ESB_VAL_Q ? 'Q' : '-'); + xive_irq_data_dump(irq_data_get_irq_handler_data(d), buffer, sizeof(buffer)); + seq_puts(m, buffer); seq_puts(m, "\n"); } From 756c52c632f5c2b054bb54b1ea9177329e4b8ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:28 +0100 Subject: [PATCH 061/240] powerpc/xive: Activate StoreEOI on P10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StoreEOI (the capability to EOI with a store) requires load-after-store ordering in some cases to be reliable. P10 introduced a new offset for load operations to enforce correct ordering and the XIVE driver has the required support since kernel 5.8, commit b1f9be9392f0 ("powerpc/xive: Enforce load-after-store ordering when StoreEOI is active") Since skiboot v7, StoreEOI support is advertised on P10 with a new flag on the PowerNV platform. See skiboot commit 4bd7d84afe46 ("xive/p10: Introduce a new OPAL_XIVE_IRQ_STORE_EOI2 flag"). When detected, activate the feature. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-4-clg@kaod.org --- arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/sysdev/xive/native.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0b63ba7d5917..a2bc4b95e703 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1094,6 +1094,7 @@ enum { OPAL_XIVE_IRQ_SHIFT_BUG = 0x00000008, /* P9 DD1.0 workaround */ OPAL_XIVE_IRQ_MASK_VIA_FW = 0x00000010, /* P9 DD1.0 workaround */ OPAL_XIVE_IRQ_EOI_VIA_FW = 0x00000020, /* P9 DD1.0 workaround */ + OPAL_XIVE_IRQ_STORE_EOI2 = 0x00000040, }; /* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */ diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 1aec282cd650..7ec8911dad57 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -63,6 +63,8 @@ int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) opal_flags = be64_to_cpu(flags); if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI) data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI2) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; if (opal_flags & OPAL_XIVE_IRQ_LSI) data->flags |= XIVE_IRQ_FLAG_LSI; data->eoi_page = be64_to_cpu(eoi_page); From 412877dfae3dc12733bc711ccbd3d02338803865 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:29 +0100 Subject: [PATCH 062/240] powerpc/xive: Introduce xive_core_debugfs_create() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and fix some compile issues when !CONFIG_DEBUG_FS. Signed-off-by: Cédric Le Goater [mpe: Add empty stub to fix !CONFIG_DEBUG_FS build] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-5-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index b74b8f18b80c..6c9092db74d0 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -227,6 +227,7 @@ static void xive_esb_write(struct xive_irq_data *xd, u32 offset, u64 data) out_be64(xd->eoi_mmio + offset, data); } +#if defined(CONFIG_XMON) || defined(CONFIG_DEBUG_FS) static void xive_irq_data_dump(struct xive_irq_data *xd, char *buffer, size_t size) { u64 val = xive_esb_read(xd, XIVE_ESB_GET); @@ -239,6 +240,7 @@ static void xive_irq_data_dump(struct xive_irq_data *xd, char *buffer, size_t si val & XIVE_ESB_VAL_Q ? 'Q' : '-', xd->trig_page, xd->eoi_page); } +#endif #ifdef CONFIG_XMON static notrace void xive_dump_eq(const char *name, struct xive_q *q) @@ -1700,6 +1702,7 @@ static int __init xive_off(char *arg) } __setup("xive=off", xive_off); +#ifdef CONFIG_DEBUG_FS static void xive_debug_show_cpu(struct seq_file *m, int cpu) { struct xive_cpu *xc = per_cpu(xive_cpu, cpu); @@ -1778,10 +1781,19 @@ static int xive_core_debug_show(struct seq_file *m, void *private) } DEFINE_SHOW_ATTRIBUTE(xive_core_debug); +static void xive_core_debugfs_create(void) +{ + debugfs_create_file("xive", 0400, arch_debugfs_dir, + NULL, &xive_core_debug_fops); +} +#else +static inline void xive_core_debugfs_create(void) { } +#endif /* CONFIG_DEBUG_FS */ + int xive_core_debug_init(void) { - if (xive_enabled()) - debugfs_create_file("xive", 0400, arch_debugfs_dir, - NULL, &xive_core_debug_fops); + if (xive_enabled() && IS_ENABLED(CONFIG_DEBUG_FS)) + xive_core_debugfs_create(); + return 0; } From baed14de78b5ee3ca04eae43c5b16e3eeb6e33a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:30 +0100 Subject: [PATCH 063/240] powerpc/xive: Change the debugfs file 'xive' into a directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a 'cpus' file to dump CPU states and 'interrupts' to dump IRQ states. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-6-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 36 +++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 6c9092db74d0..34bbae9ee963 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1759,17 +1759,10 @@ static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d) seq_puts(m, "\n"); } -static int xive_core_debug_show(struct seq_file *m, void *private) +static int xive_irq_debug_show(struct seq_file *m, void *private) { unsigned int i; struct irq_desc *desc; - int cpu; - - if (xive_ops->debug_show) - xive_ops->debug_show(m, private); - - for_each_possible_cpu(cpu) - xive_debug_show_cpu(m, cpu); for_each_irq_desc(i, desc) { struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); @@ -1779,12 +1772,33 @@ static int xive_core_debug_show(struct seq_file *m, void *private) } return 0; } -DEFINE_SHOW_ATTRIBUTE(xive_core_debug); +DEFINE_SHOW_ATTRIBUTE(xive_irq_debug); + +static int xive_cpu_debug_show(struct seq_file *m, void *private) +{ + int cpu; + + if (xive_ops->debug_show) + xive_ops->debug_show(m, private); + + for_each_possible_cpu(cpu) + xive_debug_show_cpu(m, cpu); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(xive_cpu_debug); static void xive_core_debugfs_create(void) { - debugfs_create_file("xive", 0400, arch_debugfs_dir, - NULL, &xive_core_debug_fops); + struct dentry *xive_dir; + + xive_dir = debugfs_create_dir("xive", arch_debugfs_dir); + if (IS_ERR(xive_dir)) + return; + + debugfs_create_file("cpus", 0400, xive_dir, + NULL, &xive_cpu_debug_fops); + debugfs_create_file("interrupts", 0400, xive_dir, + NULL, &xive_irq_debug_fops); } #else static inline void xive_core_debugfs_create(void) { } From 33e1d4a152ce55272b54a16884461218d12d4f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:31 +0100 Subject: [PATCH 064/240] powerpc/xive: Rename the 'cpus' debugfs file to 'ipis' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and remove the EQ entries output which is not very useful since only the next two events of the queue are taken into account. We will improve the dump of the EQ in the next patches. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-7-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 34bbae9ee963..3ef3cc413b31 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1703,11 +1703,11 @@ static int __init xive_off(char *arg) __setup("xive=off", xive_off); #ifdef CONFIG_DEBUG_FS -static void xive_debug_show_cpu(struct seq_file *m, int cpu) +static void xive_debug_show_ipi(struct seq_file *m, int cpu) { struct xive_cpu *xc = per_cpu(xive_cpu, cpu); - seq_printf(m, "CPU %d:", cpu); + seq_printf(m, "CPU %d: ", cpu); if (xc) { seq_printf(m, "pp=%02x CPPR=%02x ", xc->pending_prio, xc->cppr); @@ -1719,19 +1719,6 @@ static void xive_debug_show_cpu(struct seq_file *m, int cpu) seq_printf(m, "IPI=0x%08x %s", xc->hw_ipi, buffer); } #endif - { - struct xive_q *q = &xc->queue[xive_irq_priority]; - u32 i0, i1, idx; - - if (q->qpage) { - idx = q->idx; - i0 = be32_to_cpup(q->qpage + idx); - idx = (idx + 1) & q->msk; - i1 = be32_to_cpup(q->qpage + idx); - seq_printf(m, "EQ idx=%d T=%d %08x %08x ...", - q->idx, q->toggle, i0, i1); - } - } } seq_puts(m, "\n"); } @@ -1774,7 +1761,7 @@ static int xive_irq_debug_show(struct seq_file *m, void *private) } DEFINE_SHOW_ATTRIBUTE(xive_irq_debug); -static int xive_cpu_debug_show(struct seq_file *m, void *private) +static int xive_ipi_debug_show(struct seq_file *m, void *private) { int cpu; @@ -1782,10 +1769,10 @@ static int xive_cpu_debug_show(struct seq_file *m, void *private) xive_ops->debug_show(m, private); for_each_possible_cpu(cpu) - xive_debug_show_cpu(m, cpu); + xive_debug_show_ipi(m, cpu); return 0; } -DEFINE_SHOW_ATTRIBUTE(xive_cpu_debug); +DEFINE_SHOW_ATTRIBUTE(xive_ipi_debug); static void xive_core_debugfs_create(void) { @@ -1795,8 +1782,8 @@ static void xive_core_debugfs_create(void) if (IS_ERR(xive_dir)) return; - debugfs_create_file("cpus", 0400, xive_dir, - NULL, &xive_cpu_debug_fops); + debugfs_create_file("ipis", 0400, xive_dir, + NULL, &xive_ipi_debug_fops); debugfs_create_file("interrupts", 0400, xive_dir, NULL, &xive_irq_debug_fops); } From 08f3f610214f395561bbda03344e641579f6e917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:32 +0100 Subject: [PATCH 065/240] powerpc/xive: Add a debugfs file to dump EQs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XIVE driver under Linux uses a single interrupt priority and only one event queue is configured per CPU. Expose the contents under a 'xive/eqs/cpuX' debugfs file. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-8-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 37 +++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 3ef3cc413b31..ff6a2d1ed41d 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1774,9 +1774,40 @@ static int xive_ipi_debug_show(struct seq_file *m, void *private) } DEFINE_SHOW_ATTRIBUTE(xive_ipi_debug); +static void xive_eq_debug_show_one(struct seq_file *m, struct xive_q *q, u8 prio) +{ + int i; + + seq_printf(m, "EQ%d idx=%d T=%d\n", prio, q->idx, q->toggle); + if (q->qpage) { + for (i = 0; i < q->msk + 1; i++) { + if (!(i % 8)) + seq_printf(m, "%05d ", i); + seq_printf(m, "%08x%s", be32_to_cpup(q->qpage + i), + (i + 1) % 8 ? " " : "\n"); + } + } + seq_puts(m, "\n"); +} + +static int xive_eq_debug_show(struct seq_file *m, void *private) +{ + int cpu = (long)m->private; + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + + if (xc) + xive_eq_debug_show_one(m, &xc->queue[xive_irq_priority], + xive_irq_priority); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(xive_eq_debug); + static void xive_core_debugfs_create(void) { struct dentry *xive_dir; + struct dentry *xive_eq_dir; + long cpu; + char name[16]; xive_dir = debugfs_create_dir("xive", arch_debugfs_dir); if (IS_ERR(xive_dir)) @@ -1786,6 +1817,12 @@ static void xive_core_debugfs_create(void) NULL, &xive_ipi_debug_fops); debugfs_create_file("interrupts", 0400, xive_dir, NULL, &xive_irq_debug_fops); + xive_eq_dir = debugfs_create_dir("eqs", xive_dir); + for_each_possible_cpu(cpu) { + snprintf(name, sizeof(name), "cpu%ld", cpu); + debugfs_create_file(name, 0400, xive_eq_dir, (void *)cpu, + &xive_eq_debug_fops); + } } #else static inline void xive_core_debugfs_create(void) { } From d7bc1e376cb786e9e8483455584d89cad4b5808f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:33 +0100 Subject: [PATCH 066/240] powerpc/xive: Add a debugfs toggle for StoreEOI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It can be used to deactivate temporarily StoreEOI for tests or performance on platforms supporting the feature (POWER10) Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-9-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index ff6a2d1ed41d..b3b1dbf29d4d 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -84,6 +84,16 @@ static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); /* An invalid CPU target */ #define XIVE_INVALID_TARGET (-1) +/* + * Global toggle to switch on/off StoreEOI + */ +static bool xive_store_eoi = true; + +static bool xive_is_store_eoi(struct xive_irq_data *xd) +{ + return xd->flags & XIVE_IRQ_FLAG_STORE_EOI && xive_store_eoi; +} + /* * Read the next entry in a queue, return its content if it's valid * or 0 if there is no new entry. @@ -208,7 +218,7 @@ static notrace u8 xive_esb_read(struct xive_irq_data *xd, u32 offset) { u64 val; - if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + if (offset == XIVE_ESB_SET_PQ_10 && xive_is_store_eoi(xd)) offset |= XIVE_ESB_LD_ST_MO; if ((xd->flags & XIVE_IRQ_FLAG_H_INT_ESB) && xive_ops->esb_rw) @@ -233,7 +243,7 @@ static void xive_irq_data_dump(struct xive_irq_data *xd, char *buffer, size_t si u64 val = xive_esb_read(xd, XIVE_ESB_GET); snprintf(buffer, size, "flags=%c%c%c PQ=%c%c 0x%016llx 0x%016llx", - xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ', + xive_is_store_eoi(xd) ? 'S' : ' ', xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ', xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ', val & XIVE_ESB_VAL_P ? 'P' : '-', @@ -395,7 +405,7 @@ static void xive_do_source_eoi(struct xive_irq_data *xd) xd->stale_p = false; /* If the XIVE supports the new "store EOI facility, use it */ - if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) { + if (xive_is_store_eoi(xd)) { xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); return; } @@ -1823,6 +1833,7 @@ static void xive_core_debugfs_create(void) debugfs_create_file(name, 0400, xive_eq_dir, (void *)cpu, &xive_eq_debug_fops); } + debugfs_create_bool("store-eoi", 0600, xive_dir, &xive_store_eoi); } #else static inline void xive_core_debugfs_create(void) { } From c21ee04f11ae068aa132cce56d09f618d4a66259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:34 +0100 Subject: [PATCH 067/240] powerpc/xive: Add a kernel parameter for StoreEOI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StoreEOI is activated by default on platforms supporting the feature (POWER10) and will be used as soon as firmware advertises its availability. The kernel parameter provides a way to deactivate its use. It can be still be reactivated through debugfs. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-10-clg@kaod.org --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/powerpc/sysdev/xive/common.c | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2711ddb4835a..6248a061788a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6446,6 +6446,12 @@ controller on both pseries and powernv platforms. Only useful on POWER9 and above. + xive.store-eoi=off [PPC] + By default on POWER10 and above, the kernel will use + stores for EOI handling when the XIVE interrupt mode + is active. This option allows the XIVE driver to use + loads instead, as on POWER9. + xhci-hcd.quirks [USB,KNL] A hex value specifying bitmask with supplemental xhci host controller quirks. Meaning of each bit can be diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index b3b1dbf29d4d..3b453973a5d2 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1712,6 +1712,19 @@ static int __init xive_off(char *arg) } __setup("xive=off", xive_off); +static int __init xive_store_eoi_cmdline(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strncmp(arg, "off", 3) == 0) { + pr_info("StoreEOI disabled on kernel command line\n"); + xive_store_eoi = false; + } + return 0; +} +__setup("xive.store-eoi=", xive_store_eoi_cmdline); + #ifdef CONFIG_DEBUG_FS static void xive_debug_show_ipi(struct seq_file *m, int cpu) { From 1e7684dc4fc70271c8bf86d397bd4fbfb3581e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 5 Nov 2021 11:26:35 +0100 Subject: [PATCH 068/240] powerpc/xive: Add a debugfs toggle for save-restore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On POWER10, the automatic "save & restore" of interrupt context is always available. Provide a way to deactivate it for tests or performance. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105102636.1016378-11-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 1 + arch/powerpc/sysdev/xive/native.c | 2 +- arch/powerpc/sysdev/xive/xive-internal.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 3b453973a5d2..43f7f7df6407 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1847,6 +1847,7 @@ static void xive_core_debugfs_create(void) &xive_eq_debug_fops); } debugfs_create_bool("store-eoi", 0600, xive_dir, &xive_store_eoi); + debugfs_create_bool("save-restore", 0600, xive_dir, &xive_has_save_restore); } #else static inline void xive_core_debugfs_create(void) { } diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 7ec8911dad57..d6a091dc1bce 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -41,7 +41,7 @@ static u32 xive_queue_shift; static u32 xive_pool_vps = XIVE_INVALID_VP; static struct kmem_cache *xive_provision_cache; static bool xive_has_single_esc; -static bool xive_has_save_restore; +bool xive_has_save_restore; int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) { diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h index 504e7edce358..e0941bc64430 100644 --- a/arch/powerpc/sysdev/xive/xive-internal.h +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -72,5 +72,6 @@ static inline u32 xive_alloc_order(u32 queue_shift) } extern bool xive_cmdline_disabled; +extern bool xive_has_save_restore; #endif /* __XIVE_INTERNAL_H */ From 10b34ece132ee46dc4e6459c765d180c422a09fa Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Fri, 15 Oct 2021 18:06:27 +1100 Subject: [PATCH 069/240] powerpc/eeh: Small refactor of eeh_handle_normal_event() The control flow of eeh_handle_normal_event() is a bit tricky. Break out one of the error handling paths - rather than be in an else block, we'll make it part of the regular body of the function and put a 'goto out;' in the true limb of the if. Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211015070628.1331635-1-dja@axtens.net --- arch/powerpc/kernel/eeh_driver.c | 69 ++++++++++++++++---------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 350dab18e137..b676fc079356 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1054,45 +1054,46 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } pr_info("EEH: Recovery successful.\n"); - } else { - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); + goto out; + } - eeh_slot_error_detail(pe, EEH_LOG_PERM); + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); + /* Notify all devices that they're about to go down. */ + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; - } + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; } out: From 157616f3c2284f13ca7db9897293f944e6ab8199 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 15 Oct 2021 18:06:28 +1100 Subject: [PATCH 070/240] powerpc/eeh: Use a goto for recovery failures The EEH recovery logic in eeh_handle_normal_event() has some pretty strange flow control. If we remove all the actual recovery logic we're left with the following skeleton: if (result != PCI_ERS_RESULT_DISCONNECT) { ... } if (result != PCI_ERS_RESULT_DISCONNECT) { ... } if (result == PCI_ERS_RESULT_NONE) { ... } if (result == PCI_ERS_RESULT_CAN_RECOVER) { ... } if (result == PCI_ERS_RESULT_CAN_RECOVER) { ... } if (result == PCI_ERS_RESULT_NEED_RESET) { ... } if ((result == PCI_ERS_RESULT_RECOVERED) || (result == PCI_ERS_RESULT_NONE)) { ... goto out; } /* * unsuccessful recovery / PCI_ERS_RESULT_DISCONECTED * handling is here. */ ... out: ... Most of the "if () { ... }" blocks above change "result" to PCI_ERS_RESULT_DISCONNECTED if an error occurs in that recovery step. This makes the control flow a bit confusing since it breaks the early-exit pattern that is generally used in Linux. In any case we end up handling the error in the final else block so why not just jump there directly? Doing so also allows us to de-indent a bunch of code. No functional changes. [dja: rebase on top of linux-next + my preceeding refactor, move clearing the EEH_DEV_NO_HANDLER bit above the first goto so that it is always clear in the error handler code as it was before.] Signed-off-by: Oliver O'Halloran Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211015070628.1331635-2-dja@axtens.net --- arch/powerpc/kernel/eeh_driver.c | 93 ++++++++++++++++---------------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index b676fc079356..422f80b5b27b 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -905,18 +905,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } #endif /* CONFIG_STACKTRACE */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + edev->mode &= ~EEH_DEV_NO_HANDLER; + eeh_pe_update_time_stamp(pe); pe->freeze_count++; if (pe->freeze_count > eeh_max_freezes) { pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - result = PCI_ERS_RESULT_DISCONNECT; - } - eeh_for_each_pe(pe, tmp_pe) - eeh_pe_for_each_dev(tmp_pe, edev, tmp) - edev->mode &= ~EEH_DEV_NO_HANDLER; + goto recover_failed; + } /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -928,39 +929,38 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, - eeh_report_error, &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; - } + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if (result == PCI_ERS_RESULT_DISCONNECT) + goto recover_failed; + + /* + * Error logged on a PHB are always fences which need a full + * PHB reset to clear so force that to happen. + */ + if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE) + result = PCI_ERS_RESULT_NEED_RESET; /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - result = PCI_ERS_RESULT_DISCONNECT; - } + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + goto recover_failed; } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); - } + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -970,9 +970,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Reset with hotplug activity\n"); rc = eeh_reset_device(pe, bus, NULL, false); if (rc) { - pr_warn("%s: Unable to reset, err=%d\n", - __func__, rc); - result = PCI_ERS_RESULT_DISCONNECT; + pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); + goto recover_failed; } } @@ -980,10 +979,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (result == PCI_ERS_RESULT_CAN_RECOVER) { pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + if (rc < 0) + goto recover_failed; - if (rc < 0) { - result = PCI_ERS_RESULT_DISCONNECT; - } else if (rc) { + if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -991,15 +990,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_report_mmio_enabled, &result); } } - - /* If all devices reported they can proceed, then re-enable DMA */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + if (rc < 0) + goto recover_failed; - if (rc < 0) { - result = PCI_ERS_RESULT_DISCONNECT; - } else if (rc) { + if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -1017,16 +1014,15 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Reset without hotplug activity\n"); rc = eeh_reset_device(pe, bus, &rmv_data, true); if (rc) { - pr_warn("%s: Cannot reset, err=%d\n", - __func__, rc); - result = PCI_ERS_RESULT_DISCONNECT; - } else { - result = PCI_ERS_RESULT_NONE; - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, - &result); + pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); + goto recover_failed; } + + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); } if ((result == PCI_ERS_RESULT_RECOVERED) || @@ -1057,6 +1053,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) goto out; } +recover_failed: /* * About 90% of all real-life EEH failures in the field * are due to poorly seated PCI cards. Only 10% or so are From c9ce7c36e4870bd307101ba7a00a39d9aad270f3 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 12 Oct 2021 18:00:49 +0530 Subject: [PATCH 071/240] bpf powerpc: Remove unused SEEN_STACK SEEN_STACK is unused on PowerPC. Remove it. Also, have SEEN_TAILCALL use 0x40000000. Signed-off-by: Ravi Bangoria Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-2-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 7e9b978b768e..89bd744c2bff 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -125,8 +125,7 @@ #define COND_LE (CR0_GT | COND_CMP_FALSE) #define SEEN_FUNC 0x20000000 /* might call external helpers */ -#define SEEN_STACK 0x40000000 /* uses BPF stack */ -#define SEEN_TAILCALL 0x80000000 /* uses tail calls */ +#define SEEN_TAILCALL 0x40000000 /* uses tail calls */ #define SEEN_VREG_MASK 0x1ff80000 /* Volatile registers r3-r12 */ #define SEEN_NVREG_MASK 0x0003ffff /* Non volatile registers r14-r31 */ From 04c04205bc35d0ecdc57146995ca9eb957d4f379 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 12 Oct 2021 18:00:50 +0530 Subject: [PATCH 072/240] bpf powerpc: Remove extra_pass from bpf_jit_build_body() In case of extra_pass, usual JIT passes are always skipped. So, extra_pass is always false while calling bpf_jit_build_body() and can be removed. Signed-off-by: Ravi Bangoria Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-3-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit.h | 2 +- arch/powerpc/net/bpf_jit_comp.c | 6 +++--- arch/powerpc/net/bpf_jit_comp32.c | 4 ++-- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 89bd744c2bff..7145b651fc2a 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -175,7 +175,7 @@ static inline void bpf_clear_seen_register(struct codegen_context *ctx, int i) void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func); int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs, bool extra_pass); + u32 *addrs); void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx); void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx); void bpf_jit_realloc_regs(struct codegen_context *ctx); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 90ce75f0f1e2..f39ad8d123dd 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -149,7 +149,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) cgctx.stack_size = round_up(fp->aux->stack_depth, 16); /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { /* We hit something illegal or unsupported. */ fp = org_fp; goto out_addrs; @@ -162,7 +162,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ if (cgctx.seen & SEEN_TAILCALL) { cgctx.idx = 0; - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { fp = org_fp; goto out_addrs; } @@ -210,7 +210,7 @@ skip_init_ctx: /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(code_base, &cgctx); - if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass)) { + if (bpf_jit_build_body(fp, code_base, &cgctx, addrs)) { bpf_jit_binary_free(bpf_hdr); fp = org_fp; goto out_addrs; diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 0da31d41d413..903f945601c0 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -268,7 +268,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs, bool extra_pass) + u32 *addrs) { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; @@ -862,7 +862,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + ret = bpf_jit_get_func_addr(fp, &insn[i], false, &func_addr, &func_addr_fixed); if (ret < 0) return ret; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 8b5157ccfeba..b25bf9b11b9d 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -297,7 +297,7 @@ asm ( /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs, bool extra_pass) + u32 *addrs) { enum stf_barrier_type stf_barrier = stf_barrier_type_get(); const struct bpf_insn *insn = fp->insnsi; @@ -831,7 +831,7 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + ret = bpf_jit_get_func_addr(fp, &insn[i], false, &func_addr, &func_addr_fixed); if (ret < 0) return ret; From efa95f031bf38c85cf865413335a3dc044e3194e Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 12 Oct 2021 18:00:51 +0530 Subject: [PATCH 073/240] bpf powerpc: refactor JIT compiler code Refactor powerpc LDX JITing code to simplify adding BPF_PROBE_MEM support. Signed-off-by: Hari Bathini Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-4-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp32.c | 33 ++++++++++++++++++------------- arch/powerpc/net/bpf_jit_comp64.c | 31 +++++++++++++++++------------ 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 903f945601c0..8b2ac1c27f1f 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -284,6 +284,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * u32 src_reg = bpf_to_ppc(ctx, insn[i].src_reg); u32 src_reg_h = src_reg - 1; u32 tmp_reg = bpf_to_ppc(ctx, TMP_REG); + u32 size = BPF_SIZE(code); s16 off = insn[i].off; s32 imm = insn[i].imm; bool func_addr_fixed; @@ -812,23 +813,27 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * BPF_LDX */ case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ - EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); - if (!fp->aux->verifier_zext) - EMIT(PPC_RAW_LI(dst_reg_h, 0)); - break; case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ - EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); - if (!fp->aux->verifier_zext) - EMIT(PPC_RAW_LI(dst_reg_h, 0)); - break; case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ - EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); - if (!fp->aux->verifier_zext) - EMIT(PPC_RAW_LI(dst_reg_h, 0)); - break; case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ - EMIT(PPC_RAW_LWZ(dst_reg_h, src_reg, off)); - EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off + 4)); + switch (size) { + case BPF_B: + EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); + break; + case BPF_DW: + EMIT(PPC_RAW_LWZ(dst_reg_h, src_reg, off)); + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off + 4)); + break; + } + + if (size != BPF_DW && !fp->aux->verifier_zext) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); break; /* diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index b25bf9b11b9d..ad852f15ca61 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -311,6 +311,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * u32 code = insn[i].code; u32 dst_reg = b2p[insn[i].dst_reg]; u32 src_reg = b2p[insn[i].src_reg]; + u32 size = BPF_SIZE(code); s16 off = insn[i].off; s32 imm = insn[i].imm; bool func_addr_fixed; @@ -778,25 +779,29 @@ emit_clear: */ /* dst = *(u8 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_B: - EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); - if (insn_is_zext(&insn[i + 1])) - addrs[++i] = ctx->idx * 4; - break; /* dst = *(u16 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_H: - EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); - if (insn_is_zext(&insn[i + 1])) - addrs[++i] = ctx->idx * 4; - break; /* dst = *(u32 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_W: - EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); - if (insn_is_zext(&insn[i + 1])) - addrs[++i] = ctx->idx * 4; - break; /* dst = *(u64 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_DW: - PPC_BPF_LL(dst_reg, src_reg, off); + switch (size) { + case BPF_B: + EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); + break; + case BPF_DW: + PPC_BPF_LL(dst_reg, src_reg, off); + break; + } + + if (size != BPF_DW && insn_is_zext(&insn[i + 1])) + addrs[++i] = ctx->idx * 4; break; /* From f15a71b3880bf07b40810644e5ac6f177c2a7c8f Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 12 Oct 2021 18:00:52 +0530 Subject: [PATCH 074/240] powerpc/ppc-opcode: introduce PPC_RAW_BRANCH() macro Define and use PPC_RAW_BRANCH() macro instead of open coding it. This macro is used while adding BPF_PROBE_MEM support. Signed-off-by: Hari Bathini Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-5-hbathini@linux.ibm.com --- arch/powerpc/include/asm/ppc-opcode.h | 2 ++ arch/powerpc/net/bpf_jit.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index baea657bc868..f50213e2a3e0 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -566,6 +566,8 @@ #define PPC_RAW_MTSPR(spr, d) (0x7c0003a6 | ___PPC_RS(d) | __PPC_SPR(spr)) #define PPC_RAW_EIEIO() (0x7c0006ac) +#define PPC_RAW_BRANCH(addr) (PPC_INST_BRANCH | ((addr) & 0x03fffffc)) + /* Deal with instructions that older assemblers aren't aware of */ #define PPC_BCCTR_FLUSH stringify_in_c(.long PPC_INST_BCCTR_FLUSH) #define PPC_CP_ABORT stringify_in_c(.long PPC_RAW_CP_ABORT) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 7145b651fc2a..6a945f6211f4 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -31,7 +31,7 @@ pr_err_ratelimited("Branch offset 0x%lx (@%u) out of range\n", offset, ctx->idx); \ return -ERANGE; \ } \ - EMIT(PPC_INST_BRANCH | (offset & 0x03fffffc)); \ + EMIT(PPC_RAW_BRANCH(offset)); \ } while (0) /* blr; (unconditional 'branch' with link) to absolute address */ From 983bdc0245a29cdefcd30d9d484d3edbc4b6d787 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 12 Oct 2021 18:00:53 +0530 Subject: [PATCH 075/240] bpf ppc64: Add BPF_PROBE_MEM support for JIT BPF load instruction with BPF_PROBE_MEM mode can cause a fault inside kernel. Append exception table for such instructions within BPF program. Unlike other archs which uses extable 'fixup' field to pass dest_reg and nip, BPF exception table on PowerPC follows the generic PowerPC exception table design, where it populates both fixup and extable sections within BPF program. fixup section contains two instructions, first instruction clears dest_reg and 2nd jumps to next instruction in the BPF code. extable 'insn' field contains relative offset of the instruction and 'fixup' field contains relative offset of the fixup entry. Example layout of BPF program with extable present: +------------------+ | | | | 0x4020 -->| ld r27,4(r3) | | | | | 0x40ac -->| lwz r3,0(r4) | | | | | |------------------| 0x4280 -->| li r27,0 | \ fixup entry | b 0x4024 | / 0x4288 -->| li r3,0 | | b 0x40b0 | |------------------| 0x4290 -->| insn=0xfffffd90 | \ extable entry | fixup=0xffffffec | / 0x4298 -->| insn=0xfffffe14 | | fixup=0xffffffec | +------------------+ (Addresses shown here are chosen random, not real) Signed-off-by: Ravi Bangoria Signed-off-by: Hari Bathini Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-6-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit.h | 8 +++- arch/powerpc/net/bpf_jit_comp.c | 66 ++++++++++++++++++++++++++++--- arch/powerpc/net/bpf_jit_comp32.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 13 +++++- 4 files changed, 80 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 6a945f6211f4..444c9debce91 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -150,8 +150,11 @@ struct codegen_context { unsigned int idx; unsigned int stack_size; int b2p[ARRAY_SIZE(b2p)]; + unsigned int exentry_idx; }; +#define BPF_FIXUP_LEN 2 /* Two instructions => 8 bytes */ + static inline void bpf_flush_icache(void *start, void *end) { smp_wmb(); /* smp write barrier */ @@ -175,11 +178,14 @@ static inline void bpf_clear_seen_register(struct codegen_context *ctx, int i) void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func); int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs); + u32 *addrs, int pass); void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx); void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx); void bpf_jit_realloc_regs(struct codegen_context *ctx); +int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct codegen_context *ctx, + int insn_idx, int jmp_off, int dst_reg); + #endif #endif diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index f39ad8d123dd..a936dca7331e 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -101,6 +101,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_prog *tmp_fp; bool bpf_blinded = false; bool extra_pass = false; + u32 extable_len; + u32 fixup_len; if (!fp->jit_requested) return org_fp; @@ -131,7 +133,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) image = jit_data->image; bpf_hdr = jit_data->header; proglen = jit_data->proglen; - alloclen = proglen + FUNCTION_DESCR_SIZE; extra_pass = true; goto skip_init_ctx; } @@ -149,7 +150,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) cgctx.stack_size = round_up(fp->aux->stack_depth, 16); /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) { /* We hit something illegal or unsupported. */ fp = org_fp; goto out_addrs; @@ -162,7 +163,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ if (cgctx.seen & SEEN_TAILCALL) { cgctx.idx = 0; - if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { + if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) { fp = org_fp; goto out_addrs; } @@ -177,8 +178,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) bpf_jit_build_prologue(0, &cgctx); bpf_jit_build_epilogue(0, &cgctx); + fixup_len = fp->aux->num_exentries * BPF_FIXUP_LEN * 4; + extable_len = fp->aux->num_exentries * sizeof(struct exception_table_entry); + proglen = cgctx.idx * 4; - alloclen = proglen + FUNCTION_DESCR_SIZE; + alloclen = proglen + FUNCTION_DESCR_SIZE + fixup_len + extable_len; bpf_hdr = bpf_jit_binary_alloc(alloclen, &image, 4, bpf_jit_fill_ill_insns); if (!bpf_hdr) { @@ -186,6 +190,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) goto out_addrs; } + if (extable_len) + fp->aux->extable = (void *)image + FUNCTION_DESCR_SIZE + proglen + fixup_len; + skip_init_ctx: code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); @@ -210,7 +217,7 @@ skip_init_ctx: /* Now build the prologue, body code & epilogue for real. */ cgctx.idx = 0; bpf_jit_build_prologue(code_base, &cgctx); - if (bpf_jit_build_body(fp, code_base, &cgctx, addrs)) { + if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, pass)) { bpf_jit_binary_free(bpf_hdr); fp = org_fp; goto out_addrs; @@ -238,7 +245,7 @@ skip_codegen_passes: fp->bpf_func = (void *)image; fp->jited = 1; - fp->jited_len = alloclen; + fp->jited_len = proglen + FUNCTION_DESCR_SIZE; bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); if (!fp->is_func || extra_pass) { @@ -262,3 +269,50 @@ out: return fp; } + +/* + * The caller should check for (BPF_MODE(code) == BPF_PROBE_MEM) before calling + * this function, as this only applies to BPF_PROBE_MEM, for now. + */ +int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct codegen_context *ctx, + int insn_idx, int jmp_off, int dst_reg) +{ + off_t offset; + unsigned long pc; + struct exception_table_entry *ex; + u32 *fixup; + + /* Populate extable entries only in the last pass */ + if (pass != 2) + return 0; + + if (!fp->aux->extable || + WARN_ON_ONCE(ctx->exentry_idx >= fp->aux->num_exentries)) + return -EINVAL; + + pc = (unsigned long)&image[insn_idx]; + + fixup = (void *)fp->aux->extable - + (fp->aux->num_exentries * BPF_FIXUP_LEN * 4) + + (ctx->exentry_idx * BPF_FIXUP_LEN * 4); + + fixup[0] = PPC_RAW_LI(dst_reg, 0); + + fixup[BPF_FIXUP_LEN - 1] = + PPC_RAW_BRANCH((long)(pc + jmp_off) - (long)&fixup[BPF_FIXUP_LEN - 1]); + + ex = &fp->aux->extable[ctx->exentry_idx]; + + offset = pc - (long)&ex->insn; + if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) + return -ERANGE; + ex->insn = offset; + + offset = (long)fixup - (long)&ex->fixup; + if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) + return -ERANGE; + ex->fixup = offset; + + ctx->exentry_idx++; + return 0; +} diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 8b2ac1c27f1f..54e7cef3e1f2 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -268,7 +268,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs) + u32 *addrs, int pass) { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index ad852f15ca61..ede8cb3e453f 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -297,7 +297,7 @@ asm ( /* Assemble the body code between the prologue & epilogue */ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, - u32 *addrs) + u32 *addrs, int pass) { enum stf_barrier_type stf_barrier = stf_barrier_type_get(); const struct bpf_insn *insn = fp->insnsi; @@ -779,12 +779,16 @@ emit_clear: */ /* dst = *(u8 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_B: + case BPF_LDX | BPF_PROBE_MEM | BPF_B: /* dst = *(u16 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_H: + case BPF_LDX | BPF_PROBE_MEM | BPF_H: /* dst = *(u32 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_W: + case BPF_LDX | BPF_PROBE_MEM | BPF_W: /* dst = *(u64 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: switch (size) { case BPF_B: EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); @@ -802,6 +806,13 @@ emit_clear: if (size != BPF_DW && insn_is_zext(&insn[i + 1])) addrs[++i] = ctx->idx * 4; + + if (BPF_MODE(code) == BPF_PROBE_MEM) { + ret = bpf_add_extable_entry(fp, image, pass, ctx, ctx->idx - 1, + 4, dst_reg); + if (ret) + return ret; + } break; /* From 9c70c7147ffec31de67d33243570a533b29f9759 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 12 Oct 2021 18:00:54 +0530 Subject: [PATCH 076/240] bpf ppc64: Access only if addr is kernel address On PPC64 with KUAP enabled, any kernel code which wants to access userspace needs to be surrounded by disable-enable KUAP. But that is not happening for BPF_PROBE_MEM load instruction. So, when BPF program tries to access invalid userspace address, page-fault handler considers it as bad KUAP fault: Kernel attempted to read user page (d0000000) - exploit attempt? (uid: 0) Considering the fact that PTR_TO_BTF_ID (which uses BPF_PROBE_MEM mode) could either be a valid kernel pointer or NULL but should never be a pointer to userspace address, execute BPF_PROBE_MEM load only if addr is kernel address, otherwise set dst_reg=0 and move on. This will catch NULL, valid or invalid userspace pointers. Only bad kernel pointer will be handled by BPF exception table. [Alexei suggested for x86] Suggested-by: Alexei Starovoitov Signed-off-by: Ravi Bangoria Signed-off-by: Hari Bathini Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-7-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp64.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index ede8cb3e453f..472d4a551945 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -789,6 +789,32 @@ emit_clear: /* dst = *(u64 *)(ul) (src + off) */ case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + /* + * As PTR_TO_BTF_ID that uses BPF_PROBE_MEM mode could either be a valid + * kernel pointer or NULL but not a userspace address, execute BPF_PROBE_MEM + * load only if addr is kernel address (see is_kernel_addr()), otherwise + * set dst_reg=0 and move on. + */ + if (BPF_MODE(code) == BPF_PROBE_MEM) { + EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], src_reg, off)); + if (IS_ENABLED(CONFIG_PPC_BOOK3E_64)) + PPC_LI64(b2p[TMP_REG_2], 0x8000000000000000ul); + else /* BOOK3S_64 */ + PPC_LI64(b2p[TMP_REG_2], PAGE_OFFSET); + EMIT(PPC_RAW_CMPLD(b2p[TMP_REG_1], b2p[TMP_REG_2])); + PPC_BCC(COND_GT, (ctx->idx + 4) * 4); + EMIT(PPC_RAW_LI(dst_reg, 0)); + /* + * Check if 'off' is word aligned because PPC_BPF_LL() + * (BPF_DW case) generates two instructions if 'off' is not + * word-aligned and one instruction otherwise. + */ + if (BPF_SIZE(code) == BPF_DW && (off & 3)) + PPC_JMP((ctx->idx + 3) * 4); + else + PPC_JMP((ctx->idx + 2) * 4); + } + switch (size) { case BPF_B: EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); From 23b51916ee129833453d8a3d6bde0ff392f82fce Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 12 Oct 2021 18:00:55 +0530 Subject: [PATCH 077/240] bpf ppc32: Add BPF_PROBE_MEM support for JIT BPF load instruction with BPF_PROBE_MEM mode can cause a fault inside kernel. Append exception table for such instructions within BPF program. Unlike other archs which uses extable 'fixup' field to pass dest_reg and nip, BPF exception table on PowerPC follows the generic PowerPC exception table design, where it populates both fixup and extable sections within BPF program. fixup section contains 3 instructions, first 2 instructions clear dest_reg (lower & higher 32-bit registers) and last instruction jumps to next instruction in the BPF code. extable 'insn' field contains relative offset of the instruction and 'fixup' field contains relative offset of the fixup entry. Example layout of BPF program with extable present: +------------------+ | | | | 0x4020 -->| lwz r28,4(r4) | | | | | 0x40ac -->| lwz r3,0(r24) | | lwz r4,4(r24) | | | | | |------------------| 0x4278 -->| li r28,0 | \ | li r27,0 | | fixup entry | b 0x4024 | / 0x4284 -->| li r4,0 | | li r3,0 | | b 0x40b4 | |------------------| 0x4290 -->| insn=0xfffffd90 | \ extable entry | fixup=0xffffffe4 | / 0x4298 -->| insn=0xfffffe14 | | fixup=0xffffffe8 | +------------------+ (Addresses shown here are chosen random, not real) Signed-off-by: Hari Bathini Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-8-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit.h | 4 ++++ arch/powerpc/net/bpf_jit_comp.c | 2 ++ arch/powerpc/net/bpf_jit_comp32.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 444c9debce91..b20a2a83a6e7 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -153,7 +153,11 @@ struct codegen_context { unsigned int exentry_idx; }; +#ifdef CONFIG_PPC32 +#define BPF_FIXUP_LEN 3 /* Three instructions => 12 bytes */ +#else #define BPF_FIXUP_LEN 2 /* Two instructions => 8 bytes */ +#endif static inline void bpf_flush_icache(void *start, void *end) { diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index a936dca7331e..d6ffdd0f2309 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -297,6 +297,8 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct code (ctx->exentry_idx * BPF_FIXUP_LEN * 4); fixup[0] = PPC_RAW_LI(dst_reg, 0); + if (IS_ENABLED(CONFIG_PPC32)) + fixup[1] = PPC_RAW_LI(dst_reg - 1, 0); /* clear higher 32-bit register too */ fixup[BPF_FIXUP_LEN - 1] = PPC_RAW_BRANCH((long)(pc + jmp_off) - (long)&fixup[BPF_FIXUP_LEN - 1]); diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 54e7cef3e1f2..5dc45e393d1d 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -813,9 +813,13 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * BPF_LDX */ case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_B: case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_H: case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: switch (size) { case BPF_B: EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); @@ -834,6 +838,32 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * if (size != BPF_DW && !fp->aux->verifier_zext) EMIT(PPC_RAW_LI(dst_reg_h, 0)); + + if (BPF_MODE(code) == BPF_PROBE_MEM) { + int insn_idx = ctx->idx - 1; + int jmp_off = 4; + + /* + * In case of BPF_DW, two lwz instructions are emitted, one + * for higher 32-bit and another for lower 32-bit. So, set + * ex->insn to the first of the two and jump over both + * instructions in fixup. + * + * Similarly, with !verifier_zext, two instructions are + * emitted for BPF_B/H/W case. So, set ex->insn to the + * instruction that could fault and skip over both + * instructions. + */ + if (size == BPF_DW || !fp->aux->verifier_zext) { + insn_idx -= 1; + jmp_off += 4; + } + + ret = bpf_add_extable_entry(fp, image, pass, ctx, insn_idx, + jmp_off, dst_reg); + if (ret) + return ret; + } break; /* From e919c0b2323bedec00e1ecc6280498ff81f59b15 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 12 Oct 2021 18:00:56 +0530 Subject: [PATCH 078/240] bpf ppc32: Access only if addr is kernel address With KUAP enabled, any kernel code which wants to access userspace needs to be surrounded by disable-enable KUAP. But that is not happening for BPF_PROBE_MEM load instruction. Though PPC32 does not support read protection, considering the fact that PTR_TO_BTF_ID (which uses BPF_PROBE_MEM mode) could either be a valid kernel pointer or NULL but should never be a pointer to userspace address, execute BPF_PROBE_MEM load only if addr is kernel address, otherwise set dst_reg=0 and move on. This will catch NULL, valid or invalid userspace pointers. Only bad kernel pointer will be handled by BPF exception table. [Alexei suggested for x86] Suggested-by: Alexei Starovoitov Signed-off-by: Hari Bathini Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211012123056.485795-9-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp32.c | 34 +++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 5dc45e393d1d..d3a52cd42f53 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -820,6 +820,40 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + /* + * As PTR_TO_BTF_ID that uses BPF_PROBE_MEM mode could either be a valid + * kernel pointer or NULL but not a userspace address, execute BPF_PROBE_MEM + * load only if addr is kernel address (see is_kernel_addr()), otherwise + * set dst_reg=0 and move on. + */ + if (BPF_MODE(code) == BPF_PROBE_MEM) { + PPC_LI32(_R0, TASK_SIZE - off); + EMIT(PPC_RAW_CMPLW(src_reg, _R0)); + PPC_BCC(COND_GT, (ctx->idx + 5) * 4); + EMIT(PPC_RAW_LI(dst_reg, 0)); + /* + * For BPF_DW case, "li reg_h,0" would be needed when + * !fp->aux->verifier_zext. Emit NOP otherwise. + * + * Note that "li reg_h,0" is emitted for BPF_B/H/W case, + * if necessary. So, jump there insted of emitting an + * additional "li reg_h,0" instruction. + */ + if (size == BPF_DW && !fp->aux->verifier_zext) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + else + EMIT(PPC_RAW_NOP()); + /* + * Need to jump two instructions instead of one for BPF_DW case + * as there are two load instructions for dst_reg_h & dst_reg + * respectively. + */ + if (size == BPF_DW) + PPC_JMP((ctx->idx + 3) * 4); + else + PPC_JMP((ctx->idx + 2) * 4); + } + switch (size) { case BPF_B: EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); From a3bcfc182b2c968fd740101322bd128844724961 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Sun, 14 Nov 2021 19:56:16 +0800 Subject: [PATCH 079/240] powerpc/tsi108: make EXPORT_SYMBOL follow its function immediately EXPORT_SYMBOL(foo); should immediately follow its function/variable. Signed-off-by: Jason Wang Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211114115616.493815-1-wangborong@cdjrlc.com --- arch/powerpc/sysdev/tsi108_dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index 4c4a6efd5e5f..9e13fb35ed5c 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -51,13 +51,12 @@ phys_addr_t get_csrbase(void) } return tsi108_csr_base; } +EXPORT_SYMBOL(get_csrbase); u32 get_vir_csrbase(void) { return (u32) (ioremap(get_csrbase(), 0x10000)); } - -EXPORT_SYMBOL(get_csrbase); EXPORT_SYMBOL(get_vir_csrbase); static int __init tsi108_eth_of_init(void) From 8b8a8f0ab3f5519e45c526f826a655817486c5bb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 15 Nov 2021 11:12:22 +0100 Subject: [PATCH 080/240] powerpc/code-patching: Improve verification of patchability Today, patch_instruction() assumes that it is called exclusively on valid addresses, and only checks that it is not called on an init address after init section has been freed. Improve verification by calling kernel_text_address() instead. kernel_text_address() already includes a verification of initmem release. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/bc683d499a411730504b132a924de0ccc2ef1f79.1636971137.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/setup.h | 1 - arch/powerpc/lib/code-patching.c | 5 ++--- arch/powerpc/mm/mem.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 6c1a7d217d1a..426a2d8d028f 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -9,7 +9,6 @@ extern void ppc_printk_progress(char *s, unsigned short hex); extern unsigned int rtas_data; extern unsigned long long memory_limit; -extern bool init_mem_is_free; extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask); struct device_node; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index c5ed98823835..5e2fe133639e 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -190,10 +190,9 @@ static int do_patch_instruction(u32 *addr, struct ppc_inst instr) int patch_instruction(u32 *addr, struct ppc_inst instr) { /* Make sure we aren't patching a freed init section */ - if (init_mem_is_free && init_section_contains(addr, 4)) { - pr_debug("Skipping init section patching addr: 0x%px\n", addr); + if (!kernel_text_address((unsigned long)addr)) return 0; - } + return do_patch_instruction(addr, instr); } NOKPROBE_SYMBOL(patch_instruction); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index bd5d91a31183..8e301cd8925b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -26,7 +26,6 @@ #include unsigned long long memory_limit; -bool init_mem_is_free; unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -312,7 +311,6 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); - init_mem_is_free = true; free_initmem_default(POISON_FREE_INITMEM); } From 53cadf7deee0ce65d7c33770b7810c98a2a0ee6a Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 16 Nov 2021 15:58:06 -0600 Subject: [PATCH 081/240] powerpc/rtas: kernel-doc fixes Fix the following issues reported by kernel-doc: $ scripts/kernel-doc -v -none arch/powerpc/kernel/rtas.c arch/powerpc/kernel/rtas.c:810: info: Scanning doc for function rtas_activate_firmware arch/powerpc/kernel/rtas.c:818: warning: contents before sections arch/powerpc/kernel/rtas.c:841: info: Scanning doc for function rtas_call_reentrant arch/powerpc/kernel/rtas.c:893: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Find a specific pseries error log in an RTAS extended event log. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211116215806.928235-1-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index ff80bbad22a5..ca27421f471a 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -809,13 +809,13 @@ void rtas_os_term(char *str) /** * rtas_activate_firmware() - Activate a new version of firmware. * + * Context: This function may sleep. + * * Activate a new version of partition firmware. The OS must call this * after resuming from a partition hibernation or migration in order * to maintain the ability to perform live firmware updates. It's not * catastrophic for this method to be absent or to fail; just log the * condition in that case. - * - * Context: This function may sleep. */ void rtas_activate_firmware(void) { @@ -890,11 +890,12 @@ int rtas_call_reentrant(int token, int nargs, int nret, int *outputs, ...) #endif /* CONFIG_PPC_PSERIES */ /** - * Find a specific pseries error log in an RTAS extended event log. + * get_pseries_errorlog() - Find a specific pseries error log in an RTAS + * extended event log. * @log: RTAS error/event log * @section_id: two character section identifier * - * Returns a pointer to the specified errorlog or NULL if not found. + * Return: A pointer to the specified errorlog or NULL if not found. */ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, uint16_t section_id) From 22887f319a39929e357810a1f964fcba7ae42c59 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 20 Sep 2021 12:32:03 -0500 Subject: [PATCH 082/240] powerpc/pseries: delete scanlog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the pseries scanlog driver. This code supports functions from Power4-era servers that are not present on targets currently supported by arch/powerpc. System manuals from this time have this description: Scan Dump data is a set of chip data that the service processor gathers after a system malfunction. It consists of chip scan rings, chip trace arrays, and Scan COM (SCOM) registers. This data is stored in the scan-log partition of the system’s Nonvolatile Random Access Memory (NVRAM). PowerVM partition firmware development doesn't recognize the associated function call or property, and they don't see any references to them in their codebase. It seems to have been specific to non-virtualized pseries. References: Historical Linux commit from February 2003 (interesting to note this seems to be the source of non-GPL exports for rtas_call etc): https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=f92e361842d5251e50562b09664082dcbd0548bb IntelliStation and pSeries docs which refer to the feature: http://ps-2.retropc.se/basil.holloway/ALL%20PDF/380635.pdf http://ps-2.kev009.com/rs6000/manuals/p/p615-6C3-6E3/6C3_and_6E3_Users_Guide_SA38-0629.pdf Signed-off-by: Nathan Lynch Reviewed-by: Tyrel Datwyler Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210920173203.1800475-1-nathanl@linux.ibm.com --- arch/powerpc/configs/ppc64_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/powerpc/platforms/pseries/Kconfig | 4 - arch/powerpc/platforms/pseries/Makefile | 1 - arch/powerpc/platforms/pseries/scanlog.c | 195 ----------------------- 5 files changed, 202 deletions(-) delete mode 100644 arch/powerpc/platforms/pseries/scanlog.c diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 203d0b7f0bb8..c8b0e80d613b 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -26,7 +26,6 @@ CONFIG_PPC64=y CONFIG_NR_CPUS=2048 CONFIG_PPC_SPLPAR=y CONFIG_DTL=y -CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y CONFIG_PPC_SVM=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index de7641adb899..243076f3e1a9 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -38,7 +38,6 @@ CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_PARTITION_ADVANCED=y CONFIG_PPC_SPLPAR=y CONFIG_DTL=y -CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y CONFIG_PAPR_SCM=m diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 2e57391e0778..9bd542164128 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -61,10 +61,6 @@ config PSERIES_ENERGY Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint -config SCANLOG - tristate "Scanlog dump interface" - depends on RTAS_PROC && PPC_PSERIES - config IO_EVENT_IRQ bool "IO Event Interrupt support" depends on PPC_PSERIES diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 41d8aee98da4..ee60b59024b4 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -8,7 +8,6 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SCANLOG) += scanlog.o obj-$(CONFIG_KEXEC_CORE) += kexec.o obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c deleted file mode 100644 index 2879c4f0ceb7..000000000000 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * c 2001 PPC 64 Team, IBM Corp - * - * scan-log-data driver for PPC64 Todd Inglett - * - * When ppc64 hardware fails the service processor dumps internal state - * of the system. After a reboot the operating system can access a dump - * of this data using this driver. A dump exists if the device-tree - * /chosen/ibm,scan-log-data property exists. - * - * This driver exports /proc/powerpc/scan-log-dump which can be read. - * The driver supports only sequential reads. - * - * The driver looks at a write to the driver for the single word "reset". - * If given, the driver will reset the scanlog so the platform can free it. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MODULE_VERS "1.0" -#define MODULE_NAME "scanlog" - -/* Status returns from ibm,scan-log-dump */ -#define SCANLOG_COMPLETE 0 -#define SCANLOG_HWERROR -1 -#define SCANLOG_CONTINUE 1 - - -static unsigned int ibm_scan_log_dump; /* RTAS token */ -static unsigned int *scanlog_buffer; /* The data buffer */ - -static ssize_t scanlog_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - unsigned int *data = scanlog_buffer; - int status; - unsigned long len, off; - unsigned int wait_time; - - if (count > RTAS_DATA_BUF_SIZE) - count = RTAS_DATA_BUF_SIZE; - - if (count < 1024) { - /* This is the min supported by this RTAS call. Rather - * than do all the buffering we insist the user code handle - * larger reads. As long as cp works... :) - */ - printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count); - return -EINVAL; - } - - if (!access_ok(buf, count)) - return -EFAULT; - - for (;;) { - wait_time = 500; /* default wait if no data */ - spin_lock(&rtas_data_buf_lock); - memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE); - status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, - (u32) __pa(rtas_data_buf), (u32) count); - memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE); - spin_unlock(&rtas_data_buf_lock); - - pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \ - "data[2]=%x\n", status, data[0], data[1], data[2]); - switch (status) { - case SCANLOG_COMPLETE: - pr_debug("scanlog: hit eof\n"); - return 0; - case SCANLOG_HWERROR: - pr_debug("scanlog: hardware error reading data\n"); - return -EIO; - case SCANLOG_CONTINUE: - /* We may or may not have data yet */ - len = data[1]; - off = data[2]; - if (len > 0) { - if (copy_to_user(buf, ((char *)data)+off, len)) - return -EFAULT; - return len; - } - /* Break to sleep default time */ - break; - default: - /* Assume extended busy */ - wait_time = rtas_busy_delay_time(status); - if (!wait_time) { - printk(KERN_ERR "scanlog: unknown error " \ - "from rtas: %d\n", status); - return -EIO; - } - } - /* Apparently no data yet. Wait and try again. */ - msleep_interruptible(wait_time); - } - /*NOTREACHED*/ -} - -static ssize_t scanlog_write(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - char stkbuf[20]; - int status; - - if (count > 19) count = 19; - if (copy_from_user (stkbuf, buf, count)) { - return -EFAULT; - } - stkbuf[count] = 0; - - if (buf) { - if (strncmp(stkbuf, "reset", 5) == 0) { - pr_debug("scanlog: reset scanlog\n"); - status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0); - pr_debug("scanlog: rtas returns %d\n", status); - } - } - return count; -} - -static int scanlog_open(struct inode * inode, struct file * file) -{ - unsigned int *data = scanlog_buffer; - - if (data[0] != 0) { - /* This imperfect test stops a second copy of the - * data (or a reset while data is being copied) - */ - return -EBUSY; - } - - data[0] = 0; /* re-init so we restart the scan */ - - return 0; -} - -static int scanlog_release(struct inode * inode, struct file * file) -{ - unsigned int *data = scanlog_buffer; - - data[0] = 0; - return 0; -} - -static const struct proc_ops scanlog_proc_ops = { - .proc_read = scanlog_read, - .proc_write = scanlog_write, - .proc_open = scanlog_open, - .proc_release = scanlog_release, - .proc_lseek = noop_llseek, -}; - -static int __init scanlog_init(void) -{ - struct proc_dir_entry *ent; - int err = -ENOMEM; - - ibm_scan_log_dump = rtas_token("ibm,scan-log-dump"); - if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE) - return -ENODEV; - - /* Ideally we could allocate a buffer < 4G */ - scanlog_buffer = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); - if (!scanlog_buffer) - goto err; - - ent = proc_create("powerpc/rtas/scan-log-dump", 0400, NULL, - &scanlog_proc_ops); - if (!ent) - goto err; - return 0; -err: - kfree(scanlog_buffer); - return err; -} - -static void __exit scanlog_cleanup(void) -{ - remove_proc_entry("powerpc/rtas/scan-log-dump", NULL); - kfree(scanlog_buffer); -} - -module_init(scanlog_init); -module_exit(scanlog_cleanup); -MODULE_LICENSE("GPL"); From 38f7b7067dae0c101be573106018e8af22a90fdf Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Wed, 17 Nov 2021 00:02:58 -0600 Subject: [PATCH 083/240] powerpc/rtas: rtas_busy_delay() improvements Generally RTAS cannot block, and in PAPR it is required to return control to the OS within a few tens of microseconds. In order to support operations which may take longer to complete, many RTAS primitives can return intermediate -2 ("busy") or 990x ("extended delay") values, which indicate that the OS should reattempt the same call with the same arguments at some point in the future. Current versions of PAPR are less than clear about this, but the intended meanings of these values in more detail are: RTAS_BUSY (-2): RTAS has suspended a potentially long-running operation in order to meet its latency obligation and give the OS the opportunity to perform other work. RTAS can resume making progress as soon as the OS reattempts the call. RTAS_EXTENDED_DELAY_{MIN...MAX} (9900-9905): RTAS must wait for an external event to occur or for internal contention to resolve before it can complete the requested operation. The value encodes a non-binding hint as to roughly how long the OS should wait before calling again, but the OS is allowed to reattempt the call sooner or even immediately. Linux of course must take its own CPU scheduling obligations into account when handling these statuses; e.g. a task which receives an RTAS_BUSY status should check whether to reschedule before it attempts the RTAS call again to avoid starving other tasks. rtas_busy_delay() is a helper function that "consumes" a busy or extended delay status. Common usage: int rc; do { rc = rtas_call(rtas_token("some-function"), ...); } while (rtas_busy_delay(rc)); /* convert rc to Linux error value, etc */ If rc is a busy or extended delay status, the caller can rely on rtas_busy_delay() to perform an appropriate sleep or reschedule and return nonzero. Other statuses are handled normally by the caller. The current implementation of rtas_busy_delay() both oversleeps and overuses the CPU: * It performs msleep() for all 990x and even when no delay is suggested (-2), but this is understood to actually sleep for two jiffies minimum in practice (20ms with HZ=100). 9900 (1ms) and 9901 (10ms) appear to be the most common extended delay statuses, and the oversleeping measurably lengthens DLPAR operations, which perform many RTAS calls. * It does not sleep on 990x unless need_resched() is true, causing code like the loop above to needlessly retry, wasting CPU time. Alter the logic to align better with the intended meanings: * When passed RTAS_BUSY, perform cond_resched() and return without sleeping. The caller should reattempt immediately * Always sleep when passed an extended delay status, using usleep_range() for precise shorter sleeps. Limit the sleep time to one second even though there are higher architected values. Change rtas_busy_delay()'s return type to bool to better reflect its usage, and add kernel-doc. rtas_busy_delay_time() is unchanged, even though it "incorrectly" returns 1 for RTAS_BUSY. There are users of that API with open-coded delay loops in sensitive contexts that will have to be taken on an individual basis. Brief results for addition and removal of 5GB memory on a small P9 PowerVM partition follow. Load was generated with stress-ng --cpu N. For add, elapsed time is greatly reduced without significant change in the number of RTAS calls or time spent on CPU. For remove, elapsed time is modestly reduced, with significant reductions in RTAS calls and time spent on CPU. With no competing workload (- before, + after): Performance counter stats for 'bash -c echo "memory add count 20" > /sys/kernel/dlpar' (10 runs): - 1,935 probe:rtas_call # 0.003 M/sec ( +- 0.22% ) - 609.99 msec task-clock # 0.183 CPUs utilized ( +- 0.19% ) + 1,956 probe:rtas_call # 0.003 M/sec ( +- 0.17% ) + 618.56 msec task-clock # 0.278 CPUs utilized ( +- 0.11% ) - 3.3322 +- 0.0670 seconds time elapsed ( +- 2.01% ) + 2.2222 +- 0.0416 seconds time elapsed ( +- 1.87% ) Performance counter stats for 'bash -c echo "memory remove count 20" > /sys/kernel/dlpar' (10 runs): - 6,224 probe:rtas_call # 0.008 M/sec ( +- 2.57% ) - 750.36 msec task-clock # 0.190 CPUs utilized ( +- 2.01% ) + 843 probe:rtas_call # 0.003 M/sec ( +- 0.12% ) + 250.66 msec task-clock # 0.068 CPUs utilized ( +- 0.17% ) - 3.9394 +- 0.0890 seconds time elapsed ( +- 2.26% ) + 3.678 +- 0.113 seconds time elapsed ( +- 3.07% ) With all CPUs 100% busy (- before, + after): Performance counter stats for 'bash -c echo "memory add count 20" > /sys/kernel/dlpar' (10 runs): - 2,979 probe:rtas_call # 0.003 M/sec ( +- 0.12% ) - 1,096.62 msec task-clock # 0.105 CPUs utilized ( +- 0.10% ) + 2,981 probe:rtas_call # 0.003 M/sec ( +- 0.22% ) + 1,095.26 msec task-clock # 0.154 CPUs utilized ( +- 0.21% ) - 10.476 +- 0.104 seconds time elapsed ( +- 1.00% ) + 7.1124 +- 0.0865 seconds time elapsed ( +- 1.22% ) Performance counter stats for 'bash -c echo "memory remove count 20" > /sys/kernel/dlpar' (10 runs): - 2,702 probe:rtas_call # 0.004 M/sec ( +- 4.00% ) - 722.71 msec task-clock # 0.067 CPUs utilized ( +- 2.41% ) + 1,246 probe:rtas_call # 0.003 M/sec ( +- 0.25% ) + 487.73 msec task-clock # 0.049 CPUs utilized ( +- 0.20% ) - 10.829 +- 0.163 seconds time elapsed ( +- 1.51% ) + 9.9887 +- 0.0866 seconds time elapsed ( +- 0.87% ) Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211117060259.957178-2-nathanl@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 2 +- arch/powerpc/kernel/rtas.c | 74 +++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 9dc97d2f9d27..82e5b055fa2a 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -264,7 +264,7 @@ extern void rtas_get_rtc_time(struct rtc_time *rtc_time); extern int rtas_set_rtc_time(struct rtc_time *rtc_time); extern unsigned int rtas_busy_delay_time(int status); -extern unsigned int rtas_busy_delay(int status); +bool rtas_busy_delay(int status); extern int early_init_dt_scan_rtas(unsigned long node, const char *uname, int depth, void *data); diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index ca27421f471a..048599a68834 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -513,17 +513,77 @@ unsigned int rtas_busy_delay_time(int status) } EXPORT_SYMBOL(rtas_busy_delay_time); -/* For an RTAS busy status code, perform the hinted delay. */ -unsigned int rtas_busy_delay(int status) +/** + * rtas_busy_delay() - helper for RTAS busy and extended delay statuses + * + * @status: a value returned from rtas_call() or similar APIs which return + * the status of a RTAS function call. + * + * Context: Process context. May sleep or schedule. + * + * Return: + * * true - @status is RTAS_BUSY or an extended delay hint. The + * caller may assume that the CPU has been yielded if necessary, + * and that an appropriate delay for @status has elapsed. + * Generally the caller should reattempt the RTAS call which + * yielded @status. + * + * * false - @status is not @RTAS_BUSY nor an extended delay hint. The + * caller is responsible for handling @status. + */ +bool rtas_busy_delay(int status) { unsigned int ms; + bool ret; - might_sleep(); - ms = rtas_busy_delay_time(status); - if (ms && need_resched()) - msleep(ms); + switch (status) { + case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX: + ret = true; + ms = rtas_busy_delay_time(status); + /* + * The extended delay hint can be as high as 100 seconds. + * Surely any function returning such a status is either + * buggy or isn't going to be significantly slowed by us + * polling at 1HZ. Clamp the sleep time to one second. + */ + ms = clamp(ms, 1U, 1000U); + /* + * The delay hint is an order-of-magnitude suggestion, not + * a minimum. It is fine, possibly even advantageous, for + * us to pause for less time than hinted. For small values, + * use usleep_range() to ensure we don't sleep much longer + * than actually needed. + * + * See Documentation/timers/timers-howto.rst for + * explanation of the threshold used here. In effect we use + * usleep_range() for 9900 and 9901, msleep() for + * 9902-9905. + */ + if (ms <= 20) + usleep_range(ms * 100, ms * 1000); + else + msleep(ms); + break; + case RTAS_BUSY: + ret = true; + /* + * We should call again immediately if there's no other + * work to do. + */ + cond_resched(); + break; + default: + ret = false; + /* + * Not a busy or extended delay status; the caller should + * handle @status itself. Ensure we warn on misuses in + * atomic context regardless. + */ + might_sleep(); + break; + } - return ms; + return ret; } EXPORT_SYMBOL(rtas_busy_delay); From dd5cde457a5eb77088d1d9eecface47c0563cd43 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Wed, 17 Nov 2021 00:02:59 -0600 Subject: [PATCH 084/240] powerpc/rtas: rtas_busy_delay_time() kernel-doc Provide API documentation for rtas_busy_delay_time(), explaining why we return the same value for 9900 and -2. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211117060259.957178-3-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 048599a68834..733e6ef36758 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -492,8 +492,25 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) } EXPORT_SYMBOL(rtas_call); -/* For RTAS_BUSY (-2), delay for 1 millisecond. For an extended busy status - * code of 990n, perform the hinted delay of 10^n (last digit) milliseconds. +/** + * rtas_busy_delay_time() - From an RTAS status value, calculate the + * suggested delay time in milliseconds. + * + * @status: a value returned from rtas_call() or similar APIs which return + * the status of a RTAS function call. + * + * Context: Any context. + * + * Return: + * * 100000 - If @status is 9905. + * * 10000 - If @status is 9904. + * * 1000 - If @status is 9903. + * * 100 - If @status is 9902. + * * 10 - If @status is 9901. + * * 1 - If @status is either 9900 or -2. This is "wrong" for -2, but + * some callers depend on this behavior, and the worst outcome + * is that they will delay for longer than necessary. + * * 0 - If @status is not a busy or extended delay value. */ unsigned int rtas_busy_delay_time(int status) { From 869fb7e5aecbc163003f93f36dcc26d0554319f6 Mon Sep 17 00:00:00 2001 From: Peiwei Hu Date: Fri, 19 Nov 2021 17:12:18 +0800 Subject: [PATCH 085/240] powerpc/prom_init: Fix improper check of prom_getprop() prom_getprop() can return PROM_ERROR. Binary operator can not identify it. Fixes: 94d2dde738a5 ("[POWERPC] Efika: prune fixups and make them more carefull") Signed-off-by: Peiwei Hu Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/tencent_BA28CC6897B7C95A92EB8C580B5D18589105@qq.com --- arch/powerpc/kernel/prom_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 18b04b08b983..f845065c860e 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2991,7 +2991,7 @@ static void __init fixup_device_tree_efika_add_phy(void) /* Check if the phy-handle property exists - bail if it does */ rv = prom_getprop(node, "phy-handle", prop, sizeof(prop)); - if (!rv) + if (rv <= 0) return; /* From 5dad4ba68a2483fc80d70b9dc90bbe16e1f27263 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Nov 2021 12:50:53 +1000 Subject: [PATCH 086/240] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race It is possible for all CPUs to miss the pending cpumask becoming clear, and then nobody resetting it, which will cause the lockup detector to stop working. It will eventually expire, but watchdog_smp_panic will avoid doing anything if the pending mask is clear and it will never be reset. Order the cpumask clear vs the subsequent test to close this race. Add an extra check for an empty pending mask when the watchdog fires and finds its bit still clear, to try to catch any other possible races or bugs here and keep the watchdog working. The extra test in arch_touch_nmi_watchdog is required to prevent the new warning from firing off. Signed-off-by: Nicholas Piggin Reviewed-by: Laurent Dufour Debugged-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211110025056.2084347-2-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 41 +++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 3fa6d240bade..ad94a2c6b733 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -135,6 +135,10 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) { cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); + /* + * See wd_smp_clear_cpu_pending() + */ + smp_mb(); if (cpumask_empty(&wd_smp_cpus_pending)) { wd_smp_last_reset_tb = tb; cpumask_andnot(&wd_smp_cpus_pending, @@ -221,13 +225,44 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); wd_smp_unlock(&flags); + } else { + /* + * The last CPU to clear pending should have reset the + * watchdog so we generally should not find it empty + * here if our CPU was clear. However it could happen + * due to a rare race with another CPU taking the + * last CPU out of the mask concurrently. + * + * We can't add a warning for it. But just in case + * there is a problem with the watchdog that is causing + * the mask to not be reset, try to kick it along here. + */ + if (unlikely(cpumask_empty(&wd_smp_cpus_pending))) + goto none_pending; } return; } + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + + /* + * Order the store to clear pending with the load(s) to check all + * words in the pending mask to check they are all empty. This orders + * with the same barrier on another CPU. This prevents two CPUs + * clearing the last 2 pending bits, but neither seeing the other's + * store when checking if the mask is empty, and missing an empty + * mask, which ends with a false positive. + */ + smp_mb(); if (cpumask_empty(&wd_smp_cpus_pending)) { unsigned long flags; +none_pending: + /* + * Double check under lock because more than one CPU could see + * a clear mask with the lockless check after clearing their + * pending bits. + */ wd_smp_lock(&flags); if (cpumask_empty(&wd_smp_cpus_pending)) { wd_smp_last_reset_tb = tb; @@ -318,8 +353,12 @@ void arch_touch_nmi_watchdog(void) { unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); - u64 tb = get_tb(); + u64 tb; + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) + return; + + tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { per_cpu(wd_timer_tb, cpu) = tb; wd_smp_clear_cpu_pending(cpu, tb); From 858c93c31504ac1507084493d7eafbe7e2302dc2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Nov 2021 12:50:54 +1000 Subject: [PATCH 087/240] powerpc/watchdog: tighten non-atomic read-modify-write access Most updates to wd_smp_cpus_pending are under lock except the watchdog interrupt bit clear. This can race with non-atomic RMW updates to the mask under lock, which can happen in two instances: Firstly, if another CPU detects this one is stuck, removes it from the mask, mask becomes empty and is re-filled with non-atomic stores. This is okay because it would re-fill the mask with this CPU's bit clear anyway (because this CPU is now stuck), so it doesn't matter that the bit clear update got "lost". Add a comment for this. Secondly, if another CPU detects a different CPU is stuck and removes it from the pending mask with a non-atomic store to bytes which also include the bit of this CPU. This case can result in the bit clear being lost and the end result being the bit is set. This should be so rare it hardly matters, but to make things simpler to reason about just avoid the non-atomic access for that case. Signed-off-by: Nicholas Piggin Reviewed-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211110025056.2084347-3-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 36 ++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index ad94a2c6b733..588f54350d19 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -131,10 +131,10 @@ static void wd_lockup_ipi(struct pt_regs *regs) /* Do not panic from here because that can recurse into NMI IPI layer */ } -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) +static bool set_cpu_stuck(int cpu, u64 tb) { - cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); - cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); /* * See wd_smp_clear_cpu_pending() */ @@ -144,11 +144,9 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) cpumask_andnot(&wd_smp_cpus_pending, &wd_cpus_enabled, &wd_smp_cpus_stuck); + return true; } -} -static void set_cpu_stuck(int cpu, u64 tb) -{ - set_cpumask_stuck(cpumask_of(cpu), tb); + return false; } static void watchdog_smp_panic(int cpu, u64 tb) @@ -177,15 +175,17 @@ static void watchdog_smp_panic(int cpu, u64 tb) * get a backtrace on all of them anyway. */ for_each_cpu(c, &wd_smp_cpus_pending) { + bool empty; if (c == cpu) continue; + /* Take the stuck CPUs out of the watch group */ + empty = set_cpu_stuck(c, tb); smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + if (empty) + break; } } - /* Take the stuck CPUs out of the watch group */ - set_cpumask_stuck(&wd_smp_cpus_pending, tb); - wd_smp_unlock(&flags); if (sysctl_hardlockup_all_cpu_backtrace) @@ -243,6 +243,22 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) return; } + /* + * All other updates to wd_smp_cpus_pending are performed under + * wd_smp_lock. All of them are atomic except the case where the + * mask becomes empty and is reset. This will not happen here because + * cpu was tested to be in the bitmap (above), and a CPU only clears + * its own bit. _Except_ in the case where another CPU has detected a + * hard lockup on our CPU and takes us out of the pending mask. So in + * normal operation there will be no race here, no problem. + * + * In the lockup case, this atomic clear-bit vs a store that refills + * other bits in the accessed word wll not be a problem. The bit clear + * is atomic so it will not cause the store to get lost, and the store + * will never set this bit so it will not overwrite the bit clear. The + * only way for a stuck CPU to return to the pending bitmap is to + * become unstuck itself. + */ cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); /* From 76521c4b0291ad25723638ade5a0ff4d5f659771 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Nov 2021 12:50:55 +1000 Subject: [PATCH 088/240] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi There is a deadlock with the console_owner lock and the wd_smp_lock: CPU x takes the console_owner lock CPU y takes a watchdog timer interrupt and takes __wd_smp_lock CPU x takes a soft-NMI interrupt, detects deadlock, spins on __wd_smp_lock CPU y detects deadlock, tries to print something and spins on console_owner -> deadlock Change the watchdog locking scheme so wd_smp_lock protects the watchdog internal data, but "reporting" (printing, issuing NMI IPIs, taking any action outside of watchdog) uses a non-waiting exclusion. If a CPU detects a problem but can not take the reporting lock, it just returns because something else is already reporting. It will try again at some point. Typically hard lockup watchdog report usefulness is not impacted due to failure to spewing a large enough amount of data in as short a time as possible, but by messages getting garbled. Laurent debugged this and found the deadlock, and this patch is based on his general approach to avoid expensive operations while holding the lock. With the addition of the reporting exclusion. Signed-off-by: Laurent Dufour [np: rework to add reporting exclusion update changelog] Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211110025056.2084347-4-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 93 +++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 588f54350d19..af16a835ddec 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -85,10 +85,36 @@ static DEFINE_PER_CPU(u64, wd_timer_tb); /* SMP checker bits */ static unsigned long __wd_smp_lock; +static unsigned long __wd_reporting; static cpumask_t wd_smp_cpus_pending; static cpumask_t wd_smp_cpus_stuck; static u64 wd_smp_last_reset_tb; +/* + * Try to take the exclusive watchdog action / NMI IPI / printing lock. + * wd_smp_lock must be held. If this fails, we should return and wait + * for the watchdog to kick in again (or another CPU to trigger it). + * + * Importantly, if hardlockup_panic is set, wd_try_report failure should + * not delay the panic, because whichever other CPU is reporting will + * call panic. + */ +static bool wd_try_report(void) +{ + if (__wd_reporting) + return false; + __wd_reporting = 1; + return true; +} + +/* End printing after successful wd_try_report. wd_smp_lock not required. */ +static void wd_end_reporting(void) +{ + smp_mb(); /* End printing "critical section" */ + WARN_ON_ONCE(__wd_reporting == 0); + WRITE_ONCE(__wd_reporting, 0); +} + static inline void wd_smp_lock(unsigned long *flags) { /* @@ -151,6 +177,7 @@ static bool set_cpu_stuck(int cpu, u64 tb) static void watchdog_smp_panic(int cpu, u64 tb) { + static cpumask_t wd_smp_cpus_ipi; // protected by reporting unsigned long flags; int c; @@ -160,11 +187,26 @@ static void watchdog_smp_panic(int cpu, u64 tb) goto out; if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) goto out; - if (cpumask_weight(&wd_smp_cpus_pending) == 0) + if (!wd_try_report()) goto out; + for_each_online_cpu(c) { + if (!cpumask_test_cpu(c, &wd_smp_cpus_pending)) + continue; + if (c == cpu) + continue; // should not happen + + __cpumask_set_cpu(c, &wd_smp_cpus_ipi); + if (set_cpu_stuck(c, tb)) + break; + } + if (cpumask_empty(&wd_smp_cpus_ipi)) { + wd_end_reporting(); + goto out; + } + wd_smp_unlock(&flags); pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n", - cpu, cpumask_pr_args(&wd_smp_cpus_pending)); + cpu, cpumask_pr_args(&wd_smp_cpus_ipi)); pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n", cpu, tb, wd_smp_last_reset_tb, tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000); @@ -174,22 +216,14 @@ static void watchdog_smp_panic(int cpu, u64 tb) * Try to trigger the stuck CPUs, unless we are going to * get a backtrace on all of them anyway. */ - for_each_cpu(c, &wd_smp_cpus_pending) { - bool empty; - if (c == cpu) - continue; - /* Take the stuck CPUs out of the watch group */ - empty = set_cpu_stuck(c, tb); + for_each_cpu(c, &wd_smp_cpus_ipi) { smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); - if (empty) - break; + __cpumask_clear_cpu(c, &wd_smp_cpus_ipi); } - } - - wd_smp_unlock(&flags); - - if (sysctl_hardlockup_all_cpu_backtrace) + } else { trigger_allbutself_cpu_backtrace(); + cpumask_clear(&wd_smp_cpus_ipi); + } /* * Force flush any remote buffers that might be stuck in IRQ context @@ -200,6 +234,8 @@ static void watchdog_smp_panic(int cpu, u64 tb) if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); + wd_end_reporting(); + return; out: @@ -213,8 +249,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) struct pt_regs *regs = get_irq_regs(); unsigned long flags; - wd_smp_lock(&flags); - pr_emerg("CPU %d became unstuck TB:%lld\n", cpu, tb); print_irqtrace_events(current); @@ -223,6 +257,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) else dump_stack(); + wd_smp_lock(&flags); cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); wd_smp_unlock(&flags); } else { @@ -318,13 +353,28 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { + /* + * Taking wd_smp_lock here means it is a soft-NMI lock, which + * means we can't take any regular or irqsafe spin locks while + * holding this lock. This is why timers can't printk while + * holding the lock. + */ wd_smp_lock(&flags); if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) { wd_smp_unlock(&flags); return 0; } + if (!wd_try_report()) { + wd_smp_unlock(&flags); + /* Couldn't report, try again in 100ms */ + mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000); + return 0; + } + set_cpu_stuck(cpu, tb); + wd_smp_unlock(&flags); + pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n", cpu, (void *)regs->nip); pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n", @@ -334,14 +384,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) print_irqtrace_events(current); show_regs(regs); - wd_smp_unlock(&flags); - if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); + + wd_end_reporting(); } + /* + * We are okay to change DEC in soft_nmi_interrupt because the masked + * handler has marked a DEC as pending, so the timer interrupt will be + * replayed as soon as local irqs are enabled again. + */ if (wd_panic_timeout_tb < 0x7fffffff) mtspr(SPRN_DEC, wd_panic_timeout_tb); From 1f01bf90765fa5f88fbae452c131c1edf5cda7ba Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Nov 2021 12:50:56 +1000 Subject: [PATCH 089/240] powerpc/watchdog: read TB close to where it is used When taking watchdog actions, printing messages, comparing and re-setting wd_smp_last_reset_tb, etc., read TB close to the point of use and under wd_smp_lock or printing lock (if applicable). This should keep timebase mostly monotonic with kernel log messages, and could prevent (in theory) a laggy CPU updating wd_smp_last_reset_tb to something a long way in the past, and causing other CPUs to appear to be stuck. These additional TB reads are all slowpath (lockup has been detected), so performance does not matter. Signed-off-by: Nicholas Piggin Reviewed-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211110025056.2084347-5-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index af16a835ddec..b6533539386b 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -157,7 +157,7 @@ static void wd_lockup_ipi(struct pt_regs *regs) /* Do not panic from here because that can recurse into NMI IPI layer */ } -static bool set_cpu_stuck(int cpu, u64 tb) +static bool set_cpu_stuck(int cpu) { cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); @@ -166,7 +166,7 @@ static bool set_cpu_stuck(int cpu, u64 tb) */ smp_mb(); if (cpumask_empty(&wd_smp_cpus_pending)) { - wd_smp_last_reset_tb = tb; + wd_smp_last_reset_tb = get_tb(); cpumask_andnot(&wd_smp_cpus_pending, &wd_cpus_enabled, &wd_smp_cpus_stuck); @@ -175,14 +175,16 @@ static bool set_cpu_stuck(int cpu, u64 tb) return false; } -static void watchdog_smp_panic(int cpu, u64 tb) +static void watchdog_smp_panic(int cpu) { static cpumask_t wd_smp_cpus_ipi; // protected by reporting unsigned long flags; + u64 tb; int c; wd_smp_lock(&flags); /* Double check some things under lock */ + tb = get_tb(); if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) goto out; if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) @@ -196,7 +198,7 @@ static void watchdog_smp_panic(int cpu, u64 tb) continue; // should not happen __cpumask_set_cpu(c, &wd_smp_cpus_ipi); - if (set_cpu_stuck(c, tb)) + if (set_cpu_stuck(c)) break; } if (cpumask_empty(&wd_smp_cpus_ipi)) { @@ -242,7 +244,7 @@ out: wd_smp_unlock(&flags); } -static void wd_smp_clear_cpu_pending(int cpu, u64 tb) +static void wd_smp_clear_cpu_pending(int cpu) { if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { @@ -250,7 +252,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb) unsigned long flags; pr_emerg("CPU %d became unstuck TB:%lld\n", - cpu, tb); + cpu, get_tb()); print_irqtrace_events(current); if (regs) show_regs(regs); @@ -316,7 +318,7 @@ none_pending: */ wd_smp_lock(&flags); if (cpumask_empty(&wd_smp_cpus_pending)) { - wd_smp_last_reset_tb = tb; + wd_smp_last_reset_tb = get_tb(); cpumask_andnot(&wd_smp_cpus_pending, &wd_cpus_enabled, &wd_smp_cpus_stuck); @@ -331,10 +333,10 @@ static void watchdog_timer_interrupt(int cpu) per_cpu(wd_timer_tb, cpu) = tb; - wd_smp_clear_cpu_pending(cpu, tb); + wd_smp_clear_cpu_pending(cpu); if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) - watchdog_smp_panic(cpu, tb); + watchdog_smp_panic(cpu); } DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) @@ -371,7 +373,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) return 0; } - set_cpu_stuck(cpu, tb); + set_cpu_stuck(cpu); wd_smp_unlock(&flags); @@ -432,7 +434,7 @@ void arch_touch_nmi_watchdog(void) tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { per_cpu(wd_timer_tb, cpu) = tb; - wd_smp_clear_cpu_pending(cpu, tb); + wd_smp_clear_cpu_pending(cpu); } } EXPORT_SYMBOL(arch_touch_nmi_watchdog); @@ -490,7 +492,7 @@ static void stop_watchdog(void *arg) cpumask_clear_cpu(cpu, &wd_cpus_enabled); wd_smp_unlock(&flags); - wd_smp_clear_cpu_pending(cpu, get_tb()); + wd_smp_clear_cpu_pending(cpu); } static int stop_watchdog_on_cpu(unsigned int cpu) From 4afc78eae10cd74c5a0b70822b9754d1d094c5d6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 18 Nov 2021 11:44:15 +1100 Subject: [PATCH 090/240] powerpc/microwatt: Make microwatt_get_random_darn() static Make microwatt_get_random_darn() static, because it can be. Reported-by: kernel test robot Signed-off-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211118004415.1706863-1-mpe@ellerman.id.au --- arch/powerpc/platforms/microwatt/rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/microwatt/rng.c b/arch/powerpc/platforms/microwatt/rng.c index 3d8ee6eb7dad..7bc4d1cbfaf0 100644 --- a/arch/powerpc/platforms/microwatt/rng.c +++ b/arch/powerpc/platforms/microwatt/rng.c @@ -14,7 +14,7 @@ #define DARN_ERR 0xFFFFFFFFFFFFFFFFul -int microwatt_get_random_darn(unsigned long *v) +static int microwatt_get_random_darn(unsigned long *v) { unsigned long val; From 3d030e301856da366380b3865fce6c03037b08a6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 25 Nov 2021 20:33:46 +1000 Subject: [PATCH 091/240] powerpc/watchdog: Fix wd_smp_last_reset_tb reporting wd_smp_last_reset_tb now gets reset by watchdog_smp_panic() as part of marking CPUs stuck and removing them from the pending mask before it begins any printing. This causes last reset times reported to be off. Fix this by reading it into a local variable before it gets reset. Fixes: 76521c4b0291 ("powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211125103346.1188958-1-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index b6533539386b..23745af38d62 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -179,13 +179,14 @@ static void watchdog_smp_panic(int cpu) { static cpumask_t wd_smp_cpus_ipi; // protected by reporting unsigned long flags; - u64 tb; + u64 tb, last_reset; int c; wd_smp_lock(&flags); /* Double check some things under lock */ tb = get_tb(); - if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) + last_reset = wd_smp_last_reset_tb; + if ((s64)(tb - last_reset) < (s64)wd_smp_panic_timeout_tb) goto out; if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) goto out; @@ -210,8 +211,7 @@ static void watchdog_smp_panic(int cpu) pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n", cpu, cpumask_pr_args(&wd_smp_cpus_ipi)); pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n", - cpu, tb, wd_smp_last_reset_tb, - tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000); + cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000); if (!sysctl_hardlockup_all_cpu_backtrace) { /* From af3fdce4ab0781ea183107c90de9cbf21d701c54 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 29 Nov 2021 17:41:52 +1100 Subject: [PATCH 092/240] Revert "powerpc/code-patching: Improve verification of patchability" This reverts commit 8b8a8f0ab3f5519e45c526f826a655817486c5bb. As reported[1] by Sachin this causes problems with ftrace, and it also causes the code patching selftests to fail as reported[2] by Stephen. So revert it for now. 1: https://lore.kernel.org/linuxppc-dev/3668743C-09DF-4673-B15C-2FFE2A57F7D7@linux.vnet.ibm.com/ 2: https://lore.kernel.org/linuxppc-dev/20211126161747.1f7795b0@canb.auug.org.au/ Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/setup.h | 1 + arch/powerpc/lib/code-patching.c | 5 +++-- arch/powerpc/mm/mem.c | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 426a2d8d028f..6c1a7d217d1a 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -9,6 +9,7 @@ extern void ppc_printk_progress(char *s, unsigned short hex); extern unsigned int rtas_data; extern unsigned long long memory_limit; +extern bool init_mem_is_free; extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask); struct device_node; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 5e2fe133639e..c5ed98823835 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -190,9 +190,10 @@ static int do_patch_instruction(u32 *addr, struct ppc_inst instr) int patch_instruction(u32 *addr, struct ppc_inst instr) { /* Make sure we aren't patching a freed init section */ - if (!kernel_text_address((unsigned long)addr)) + if (init_mem_is_free && init_section_contains(addr, 4)) { + pr_debug("Skipping init section patching addr: 0x%px\n", addr); return 0; - + } return do_patch_instruction(addr, instr); } NOKPROBE_SYMBOL(patch_instruction); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8e301cd8925b..bd5d91a31183 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -26,6 +26,7 @@ #include unsigned long long memory_limit; +bool init_mem_is_free; unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -311,6 +312,7 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); + init_mem_is_free = true; free_initmem_default(POISON_FREE_INITMEM); } From b350111bf7b3f4a780d28c44f18f7c9fcbe6d11b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 5 Nov 2021 13:50:41 +1000 Subject: [PATCH 093/240] powerpc: remove cpu_online_cores_map function This function builds the cores online map with on-stack cpumasks which can cause high stack usage with large NR_CPUS. It is not used in any performance sensitive paths, so instead just check for first thread sibling. Signed-off-by: Nicholas Piggin Tested-by: Sachin Sant Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105035042.1398309-1-npiggin@gmail.com --- arch/powerpc/include/asm/cputhreads.h | 33 ----------------------- arch/powerpc/platforms/powernv/idle.c | 10 +++---- arch/powerpc/platforms/powernv/opal-imc.c | 6 ++--- 3 files changed, 8 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index b167186aaee4..f26c430f3982 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -32,44 +32,11 @@ extern cpumask_t threads_core_mask; #define threads_core_mask (*get_cpu_mask(0)) #endif -/* cpu_thread_mask_to_cores - Return a cpumask of one per cores - * hit by the argument - * - * @threads: a cpumask of online threads - * - * This function returns a cpumask which will have one online cpu's - * bit set for each core that has at least one thread set in the argument. - * - * This can typically be used for things like IPI for tlb invalidations - * since those need to be done only once per core/TLB - */ -static inline cpumask_t cpu_thread_mask_to_cores(const struct cpumask *threads) -{ - cpumask_t tmp, res; - int i, cpu; - - cpumask_clear(&res); - for (i = 0; i < NR_CPUS; i += threads_per_core) { - cpumask_shift_left(&tmp, &threads_core_mask, i); - if (cpumask_intersects(threads, &tmp)) { - cpu = cpumask_next_and(-1, &tmp, cpu_online_mask); - if (cpu < nr_cpu_ids) - cpumask_set_cpu(cpu, &res); - } - } - return res; -} - static inline int cpu_nr_cores(void) { return nr_cpu_ids >> threads_shift; } -static inline cpumask_t cpu_online_cores_map(void) -{ - return cpu_thread_mask_to_cores(cpu_online_mask); -} - #ifdef CONFIG_SMP int cpu_core_index_of_thread(int cpu); int cpu_first_thread_of_core(int core); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 3bc84e2fe064..95458fd9572c 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -146,9 +146,13 @@ EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); static void pnv_fastsleep_workaround_apply(void *info) { + int cpu = smp_processor_id(); int rc; int *err = info; + if (cpu_first_thread_sibling(cpu) != cpu) + return; + rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP, OPAL_CONFIG_IDLE_APPLY); if (rc) @@ -175,7 +179,6 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - cpumask_t primary_thread_mask; int err; u8 val; @@ -200,10 +203,7 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, power7_fastsleep_workaround_exit = false; cpus_read_lock(); - primary_thread_mask = cpu_online_cores_map(); - on_each_cpu_mask(&primary_thread_mask, - pnv_fastsleep_workaround_apply, - &err, 1); + on_each_cpu(pnv_fastsleep_workaround_apply, &err, 1); cpus_read_unlock(); if (err) { pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply"); diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 05d3832019b9..3fea5da6d1b3 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -200,13 +200,13 @@ static void disable_nest_pmu_counters(void) static void disable_core_pmu_counters(void) { - cpumask_t cores_map; int cpu, rc; cpus_read_lock(); /* Disable the IMC Core functions */ - cores_map = cpu_online_cores_map(); - for_each_cpu(cpu, &cores_map) { + for_each_online_cpu(cpu) { + if (cpu_first_thread_sibling(cpu) != cpu) + continue; rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, get_hard_smp_processor_id(cpu)); if (rc) From 2eafc4748bc08c5b9b6ee0b5b65ad20b30f7d704 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 5 Nov 2021 13:50:42 +1000 Subject: [PATCH 094/240] powerpc: select CPUMASK_OFFSTACK if NR_CPUS >= 8192 Some core kernel code starts to go beyond the 2048 byte stack size warning at NR_CPUS=8192, so select CPUMASK_OFFSTACK in that case. x86 does similarly for very large NR_CPUS. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105035042.1398309-2-npiggin@gmail.com --- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index dea74d7717c0..bb5e44e4d7c7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -165,6 +165,7 @@ config PPC select BINFMT_ELF select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS + select CPUMASK_OFFSTACK if NR_CPUS >= 8192 select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN select DMA_OPS_BYPASS if PPC64 select DMA_OPS if PPC64 From 4ea9e321c27fd531a8dfe0fa1d1b2ee15fc3444e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 24 Nov 2021 20:32:49 +1100 Subject: [PATCH 095/240] powerpc/85xx: Fix no previous prototype warning for mpc85xx_setup_pmc() Fixes the following W=1 warning: arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c:89:12: warning: no previous prototype for 'mpc85xx_setup_pmc' Reported-by: kernel test robot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211124093254.1054750-1-mpe@ellerman.id.au --- arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c index 4a8af80011a6..f7ac92a8ae97 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c @@ -15,6 +15,8 @@ #include #include +#include "smp.h" + static struct ccsr_guts __iomem *guts; #ifdef CONFIG_FSL_PMC From 84a61fb43fdfc528a3a7ff00e0b14ba91f5eb745 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 24 Nov 2021 20:32:50 +1100 Subject: [PATCH 096/240] powerpc/85xx: Make mpc85xx_smp_kexec_cpu_down() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To fix the W=1 warning: arch/powerpc/platforms/85xx/smp.c:369:6: error: no previous prototype for ‘mpc85xx_smp_kexec_cpu_down’ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211124093254.1054750-2-mpe@ellerman.id.au --- arch/powerpc/platforms/85xx/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 83f4a6389a28..0abc1da2c14f 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -366,7 +366,7 @@ struct smp_ops_t smp_85xx_ops = { #ifdef CONFIG_PPC32 atomic_t kexec_down_cpus = ATOMIC_INIT(0); -void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary) +static void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary) { local_irq_disable(); @@ -384,7 +384,7 @@ static void mpc85xx_smp_kexec_down(void *arg) ppc_md.kexec_cpu_down(0,1); } #else -void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary) +static void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary) { int cpu = smp_processor_id(); int sibling = cpu_last_thread_sibling(cpu); From d9150d5bb5586dc20b6c424e59d5ea29fe1b3030 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 24 Nov 2021 20:32:51 +1100 Subject: [PATCH 097/240] powerpc/85xx: Make c293_pcie_pic_init() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To fix the W=1 warning: linux/arch/powerpc/platforms/85xx/c293pcie.c:22:13: error: no previous prototype for ‘c293_pcie_pic_init’ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211124093254.1054750-3-mpe@ellerman.id.au --- arch/powerpc/platforms/85xx/c293pcie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/85xx/c293pcie.c b/arch/powerpc/platforms/85xx/c293pcie.c index 8d9a2503dd0f..58a398c89e97 100644 --- a/arch/powerpc/platforms/85xx/c293pcie.c +++ b/arch/powerpc/platforms/85xx/c293pcie.c @@ -19,7 +19,7 @@ #include "mpc85xx.h" -void __init c293_pcie_pic_init(void) +static void __init c293_pcie_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | MPIC_SINGLE_DEST_CPU, 0, 256, " OpenPIC "); From ff47a95d1a67477e9bc2049a840d93b68508e079 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 24 Nov 2021 20:32:52 +1100 Subject: [PATCH 098/240] powerpc/mm: Move tlbcam_sz() and make it static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with W=1 we see a warning: linux/arch/powerpc/mm/nohash/fsl_book3e.c:63:15: error: no previous prototype for ‘tlbcam_sz’ tlbcam_sz() is not used outside this file, so we can make it static. However it's only used inside #ifdef CONFIG_PPC32, so move it within that ifdef, otherwise we would get a defined but not used error. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211124093254.1054750-4-mpe@ellerman.id.au --- arch/powerpc/mm/nohash/fsl_book3e.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c index b231a54f540c..7f71bc3bf85f 100644 --- a/arch/powerpc/mm/nohash/fsl_book3e.c +++ b/arch/powerpc/mm/nohash/fsl_book3e.c @@ -60,11 +60,6 @@ struct tlbcamrange { phys_addr_t phys; } tlbcam_addrs[NUM_TLBCAMS]; -unsigned long tlbcam_sz(int idx) -{ - return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; -} - #ifdef CONFIG_FSL_BOOKE /* * Return PA for this VA if it is mapped by a CAM, or 0 @@ -264,6 +259,11 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } +static unsigned long tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + void __init adjust_total_lowmem(void) { unsigned long ram; From a4ac0d249a5db80e79d573db9e4ad29354b643a8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 24 Nov 2021 20:32:53 +1100 Subject: [PATCH 099/240] powerpc/smp: Move setup_profiling_timer() under CONFIG_PROFILING MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit setup_profiling_timer() is only needed when CONFIG_PROFILING is enabled. Fixes the following W=1 warning when CONFIG_PROFILING=n: linux/arch/powerpc/kernel/smp.c:1638:5: error: no previous prototype for ‘setup_profiling_timer’ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211124093254.1054750-5-mpe@ellerman.id.au --- arch/powerpc/kernel/smp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index c23ee842c4c3..aee3a7119f97 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1635,10 +1635,12 @@ void start_secondary(void *unused) BUG(); } +#ifdef CONFIG_PROFILING int setup_profiling_timer(unsigned int multiplier) { return 0; } +#endif static void fixup_topology(void) { From ab85a273957eadfcf7906bcd8a0adf5909d802ee Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 24 Nov 2021 20:32:54 +1100 Subject: [PATCH 100/240] powerpc: Mark probe_machine() __init and static Prior to commit b1923caa6e64 ("powerpc: Merge 32-bit and 64-bit setup_arch()") probe_machine() was called from setup_32/64.c and lived in setup-common.c. But now it's only called from setup-common.c so it can be static and __init, and we don't need the declaration in machdep.h either. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211124093254.1054750-6-mpe@ellerman.id.au --- arch/powerpc/include/asm/machdep.h | 2 -- arch/powerpc/kernel/setup-common.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 9c3c9f04129f..e821037f74f0 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -235,8 +235,6 @@ extern struct machdep_calls *machine_id; machine_id == &mach_##name; \ }) -extern void probe_machine(void); - #ifdef CONFIG_PPC_PMAC /* * Power macintoshes have either a CUDA, PMU or SMU controlling diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 4f1322b65760..f8da937df918 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -582,7 +582,7 @@ static __init int add_pcspkr(void) device_initcall(add_pcspkr); #endif /* CONFIG_PCSPKR_PLATFORM */ -void probe_machine(void) +static __init void probe_machine(void) { extern struct machdep_calls __machine_desc_start; extern struct machdep_calls __machine_desc_end; From 88670fdb26800228606c078ba4a018e9522a75a8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Oct 2021 14:24:02 +0200 Subject: [PATCH 101/240] powerpc/ftrace: No need to read LR from stack in _mcount() All functions calling _mcount do it exactly the same way, with the following sequence of instructions: c07de788: 7c 08 02 a6 mflr r0 c07de78c: 90 01 00 04 stw r0,4(r1) c07de790: 4b 84 13 65 bl c001faf4 <_mcount> Allthough LR is pushed on stack, it is still in r0 while entering _mcount(). Function arguments are in r3-r10, so r11 and r12 are still available at that point. Do like PPC64 and use r12 to move LR into CTR, so that r0 is preserved and doesn't need to be restored from the stack. While at it, bring back the EXPORT_SYMBOL at the end of _mcount. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/24a3ba7db388537c44a038026f926d885372e6d3.1635423081.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/trace/ftrace_32.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index e023ae59c429..c7d57124cc59 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -14,16 +14,16 @@ _GLOBAL(mcount) _GLOBAL(_mcount) /* * It is required that _mcount on PPC32 must preserve the - * link register. But we have r0 to play with. We use r0 + * link register. But we have r12 to play with. We use r12 * to push the return address back to the caller of mcount * into the ctr register, restore the link register and * then jump back using the ctr register. */ - mflr r0 - mtctr r0 - lwz r0, 4(r1) + mflr r12 + mtctr r12 mtlr r0 bctr +EXPORT_SYMBOL(_mcount) _GLOBAL(ftrace_caller) MCOUNT_SAVE_FRAME @@ -43,7 +43,6 @@ _GLOBAL(ftrace_graph_stub) /* old link register ends up in ctr reg */ bctr -EXPORT_SYMBOL(_mcount) _GLOBAL(ftrace_stub) blr From c93d4f6ecf4b0699d0f2088f7bd9cd09af45d65a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Oct 2021 14:24:03 +0200 Subject: [PATCH 102/240] powerpc/ftrace: Add module_trampoline_target() for PPC32 module_trampoline_target() is used by __ftrace_modify_call(). Implement it for PPC32 so that CONFIG_DYNAMIC_FTRACE_WITH_REGS can be activated on PPC32 as well. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/42345f464fb465f0fc76f3090e250be8fc1729f0.1635423081.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/module_32.c | 25 ++++++++++++++++++++ arch/powerpc/kernel/trace/ftrace.c | 37 ++++-------------------------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index f417afc08d33..5dedd76346b2 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -273,6 +273,31 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, } #ifdef CONFIG_DYNAMIC_FTRACE +int module_trampoline_target(struct module *mod, unsigned long addr, + unsigned long *target) +{ + unsigned int jmp[4]; + + /* Find where the trampoline jumps to */ + if (copy_from_kernel_nofault(jmp, (void *)addr, sizeof(jmp))) + return -EFAULT; + + /* verify that this is what we expect it to be */ + if ((jmp[0] & 0xffff0000) != PPC_RAW_LIS(_R12, 0) || + (jmp[1] & 0xffff0000) != PPC_RAW_ADDI(_R12, _R12, 0) || + jmp[2] != PPC_RAW_MTCTR(_R12) || + jmp[3] != PPC_RAW_BCTR()) + return -EINVAL; + + addr = (jmp[1] & 0xffff) | ((jmp[0] & 0xffff) << 16); + if (addr & 0x8000) + addr -= 0x10000; + + *target = addr; + + return 0; +} + int module_finalize_ftrace(struct module *module, const Elf_Shdr *sechdrs) { module->arch.tramp = do_plt_call(module->core_layout.base, diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index d89c5df4f206..c1d54c18e912 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -222,9 +222,8 @@ __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { struct ppc_inst op; - unsigned int jmp[4]; unsigned long ip = rec->ip; - unsigned long tramp; + unsigned long tramp, ptr; if (copy_from_kernel_nofault(&op, (void *)ip, MCOUNT_INSN_SIZE)) return -EFAULT; @@ -238,41 +237,13 @@ __ftrace_make_nop(struct module *mod, /* lets find where the pointer goes */ tramp = find_bl_target(ip, op); - /* - * On PPC32 the trampoline looks like: - * 0x3d, 0x80, 0x00, 0x00 lis r12,sym@ha - * 0x39, 0x8c, 0x00, 0x00 addi r12,r12,sym@l - * 0x7d, 0x89, 0x03, 0xa6 mtctr r12 - * 0x4e, 0x80, 0x04, 0x20 bctr - */ - - pr_devel("ip:%lx jumps to %lx", ip, tramp); - /* Find where the trampoline jumps to */ - if (copy_from_kernel_nofault(jmp, (void *)tramp, sizeof(jmp))) { - pr_err("Failed to read %lx\n", tramp); + if (module_trampoline_target(mod, tramp, &ptr)) { + pr_err("Failed to get trampoline target\n"); return -EFAULT; } - pr_devel(" %08x %08x ", jmp[0], jmp[1]); - - /* verify that this is what we expect it to be */ - if (((jmp[0] & 0xffff0000) != 0x3d800000) || - ((jmp[1] & 0xffff0000) != 0x398c0000) || - (jmp[2] != 0x7d8903a6) || - (jmp[3] != 0x4e800420)) { - pr_err("Not a trampoline\n"); - return -EINVAL; - } - - tramp = (jmp[1] & 0xffff) | - ((jmp[0] & 0xffff) << 16); - if (tramp & 0x8000) - tramp -= 0x10000; - - pr_devel(" %lx ", tramp); - - if (tramp != addr) { + if (ptr != addr) { pr_err("Trampoline location %08lx does not match addr\n", tramp); return -EINVAL; From 7dfbfb87c243cf08bc2b9cc23699ac207b726458 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Oct 2021 14:24:04 +0200 Subject: [PATCH 103/240] powerpc/ftrace: Activate HAVE_DYNAMIC_FTRACE_WITH_REGS on PPC32 Unlike PPC64, PPC32 doesn't require any special compiler option to get _mcount() call not clobbering registers. Provide ftrace_regs_caller() and ftrace_regs_call() and activate HAVE_DYNAMIC_FTRACE_WITH_REGS. That's heavily copied from ftrace_64_mprofile.S For the time being leave livepatching aside, it will come with following patch. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1862dc7719855cc2a4eec80920d94c955877557e.1635423081.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 4 +- arch/powerpc/kernel/module_32.c | 8 ++ arch/powerpc/kernel/trace/ftrace.c | 16 +++- arch/powerpc/kernel/trace/ftrace_32.S | 111 +++++++++++++++++++++++--- 4 files changed, 126 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bb5e44e4d7c7..5c61f3511d5a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -206,7 +206,7 @@ config PPC select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE - select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL + select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL || PPC32 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FAST_GUP @@ -230,7 +230,7 @@ config PPC select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES select HAVE_LD_DEAD_CODE_DATA_ELIMINATION - select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS && PPC64 select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_OPTPROBES diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 5dedd76346b2..a491ad481d85 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -306,6 +306,14 @@ int module_finalize_ftrace(struct module *module, const Elf_Shdr *sechdrs) if (!module->arch.tramp) return -ENOENT; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + module->arch.tramp_regs = do_plt_call(module->core_layout.base, + (unsigned long)ftrace_regs_caller, + sechdrs, module); + if (!module->arch.tramp_regs) + return -ENOENT; +#endif + return 0; } #endif diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index c1d54c18e912..faa0fa29ac20 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -561,6 +561,8 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) int err; struct ppc_inst op; u32 *ip = (u32 *)rec->ip; + struct module *mod = rec->arch.mod; + unsigned long tramp; /* read where this goes */ if (copy_inst_from_kernel_nofault(&op, ip)) @@ -573,13 +575,23 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) } /* If we never set up a trampoline to ftrace_caller, then bail */ - if (!rec->arch.mod->arch.tramp) { +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + if (!mod->arch.tramp || !mod->arch.tramp_regs) { +#else + if (!mod->arch.tramp) { +#endif pr_err("No ftrace trampoline\n"); return -EINVAL; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + if (rec->flags & FTRACE_FL_REGS) + tramp = mod->arch.tramp_regs; + else +#endif + tramp = mod->arch.tramp; /* create the branch to the trampoline */ - err = create_branch(&op, ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + err = create_branch(&op, ip, tramp, BRANCH_SET_LINK); if (err) { pr_err("REL24 out of range!\n"); return -EINVAL; diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index c7d57124cc59..0a02c0cb12d9 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -9,6 +9,7 @@ #include #include #include +#include _GLOBAL(mcount) _GLOBAL(_mcount) @@ -29,17 +30,21 @@ _GLOBAL(ftrace_caller) MCOUNT_SAVE_FRAME /* r3 ends up with link register */ subi r3, r3, MCOUNT_INSN_SIZE + lis r5,function_trace_op@ha + lwz r5,function_trace_op@l(r5) + li r6, 0 .globl ftrace_call ftrace_call: bl ftrace_stub nop + MCOUNT_RESTORE_FRAME +ftrace_caller_common: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: b ftrace_graph_stub _GLOBAL(ftrace_graph_stub) #endif - MCOUNT_RESTORE_FRAME /* old link register ends up in ctr reg */ bctr @@ -47,15 +52,91 @@ _GLOBAL(ftrace_graph_stub) _GLOBAL(ftrace_stub) blr +_GLOBAL(ftrace_regs_caller) + /* Save the original return address in A's stack frame */ + stw r0,LRSAVE(r1) + + /* Create our stack frame + pt_regs */ + stwu r1,-INT_FRAME_SIZE(r1) + + /* Save all gprs to pt_regs */ + stw r0, GPR0(r1) + stmw r2, GPR2(r1) + + /* Save previous stack pointer (r1) */ + addi r8, r1, INT_FRAME_SIZE + stw r8, GPR1(r1) + + /* Load special regs for save below */ + mfmsr r8 + mfctr r9 + mfxer r10 + mfcr r11 + + /* Get the _mcount() call site out of LR */ + mflr r7 + /* Save it as pt_regs->nip */ + stw r7, _NIP(r1) + /* Save the read LR in pt_regs->link */ + stw r0, _LINK(r1) + + lis r3,function_trace_op@ha + lwz r5,function_trace_op@l(r3) + + /* Calculate ip from nip-4 into r3 for call below */ + subi r3, r7, MCOUNT_INSN_SIZE + + /* Put the original return address in r4 as parent_ip */ + mr r4, r0 + + /* Save special regs */ + stw r8, _MSR(r1) + stw r9, _CTR(r1) + stw r10, _XER(r1) + stw r11, _CCR(r1) + + /* Load &pt_regs in r6 for call below */ + addi r6, r1, STACK_FRAME_OVERHEAD + + /* ftrace_call(r3, r4, r5, r6) */ +.globl ftrace_regs_call +ftrace_regs_call: + bl ftrace_stub + nop + + /* Load ctr with the possibly modified NIP */ + lwz r3, _NIP(r1) + mtctr r3 + + /* Restore gprs */ + lmw r2, GPR2(r1) + + /* Restore possibly modified LR */ + lwz r0, _LINK(r1) + mtlr r0 + + /* Pop our stack frame */ + addi r1, r1, INT_FRAME_SIZE + + b ftrace_caller_common + #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) - addi r5, r1, 48 - /* load r4 with local address */ - lwz r4, 44(r1) - subi r4, r4, MCOUNT_INSN_SIZE + stwu r1,-48(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) - /* Grab the LR out of the caller stack frame */ - lwz r3,52(r1) + addi r5, r1, 48 + mfctr r4 /* ftrace_caller has moved local addr here */ + stw r4, 44(r1) + mflr r3 /* ftrace_caller has restored LR from stack */ + subi r4, r4, MCOUNT_INSN_SIZE bl prepare_ftrace_return nop @@ -65,9 +146,21 @@ _GLOBAL(ftrace_graph_caller) * Change the LR in the callers stack frame to this. */ stw r3,52(r1) + mtlr r3 + lwz r0,44(r1) + mtctr r0 + + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + + addi r1, r1, 48 - MCOUNT_RESTORE_FRAME - /* old link register ends up in ctr reg */ bctr _GLOBAL(return_to_handler) From cdc81aece8041fd5437bdabde6c543cdeb2891a8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Nov 2021 11:30:03 +0100 Subject: [PATCH 104/240] powerpc/ptdump: Fix display a BAT's size unit We have wrong units on BAT's sizes (G instead of M, M instead of ...) ---[ Instruction Block Address Translation ]--- 0: 0xc0000000-0xc03fffff 0x00000000 4G Kernel x m 1: 0xc0400000-0xc05fffff 0x00400000 2G Kernel x m 2: 0xc0600000-0xc06fffff 0x00600000 1G Kernel x m 3: 0xc0700000-0xc077ffff 0x00700000 512M Kernel x m 4: 0xc0780000-0xc079ffff 0x00780000 128M Kernel x m 5: 0xc07a0000-0xc07bffff 0x007a0000 128M Kernel x m 6: - 7: - This is because pt_dump_size() expects a size in Kbytes but bat_show_603() gives the size in bytes. To avoid risk of confusion, change pt_dump_size() to take bytes. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f16c30f5c9185a63335322cf1a8b22f189d335ef.1637922595.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/ptdump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index bf251191e78d..031956d0ee84 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -123,7 +123,7 @@ static struct ptdump_range ptdump_range[] __ro_after_init = { void pt_dump_size(struct seq_file *m, unsigned long size) { - static const char units[] = "KMGTPE"; + static const char units[] = " KMGTPE"; const char *unit = units; /* Work out what appropriate unit to use */ @@ -176,7 +176,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr) pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); - pt_dump_size(st->seq, (addr - st->start_address) >> 10); + pt_dump_size(st->seq, addr - st->start_address); } static void note_prot_wx(struct pg_state *st, unsigned long addr) From 57dd3a7bdf311e4a499fe0decabcdf2484e2538a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Nov 2021 12:43:33 +0100 Subject: [PATCH 105/240] powerpc: Don't bother about .data..Lubsan sections Since commit 9a427556fb8e ("vmlinux.lds.h: catch compound literals into data and BSS") .data..Lubsan sections are taken into account in DATA_MAIN which is included in DATA_DATA macro. No need to take care of them anymore in powerpc vmlinux.lds.S Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3eb14570612eef17e01bb67f14a4450136001794.1637840601.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/vmlinux.lds.S | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 18e42c74abdd..dfc3f39d365f 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -322,10 +322,6 @@ SECTIONS #ifdef CONFIG_PPC32 .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA -#ifdef CONFIG_UBSAN - *(.data..Lubsan_data*) - *(.data..Lubsan_type*) -#endif *(.data.rel*) *(SDATA_MAIN) *(.sdata2) @@ -336,10 +332,6 @@ SECTIONS #else .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA -#ifdef CONFIG_UBSAN - *(.data..Lubsan_data*) - *(.data..Lubsan_type*) -#endif *(.data.rel*) *(.toc1) *(.branch_lt) From e012c499985c608c936410d8bab29d9596d62859 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 19 Nov 2021 21:31:46 +1000 Subject: [PATCH 106/240] powerpc/watchdog: help remote CPUs to flush NMI printk output The printk layer at the moment does not seem to have a good way to force flush printk messages that are created in NMI context, except in the panic path. NMI-context printk messages normally get to the console with irq_work, but that won't help if the CPU is stuck with irqs disabled, as can be the case for hard lockup watchdog messages. The watchdog currently flushes the printk buffers after detecting a lockup on remote CPUs, but they may not have processed their NMI IPI yet by that stage, or they may have self-detected a lockup in which case they won't go via this NMI IPI path. Improve the situation by having NMI-context mark a flag if it called printk, and have watchdog timer interrupts check if that flag was set and try to flush if it was. Latency is not a big problem because we were already stuck for a while, just need to try to make sure the messages eventually make it out. Depends-on: 5d5e4522a7f4 ("printk: restore flushing of NMI buffers on remote CPUs after NMI backtraces") Signed-off-by: Nicholas Piggin Reviewed-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211119113146.752759-6-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 37 ++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 23745af38d62..bfc27496fe7e 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -86,6 +86,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb); /* SMP checker bits */ static unsigned long __wd_smp_lock; static unsigned long __wd_reporting; +static unsigned long __wd_nmi_output; static cpumask_t wd_smp_cpus_pending; static cpumask_t wd_smp_cpus_stuck; static u64 wd_smp_last_reset_tb; @@ -154,6 +155,23 @@ static void wd_lockup_ipi(struct pt_regs *regs) else dump_stack(); + /* + * __wd_nmi_output must be set after we printk from NMI context. + * + * printk from NMI context defers printing to the console to irq_work. + * If that NMI was taken in some code that is hard-locked, then irqs + * are disabled so irq_work will never fire. That can result in the + * hard lockup messages being delayed (indefinitely, until something + * else kicks the console drivers). + * + * Setting __wd_nmi_output will cause another CPU to notice and kick + * the console drivers for us. + * + * xchg is not needed here (it could be a smp_mb and store), but xchg + * gives the memory ordering and atomicity required. + */ + xchg(&__wd_nmi_output, 1); + /* Do not panic from here because that can recurse into NMI IPI layer */ } @@ -227,12 +245,6 @@ static void watchdog_smp_panic(int cpu) cpumask_clear(&wd_smp_cpus_ipi); } - /* - * Force flush any remote buffers that might be stuck in IRQ context - * and therefore could not run their irq_work. - */ - printk_trigger_flush(); - if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); @@ -337,6 +349,17 @@ static void watchdog_timer_interrupt(int cpu) if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) watchdog_smp_panic(cpu); + + if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) { + /* + * Something has called printk from NMI context. It might be + * stuck, so this this triggers a flush that will get that + * printk output to the console. + * + * See wd_lockup_ipi. + */ + printk_trigger_flush(); + } } DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) @@ -386,6 +409,8 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) print_irqtrace_events(current); show_regs(regs); + xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi + if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); From aebd1fb45c622e9a2b06fb70665d084d3a8d6c78 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 22 Oct 2021 16:13:22 +1000 Subject: [PATCH 107/240] powerpc: flexible GPR range save/restore macros Introduce macros that operate on a (start, end) range of GPRs, which reduces lines of code and need to do mental arithmetic while reading the code. Signed-off-by: Nicholas Piggin Reviewed-by: Segher Boessenkool Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211022061322.2671178-1-npiggin@gmail.com --- arch/powerpc/boot/crt0.S | 31 +++++++------ arch/powerpc/crypto/md5-asm.S | 10 ++--- arch/powerpc/crypto/sha1-powerpc-asm.S | 6 +-- arch/powerpc/include/asm/ppc_asm.h | 43 ++++++++++++------- arch/powerpc/kernel/entry_32.S | 23 ++++------ arch/powerpc/kernel/exceptions-64e.S | 14 ++---- arch/powerpc/kernel/exceptions-64s.S | 6 +-- arch/powerpc/kernel/head_32.h | 3 +- arch/powerpc/kernel/head_booke.h | 3 +- arch/powerpc/kernel/interrupt_64.S | 34 ++++++--------- arch/powerpc/kernel/optprobes_head.S | 4 +- arch/powerpc/kernel/tm.S | 15 ++----- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 15 +++---- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 5 +-- .../lib/test_emulate_step_exec_instr.S | 8 ++-- 15 files changed, 94 insertions(+), 126 deletions(-) diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S index 1d83966f5ef6..e8f10a599659 100644 --- a/arch/powerpc/boot/crt0.S +++ b/arch/powerpc/boot/crt0.S @@ -226,16 +226,19 @@ p_base: mflr r10 /* r10 now points to runtime addr of p_base */ #ifdef __powerpc64__ #define PROM_FRAME_SIZE 512 -#define SAVE_GPR(n, base) std n,8*(n)(base) -#define REST_GPR(n, base) ld n,8*(n)(base) -#define SAVE_2GPRS(n, base) SAVE_GPR(n, base); SAVE_GPR(n+1, base) -#define SAVE_4GPRS(n, base) SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base) -#define SAVE_8GPRS(n, base) SAVE_4GPRS(n, base); SAVE_4GPRS(n+4, base) -#define SAVE_10GPRS(n, base) SAVE_8GPRS(n, base); SAVE_2GPRS(n+8, base) -#define REST_2GPRS(n, base) REST_GPR(n, base); REST_GPR(n+1, base) -#define REST_4GPRS(n, base) REST_2GPRS(n, base); REST_2GPRS(n+2, base) -#define REST_8GPRS(n, base) REST_4GPRS(n, base); REST_4GPRS(n+4, base) -#define REST_10GPRS(n, base) REST_8GPRS(n, base); REST_2GPRS(n+8, base) + +.macro OP_REGS op, width, start, end, base, offset + .Lreg=\start + .rept (\end - \start + 1) + \op .Lreg,\offset+\width*.Lreg(\base) + .Lreg=.Lreg+1 + .endr +.endm + +#define SAVE_GPRS(start, end, base) OP_REGS std, 8, start, end, base, 0 +#define REST_GPRS(start, end, base) OP_REGS ld, 8, start, end, base, 0 +#define SAVE_GPR(n, base) SAVE_GPRS(n, n, base) +#define REST_GPR(n, base) REST_GPRS(n, n, base) /* prom handles the jump into and return from firmware. The prom args pointer is loaded in r3. */ @@ -246,9 +249,7 @@ prom: stdu r1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */ SAVE_GPR(2, r1) - SAVE_GPR(13, r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) + SAVE_GPRS(13, 31, r1) mfcr r10 std r10,8*32(r1) mfmsr r10 @@ -283,9 +284,7 @@ prom: /* Restore other registers */ REST_GPR(2, r1) - REST_GPR(13, r1) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) + REST_GPRS(13, 31, r1) ld r10,8*32(r1) mtcr r10 diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S index 948d100a2934..fa6bc440cf4a 100644 --- a/arch/powerpc/crypto/md5-asm.S +++ b/arch/powerpc/crypto/md5-asm.S @@ -38,15 +38,11 @@ #define INITIALIZE \ PPC_STLU r1,-INT_FRAME_SIZE(r1); \ - SAVE_8GPRS(14, r1); /* push registers onto stack */ \ - SAVE_4GPRS(22, r1); \ - SAVE_GPR(26, r1) + SAVE_GPRS(14, 26, r1) /* push registers onto stack */ #define FINALIZE \ - REST_8GPRS(14, r1); /* pop registers from stack */ \ - REST_4GPRS(22, r1); \ - REST_GPR(26, r1); \ - addi r1,r1,INT_FRAME_SIZE; + REST_GPRS(14, 26, r1); /* pop registers from stack */ \ + addi r1,r1,INT_FRAME_SIZE #ifdef __BIG_ENDIAN__ #define LOAD_DATA(reg, off) \ diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S b/arch/powerpc/crypto/sha1-powerpc-asm.S index 23e248beff71..f0d5ed557ab1 100644 --- a/arch/powerpc/crypto/sha1-powerpc-asm.S +++ b/arch/powerpc/crypto/sha1-powerpc-asm.S @@ -125,8 +125,7 @@ _GLOBAL(powerpc_sha_transform) PPC_STLU r1,-INT_FRAME_SIZE(r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) + SAVE_GPRS(14, 31, r1) /* Load up A - E */ lwz RA(0),0(r3) /* A */ @@ -184,7 +183,6 @@ _GLOBAL(powerpc_sha_transform) stw RD(0),12(r3) stw RE(0),16(r3) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) + REST_GPRS(14, 31, r1) addi r1,r1,INT_FRAME_SIZE blr diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 7be24048b8d1..f21e6bde17a1 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -16,30 +16,41 @@ #define SZL (BITS_PER_LONG/8) +/* + * This expands to a sequence of operations with reg incrementing from + * start to end inclusive, of this form: + * + * op reg, (offset + (width * reg))(base) + * + * Note that offset is not the offset of the first operation unless start + * is zero (or width is zero). + */ +.macro OP_REGS op, width, start, end, base, offset + .Lreg=\start + .rept (\end - \start + 1) + \op .Lreg, \offset + \width * .Lreg(\base) + .Lreg=.Lreg+1 + .endr +.endm + /* * Macros for storing registers into and loading registers from * exception frames. */ #ifdef __powerpc64__ -#define SAVE_GPR(n, base) std n,GPR0+8*(n)(base) -#define REST_GPR(n, base) ld n,GPR0+8*(n)(base) -#define SAVE_NVGPRS(base) SAVE_8GPRS(14, base); SAVE_10GPRS(22, base) -#define REST_NVGPRS(base) REST_8GPRS(14, base); REST_10GPRS(22, base) +#define SAVE_GPRS(start, end, base) OP_REGS std, 8, start, end, base, GPR0 +#define REST_GPRS(start, end, base) OP_REGS ld, 8, start, end, base, GPR0 +#define SAVE_NVGPRS(base) SAVE_GPRS(14, 31, base) +#define REST_NVGPRS(base) REST_GPRS(14, 31, base) #else -#define SAVE_GPR(n, base) stw n,GPR0+4*(n)(base) -#define REST_GPR(n, base) lwz n,GPR0+4*(n)(base) -#define SAVE_NVGPRS(base) SAVE_GPR(13, base); SAVE_8GPRS(14, base); SAVE_10GPRS(22, base) -#define REST_NVGPRS(base) REST_GPR(13, base); REST_8GPRS(14, base); REST_10GPRS(22, base) +#define SAVE_GPRS(start, end, base) OP_REGS stw, 4, start, end, base, GPR0 +#define REST_GPRS(start, end, base) OP_REGS lwz, 4, start, end, base, GPR0 +#define SAVE_NVGPRS(base) SAVE_GPRS(13, 31, base) +#define REST_NVGPRS(base) REST_GPRS(13, 31, base) #endif -#define SAVE_2GPRS(n, base) SAVE_GPR(n, base); SAVE_GPR(n+1, base) -#define SAVE_4GPRS(n, base) SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base) -#define SAVE_8GPRS(n, base) SAVE_4GPRS(n, base); SAVE_4GPRS(n+4, base) -#define SAVE_10GPRS(n, base) SAVE_8GPRS(n, base); SAVE_2GPRS(n+8, base) -#define REST_2GPRS(n, base) REST_GPR(n, base); REST_GPR(n+1, base) -#define REST_4GPRS(n, base) REST_2GPRS(n, base); REST_2GPRS(n+2, base) -#define REST_8GPRS(n, base) REST_4GPRS(n, base); REST_4GPRS(n+4, base) -#define REST_10GPRS(n, base) REST_8GPRS(n, base); REST_2GPRS(n+8, base) +#define SAVE_GPR(n, base) SAVE_GPRS(n, n, base) +#define REST_GPR(n, base) REST_GPRS(n, n, base) #define SAVE_FPR(n, base) stfd n,8*TS_FPRWIDTH*(n)(base) #define SAVE_2FPRS(n, base) SAVE_FPR(n, base); SAVE_FPR(n+1, base) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 61fdd53cdd9a..c62dd9815965 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -90,8 +90,7 @@ transfer_to_syscall: stw r12,8(r1) stw r2,_TRAP(r1) SAVE_GPR(0, r1) - SAVE_4GPRS(3, r1) - SAVE_2GPRS(7, r1) + SAVE_GPRS(3, 8, r1) addi r2,r10,-THREAD SAVE_NVGPRS(r1) @@ -139,7 +138,7 @@ syscall_exit_finish: mtxer r5 lwz r0,GPR0(r1) lwz r3,GPR3(r1) - REST_8GPRS(4,r1) + REST_GPRS(4, 11, r1) lwz r12,GPR12(r1) b 1b @@ -232,9 +231,9 @@ fast_exception_return: beq 3f /* if not, we've got problems */ #endif -2: REST_4GPRS(3, r11) +2: REST_GPRS(3, 6, r11) lwz r10,_CCR(r11) - REST_2GPRS(1, r11) + REST_GPRS(1, 2, r11) mtcr r10 lwz r10,_LINK(r11) mtlr r10 @@ -298,16 +297,14 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) * the reliable stack unwinder later on. Clear it. */ stw r0,8(r1) - REST_4GPRS(7, r1) - REST_2GPRS(11, r1) + REST_GPRS(7, 12, r1) mtcr r3 mtlr r4 mtctr r5 mtspr SPRN_XER,r6 - REST_4GPRS(2, r1) - REST_GPR(6, r1) + REST_GPRS(2, 6, r1) REST_GPR(0, r1) REST_GPR(1, r1) rfi @@ -341,8 +338,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) lwz r6,_CCR(r1) li r0,0 - REST_4GPRS(7, r1) - REST_2GPRS(11, r1) + REST_GPRS(7, 12, r1) mtlr r3 mtctr r4 @@ -354,7 +350,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) */ stw r0,8(r1) - REST_4GPRS(2, r1) + REST_GPRS(2, 5, r1) bne- cr1,1f /* emulate stack store */ mtcr r6 @@ -430,8 +426,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return) bne interrupt_return; \ lwz r0,GPR0(r1); \ lwz r2,GPR2(r1); \ - REST_4GPRS(3, r1); \ - REST_2GPRS(7, r1); \ + REST_GPRS(3, 8, r1); \ lwz r10,_XER(r1); \ lwz r11,_CTR(r1); \ mtspr SPRN_XER,r10; \ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 711c66b76df1..67dc4e3179a0 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -198,8 +198,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) stdcx. r0,0,r1 /* to clear the reservation */ - REST_4GPRS(2, r1) - REST_4GPRS(6, r1) + REST_GPRS(2, 9, r1) ld r10,_CTR(r1) ld r11,_XER(r1) @@ -375,9 +374,7 @@ ret_from_mc_except: exc_##n##_common: \ std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - std r9,GPR9(r1); /* save r9 in stackframe */ \ + SAVE_GPRS(3, 9, r1); /* save r3 - r9 in stackframe */ \ std r10,_NIP(r1); /* save SRR0 to stackframe */ \ std r11,_MSR(r1); /* save SRR1 to stackframe */ \ beq 2f; /* if from kernel mode */ \ @@ -1061,9 +1058,7 @@ bad_stack_book3e: std r11,_ESR(r1) std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - std r9,GPR9(r1); /* save r9 in stackframe */ \ + SAVE_GPRS(3, 9, r1); /* save r3 - r9 in stackframe */ \ ld r3,PACA_EXGEN+EX_R10(r13);/* get back r10 */ \ ld r4,PACA_EXGEN+EX_R11(r13);/* get back r11 */ \ mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 XXX can be wrong */ \ @@ -1077,8 +1072,7 @@ bad_stack_book3e: std r10,_LINK(r1) std r11,_CTR(r1) std r12,_XER(r1) - SAVE_10GPRS(14,r1) - SAVE_8GPRS(24,r1) + SAVE_GPRS(14, 31, r1) lhz r12,PACA_TRAP_SAVE(r13) std r12,_TRAP(r1) addi r11,r1,INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index eaf1f72131a1..277eccf0f086 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -574,8 +574,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r10,IAREA+EX_CTR(r13) std r10,_CTR(r1) std r2,GPR2(r1) /* save r2 in stackframe */ - SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */ - SAVE_2GPRS(7, r1) /* save r7, r8 in stackframe */ + SAVE_GPRS(3, 8, r1) /* save r3 - r8 in stackframe */ mflr r9 /* Get LR, later save to stack */ ld r2,PACATOC(r13) /* get kernel TOC into r2 */ std r9,_LINK(r1) @@ -693,8 +692,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) mtlr r9 ld r9,_CCR(r1) mtcr r9 - REST_8GPRS(2, r1) - REST_4GPRS(10, r1) + REST_GPRS(2, 13, r1) REST_GPR(0, r1) /* restore original r1. */ ld r1,GPR1(r1) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 6b1ec9e3541b..25887303651a 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -115,8 +115,7 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) stw r10,8(r1) li r10, \trapno stw r10,_TRAP(r1) - SAVE_4GPRS(3, r1) - SAVE_2GPRS(7, r1) + SAVE_GPRS(3, 8, r1) SAVE_NVGPRS(r1) stw r2,GPR2(r1) stw r12,_NIP(r1) diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index ef8d1b1c234e..bb6d5d0fc4ac 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -87,8 +87,7 @@ END_BTB_FLUSH_SECTION stw r10, 8(r1) li r10, \trapno stw r10,_TRAP(r1) - SAVE_4GPRS(3, r1) - SAVE_2GPRS(7, r1) + SAVE_GPRS(3, 8, r1) SAVE_NVGPRS(r1) stw r2,GPR2(r1) stw r12,_NIP(r1) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index ec950b08a8dc..2ad223597ca2 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -162,10 +162,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * The value of AMR only matters while we're in the kernel. */ mtcr r2 - ld r2,GPR2(r1) - ld r3,GPR3(r1) - ld r13,GPR13(r1) - ld r1,GPR1(r1) + REST_GPRS(2, 3, r1) + REST_GPR(13, r1) + REST_GPR(1, r1) RFSCV_TO_USER b . /* prevent speculative execution */ @@ -183,9 +182,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mtctr r3 mtlr r4 mtspr SPRN_XER,r5 - REST_10GPRS(2, r1) - REST_2GPRS(12, r1) - ld r1,GPR1(r1) + REST_GPRS(2, 13, r1) + REST_GPR(1, r1) RFI_TO_USER .Lsyscall_vectored_\name\()_rst_end: @@ -374,10 +372,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * The value of AMR only matters while we're in the kernel. */ mtcr r2 - ld r2,GPR2(r1) - ld r3,GPR3(r1) - ld r13,GPR13(r1) - ld r1,GPR1(r1) + REST_GPRS(2, 3, r1) + REST_GPR(13, r1) + REST_GPR(1, r1) RFI_TO_USER b . /* prevent speculative execution */ @@ -388,8 +385,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mtctr r3 mtspr SPRN_XER,r4 ld r0,GPR0(r1) - REST_8GPRS(4, r1) - ld r12,GPR12(r1) + REST_GPRS(4, 12, r1) b .Lsyscall_restore_regs_cont .Lsyscall_rst_end: @@ -518,17 +514,14 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) ld r6,_XER(r1) li r0,0 - REST_4GPRS(7, r1) - REST_2GPRS(11, r1) - REST_GPR(13, r1) + REST_GPRS(7, 13, r1) mtcr r3 mtlr r4 mtctr r5 mtspr SPRN_XER,r6 - REST_4GPRS(2, r1) - REST_GPR(6, r1) + REST_GPRS(2, 6, r1) REST_GPR(0, r1) REST_GPR(1, r1) .ifc \srr,srr @@ -625,8 +618,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) ld r6,_CCR(r1) li r0,0 - REST_4GPRS(7, r1) - REST_2GPRS(11, r1) + REST_GPRS(7, 12, r1) mtlr r3 mtctr r4 @@ -638,7 +630,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) */ std r0,STACK_FRAME_OVERHEAD-16(r1) - REST_4GPRS(2, r1) + REST_GPRS(2, 5, r1) bne- cr1,1f /* emulate stack store */ mtcr r6 diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index 19ea3312403c..5c7f0b4b784b 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -10,8 +10,8 @@ #include #ifdef CONFIG_PPC64 -#define SAVE_30GPRS(base) SAVE_10GPRS(2,base); SAVE_10GPRS(12,base); SAVE_10GPRS(22,base) -#define REST_30GPRS(base) REST_10GPRS(2,base); REST_10GPRS(12,base); REST_10GPRS(22,base) +#define SAVE_30GPRS(base) SAVE_GPRS(2, 31, base) +#define REST_30GPRS(base) REST_GPRS(2, 31, base) #define TEMPLATE_FOR_IMM_LOAD_INSNS nop; nop; nop; nop; nop #else #define SAVE_30GPRS(base) stmw r2, GPR2(base) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 2b91f233b05d..3beecc32940b 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -226,11 +226,8 @@ _GLOBAL(tm_reclaim) /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ SAVE_GPR(0, r7) /* user r0 */ - SAVE_GPR(2, r7) /* user r2 */ - SAVE_4GPRS(3, r7) /* user r3-r6 */ - SAVE_GPR(8, r7) /* user r8 */ - SAVE_GPR(9, r7) /* user r9 */ - SAVE_GPR(10, r7) /* user r10 */ + SAVE_GPRS(2, 6, r7) /* user r2-r6 */ + SAVE_GPRS(8, 10, r7) /* user r8-r10 */ ld r3, GPR1(r1) /* user r1 */ ld r4, GPR7(r1) /* user r7 */ ld r5, GPR11(r1) /* user r11 */ @@ -445,12 +442,8 @@ restore_gprs: ld r6, THREAD_TM_PPR(r3) REST_GPR(0, r7) /* GPR0 */ - REST_2GPRS(2, r7) /* GPR2-3 */ - REST_GPR(4, r7) /* GPR4 */ - REST_4GPRS(8, r7) /* GPR8-11 */ - REST_2GPRS(12, r7) /* GPR12-13 */ - - REST_NVGPRS(r7) /* GPR14-31 */ + REST_GPRS(2, 4, r7) /* GPR2-4 */ + REST_GPRS(8, 31, r7) /* GPR8-31 */ /* Load up PPR and DSCR here so we don't run with user values for long */ mtspr SPRN_DSCR, r5 diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index f9fd5f743eba..d636fc755f60 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -41,15 +41,14 @@ _GLOBAL(ftrace_regs_caller) /* Save all gprs to pt_regs */ SAVE_GPR(0, r1) - SAVE_10GPRS(2, r1) + SAVE_GPRS(2, 11, r1) /* Ok to continue? */ lbz r3, PACA_FTRACE_ENABLED(r13) cmpdi r3, 0 beq ftrace_no_trace - SAVE_10GPRS(12, r1) - SAVE_10GPRS(22, r1) + SAVE_GPRS(12, 31, r1) /* Save previous stack pointer (r1) */ addi r8, r1, SWITCH_FRAME_SIZE @@ -108,10 +107,8 @@ ftrace_regs_call: #endif /* Restore gprs */ - REST_GPR(0,r1) - REST_10GPRS(2,r1) - REST_10GPRS(12,r1) - REST_10GPRS(22,r1) + REST_GPR(0, r1) + REST_GPRS(2, 31, r1) /* Restore possibly modified LR */ ld r0, _LINK(r1) @@ -157,7 +154,7 @@ _GLOBAL(ftrace_caller) stdu r1, -SWITCH_FRAME_SIZE(r1) /* Save all gprs to pt_regs */ - SAVE_8GPRS(3, r1) + SAVE_GPRS(3, 10, r1) lbz r3, PACA_FTRACE_ENABLED(r13) cmpdi r3, 0 @@ -194,7 +191,7 @@ ftrace_call: mtctr r3 /* Restore gprs */ - REST_8GPRS(3,r1) + REST_GPRS(3, 10, r1) /* Restore callee's TOC */ ld r2, 24(r1) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 3f1aeff72438..d185dee26026 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2693,8 +2693,7 @@ kvmppc_bad_host_intr: std r0, GPR0(r1) std r9, GPR1(r1) std r2, GPR2(r1) - SAVE_4GPRS(3, r1) - SAVE_2GPRS(7, r1) + SAVE_GPRS(3, 8, r1) srdi r0, r12, 32 clrldi r12, r12, 32 std r0, _CCR(r1) @@ -2717,7 +2716,7 @@ kvmppc_bad_host_intr: ld r9, HSTATE_SCRATCH2(r13) ld r12, HSTATE_SCRATCH0(r13) GET_SCRATCH0(r0) - SAVE_4GPRS(9, r1) + SAVE_GPRS(9, 12, r1) std r0, GPR13(r1) SAVE_NVGPRS(r1) ld r5, HSTATE_CFAR(r13) diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S index 9ef941d958d8..5473f9d03df3 100644 --- a/arch/powerpc/lib/test_emulate_step_exec_instr.S +++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S @@ -37,7 +37,7 @@ _GLOBAL(exec_instr) * The stack pointer (GPR1) and the thread pointer (GPR13) are not * saved as these should not be modified anyway. */ - SAVE_2GPRS(2, r1) + SAVE_GPRS(2, 3, r1) SAVE_NVGPRS(r1) /* @@ -75,8 +75,7 @@ _GLOBAL(exec_instr) /* Load GPRs from pt_regs */ REST_GPR(0, r31) - REST_10GPRS(2, r31) - REST_GPR(12, r31) + REST_GPRS(2, 12, r31) REST_NVGPRS(r31) /* Placeholder for the test instruction */ @@ -99,8 +98,7 @@ _GLOBAL(exec_instr) subi r3, r3, GPR0 SAVE_GPR(0, r3) SAVE_GPR(2, r3) - SAVE_8GPRS(4, r3) - SAVE_GPR(12, r3) + SAVE_GPRS(4, 12, r3) SAVE_NVGPRS(r3) /* Save resulting LR to pt_regs */ From fb350784d8d17952afa93383bb47aaa6b715c459 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 21 Sep 2021 17:09:47 +0200 Subject: [PATCH 108/240] powerpc/bitops: Use immediate operand when possible Today we get the following code generation for bitops like set or clear bit: c0009fe0: 39 40 08 00 li r10,2048 c0009fe4: 7c e0 40 28 lwarx r7,0,r8 c0009fe8: 7c e7 53 78 or r7,r7,r10 c0009fec: 7c e0 41 2d stwcx. r7,0,r8 c000d568: 39 00 18 00 li r8,6144 c000d56c: 7c c0 38 28 lwarx r6,0,r7 c000d570: 7c c6 40 78 andc r6,r6,r8 c000d574: 7c c0 39 2d stwcx. r6,0,r7 Most set bits are constant on lower 16 bits, so it can easily be replaced by the "immediate" version of the operation. Allow GCC to choose between the normal or immediate form. For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for when all bits to be cleared are consecutive. On 64 bits we don't have any equivalent single operation for clearing, single bits or a few bits, we'd need two 'rldicl' so it is not worth it, the li/andc sequence is doing the same. With this patch we get: c0009fe0: 7d 00 50 28 lwarx r8,0,r10 c0009fe4: 61 08 08 00 ori r8,r8,2048 c0009fe8: 7d 00 51 2d stwcx. r8,0,r10 c000d558: 7c e0 40 28 lwarx r7,0,r8 c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18 c000d560: 7c e0 41 2d stwcx. r7,0,r8 On pmac32_defconfig, it reduces the text by approx 10 kbytes. Signed-off-by: Christophe Leroy Reviewed-by: Segher Boessenkool Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e6f815d9181bab09df3b350af51149437863e9f9.1632236981.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 11847b6a244e..a05d8c62cbea 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \ __asm__ __volatile__ ( \ prefix \ "1:" PPC_LLARX "%0,0,%3,0\n" \ - stringify_in_c(op) "%0,%0,%2\n" \ + #op "%I2 %0,%0,%2\n" \ PPC_STLCX "%0,0,%3\n" \ "bne- 1b\n" \ : "=&r" (old), "+m" (*p) \ - : "r" (mask), "r" (p) \ + : "rK" (mask), "r" (p) \ : "cc", "memory"); \ } DEFINE_BITOP(set_bits, or, "") -DEFINE_BITOP(clear_bits, andc, "") -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) DEFINE_BITOP(change_bits, xor, "") +static __always_inline bool is_rlwinm_mask_valid(unsigned long x) +{ + if (!x) + return false; + if (x & 1) + x = ~x; // make the mask non-wrapping + x += x & -x; // adding the low set bit results in at most one bit set + + return !(x & (x - 1)); +} + +#define DEFINE_CLROP(fn, prefix) \ +static inline void fn(unsigned long mask, volatile unsigned long *_p) \ +{ \ + unsigned long old; \ + unsigned long *p = (unsigned long *)_p; \ + \ + if (IS_ENABLED(CONFIG_PPC32) && \ + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\ + asm volatile ( \ + prefix \ + "1:" "lwarx %0,0,%3\n" \ + "rlwinm %0,%0,0,%2\n" \ + "stwcx. %0,0,%3\n" \ + "bne- 1b\n" \ + : "=&r" (old), "+m" (*p) \ + : "n" (~mask), "r" (p) \ + : "cc", "memory"); \ + } else { \ + asm volatile ( \ + prefix \ + "1:" PPC_LLARX "%0,0,%3,0\n" \ + "andc %0,%0,%2\n" \ + PPC_STLCX "%0,0,%3\n" \ + "bne- 1b\n" \ + : "=&r" (old), "+m" (*p) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ + } \ +} + +DEFINE_CLROP(clear_bits, "") +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER) + static inline void arch_set_bit(int nr, volatile unsigned long *addr) { set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); @@ -116,12 +158,12 @@ static inline unsigned long fn( \ __asm__ __volatile__ ( \ prefix \ "1:" PPC_LLARX "%0,0,%3,%4\n" \ - stringify_in_c(op) "%1,%0,%2\n" \ + #op "%I2 %1,%0,%2\n" \ PPC_STLCX "%1,0,%3\n" \ "bne- 1b\n" \ postfix \ : "=&r" (old), "=&r" (t) \ - : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ + : "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ : "cc", "memory"); \ return (old & mask); \ } @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, 0) DEFINE_TESTOP(test_and_set_bits_lock, or, "", PPC_ACQUIRE_BARRIER, 1) -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER, - PPC_ATOMIC_EXIT_BARRIER, 0) DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, 0) +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p) +{ + unsigned long old, t; + unsigned long *p = (unsigned long *)_p; + + if (IS_ENABLED(CONFIG_PPC32) && + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) { + asm volatile ( + PPC_ATOMIC_ENTRY_BARRIER + "1:" "lwarx %0,0,%3\n" + "rlwinm %1,%0,0,%2\n" + "stwcx. %1,0,%3\n" + "bne- 1b\n" + PPC_ATOMIC_EXIT_BARRIER + : "=&r" (old), "=&r" (t) + : "n" (~mask), "r" (p) + : "cc", "memory"); + } else { + asm volatile ( + PPC_ATOMIC_ENTRY_BARRIER + "1:" PPC_LLARX "%0,0,%3,0\n" + "andc %1,%0,%2\n" + PPC_STLCX "%1,0,%3\n" + "bne- 1b\n" + PPC_ATOMIC_EXIT_BARRIER + : "=&r" (old), "=&r" (t) + : "r" (mask), "r" (p) + : "cc", "memory"); + } + + return (old & mask); +} + static inline int arch_test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { From 41d65207de9fbff58acd8937a7c3f8940c186a87 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 21 Sep 2021 17:09:48 +0200 Subject: [PATCH 109/240] powerpc/atomics: Use immediate operand when possible Today we get the following code generation for atomic operations: c001bb2c: 39 20 00 01 li r9,1 c001bb30: 7d 40 18 28 lwarx r10,0,r3 c001bb34: 7d 09 50 50 subf r8,r9,r10 c001bb38: 7d 00 19 2d stwcx. r8,0,r3 c001c7a8: 39 40 00 01 li r10,1 c001c7ac: 7d 00 18 28 lwarx r8,0,r3 c001c7b0: 7c ea 42 14 add r7,r10,r8 c001c7b4: 7c e0 19 2d stwcx. r7,0,r3 By allowing GCC to choose between immediate or regular operation, we get: c001bb2c: 7d 20 18 28 lwarx r9,0,r3 c001bb30: 39 49 ff ff addi r10,r9,-1 c001bb34: 7d 40 19 2d stwcx. r10,0,r3 -- c001c7a4: 7d 40 18 28 lwarx r10,0,r3 c001c7a8: 39 0a 00 01 addi r8,r10,1 c001c7ac: 7d 00 19 2d stwcx. r8,0,r3 For "and", the dot form has to be used because "andi" doesn't exist. For logical operations we use unsigned 16 bits immediate. For arithmetic operations we use signed 16 bits immediate. On pmac32_defconfig, it reduces the text by approx another 8 kbytes. Signed-off-by: Christophe Leroy Acked-by: Segher Boessenkool Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2ec558d44db8045752fe9dbd29c9ba84bab6030b.1632236981.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/atomic.h | 56 +++++++++++++++---------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index fd594fdbd84d..45f564dbaef5 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -37,62 +37,62 @@ static __inline__ void arch_atomic_set(atomic_t *v, int i) __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m<>"(v->counter) : "r"(i)); } -#define ATOMIC_OP(op, asm_op) \ +#define ATOMIC_OP(op, asm_op, suffix, sign, ...) \ static __inline__ void arch_atomic_##op(int a, atomic_t *v) \ { \ int t; \ \ __asm__ __volatile__( \ "1: lwarx %0,0,%3 # atomic_" #op "\n" \ - #asm_op " %0,%2,%0\n" \ + #asm_op "%I2" suffix " %0,%0,%2\n" \ " stwcx. %0,0,%3 \n" \ " bne- 1b\n" \ : "=&r" (t), "+m" (v->counter) \ - : "r" (a), "r" (&v->counter) \ - : "cc"); \ + : "r"#sign (a), "r" (&v->counter) \ + : "cc", ##__VA_ARGS__); \ } \ -#define ATOMIC_OP_RETURN_RELAXED(op, asm_op) \ +#define ATOMIC_OP_RETURN_RELAXED(op, asm_op, suffix, sign, ...) \ static inline int arch_atomic_##op##_return_relaxed(int a, atomic_t *v) \ { \ int t; \ \ __asm__ __volatile__( \ "1: lwarx %0,0,%3 # atomic_" #op "_return_relaxed\n" \ - #asm_op " %0,%2,%0\n" \ + #asm_op "%I2" suffix " %0,%0,%2\n" \ " stwcx. %0,0,%3\n" \ " bne- 1b\n" \ : "=&r" (t), "+m" (v->counter) \ - : "r" (a), "r" (&v->counter) \ - : "cc"); \ + : "r"#sign (a), "r" (&v->counter) \ + : "cc", ##__VA_ARGS__); \ \ return t; \ } -#define ATOMIC_FETCH_OP_RELAXED(op, asm_op) \ +#define ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign, ...) \ static inline int arch_atomic_fetch_##op##_relaxed(int a, atomic_t *v) \ { \ int res, t; \ \ __asm__ __volatile__( \ "1: lwarx %0,0,%4 # atomic_fetch_" #op "_relaxed\n" \ - #asm_op " %1,%3,%0\n" \ + #asm_op "%I3" suffix " %1,%0,%3\n" \ " stwcx. %1,0,%4\n" \ " bne- 1b\n" \ : "=&r" (res), "=&r" (t), "+m" (v->counter) \ - : "r" (a), "r" (&v->counter) \ - : "cc"); \ + : "r"#sign (a), "r" (&v->counter) \ + : "cc", ##__VA_ARGS__); \ \ return res; \ } -#define ATOMIC_OPS(op, asm_op) \ - ATOMIC_OP(op, asm_op) \ - ATOMIC_OP_RETURN_RELAXED(op, asm_op) \ - ATOMIC_FETCH_OP_RELAXED(op, asm_op) +#define ATOMIC_OPS(op, asm_op, suffix, sign, ...) \ + ATOMIC_OP(op, asm_op, suffix, sign, ##__VA_ARGS__) \ + ATOMIC_OP_RETURN_RELAXED(op, asm_op, suffix, sign, ##__VA_ARGS__)\ + ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign, ##__VA_ARGS__) -ATOMIC_OPS(add, add) -ATOMIC_OPS(sub, subf) +ATOMIC_OPS(add, add, "c", I, "xer") +ATOMIC_OPS(sub, sub, "c", I, "xer") #define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed #define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed @@ -101,13 +101,13 @@ ATOMIC_OPS(sub, subf) #define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed #undef ATOMIC_OPS -#define ATOMIC_OPS(op, asm_op) \ - ATOMIC_OP(op, asm_op) \ - ATOMIC_FETCH_OP_RELAXED(op, asm_op) +#define ATOMIC_OPS(op, asm_op, suffix, sign) \ + ATOMIC_OP(op, asm_op, suffix, sign) \ + ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign) -ATOMIC_OPS(and, and) -ATOMIC_OPS(or, or) -ATOMIC_OPS(xor, xor) +ATOMIC_OPS(and, and, ".", K) +ATOMIC_OPS(or, or, "", K) +ATOMIC_OPS(xor, xor, "", K) #define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed #define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed @@ -241,15 +241,15 @@ static __inline__ int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) "1: lwarx %0,0,%1 # atomic_fetch_add_unless\n\ cmpw 0,%0,%3 \n\ beq 2f \n\ - add %0,%2,%0 \n" + add%I2c %0,%0,%2 \n" " stwcx. %0,0,%1 \n\ bne- 1b \n" PPC_ATOMIC_EXIT_BARRIER -" subf %0,%2,%0 \n\ +" sub%I2c %0,%0,%2 \n\ 2:" : "=&r" (t) - : "r" (&v->counter), "r" (a), "r" (u) - : "cc", "memory"); + : "r" (&v->counter), "rI" (a), "r" (u) + : "cc", "memory", "xer"); return t; } From f05cab0034babaa9b3dfaf6003ee6493496a8180 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 21 Sep 2021 17:09:49 +0200 Subject: [PATCH 110/240] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends Now that atomic_add() and atomic_sub() handle immediate operands, atomic_inc() and atomic_dec() have no added value compared to the generic fallback which calls atomic_add(1) and atomic_sub(1). Also remove atomic_inc_not_zero() which fallsback to atomic_add_unless() which itself fallsback to atomic_fetch_add_unless() which now handles immediate operands. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0bc64a2f18726055093dbb2e479cefc60a409cfd.1632236981.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/atomic.h | 95 ------------------------------- 1 file changed, 95 deletions(-) diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 45f564dbaef5..853dc86864f4 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -118,71 +118,6 @@ ATOMIC_OPS(xor, xor, "", K) #undef ATOMIC_OP_RETURN_RELAXED #undef ATOMIC_OP -static __inline__ void arch_atomic_inc(atomic_t *v) -{ - int t; - - __asm__ __volatile__( -"1: lwarx %0,0,%2 # atomic_inc\n\ - addic %0,%0,1\n" -" stwcx. %0,0,%2 \n\ - bne- 1b" - : "=&r" (t), "+m" (v->counter) - : "r" (&v->counter) - : "cc", "xer"); -} -#define arch_atomic_inc arch_atomic_inc - -static __inline__ int arch_atomic_inc_return_relaxed(atomic_t *v) -{ - int t; - - __asm__ __volatile__( -"1: lwarx %0,0,%2 # atomic_inc_return_relaxed\n" -" addic %0,%0,1\n" -" stwcx. %0,0,%2\n" -" bne- 1b" - : "=&r" (t), "+m" (v->counter) - : "r" (&v->counter) - : "cc", "xer"); - - return t; -} - -static __inline__ void arch_atomic_dec(atomic_t *v) -{ - int t; - - __asm__ __volatile__( -"1: lwarx %0,0,%2 # atomic_dec\n\ - addic %0,%0,-1\n" -" stwcx. %0,0,%2\n\ - bne- 1b" - : "=&r" (t), "+m" (v->counter) - : "r" (&v->counter) - : "cc", "xer"); -} -#define arch_atomic_dec arch_atomic_dec - -static __inline__ int arch_atomic_dec_return_relaxed(atomic_t *v) -{ - int t; - - __asm__ __volatile__( -"1: lwarx %0,0,%2 # atomic_dec_return_relaxed\n" -" addic %0,%0,-1\n" -" stwcx. %0,0,%2\n" -" bne- 1b" - : "=&r" (t), "+m" (v->counter) - : "r" (&v->counter) - : "cc", "xer"); - - return t; -} - -#define arch_atomic_inc_return_relaxed arch_atomic_inc_return_relaxed -#define arch_atomic_dec_return_relaxed arch_atomic_dec_return_relaxed - #define arch_atomic_cmpxchg(v, o, n) \ (arch_cmpxchg(&((v)->counter), (o), (n))) #define arch_atomic_cmpxchg_relaxed(v, o, n) \ @@ -255,36 +190,6 @@ static __inline__ int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u) } #define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless -/** - * atomic_inc_not_zero - increment unless the number is zero - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1, so long as @v is non-zero. - * Returns non-zero if @v was non-zero, and zero otherwise. - */ -static __inline__ int arch_atomic_inc_not_zero(atomic_t *v) -{ - int t1, t2; - - __asm__ __volatile__ ( - PPC_ATOMIC_ENTRY_BARRIER -"1: lwarx %0,0,%2 # atomic_inc_not_zero\n\ - cmpwi 0,%0,0\n\ - beq- 2f\n\ - addic %1,%0,1\n" -" stwcx. %1,0,%2\n\ - bne- 1b\n" - PPC_ATOMIC_EXIT_BARRIER - "\n\ -2:" - : "=&r" (t1), "=&r" (t2) - : "r" (&v->counter) - : "cc", "xer", "memory"); - - return t1; -} -#define arch_atomic_inc_not_zero(v) arch_atomic_inc_not_zero((v)) - /* * Atomically test *v and decrement if it is greater than 0. * The function returns the old value of *v minus 1, even if From 2c9ac51b850d84ee496b0a5d832ce66d411ae552 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Wed, 21 Jul 2021 01:48:29 -0400 Subject: [PATCH 111/240] powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC Running perf fuzzer showed below in dmesg logs: "Can't find PMC that caused IRQ" This means a PMU exception happened, but none of the PMC's (Performance Monitor Counter) were found to be overflown. There are some corner cases that clears the PMCs after PMI gets masked. In such cases, the perf interrupt handler will not find the active PMC values that had caused the overflow and thus leads to this message while replaying. Case 1: PMU Interrupt happens during replay of other interrupts and counter values gets cleared by PMU callbacks before replay: During replay of interrupts like timer, __do_irq() and doorbell exception, we conditionally enable interrupts via may_hard_irq_enable(). This could potentially create a window to generate a PMI. Since irq soft mask is set to ALL_DISABLED, the PMI will get masked here. We could get IPIs run before perf interrupt is replayed and the PMU events could be deleted or stopped. This will change the PMU SPR values and resets the counters. Snippet of ftrace log showing PMU callbacks invoked in __do_irq(): -0 [051] dns. 132025441306354: __do_irq <-call_do_irq -0 [051] dns. 132025441306430: irq_enter <-__do_irq -0 [051] dns. 132025441306503: irq_enter_rcu <-__do_irq -0 [051] dnH. 132025441306599: xive_get_irq <-__do_irq <<>> -0 [051] dnH. 132025441307770: generic_smp_call_function_single_interrupt <-smp_ipi_demux_relaxed -0 [051] dnH. 132025441307839: flush_smp_call_function_queue <-smp_ipi_demux_relaxed -0 [051] dnH. 132025441308057: _raw_spin_lock <-event_function -0 [051] dnH. 132025441308206: power_pmu_disable <-perf_pmu_disable -0 [051] dnH. 132025441308337: power_pmu_del <-event_sched_out -0 [051] dnH. 132025441308407: power_pmu_read <-power_pmu_del -0 [051] dnH. 132025441308477: read_pmc <-power_pmu_read -0 [051] dnH. 132025441308590: isa207_disable_pmc <-power_pmu_del -0 [051] dnH. 132025441308663: write_pmc <-power_pmu_del -0 [051] dnH. 132025441308787: power_pmu_event_idx <-perf_event_update_userpage -0 [051] dnH. 132025441308859: rcu_read_unlock_strict <-perf_event_update_userpage -0 [051] dnH. 132025441308975: power_pmu_enable <-perf_pmu_enable <<>> -0 [051] dnH. 132025441311108: irq_exit <-__do_irq -0 [051] dns. 132025441311319: performance_monitor_exception <-replay_soft_interrupts Case 2: PMI's masked during local_* operations, example local_add(). If the local_add() operation happens within a local_irq_save(), replay of PMI will be during local_irq_restore(). Similar to case 1, this could also create a window before replay where PMU events gets deleted or stopped. Fix it by updating the PMU callback function power_pmu_disable() to check for pending perf interrupt. If there is an overflown PMC and pending perf interrupt indicated in paca, clear the PMI bit in paca to drop that sample. Clearing of PMI bit is done in power_pmu_disable() since disable is invoked before any event gets deleted/stopped. With this fix, if there are more than one event running in the PMU, there is a chance that we clear the PMI bit for the event which is not getting deleted/stopped. The other events may still remain active. Hence to make sure we don't drop valid sample in such cases, another check is added in power_pmu_enable. This checks if there is an overflown PMC found among the active events and if so enable back the PMI bit. Two new helper functions are introduced to clear/set the PMI, ie clear_pmi_irq_pending() and set_pmi_irq_pending(). Helper function pmi_irq_pending() is introduced to give a warning if there is pending PMI bit in paca, but no PMC is overflown. Also there are corner cases which result in performance monitor interrupts being triggered during power_pmu_disable(). This happens since PMXE bit is not cleared along with disabling of other MMCR0 bits in the pmu_disable. Such PMI's could leave the PMU running and could trigger PMI again which will set MMCR0 PMAO bit. This could lead to spurious interrupts in some corner cases. Example, a timer after power_pmu_del() which will re-enable interrupts and triggers a PMI again since PMAO bit is still set. But fails to find valid overflow since PMC was cleared in power_pmu_del(). Fix that by disabling PMXE along with disabling of other MMCR0 bits in power_pmu_disable(). We can't just replay PMI any time. Hence this approach is preferred rather than replaying PMI before resetting overflown PMC. Patch also documents core-book3s on a race condition which can trigger these PMC messages during idle path in PowerNV. Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and replay them") Reported-by: Nageswara R Sastry Suggested-by: Nicholas Piggin Suggested-by: Madhavan Srinivasan Signed-off-by: Athira Rajeev Tested-by: Nageswara R Sastry Reviewed-by: Nicholas Piggin [mpe: Make pmi_irq_pending() return bool, reflow/reword some comments] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1626846509-1350-2-git-send-email-atrajeev@linux.vnet.ibm.com --- arch/powerpc/include/asm/hw_irq.h | 40 +++++++++++++++++++++ arch/powerpc/perf/core-book3s.c | 58 ++++++++++++++++++++++++++++++- 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 21cc571ea9c2..5c98a950eca0 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -224,6 +224,42 @@ static inline bool arch_irqs_disabled(void) return arch_irqs_disabled_flags(arch_local_save_flags()); } +static inline void set_pmi_irq_pending(void) +{ + /* + * Invoked from PMU callback functions to set PMI bit in the paca. + * This has to be called with irq's disabled (via hard_irq_disable()). + */ + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(mfmsr() & MSR_EE); + + get_paca()->irq_happened |= PACA_IRQ_PMI; +} + +static inline void clear_pmi_irq_pending(void) +{ + /* + * Invoked from PMU callback functions to clear the pending PMI bit + * in the paca. + */ + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(mfmsr() & MSR_EE); + + get_paca()->irq_happened &= ~PACA_IRQ_PMI; +} + +static inline bool pmi_irq_pending(void) +{ + /* + * Invoked from PMU callback functions to check if there is a pending + * PMI bit in the paca. + */ + if (get_paca()->irq_happened & PACA_IRQ_PMI) + return true; + + return false; +} + #ifdef CONFIG_PPC_BOOK3S /* * To support disabling and enabling of irq with PMI, set of @@ -408,6 +444,10 @@ static inline void do_hard_irq_enable(void) BUILD_BUG(); } +static inline void clear_pmi_irq_pending(void) { } +static inline void set_pmi_irq_pending(void) { } +static inline bool pmi_irq_pending(void) { return false; } + static inline void irq_soft_mask_regs_set_state(struct pt_regs *regs, unsigned long val) { } diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 8d4ff93462fb..1f1ded29a06e 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -857,6 +857,19 @@ static void write_pmc(int idx, unsigned long val) } } +static int any_pmc_overflown(struct cpu_hw_events *cpuhw) +{ + int i, idx; + + for (i = 0; i < cpuhw->n_events; i++) { + idx = cpuhw->event[i]->hw.idx; + if ((idx) && ((int)read_pmc(idx) < 0)) + return idx; + } + + return 0; +} + /* Called from sysrq_handle_showregs() */ void perf_event_print_debug(void) { @@ -1281,11 +1294,13 @@ static void power_pmu_disable(struct pmu *pmu) /* * Set the 'freeze counters' bit, clear EBE/BHRBA/PMCC/PMAO/FC56 + * Also clear PMXE to disable PMI's getting triggered in some + * corner cases during PMU disable. */ val = mmcr0 = mfspr(SPRN_MMCR0); val |= MMCR0_FC; val &= ~(MMCR0_EBE | MMCR0_BHRBA | MMCR0_PMCC | MMCR0_PMAO | - MMCR0_FC56); + MMCR0_PMXE | MMCR0_FC56); /* Set mmcr0 PMCCEXT for p10 */ if (ppmu->flags & PPMU_ARCH_31) val |= MMCR0_PMCCEXT; @@ -1299,6 +1314,23 @@ static void power_pmu_disable(struct pmu *pmu) mb(); isync(); + /* + * Some corner cases could clear the PMU counter overflow + * while a masked PMI is pending. One such case is when + * a PMI happens during interrupt replay and perf counter + * values are cleared by PMU callbacks before replay. + * + * If any PMC corresponding to the active PMU events are + * overflown, disable the interrupt by clearing the paca + * bit for PMI since we are disabling the PMU now. + * Otherwise provide a warning if there is PMI pending, but + * no counter is found overflown. + */ + if (any_pmc_overflown(cpuhw)) + clear_pmi_irq_pending(); + else + WARN_ON(pmi_irq_pending()); + val = mmcra = cpuhw->mmcr.mmcra; /* @@ -1390,6 +1422,15 @@ static void power_pmu_enable(struct pmu *pmu) * (possibly updated for removal of events). */ if (!cpuhw->n_added) { + /* + * If there is any active event with an overflown PMC + * value, set back PACA_IRQ_PMI which would have been + * cleared in power_pmu_disable(). + */ + hard_irq_disable(); + if (any_pmc_overflown(cpuhw)) + set_pmi_irq_pending(); + mtspr(SPRN_MMCRA, cpuhw->mmcr.mmcra & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr.mmcr1); if (ppmu->flags & PPMU_ARCH_31) @@ -2337,6 +2378,14 @@ static void __perf_event_interrupt(struct pt_regs *regs) break; } } + + /* + * Clear PACA_IRQ_PMI in case it was set by + * set_pmi_irq_pending() when PMU was enabled + * after accounting for interrupts. + */ + clear_pmi_irq_pending(); + if (!active) /* reset non active counters that have overflowed */ write_pmc(i + 1, 0); @@ -2356,6 +2405,13 @@ static void __perf_event_interrupt(struct pt_regs *regs) } } } + + /* + * During system wide profling or while specific CPU is monitored for an + * event, some corner cases could cause PMC to overflow in idle path. This + * will trigger a PMI after waking up from idle. Since counter values are _not_ + * saved/restored in idle path, can lead to below "Can't find PMC" message. + */ if (unlikely(!found) && !arch_irq_disabled_regs(regs)) printk_ratelimited(KERN_WARNING "Can't find PMC that caused IRQ\n"); From 5402e239d09feea482d25d60df9b908cfaf9ec3c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 29 Nov 2021 13:09:15 +1000 Subject: [PATCH 112/240] powerpc/64s: Get LPID bit width from device tree Allow the LPID bit width and partition table size to be set at runtime from the device tree. Move the PID bit width detection into the same place. KVM does not support using the extra bits yet, this is mainly required to get the PTCR register values correct (so KVM will run but it will not allocate > 4096 LPIDs). OPAL firmware provides this property for POWER10 CPUs since skiboot commit 9b85f7d961f2 ("hdata: add mmu-pid-bits and mmu-lpid-bits for POWER10 CPUs"). Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211129030915.1888332-1-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 9 ++--- arch/powerpc/mm/book3s64/pgtable.c | 5 --- arch/powerpc/mm/book3s64/radix_pgtable.c | 13 +------ arch/powerpc/mm/init_64.c | 46 +++++++++++++++++++++++- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index c02f42d1031e..8c500dd6fee4 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -62,6 +62,9 @@ extern struct patb_entry *partition_tb; #define PRTS_MASK 0x1f /* process table size field */ #define PRTB_MASK 0x0ffffffffffff000UL +/* Number of supported LPID bits */ +extern unsigned int mmu_lpid_bits; + /* Number of supported PID bits */ extern unsigned int mmu_pid_bits; @@ -76,10 +79,8 @@ extern unsigned long __ro_after_init radix_mem_block_size; #define PRTB_SIZE_SHIFT (mmu_pid_bits + 4) #define PRTB_ENTRIES (1ul << mmu_pid_bits) -/* - * Power9 currently only support 64K partition table size. - */ -#define PATB_SIZE_SHIFT 16 +#define PATB_SIZE_SHIFT (mmu_lpid_bits + 4) +#define PATB_ENTRIES (1ul << mmu_lpid_bits) typedef unsigned long mm_context_id_t; struct spinlock; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 9e16c7b1a6c5..13d1fbddecb9 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -207,17 +207,12 @@ void __init mmu_partition_table_init(void) unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; unsigned long ptcr; - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); /* Initialize the Partition Table with no entries */ partition_tb = memblock_alloc(patb_size, patb_size); if (!partition_tb) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, patb_size, patb_size); - /* - * update partition table control register, - * 64 K size. - */ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 77820036c722..1f4afc37843d 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -33,7 +33,6 @@ #include -unsigned int mmu_pid_bits; unsigned int mmu_base_pid; unsigned long radix_mem_block_size __ro_after_init; @@ -357,18 +356,13 @@ static void __init radix_init_pgtable(void) -1, PAGE_KERNEL)); } - /* Find out how many PID bits are supported */ if (!cpu_has_feature(CPU_FTR_HVMODE) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { /* * Older versions of KVM on these machines perfer if the * guest only uses the low 19 PID bits. */ - if (!mmu_pid_bits) - mmu_pid_bits = 19; - } else { - if (!mmu_pid_bits) - mmu_pid_bits = 20; + mmu_pid_bits = 19; } mmu_base_pid = 1; @@ -449,11 +443,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; - /* Find MMU PID size */ - prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); - if (prop && size == 4) - mmu_pid_bits = be32_to_cpup(prop); - /* Grab page size encodings */ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); if (!prop) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 386be136026e..3e5f9ac9dded 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -370,6 +370,9 @@ void register_page_bootmem_memmap(unsigned long section_nr, #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 +unsigned int mmu_lpid_bits; +unsigned int mmu_pid_bits; + static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); static int __init parse_disable_radix(char *p) @@ -437,19 +440,60 @@ static void __init early_check_vec5(void) } } +static int __init dt_scan_mmu_pid_width(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + /* Find MMU LPID, PID register size */ + prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size); + if (prop && size == 4) + mmu_lpid_bits = be32_to_cpup(prop); + + prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); + if (prop && size == 4) + mmu_pid_bits = be32_to_cpup(prop); + + if (!mmu_pid_bits && !mmu_lpid_bits) + return 0; + + return 1; +} + void __init mmu_early_init_devtree(void) { + bool hvmode = !!(mfmsr() & MSR_HV); + /* Disable radix mode based on kernel command line. */ if (disable_radix) cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + of_scan_flat_dt(dt_scan_mmu_pid_width, NULL); + if (hvmode && !mmu_lpid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + mmu_lpid_bits = 12; /* POWER8-10 */ + else + mmu_lpid_bits = 10; /* POWER7 */ + } + if (!mmu_pid_bits) { + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + mmu_pid_bits = 20; /* POWER9-10 */ + } + /* * Check /chosen/ibm,architecture-vec-5 if running as a guest. * When running bare-metal, we can use radix if we like * even though the ibm,architecture-vec-5 property created by * skiboot doesn't have the necessary bits set. */ - if (!(mfmsr() & MSR_HV)) + if (!hvmode) early_check_vec5(); if (early_radix_enabled()) { From f1797e4de1146009c888bcf8b6bb6648d55394f1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Nov 2021 11:10:43 +0100 Subject: [PATCH 113/240] powerpc/modules: Don't WARN on first module allocation attempt module_alloc() first tries to allocate module text within 24 bits direct jump from kernel text, and tries a wider allocation if first one fails. When first allocation fails the following is observed in kernel logs: vmap allocation for size 2400256 failed: use vmalloc= to increase size systemd-udevd: vmalloc error: size 2395133, vm_struct allocation failed, mode:0xcc0(GFP_KERNEL), nodemask=(null) CPU: 0 PID: 127 Comm: systemd-udevd Tainted: G W 5.15.5-gentoo-PowerMacG4 #9 Call Trace: [e2a53a50] [c0ba0048] dump_stack_lvl+0x80/0xb0 (unreliable) [e2a53a70] [c0540128] warn_alloc+0x11c/0x2b4 [e2a53b50] [c0531be8] __vmalloc_node_range+0xd8/0x64c [e2a53c10] [c00338c0] module_alloc+0xa0/0xac [e2a53c40] [c027a368] load_module+0x2ae0/0x8148 [e2a53e30] [c027fc78] sys_finit_module+0xfc/0x130 [e2a53f30] [c0035098] ret_from_syscall+0x0/0x28 ... Add __GFP_NOWARN flag to first allocation so that no warning appears when it fails. Reported-by: Erhard Furtner Fixes: 2ec13df16704 ("powerpc/modules: Load modules closer to kernel text") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/93c9b84d6ec76aaf7b4f03468e22433a6d308674.1638267035.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/module.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index ed04a3ba66fe..40a583e9d3c7 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -90,16 +90,17 @@ int module_finalize(const Elf_Ehdr *hdr, } static __always_inline void * -__module_alloc(unsigned long size, unsigned long start, unsigned long end) +__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn) { pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC; + gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0); /* * Don't do huge page allocations for modules yet until more testing * is done. STRICT_MODULE_RWX may require extra work to support this * too. */ - return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL, prot, + return __vmalloc_node_range(size, 1, start, end, gfp, prot, VM_FLUSH_RESET_PERMS | VM_NO_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } @@ -114,13 +115,13 @@ void *module_alloc(unsigned long size) /* First try within 32M limit from _etext to avoid branch trampolines */ if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) - ptr = __module_alloc(size, limit, MODULES_END); + ptr = __module_alloc(size, limit, MODULES_END, true); if (!ptr) - ptr = __module_alloc(size, MODULES_VADDR, MODULES_END); + ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false); return ptr; #else - return __module_alloc(size, VMALLOC_START, VMALLOC_END); + return __module_alloc(size, VMALLOC_START, VMALLOC_END, false); #endif } From df1f679d19edb9eeb67cc2f96b29375f21991945 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Nov 2021 10:32:42 +0100 Subject: [PATCH 114/240] powerpc/powermac: Add missing lockdep_register_key() KeyWest i2c @0xf8001003 irq 42 /uni-n@f8000000/i2c@f8001000 BUG: key c2d00cbc has not been registered! ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(1) WARNING: CPU: 0 PID: 1 at kernel/locking/lockdep.c:4801 lockdep_init_map_type+0x4c0/0xb4c Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.15.5-gentoo-PowerMacG4 #9 NIP: c01a9428 LR: c01a9428 CTR: 00000000 REGS: e1033cf0 TRAP: 0700 Not tainted (5.15.5-gentoo-PowerMacG4) MSR: 00029032 CR: 24002002 XER: 00000000 GPR00: c01a9428 e1033db0 c2d1cf20 00000016 00000004 00000001 c01c0630 e1033a73 GPR08: 00000000 00000000 00000000 e1033db0 24002004 00000000 f8729377 00000003 GPR16: c1829a9c 00000000 18305357 c1416fc0 c1416f80 c006ac60 c2d00ca8 c1416f00 GPR24: 00000000 c21586f0 c2160000 00000000 c2d00cbc c2170000 c216e1a0 c2160000 NIP [c01a9428] lockdep_init_map_type+0x4c0/0xb4c LR [c01a9428] lockdep_init_map_type+0x4c0/0xb4c Call Trace: [e1033db0] [c01a9428] lockdep_init_map_type+0x4c0/0xb4c (unreliable) [e1033df0] [c1c177b8] kw_i2c_add+0x334/0x424 [e1033e20] [c1c18294] pmac_i2c_init+0x9ec/0xa9c [e1033e80] [c1c1a790] smp_core99_probe+0xbc/0x35c [e1033eb0] [c1c03cb0] kernel_init_freeable+0x190/0x5a4 [e1033f10] [c000946c] kernel_init+0x28/0x154 [e1033f30] [c0035148] ret_from_kernel_thread+0x14/0x1c Add missing lockdep_register_key() Reported-by: Erhard Furtner Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/69e4f55565bb45ebb0843977801b245af0c666fe.1638264741.git.christophe.leroy@csgroup.eu --- arch/powerpc/platforms/powermac/low_i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index f77a59b5c2e1..de34fa34c42d 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -582,6 +582,7 @@ static void __init kw_i2c_add(struct pmac_i2c_host_kw *host, bus->close = kw_i2c_close; bus->xfer = kw_i2c_xfer; mutex_init(&bus->mutex); + lockdep_register_key(&bus->lock_key); lockdep_set_class(&bus->mutex, &bus->lock_key); if (controller == busnode) bus->flags = pmac_i2c_multibus; From af11dee4361b3519981fa04d014873f9d9edd6ac Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Nov 2021 09:42:37 +0100 Subject: [PATCH 115/240] powerpc/32s: Fix shift-out-of-bounds in KASAN init ================================================================================ UBSAN: shift-out-of-bounds in arch/powerpc/mm/kasan/book3s_32.c:22:23 shift exponent -1 is negative CPU: 0 PID: 0 Comm: swapper Not tainted 5.15.5-gentoo-PowerMacG4 #9 Call Trace: [c214be60] [c0ba0048] dump_stack_lvl+0x80/0xb0 (unreliable) [c214be80] [c0b99288] ubsan_epilogue+0x10/0x5c [c214be90] [c0b98fe0] __ubsan_handle_shift_out_of_bounds+0x94/0x138 [c214bf00] [c1c0f010] kasan_init_region+0xd8/0x26c [c214bf30] [c1c0ed84] kasan_init+0xc0/0x198 [c214bf70] [c1c08024] setup_arch+0x18/0x54c [c214bfc0] [c1c037f0] start_kernel+0x90/0x33c [c214bff0] [00003610] 0x3610 ================================================================================ This happens when the directly mapped memory is a power of 2. Fix it by checking the shift and set the result to 0 when shift is -1 Fixes: 7974c4732642 ("powerpc/32s: Implement dedicated kasan_init_region()") Reported-by: Erhard Furtner Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=215169 Link: https://lore.kernel.org/r/15cbc3439d4ad988b225e2119ec99502a5cc6ad3.1638261744.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/kasan/book3s_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c index 202bd260a009..35b287b0a8da 100644 --- a/arch/powerpc/mm/kasan/book3s_32.c +++ b/arch/powerpc/mm/kasan/book3s_32.c @@ -19,7 +19,8 @@ int __init kasan_init_region(void *start, size_t size) block = memblock_alloc(k_size, k_size_base); if (block && k_size_base >= SZ_128K && k_start == ALIGN(k_start, k_size_base)) { - int k_size_more = 1 << (ffs(k_size - k_size_base) - 1); + int shift = ffs(k_size - k_size_base); + int k_size_more = shift ? 1 << (shift - 1) : 0; setbat(-1, k_start, __pa(block), k_size_base, PAGE_KERNEL); if (k_size_more >= SZ_128K) From 62ea67e31981bca95ec16c37e2a1fba68f3dd8c5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Nov 2021 12:36:04 -0800 Subject: [PATCH 116/240] powerpc/signal32: Use struct_group() to zero spe regs In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memset(), avoid intentionally writing across neighboring fields. Add a struct_group() for the spe registers so that memset() can correctly reason about the size: In function 'fortify_memset_chk', inlined from 'restore_user_regs.part.0' at arch/powerpc/kernel/signal_32.c:539:3: >> include/linux/fortify-string.h:195:4: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 195 | __write_overflow_field(); | ^~~~~~~~~~~~~~~~~~~~~~~~ Reported-by: kernel test robot Signed-off-by: Kees Cook Reviewed-by: Christophe Leroy Acked-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211118203604.1288379-1-keescook@chromium.org --- arch/powerpc/include/asm/processor.h | 6 ++++-- arch/powerpc/kernel/signal_32.c | 14 +++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index e39bd0ff69f3..978a80308466 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -191,8 +191,10 @@ struct thread_struct { int used_vsr; /* set if process has used VSX */ #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE - unsigned long evr[32]; /* upper 32-bits of SPE regs */ - u64 acc; /* Accumulator */ + struct_group(spe, + unsigned long evr[32]; /* upper 32-bits of SPE regs */ + u64 acc; /* Accumulator */ + ); unsigned long spefscr; /* SPE & eFP status */ unsigned long spefscr_last; /* SPEFSCR value on last prctl call or trap return */ diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 3e053e2fd6b6..d84c434b2b78 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -527,16 +527,20 @@ static long restore_user_regs(struct pt_regs *regs, regs_set_return_msr(regs, regs->msr & ~(MSR_FP | MSR_FE0 | MSR_FE1)); #ifdef CONFIG_SPE - /* force the process to reload the spe registers from - current->thread when it next does spe instructions */ + /* + * Force the process to reload the spe registers from + * current->thread when it next does spe instructions. + * Since this is user ABI, we must enforce the sizing. + */ + BUILD_BUG_ON(sizeof(current->thread.spe) != ELF_NEVRREG * sizeof(u32)); regs_set_return_msr(regs, regs->msr & ~MSR_SPE); if (msr & MSR_SPE) { /* restore spe registers from the stack */ - unsafe_copy_from_user(current->thread.evr, &sr->mc_vregs, - ELF_NEVRREG * sizeof(u32), failed); + unsafe_copy_from_user(¤t->thread.spe, &sr->mc_vregs, + sizeof(current->thread.spe), failed); current->thread.used_spe = true; } else if (current->thread.used_spe) - memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32)); + memset(¤t->thread.spe, 0, sizeof(current->thread.spe)); /* Always get SPEFSCR back */ unsafe_get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + ELF_NEVRREG, failed); From 2a2ac8a7018b953cd23d770ebd28f8e1ea365df4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 1 Dec 2021 17:54:18 +0100 Subject: [PATCH 117/240] powerpc/xive: Fix compile when !CONFIG_PPC_POWERNV. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The automatic "save & restore" of interrupt context is a POWER10/XIVE2 feature exploited by KVM under the PowerNV platform. It is not available under pSeries and the associated toggle should not be exposed under the XIVE debugfs directory. Introduce a platform handler for debugfs initialization and move the 'save-restore' entry under the native (PowerNV) backend to fix compile when !CONFIG_PPC_POWERNV. Fixes: 1e7684dc4fc7 ("powerpc/xive: Add a debugfs toggle for save-restore") Reported-by: kernel test robot Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201165418.1041842-1-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 4 +++- arch/powerpc/sysdev/xive/native.c | 11 +++++++++++ arch/powerpc/sysdev/xive/xive-internal.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 43f7f7df6407..1ca5564bda9d 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1847,7 +1847,9 @@ static void xive_core_debugfs_create(void) &xive_eq_debug_fops); } debugfs_create_bool("store-eoi", 0600, xive_dir, &xive_store_eoi); - debugfs_create_bool("save-restore", 0600, xive_dir, &xive_has_save_restore); + + if (xive_ops->debug_create) + xive_ops->debug_create(xive_dir); } #else static inline void xive_core_debugfs_create(void) { } diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index d6a091dc1bce..d4243dab230e 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -461,6 +461,14 @@ void xive_native_sync_queue(u32 hw_irq) } EXPORT_SYMBOL_GPL(xive_native_sync_queue); +#ifdef CONFIG_DEBUG_FS +static int xive_native_debug_create(struct dentry *xive_dir) +{ + debugfs_create_bool("save-restore", 0600, xive_dir, &xive_has_save_restore); + return 0; +} +#endif + static const struct xive_ops xive_native_ops = { .populate_irq_data = xive_native_populate_irq_data, .configure_irq = xive_native_configure_irq, @@ -478,6 +486,9 @@ static const struct xive_ops xive_native_ops = { .get_ipi = xive_native_get_ipi, .put_ipi = xive_native_put_ipi, #endif /* CONFIG_SMP */ +#ifdef CONFIG_DEBUG_FS + .debug_create = xive_native_debug_create, +#endif /* CONFIG_DEBUG_FS */ .name = "native", }; diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h index e0941bc64430..fe6d95d54af9 100644 --- a/arch/powerpc/sysdev/xive/xive-internal.h +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -58,6 +58,7 @@ struct xive_ops { void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc); #endif int (*debug_show)(struct seq_file *m, void *private); + int (*debug_create)(struct dentry *xive_dir); const char *name; }; From f6a1987773a5908bae7bcadbeec0bcab25df7b20 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 1 Dec 2021 15:21:12 +1000 Subject: [PATCH 118/240] KVM: PPC: Book3S HV P9: Remove unused ri_set local variable ri_set is set and never used. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201052112.2137167-1-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_p9_entry.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index ebb4781859e2..a28e5b3daabd 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -768,7 +768,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc s64 hdec, dec; u64 purr, spurr; u64 *exsave; - bool ri_set; int trap; unsigned long msr; unsigned long host_hfscr; @@ -968,18 +967,12 @@ tm_return_to_guest: /* 0x2 bit for HSRR is only used by PR and P7/8 HV paths, clear it */ trap = local_paca->kvm_hstate.scratch0 & ~0x2; - /* HSRR interrupts leave MSR[RI] unchanged, SRR interrupts clear it. */ - ri_set = false; - if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) { - if (trap != BOOK3S_INTERRUPT_SYSCALL && - (vcpu->arch.shregs.msr & MSR_RI)) - ri_set = true; + if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) exsave = local_paca->exgen; - } else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET) { + else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET) exsave = local_paca->exnmi; - } else { /* trap == 0x200 */ + else /* trap == 0x200 */ exsave = local_paca->exmc; - } vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1; vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2; From 511d25d6b789fffcb20a3eb71899cf974a31bd9d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 1 Sep 2021 18:45:12 +1000 Subject: [PATCH 119/240] KVM: PPC: Book3S: Suppress warnings when allocating too big memory slots The userspace can trigger "vmalloc size %lu allocation failure: exceeds total pages" via the KVM_SET_USER_MEMORY_REGION ioctl. This silences the warning by checking the limit before calling vzalloc() and returns ENOMEM if failed. This does not call underlying valloc helpers as __vmalloc_node() is only exported when CONFIG_TEST_VMALLOC_MODULE and __vmalloc_node_range() is not exported at all. Spotted by syzkaller. Signed-off-by: Alexey Kardashevskiy [mpe: Use 'size' for the variable rather than 'cb'] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210901084512.1658628-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_hv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 98e90bdf1f27..7986911b873c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4872,8 +4872,12 @@ static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, unsigned long npages = mem->memory_size >> PAGE_SHIFT; if (change == KVM_MR_CREATE) { - slot->arch.rmap = vzalloc(array_size(npages, - sizeof(*slot->arch.rmap))); + unsigned long size = array_size(npages, sizeof(*slot->arch.rmap)); + + if ((size >> PAGE_SHIFT) > totalram_pages()) + return -ENOMEM; + + slot->arch.rmap = vzalloc(size); if (!slot->arch.rmap) return -ENOMEM; } From 792020907b11c6f9246c21977cab3bad985ae4b6 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 1 Sep 2021 18:45:50 +1000 Subject: [PATCH 120/240] KVM: PPC: Book3S: Suppress failed alloc warning in H_COPY_TOFROM_GUEST H_COPY_TOFROM_GUEST is an hcall for an upper level VM to access its nested VMs memory. The userspace can trigger WARN_ON_ONCE(!(gfp & __GFP_NOWARN)) in __alloc_pages() by constructing a tiny VM which only does H_COPY_TOFROM_GUEST with a too big GPR9 (number of bytes to copy). This silences the warning by adding __GFP_NOWARN. Spotted by syzkaller. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210901084550.1658699-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index e57c08b968c0..a2e34efb8d31 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -580,7 +580,7 @@ long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu) if (eaddr & (0xFFFUL << 52)) return H_PARAMETER; - buf = kzalloc(n, GFP_KERNEL); + buf = kzalloc(n, GFP_KERNEL | __GFP_NOWARN); if (!buf) return H_NO_MEM; From 79b74a68486765a4fe685ac4069bc71366c538f5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:36 +1000 Subject: [PATCH 121/240] powerpc: Remove unused FW_FEATURE_NATIVE references FW_FEATURE_NATIVE_ALWAYS and FW_FEATURE_NATIVE_POSSIBLE are always zero and never do anything. Remove them. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-2-npiggin@gmail.com --- arch/powerpc/include/asm/firmware.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 97a3bd9ffeb9..9b702d2b80fb 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -80,8 +80,6 @@ enum { FW_FEATURE_POWERNV_ALWAYS = 0, FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, - FW_FEATURE_NATIVE_POSSIBLE = 0, - FW_FEATURE_NATIVE_ALWAYS = 0, FW_FEATURE_POSSIBLE = #ifdef CONFIG_PPC_PSERIES FW_FEATURE_PSERIES_POSSIBLE | @@ -91,9 +89,6 @@ enum { #endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_POSSIBLE | -#endif -#ifdef CONFIG_PPC_NATIVE - FW_FEATURE_NATIVE_ALWAYS | #endif 0, FW_FEATURE_ALWAYS = @@ -105,9 +100,6 @@ enum { #endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_ALWAYS & -#endif -#ifdef CONFIG_PPC_NATIVE - FW_FEATURE_NATIVE_ALWAYS & #endif FW_FEATURE_POSSIBLE, From 7ebc49031d0418dc9ca8475b8133a3a161221ef5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:37 +1000 Subject: [PATCH 122/240] powerpc: Rename PPC_NATIVE to PPC_HASH_MMU_NATIVE PPC_NATIVE now only controls the native HPT code, so rename it to be more descriptive. Restrict it to Book3S only. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-3-npiggin@gmail.com --- arch/powerpc/mm/book3s64/Makefile | 2 +- arch/powerpc/mm/book3s64/hash_utils.c | 2 +- arch/powerpc/platforms/52xx/Kconfig | 2 +- arch/powerpc/platforms/Kconfig | 4 ++-- arch/powerpc/platforms/cell/Kconfig | 2 +- arch/powerpc/platforms/chrp/Kconfig | 2 +- arch/powerpc/platforms/embedded6xx/Kconfig | 2 +- arch/powerpc/platforms/maple/Kconfig | 2 +- arch/powerpc/platforms/microwatt/Kconfig | 2 +- arch/powerpc/platforms/pasemi/Kconfig | 2 +- arch/powerpc/platforms/powermac/Kconfig | 2 +- arch/powerpc/platforms/powernv/Kconfig | 2 +- arch/powerpc/platforms/pseries/Kconfig | 2 +- 13 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 1b56d3af47d4..319f4b7f3357 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -6,7 +6,7 @@ CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) obj-y += hash_pgtable.o hash_utils.o slb.o \ mmu_context.o pgtable.o hash_tlb.o -obj-$(CONFIG_PPC_NATIVE) += hash_native.o +obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index cfd45245d009..92680da5229a 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1091,7 +1091,7 @@ void __init hash__early_init_mmu(void) ps3_early_mm_init(); else if (firmware_has_feature(FW_FEATURE_LPAR)) hpte_init_pseries(); - else if (IS_ENABLED(CONFIG_PPC_NATIVE)) + else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE)) hpte_init_native(); if (!mmu_hash_ops.hpte_insert) diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig index 99d60acc20c8..b72ed2950ca8 100644 --- a/arch/powerpc/platforms/52xx/Kconfig +++ b/arch/powerpc/platforms/52xx/Kconfig @@ -34,7 +34,7 @@ config PPC_EFIKA bool "bPlan Efika 5k2. MPC5200B based computer" depends on PPC_MPC52xx select PPC_RTAS - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE config PPC_LITE5200 bool "Freescale Lite5200 Eval Board" diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index e02d29a9d12f..d41dad227de8 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -40,9 +40,9 @@ config EPAPR_PARAVIRT In case of doubt, say Y -config PPC_NATIVE +config PPC_HASH_MMU_NATIVE bool - depends on PPC_BOOK3S_32 || PPC64 + depends on PPC_BOOK3S help Support for running natively on the hardware, i.e. without a hypervisor. This option is not user-selectable but should diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index cb70c5f25bc6..db4465c51b56 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -8,7 +8,7 @@ config PPC_CELL_COMMON select PPC_DCR_MMIO select PPC_INDIRECT_PIO select PPC_INDIRECT_MMIO - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select PPC_RTAS select IRQ_EDGE_EOI_HANDLER diff --git a/arch/powerpc/platforms/chrp/Kconfig b/arch/powerpc/platforms/chrp/Kconfig index 9b5c5505718a..ff30ed579a39 100644 --- a/arch/powerpc/platforms/chrp/Kconfig +++ b/arch/powerpc/platforms/chrp/Kconfig @@ -11,6 +11,6 @@ config PPC_CHRP select RTAS_ERROR_LOGGING select PPC_MPC106 select PPC_UDBG_16550 - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select FORCE_PCI default y diff --git a/arch/powerpc/platforms/embedded6xx/Kconfig b/arch/powerpc/platforms/embedded6xx/Kconfig index 4c6d703a4284..c54786f8461e 100644 --- a/arch/powerpc/platforms/embedded6xx/Kconfig +++ b/arch/powerpc/platforms/embedded6xx/Kconfig @@ -55,7 +55,7 @@ config MVME5100 select FORCE_PCI select PPC_INDIRECT_PCI select PPC_I8259 - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select PPC_UDBG_16550 help This option enables support for the Motorola (now Emerson) MVME5100 diff --git a/arch/powerpc/platforms/maple/Kconfig b/arch/powerpc/platforms/maple/Kconfig index 86ae210bee9a..7fd84311ade5 100644 --- a/arch/powerpc/platforms/maple/Kconfig +++ b/arch/powerpc/platforms/maple/Kconfig @@ -9,7 +9,7 @@ config PPC_MAPLE select GENERIC_TBSYNC select PPC_UDBG_16550 select PPC_970_NAP - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select PPC_RTAS select MMIO_NVRAM select ATA_NONSTANDARD if ATA diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig index 8f6a81978461..62b51e37fc05 100644 --- a/arch/powerpc/platforms/microwatt/Kconfig +++ b/arch/powerpc/platforms/microwatt/Kconfig @@ -5,7 +5,7 @@ config PPC_MICROWATT select PPC_XICS select PPC_ICS_NATIVE select PPC_ICP_NATIVE - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select PPC_UDBG_16550 select ARCH_RANDOM help diff --git a/arch/powerpc/platforms/pasemi/Kconfig b/arch/powerpc/platforms/pasemi/Kconfig index c52731a7773f..bc7137353a7f 100644 --- a/arch/powerpc/platforms/pasemi/Kconfig +++ b/arch/powerpc/platforms/pasemi/Kconfig @@ -5,7 +5,7 @@ config PPC_PASEMI select MPIC select FORCE_PCI select PPC_UDBG_16550 - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select MPIC_BROKEN_REGREAD help This option enables support for PA Semi's PWRficient line diff --git a/arch/powerpc/platforms/powermac/Kconfig b/arch/powerpc/platforms/powermac/Kconfig index b97bf12801eb..2b56df145b82 100644 --- a/arch/powerpc/platforms/powermac/Kconfig +++ b/arch/powerpc/platforms/powermac/Kconfig @@ -6,7 +6,7 @@ config PPC_PMAC select FORCE_PCI select PPC_INDIRECT_PCI if PPC32 select PPC_MPC106 if PPC32 - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select ZONE_DMA if PPC32 default y diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 043eefbbdd28..cd754e116184 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -2,7 +2,7 @@ config PPC_POWERNV depends on PPC64 && PPC_BOOK3S bool "IBM PowerNV (Non-Virtualized) platform support" - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select PPC_XICS select PPC_ICP_NATIVE select PPC_XIVE_NATIVE diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 9bd542164128..30618750bd98 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -17,7 +17,7 @@ config PPC_PSERIES select PPC_RTAS_DAEMON select RTAS_ERROR_LOGGING select PPC_UDBG_16550 - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE select PPC_DOORBELL select HOTPLUG_CPU select ARCH_RANDOM From a4135cbebde8375e2a9d91261b4546ce3f3b9b0f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:38 +1000 Subject: [PATCH 123/240] powerpc/pseries: Stop selecting PPC_HASH_MMU_NATIVE The pseries platform does not use the native hash code but the PAPR virtualised hash interfaces, so remove PPC_HASH_MMU_NATIVE. This requires moving tlbiel code from hash_native.c to hash_utils.c. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-4-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/tlbflush.h | 4 - arch/powerpc/mm/book3s64/hash_native.c | 104 ------------------ arch/powerpc/mm/book3s64/hash_utils.c | 104 ++++++++++++++++++ arch/powerpc/platforms/pseries/Kconfig | 1 - 4 files changed, 104 insertions(+), 109 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 215973b4cb26..d2e80f178b6d 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -14,7 +14,6 @@ enum { TLB_INVAL_SCOPE_LPID = 1, /* invalidate TLBs for current LPID */ }; -#ifdef CONFIG_PPC_NATIVE static inline void tlbiel_all(void) { /* @@ -30,9 +29,6 @@ static inline void tlbiel_all(void) else hash__tlbiel_all(TLB_INVAL_SCOPE_GLOBAL); } -#else -static inline void tlbiel_all(void) { BUG(); } -#endif static inline void tlbiel_all_lpid(bool radix) { diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index d8279bfe68ea..d2a320828c0b 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,110 +43,6 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); -static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) -{ - unsigned long rb; - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - - asm volatile("tlbiel %0" : : "r" (rb)); -} - -/* - * tlbiel instruction for hash, set invalidation - * i.e., r=1 and is=01 or is=10 or is=11 - */ -static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, - unsigned int pid, - unsigned int ric, unsigned int prs) -{ - unsigned long rb; - unsigned long rs; - unsigned int r = 0; /* hash format */ - - rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); - rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); - - asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) - : "memory"); -} - - -static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa206(set, is); - - ppc_after_tlbiel_barrier(); -} - -static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) -{ - unsigned int set; - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the partition table cache if this is HV mode. - */ - if (early_cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - - /* - * Now invalidate the process table cache. UPRT=0 HPT modes (what - * current hardware implements) do not use the process table, but - * add the flushes anyway. - * - * From ISA v3.0B p. 1078: - * The following forms are invalid. - * * PRS=1, R=0, and RIC!=2 (The only process-scoped - * HPT caching is of the Process Table.) - */ - tlbiel_hash_set_isa300(0, is, 0, 2, 1); - - /* - * Then flush the sets of the TLB proper. Hash mode uses - * partition scoped TLB translations, which may be flushed - * in !HV mode. - */ - for (set = 0; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); - - ppc_after_tlbiel_barrier(); - - asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -void hash__tlbiel_all(unsigned int action) -{ - unsigned int is; - - switch (action) { - case TLB_INVAL_SCOPE_GLOBAL: - is = 3; - break; - case TLB_INVAL_SCOPE_LPID: - is = 2; - break; - default: - BUG(); - } - - if (early_cpu_has_feature(CPU_FTR_ARCH_300)) - tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) - tlbiel_all_isa206(POWER8_TLB_SETS, is); - else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) - tlbiel_all_isa206(POWER7_TLB_SETS, is); - else - WARN(1, "%s called on pre-POWER7 CPU\n", __func__); -} - static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) { diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 92680da5229a..97a36fa3940e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -175,6 +175,110 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = { }, }; +static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is) +{ + unsigned long rb; + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + + asm volatile("tlbiel %0" : : "r" (rb)); +} + +/* + * tlbiel instruction for hash, set invalidation + * i.e., r=1 and is=01 or is=10 or is=11 + */ +static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, + unsigned int pid, + unsigned int ric, unsigned int prs) +{ + unsigned long rb; + unsigned long rs; + unsigned int r = 0; /* hash format */ + + rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53)); + rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) + : "memory"); +} + + +static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa206(set, is); + + ppc_after_tlbiel_barrier(); +} + +static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) +{ + unsigned int set; + + asm volatile("ptesync": : :"memory"); + + /* + * Flush the partition table cache if this is HV mode. + */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_hash_set_isa300(0, is, 0, 2, 0); + + /* + * Now invalidate the process table cache. UPRT=0 HPT modes (what + * current hardware implements) do not use the process table, but + * add the flushes anyway. + * + * From ISA v3.0B p. 1078: + * The following forms are invalid. + * * PRS=1, R=0, and RIC!=2 (The only process-scoped + * HPT caching is of the Process Table.) + */ + tlbiel_hash_set_isa300(0, is, 0, 2, 1); + + /* + * Then flush the sets of the TLB proper. Hash mode uses + * partition scoped TLB translations, which may be flushed + * in !HV mode. + */ + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + + ppc_after_tlbiel_barrier(); + + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); +} + +void hash__tlbiel_all(unsigned int action) +{ + unsigned int is; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + is = 3; + break; + case TLB_INVAL_SCOPE_LPID: + is = 2; + break; + default: + BUG(); + } + + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) + tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_207S)) + tlbiel_all_isa206(POWER8_TLB_SETS, is); + else if (early_cpu_has_feature(CPU_FTR_ARCH_206)) + tlbiel_all_isa206(POWER7_TLB_SETS, is); + else + WARN(1, "%s called on pre-POWER7 CPU\n", __func__); +} + /* * 'R' and 'C' update notes: * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 30618750bd98..f7fd91d153a4 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -17,7 +17,6 @@ config PPC_PSERIES select PPC_RTAS_DAEMON select RTAS_ERROR_LOGGING select PPC_UDBG_16550 - select PPC_HASH_MMU_NATIVE select PPC_DOORBELL select HOTPLUG_CPU select ARCH_RANDOM From 935b534c24f014325b72a3619bbbdc18191f9c3d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:39 +1000 Subject: [PATCH 124/240] powerpc/64s: Move and rename do_bad_slb_fault as it is not hash specific slb.c is hash-specific SLB management, but do_bad_slb_fault deals with segment interrupts that occur with radix MMU as well. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-5-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 2 +- arch/powerpc/kernel/exceptions-64s.S | 4 ++-- arch/powerpc/mm/book3s64/slb.c | 16 ---------------- arch/powerpc/mm/fault.c | 24 ++++++++++++++++++++++++ 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index a1d238255f07..3487aab12229 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -564,7 +564,7 @@ DECLARE_INTERRUPT_HANDLER(kernel_bad_stack); /* slb.c */ DECLARE_INTERRUPT_HANDLER_RAW(do_slb_fault); -DECLARE_INTERRUPT_HANDLER(do_bad_slb_fault); +DECLARE_INTERRUPT_HANDLER(do_bad_segment_interrupt); /* hash_utils.c */ DECLARE_INTERRUPT_HANDLER_RAW(do_hash_fault); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 277eccf0f086..2acd7e66694e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1428,7 +1428,7 @@ MMU_FTR_SECTION_ELSE ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD - bl do_bad_slb_fault + bl do_bad_segment_interrupt b interrupt_return_srr @@ -1508,7 +1508,7 @@ MMU_FTR_SECTION_ELSE ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD - bl do_bad_slb_fault + bl do_bad_segment_interrupt b interrupt_return_srr diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index f0037bcc47a0..31f4cef3adac 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -868,19 +868,3 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) return err; } } - -DEFINE_INTERRUPT_HANDLER(do_bad_slb_fault) -{ - int err = regs->result; - - if (err == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); - else - bad_page_fault(regs, SIGSEGV); - } else if (err == -EINVAL) { - unrecoverable_exception(regs); - } else { - BUG(); - } -} diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index a8d0ce85d39a..2d4a411c7c85 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -620,4 +621,27 @@ DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv) { bad_page_fault(regs, SIGSEGV); } + +/* + * In radix, segment interrupts indicate the EA is not addressable by the + * page table geometry, so they are always sent here. + * + * In hash, this is called if do_slb_fault returns error. Typically it is + * because the EA was outside the region allowed by software. + */ +DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt) +{ + int err = regs->result; + + if (err == -EFAULT) { + if (user_mode(regs)) + _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar); + else + bad_page_fault(regs, SIGSEGV); + } else if (err == -EINVAL) { + unrecoverable_exception(regs); + } else { + BUG(); + } +} #endif From 0c7cc15e92157c8886c8df3151eac2c43c3dfa2b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:40 +1000 Subject: [PATCH 125/240] powerpc/pseries: move process table registration away from hash-specific code This reduces ifdefs in a later change which makes hash support configurable. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-6-npiggin@gmail.com --- arch/powerpc/platforms/pseries/lpar.c | 56 +++++++++++++-------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 3df6bdfea475..06d6a824c0dc 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -712,6 +712,34 @@ void vpa_init(int cpu) #ifdef CONFIG_PPC_BOOK3S_64 +static int pseries_lpar_register_process_table(unsigned long base, + unsigned long page_size, unsigned long table_size) +{ + long rc; + unsigned long flags = 0; + + if (table_size) + flags |= PROC_TABLE_NEW; + if (radix_enabled()) { + flags |= PROC_TABLE_RADIX; + if (mmu_has_feature(MMU_FTR_GTSE)) + flags |= PROC_TABLE_GTSE; + } else + flags |= PROC_TABLE_HPT_SLB; + for (;;) { + rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, + page_size, table_size); + if (!H_IS_LONG_BUSY(rc)) + break; + mdelay(get_longbusy_msecs(rc)); + } + if (rc != H_SUCCESS) { + pr_err("Failed to register process table (rc=%ld)\n", rc); + BUG(); + } + return rc; +} + static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, @@ -1680,34 +1708,6 @@ static int pseries_lpar_resize_hpt(unsigned long shift) return 0; } -static int pseries_lpar_register_process_table(unsigned long base, - unsigned long page_size, unsigned long table_size) -{ - long rc; - unsigned long flags = 0; - - if (table_size) - flags |= PROC_TABLE_NEW; - if (radix_enabled()) { - flags |= PROC_TABLE_RADIX; - if (mmu_has_feature(MMU_FTR_GTSE)) - flags |= PROC_TABLE_GTSE; - } else - flags |= PROC_TABLE_HPT_SLB; - for (;;) { - rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, - page_size, table_size); - if (!H_IS_LONG_BUSY(rc)) - break; - mdelay(get_longbusy_msecs(rc)); - } - if (rc != H_SUCCESS) { - pr_err("Failed to register process table (rc=%ld)\n", rc); - BUG(); - } - return rc; -} - void __init hpte_init_pseries(void) { mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate; From 3d3282fd34d82caac5005d9c4d4525054eb3cac1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:41 +1000 Subject: [PATCH 126/240] powerpc/pseries: lparcfg don't include slb_size line in radix mode This avoids a change in behaviour in the later patch making hash support configurable. This is possibly a user interface change, so the alternative would be a hard-coded slb_size=0 here. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-7-npiggin@gmail.com --- arch/powerpc/platforms/pseries/lparcfg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index f71eac74ea92..3354c00914fa 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -532,7 +532,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) lppaca_shared_proc(get_lppaca())); #ifdef CONFIG_PPC_BOOK3S_64 - seq_printf(m, "slb_size=%d\n", mmu_slb_size); + if (!radix_enabled()) + seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif parse_em_data(m); maxmem_data(m); From 162b0889bba6e721c33d12e15971618785ca778e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:42 +1000 Subject: [PATCH 127/240] powerpc/64s: move THP trace point creation out of hash specific file In preparation for making hash MMU support configurable, move THP trace point function definitions out of an otherwise hash-specific file. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-8-npiggin@gmail.com --- arch/powerpc/mm/book3s64/Makefile | 2 +- arch/powerpc/mm/book3s64/hash_pgtable.c | 1 - arch/powerpc/mm/book3s64/trace.c | 8 ++++++++ 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/mm/book3s64/trace.c diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 319f4b7f3357..1579e18e098d 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -5,7 +5,7 @@ ccflags-y := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) obj-y += hash_pgtable.o hash_utils.o slb.o \ - mmu_context.o pgtable.o hash_tlb.o + mmu_context.o pgtable.o hash_tlb.o trace.o obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index ad5eff097d31..7ce8914992e3 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -16,7 +16,6 @@ #include -#define CREATE_TRACE_POINTS #include #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c new file mode 100644 index 000000000000..b86e7b906257 --- /dev/null +++ b/arch/powerpc/mm/book3s64/trace.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file is for defining trace points and trace related helpers. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define CREATE_TRACE_POINTS +#include +#endif From 310dce6201fd27fda484e34bf543fb55c33d80b1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:43 +1000 Subject: [PATCH 128/240] powerpc/64s: Make flush_and_reload_slb a no-op when radix is enabled The radix test can exclude slb_flush_all_realmode() from being called because flush_and_reload_slb() is only expected to flush ERAT when called by flush_erat(), which is only on pre-ISA v3.0 CPUs that do not support radix. This helps the later change to make hash support configurable to not introduce runtime changes to radix mode behaviour. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-9-npiggin@gmail.com --- arch/powerpc/kernel/mce_power.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index c2f55fe7092d..cf5263b648fc 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -80,12 +80,12 @@ static bool mce_in_guest(void) #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void) { - /* Invalidate all SLBs */ - slb_flush_all_realmode(); - if (early_radix_enabled()) return; + /* Invalidate all SLBs */ + slb_flush_all_realmode(); + /* * This probably shouldn't happen, but it may be possible it's * called in early boot before SLB shadows are allocated. From bdad5d57dfcc6d2b2f8d0bc9d7e85ee794d1d50e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:44 +1000 Subject: [PATCH 129/240] powerpc/64s: move page size definitions from hash specific file The radix code uses some of the psize variables. Move the common ones from hash_utils.c to pgtable.c. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-10-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 5 ----- arch/powerpc/mm/book3s64/pgtable.c | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 97a36fa3940e..eced266dc5e9 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -99,8 +99,6 @@ */ static unsigned long _SDR1; -struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -EXPORT_SYMBOL_GPL(mmu_psize_defs); u8 hpte_page_sizes[1 << LP_BITS]; EXPORT_SYMBOL_GPL(hpte_page_sizes); @@ -114,9 +112,6 @@ EXPORT_SYMBOL_GPL(mmu_linear_psize); int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; EXPORT_SYMBOL_GPL(mmu_vmalloc_psize); -#ifdef CONFIG_SPARSEMEM_VMEMMAP -int mmu_vmemmap_psize = MMU_PAGE_4K; -#endif int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; EXPORT_SYMBOL_GPL(mmu_kernel_ssize); diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 13d1fbddecb9..0e6254160673 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -22,6 +22,13 @@ #include "internal.h" +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif + unsigned long __pmd_frag_nr; EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; From f43d2ffb47c9e86f5ec24e1de6ce6da6808634a2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:45 +1000 Subject: [PATCH 130/240] powerpc/64s: Rename hash_hugetlbpage.c to hugetlbpage.c This file contains functions and data common to radix, so rename it to remove the hash_ prefix. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-11-npiggin@gmail.com --- arch/powerpc/mm/book3s64/Makefile | 2 +- arch/powerpc/mm/book3s64/{hash_hugetlbpage.c => hugetlbpage.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename arch/powerpc/mm/book3s64/{hash_hugetlbpage.c => hugetlbpage.c} (100%) diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 1579e18e098d..501efadb287f 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o -obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o endif diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c similarity index 100% rename from arch/powerpc/mm/book3s64/hash_hugetlbpage.c rename to arch/powerpc/mm/book3s64/hugetlbpage.c From ffbe5d21d10f9c7890c07fca17db772f941385bf Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:46 +1000 Subject: [PATCH 131/240] powerpc/64: pcpu setup avoid reading mmu_linear_psize on 64e or radix Radix never sets mmu_linear_psize so it's always 4K, which causes pcpu atom_size to always be PAGE_SIZE. 64e sets it to 1GB always. Make paths for these platforms to be explicit about what value they set atom_size to. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-12-npiggin@gmail.com --- arch/powerpc/kernel/setup_64.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6052f5d5ded3..9a493796ce66 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -880,14 +880,23 @@ void __init setup_per_cpu_areas(void) int rc = -EINVAL; /* - * Linear mapping is one of 4K, 1M and 16M. For 4K, no need - * to group units. For larger mappings, use 1M atom which - * should be large enough to contain a number of units. + * BookE and BookS radix are historical values and should be revisited. */ - if (mmu_linear_psize == MMU_PAGE_4K) + if (IS_ENABLED(CONFIG_PPC_BOOK3E)) { + atom_size = SZ_1M; + } else if (radix_enabled()) { atom_size = PAGE_SIZE; - else - atom_size = 1 << 20; + } else { + /* + * Linear mapping is one of 4K, 1M and 16M. For 4K, no need + * to group units. For larger mappings, use 1M atom which + * should be large enough to contain a number of units. + */ + if (mmu_linear_psize == MMU_PAGE_4K) + atom_size = PAGE_SIZE; + else + atom_size = SZ_1M; + } if (pcpu_chosen_fc != PCPU_FC_PAGE) { rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, From 20626177c9de726c48802c15e8635cc154645588 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:47 +1000 Subject: [PATCH 132/240] powerpc: make memremap_compat_align 64s-only memremap_compat_align is only relevant when ZONE_DEVICE is selected. ZONE_DEVICE depends on ARCH_HAS_PTE_DEVMAP, which is only selected by PPC_BOOK3S_64. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-13-npiggin@gmail.com --- arch/powerpc/Kconfig | 2 +- arch/powerpc/mm/book3s64/pgtable.c | 20 ++++++++++++++++++++ arch/powerpc/mm/ioremap.c | 20 -------------------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 5c61f3511d5a..e3e281a35327 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,7 +129,7 @@ config PPC select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE - select ARCH_HAS_MEMREMAP_COMPAT_ALIGN + select ARCH_HAS_MEMREMAP_COMPAT_ALIGN if PPC_BOOK3S_64 select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PHYS_TO_DMA diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 0e6254160673..d3b01f6ba530 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -528,3 +528,23 @@ static int __init pgtable_debugfs_setup(void) return 0; } arch_initcall(pgtable_debugfs_setup); + +#ifdef CONFIG_ZONE_DEVICE +/* + * Override the generic version in mm/memremap.c. + * + * With hash translation, the direct-map range is mapped with just one + * page size selected by htab_init_page_sizes(). Consult + * mmu_psize_defs[] to determine the minimum page size alignment. +*/ +unsigned long memremap_compat_align(void) +{ + if (!radix_enabled()) { + unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; + return max(SUBSECTION_SIZE, 1UL << shift); + } + + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 57342154d2b0..4f12504fb405 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -98,23 +98,3 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, return NULL; } - -#ifdef CONFIG_ZONE_DEVICE -/* - * Override the generic version in mm/memremap.c. - * - * With hash translation, the direct-map range is mapped with just one - * page size selected by htab_init_page_sizes(). Consult - * mmu_psize_defs[] to determine the minimum page size alignment. -*/ -unsigned long memremap_compat_align(void) -{ - unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift; - - if (radix_enabled()) - return SUBSECTION_SIZE; - return max(SUBSECTION_SIZE, 1UL << shift); - -} -EXPORT_SYMBOL_GPL(memremap_compat_align); -#endif From 8dbfc0092b5c8c50f011509893bf0396253cd2ab Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:48 +1000 Subject: [PATCH 133/240] powerpc/64e: remove mmu_linear_psize mmu_linear_psize is only set at boot once on 64e, is not necessarily the correct size of the linear map pages, and is never used anywhere. Remove it. Signed-off-by: Nicholas Piggin [mpe: Retain the extern, so we can use IS_ENABLED() for related code] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-14-npiggin@gmail.com --- arch/powerpc/mm/nohash/tlb.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 647bf454a0fa..311281063d48 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -150,7 +150,6 @@ static inline int mmu_get_tsize(int psize) */ #ifdef CONFIG_PPC64 -int mmu_linear_psize; /* Page size used for the linear mapping */ int mmu_pte_psize; /* Page size used for PTE pages */ int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ @@ -657,14 +656,6 @@ static void early_init_this_mmu(void) static void __init early_init_mmu_global(void) { - /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it. Ideally - * we should find out a suitable page size and patch the - * TLB miss code (either that or use the PACA to store - * the value we want) - */ - mmu_linear_psize = MMU_PAGE_1G; - /* XXX This should be decided at runtime based on supported * page sizes in the TLB, but for now let's assume 16M is * always there and a good fit (which it probably is) From af3a0ea41cbf38e967611e262126357d2fd23955 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:49 +1000 Subject: [PATCH 134/240] powerpc/64s: Fix radix MMU when MMU_FTR_HPTE_TABLE is clear There are a few places that require MMU_FTR_HPTE_TABLE to be set even when running in radix mode. Fix those up. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-15-npiggin@gmail.com --- arch/powerpc/mm/pgtable.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index ce9482383144..abb3198bd277 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -81,9 +81,6 @@ static struct page *maybe_pte_to_page(pte_t pte) static pte_t set_pte_filter_hash(pte_t pte) { - if (radix_enabled()) - return pte; - pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { @@ -112,6 +109,9 @@ static inline pte_t set_pte_filter(pte_t pte) { struct page *pg; + if (radix_enabled()) + return pte; + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) return set_pte_filter_hash(pte); @@ -144,6 +144,9 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, { struct page *pg; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return pte; + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) return pte; From debeda017189e40bff23d1c3d2e4567ca8541aed Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:50 +1000 Subject: [PATCH 135/240] powerpc/64s: Always define arch unmapped area calls To avoid any functional changes to radix paths when building with hash MMU support disabled (and CONFIG_PPC_MM_SLICES=n), always define the arch get_unmapped_area calls on 64s platforms. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-16-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/hash.h | 4 --- arch/powerpc/include/asm/book3s/64/mmu.h | 6 ++++ arch/powerpc/mm/hugetlbpage.c | 16 ++++++--- arch/powerpc/mm/mmap.c | 40 +++++++++++++++++++---- arch/powerpc/mm/slice.c | 20 ------------ 5 files changed, 51 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 674fe0e890dc..a7a0572f3846 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -99,10 +99,6 @@ * Defines the address of the vmemap area, in its own region on * hash table CPUs. */ -#ifdef CONFIG_PPC_MM_SLICES -#define HAVE_ARCH_UNMAPPED_AREA -#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#endif /* CONFIG_PPC_MM_SLICES */ /* PTEIDX nibble */ #define _PTEIDX_SECONDARY 0x8 diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 8c500dd6fee4..a265c5618ce8 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -4,6 +4,12 @@ #include +#ifdef CONFIG_HUGETLB_PAGE +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#endif +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + #ifndef __ASSEMBLY__ /* * Page size definition diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 82d8b368ca6d..ddead41e2194 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -542,20 +542,26 @@ retry: return page; } -#ifdef CONFIG_PPC_MM_SLICES +#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +static inline int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + return shift_to_mmu_psize(huge_page_shift(hstate)); +} + unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct hstate *hstate = hstate_file(file); - int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - #ifdef CONFIG_PPC_RADIX_MMU if (radix_enabled()) return radix__hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); #endif - return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); +#ifdef CONFIG_PPC_MM_SLICES + return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); +#endif + BUG(); } #endif diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index ae683fdc716c..c475cf810aa8 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -80,6 +80,7 @@ static inline unsigned long mmap_base(unsigned long rnd, return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); } +#ifdef HAVE_ARCH_UNMAPPED_AREA #ifdef CONFIG_PPC_RADIX_MMU /* * Same function as generic code used only for radix, because we don't need to overload @@ -181,11 +182,42 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, */ return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags); } +#endif + +unsigned long arch_get_unmapped_area(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ +#ifdef CONFIG_PPC_MM_SLICES + return slice_get_unmapped_area(addr, len, flags, + mm_ctx_user_psize(¤t->mm->context), 0); +#else + BUG(); +#endif +} + +unsigned long arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ +#ifdef CONFIG_PPC_MM_SLICES + return slice_get_unmapped_area(addr0, len, flags, + mm_ctx_user_psize(¤t->mm->context), 1); +#else + BUG(); +#endif +} +#endif /* HAVE_ARCH_UNMAPPED_AREA */ static void radix__arch_pick_mmap_layout(struct mm_struct *mm, unsigned long random_factor, struct rlimit *rlim_stack) { +#ifdef CONFIG_PPC_RADIX_MMU if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = radix__arch_get_unmapped_area; @@ -193,13 +225,9 @@ static void radix__arch_pick_mmap_layout(struct mm_struct *mm, mm->mmap_base = mmap_base(random_factor, rlim_stack); mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; } -} -#else -/* dummy */ -extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, - unsigned long random_factor, - struct rlimit *rlim_stack); #endif +} + /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 82b45b1cb973..f42711f865f3 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -639,26 +639,6 @@ return_addr: } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); -unsigned long arch_get_unmapped_area(struct file *filp, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - return slice_get_unmapped_area(addr, len, flags, - mm_ctx_user_psize(¤t->mm->context), 0); -} - -unsigned long arch_get_unmapped_area_topdown(struct file *filp, - const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - return slice_get_unmapped_area(addr0, len, flags, - mm_ctx_user_psize(¤t->mm->context), 1); -} - unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) { unsigned char *psizes; From c28573744b74eb6de19add503d6a986795c4c137 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:51 +1000 Subject: [PATCH 136/240] powerpc/64s: Make hash MMU support configurable This adds Kconfig selection which allows 64s hash MMU support to be disabled. It can be disabled if radix support is enabled, the minimum supported CPU type is POWER9 (or higher), and KVM is not selected. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-17-npiggin@gmail.com --- arch/powerpc/Kconfig | 3 ++- arch/powerpc/include/asm/mmu.h | 16 +++++++++++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 14 ++++++++---- arch/powerpc/kvm/Kconfig | 1 + arch/powerpc/mm/init_64.c | 13 +++++++++-- arch/powerpc/platforms/Kconfig.cputype | 28 ++++++++++++++++++++++-- arch/powerpc/platforms/cell/Kconfig | 1 + arch/powerpc/platforms/maple/Kconfig | 1 + arch/powerpc/platforms/microwatt/Kconfig | 2 +- arch/powerpc/platforms/pasemi/Kconfig | 1 + arch/powerpc/platforms/powermac/Kconfig | 1 + arch/powerpc/platforms/powernv/Kconfig | 2 +- drivers/misc/cxl/Kconfig | 1 + drivers/misc/lkdtm/Makefile | 2 +- 14 files changed, 71 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e3e281a35327..2555563efff0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -846,7 +846,7 @@ config FORCE_MAX_ZONEORDER config PPC_SUBPAGE_PROT bool "Support setting protections for 4k subpages (subpage_prot syscall)" default n - depends on PPC_BOOK3S_64 && PPC_64K_PAGES + depends on PPC_64S_HASH_MMU && PPC_64K_PAGES help This option adds support for system call to allow user programs to set access permissions (read/write, readonly, or no access) @@ -944,6 +944,7 @@ config PPC_MEM_KEYS prompt "PowerPC Memory Protection Keys" def_bool y depends on PPC_BOOK3S_64 + depends on PPC_64S_HASH_MMU select ARCH_USES_HIGH_VMA_FLAGS select ARCH_HAS_PKEYS help diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 8abe8e42e045..5f41565a1e5d 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -157,7 +157,7 @@ DECLARE_PER_CPU(int, next_tlbcam_idx); enum { MMU_FTRS_POSSIBLE = -#if defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_BOOK3S_604) +#if defined(CONFIG_PPC_BOOK3S_604) MMU_FTR_HPTE_TABLE | #endif #ifdef CONFIG_PPC_8xx @@ -184,15 +184,18 @@ enum { MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS | #endif #ifdef CONFIG_PPC_BOOK3S_64 + MMU_FTR_KERNEL_RO | +#ifdef CONFIG_PPC_64S_HASH_MMU MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA | - MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA | + MMU_FTR_68_BIT_VA | MMU_FTR_HPTE_TABLE | #endif #ifdef CONFIG_PPC_RADIX_MMU MMU_FTR_TYPE_RADIX | MMU_FTR_GTSE | #endif /* CONFIG_PPC_RADIX_MMU */ +#endif #ifdef CONFIG_PPC_KUAP MMU_FTR_BOOK3S_KUAP | #endif /* CONFIG_PPC_KUAP */ @@ -224,6 +227,13 @@ enum { #define MMU_FTRS_ALWAYS MMU_FTR_TYPE_FSL_E #endif +/* BOOK3S_64 options */ +#if defined(CONFIG_PPC_RADIX_MMU) && !defined(CONFIG_PPC_64S_HASH_MMU) +#define MMU_FTRS_ALWAYS MMU_FTR_TYPE_RADIX +#elif !defined(CONFIG_PPC_RADIX_MMU) && defined(CONFIG_PPC_64S_HASH_MMU) +#define MMU_FTRS_ALWAYS MMU_FTR_HPTE_TABLE +#endif + #ifndef MMU_FTRS_ALWAYS #define MMU_FTRS_ALWAYS 0 #endif @@ -329,7 +339,7 @@ static __always_inline bool radix_enabled(void) return mmu_has_feature(MMU_FTR_TYPE_RADIX); } -static inline bool early_radix_enabled(void) +static __always_inline bool early_radix_enabled(void) { return early_mmu_has_feature(MMU_FTR_TYPE_RADIX); } diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index d2b35fb9181d..1ac8d7357195 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -273,6 +273,9 @@ static int __init feat_enable_mmu_hash(struct dt_cpu_feature *f) { u64 lpcr; + if (!IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + return 0; + lpcr = mfspr(SPRN_LPCR); lpcr &= ~LPCR_ISL; @@ -292,6 +295,9 @@ static int __init feat_enable_mmu_hash_v3(struct dt_cpu_feature *f) { u64 lpcr; + if (!IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + return 0; + lpcr = mfspr(SPRN_LPCR); lpcr &= ~(LPCR_ISL | LPCR_UPRT | LPCR_HR); mtspr(SPRN_LPCR, lpcr); @@ -305,15 +311,15 @@ static int __init feat_enable_mmu_hash_v3(struct dt_cpu_feature *f) static int __init feat_enable_mmu_radix(struct dt_cpu_feature *f) { -#ifdef CONFIG_PPC_RADIX_MMU + if (!IS_ENABLED(CONFIG_PPC_RADIX_MMU)) + return 0; + + cur_cpu_spec->mmu_features |= MMU_FTR_KERNEL_RO; cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX; - cur_cpu_spec->mmu_features |= MMU_FTRS_HASH_BASE; cur_cpu_spec->mmu_features |= MMU_FTR_GTSE; cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_MMU; return 1; -#endif - return 0; } static int __init feat_enable_dscr(struct dt_cpu_feature *f) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 6a58532300c5..f947b77386a9 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -69,6 +69,7 @@ config KVM_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE + select PPC_64S_HASH_MMU select SPAPR_TCE_IOMMU if IOMMU_SUPPORT && (PPC_PSERIES || PPC_POWERNV) help Support running unmodified book3s_64 and book3s_32 guest kernels diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 3e5f9ac9dded..35f46bf54281 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -472,8 +472,12 @@ void __init mmu_early_init_devtree(void) bool hvmode = !!(mfmsr() & MSR_HV); /* Disable radix mode based on kernel command line. */ - if (disable_radix) - cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + if (disable_radix) { + if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + else + pr_warn("WARNING: Ignoring cmdline option disable_radix\n"); + } of_scan_flat_dt(dt_scan_mmu_pid_width, NULL); if (hvmode && !mmu_lpid_bits) { @@ -498,6 +502,7 @@ void __init mmu_early_init_devtree(void) if (early_radix_enabled()) { radix__early_init_devtree(); + /* * We have finalized the translation we are going to use by now. * Radix mode is not limited by RMA / VRMA addressing. @@ -507,5 +512,9 @@ void __init mmu_early_init_devtree(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } else hash__early_init_devtree(); + + if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) && + !(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX)) + panic("kernel does not support any MMU type offered by platform"); } #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index a208997ade88..7ca07df1c374 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -105,9 +105,9 @@ config PPC_BOOK3S_64 select HAVE_MOVE_PMD select HAVE_MOVE_PUD select IRQ_WORK - select PPC_MM_SLICES select PPC_HAVE_KUEP select PPC_HAVE_KUAP + select PPC_64S_HASH_MMU if !PPC_RADIX_MMU config PPC_BOOK3E_64 bool "Embedded processors" @@ -130,11 +130,13 @@ choice config GENERIC_CPU bool "Generic (POWER4 and above)" depends on PPC64 && !CPU_LITTLE_ENDIAN + select PPC_64S_HASH_MMU if PPC_BOOK3S_64 config GENERIC_CPU bool "Generic (POWER8 and above)" depends on PPC64 && CPU_LITTLE_ENDIAN select ARCH_HAS_FAST_MULTIPLIER + select PPC_64S_HASH_MMU config GENERIC_CPU bool "Generic 32 bits powerpc" @@ -143,24 +145,29 @@ config GENERIC_CPU config CELL_CPU bool "Cell Broadband Engine" depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN + select PPC_64S_HASH_MMU config POWER5_CPU bool "POWER5" depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN + select PPC_64S_HASH_MMU config POWER6_CPU bool "POWER6" depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN + select PPC_64S_HASH_MMU config POWER7_CPU bool "POWER7" depends on PPC_BOOK3S_64 select ARCH_HAS_FAST_MULTIPLIER + select PPC_64S_HASH_MMU config POWER8_CPU bool "POWER8" depends on PPC_BOOK3S_64 select ARCH_HAS_FAST_MULTIPLIER + select PPC_64S_HASH_MMU config POWER9_CPU bool "POWER9" @@ -364,6 +371,22 @@ config SPE If in doubt, say Y here. +config PPC_64S_HASH_MMU + bool "Hash MMU Support" + depends on PPC_BOOK3S_64 + select PPC_MM_SLICES + default y + help + Enable support for the Power ISA Hash style MMU. This is implemented + by all IBM Power and other 64-bit Book3S CPUs before ISA v3.0. The + OpenPOWER ISA does not mandate the hash MMU and some CPUs do not + implement it (e.g., Microwatt). + + Note that POWER9 PowerVM platforms only support the hash + MMU. From POWER10 radix is also supported by PowerVM. + + If you're unsure, say Y. + config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 @@ -375,7 +398,8 @@ config PPC_RADIX_MMU you can probably disable this. config PPC_RADIX_MMU_DEFAULT - bool "Default to using the Radix MMU when possible" + bool "Default to using the Radix MMU when possible" if PPC_64S_HASH_MMU + depends on PPC_BOOK3S_64 depends on PPC_RADIX_MMU default y help diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index db4465c51b56..34669b060f36 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 config PPC_CELL + select PPC_64S_HASH_MMU if PPC64 bool config PPC_CELL_COMMON diff --git a/arch/powerpc/platforms/maple/Kconfig b/arch/powerpc/platforms/maple/Kconfig index 7fd84311ade5..4c058cc57c90 100644 --- a/arch/powerpc/platforms/maple/Kconfig +++ b/arch/powerpc/platforms/maple/Kconfig @@ -9,6 +9,7 @@ config PPC_MAPLE select GENERIC_TBSYNC select PPC_UDBG_16550 select PPC_970_NAP + select PPC_64S_HASH_MMU select PPC_HASH_MMU_NATIVE select PPC_RTAS select MMIO_NVRAM diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig index 62b51e37fc05..823192e9d38a 100644 --- a/arch/powerpc/platforms/microwatt/Kconfig +++ b/arch/powerpc/platforms/microwatt/Kconfig @@ -5,7 +5,7 @@ config PPC_MICROWATT select PPC_XICS select PPC_ICS_NATIVE select PPC_ICP_NATIVE - select PPC_HASH_MMU_NATIVE + select PPC_HASH_MMU_NATIVE if PPC_64S_HASH_MMU select PPC_UDBG_16550 select ARCH_RANDOM help diff --git a/arch/powerpc/platforms/pasemi/Kconfig b/arch/powerpc/platforms/pasemi/Kconfig index bc7137353a7f..85ae18ddd911 100644 --- a/arch/powerpc/platforms/pasemi/Kconfig +++ b/arch/powerpc/platforms/pasemi/Kconfig @@ -5,6 +5,7 @@ config PPC_PASEMI select MPIC select FORCE_PCI select PPC_UDBG_16550 + select PPC_64S_HASH_MMU select PPC_HASH_MMU_NATIVE select MPIC_BROKEN_REGREAD help diff --git a/arch/powerpc/platforms/powermac/Kconfig b/arch/powerpc/platforms/powermac/Kconfig index 2b56df145b82..130707ec9f99 100644 --- a/arch/powerpc/platforms/powermac/Kconfig +++ b/arch/powerpc/platforms/powermac/Kconfig @@ -6,6 +6,7 @@ config PPC_PMAC select FORCE_PCI select PPC_INDIRECT_PCI if PPC32 select PPC_MPC106 if PPC32 + select PPC_64S_HASH_MMU if PPC64 select PPC_HASH_MMU_NATIVE select ZONE_DMA if PPC32 default y diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index cd754e116184..161dfe024085 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -2,7 +2,7 @@ config PPC_POWERNV depends on PPC64 && PPC_BOOK3S bool "IBM PowerNV (Non-Virtualized) platform support" - select PPC_HASH_MMU_NATIVE + select PPC_HASH_MMU_NATIVE if PPC_64S_HASH_MMU select PPC_XICS select PPC_ICP_NATIVE select PPC_XIVE_NATIVE diff --git a/drivers/misc/cxl/Kconfig b/drivers/misc/cxl/Kconfig index 51aecafdcbdf..5efc4151bf58 100644 --- a/drivers/misc/cxl/Kconfig +++ b/drivers/misc/cxl/Kconfig @@ -6,6 +6,7 @@ config CXL_BASE bool select PPC_COPRO_BASE + select PPC_64S_HASH_MMU config CXL tristate "Support for IBM Coherent Accelerators (CXL)" diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile index aa12097668d3..83a7baf5df82 100644 --- a/drivers/misc/lkdtm/Makefile +++ b/drivers/misc/lkdtm/Makefile @@ -11,7 +11,7 @@ lkdtm-$(CONFIG_LKDTM) += usercopy.o lkdtm-$(CONFIG_LKDTM) += stackleak.o lkdtm-$(CONFIG_LKDTM) += cfi.o lkdtm-$(CONFIG_LKDTM) += fortify.o -lkdtm-$(CONFIG_PPC_BOOK3S_64) += powerpc.o +lkdtm-$(CONFIG_PPC_64S_HASH_MMU) += powerpc.o KASAN_SANITIZE_rodata.o := n KASAN_SANITIZE_stackleak.o := n From 387e220a2e5e630794e1f5219ed6f11e56271c21 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:52 +1000 Subject: [PATCH 137/240] powerpc/64s: Move hash MMU support code under CONFIG_PPC_64S_HASH_MMU Compiling out hash support code when CONFIG_PPC_64S_HASH_MMU=n saves 128kB kernel image size (90kB text) on powernv_defconfig minus KVM, 350kB on pseries_defconfig minus KVM, 40kB on a tiny config. Signed-off-by: Nicholas Piggin [mpe: Fixup defined(ARCH_HAS_MEMREMAP_COMPAT_ALIGN), which needs CONFIG. Fix radix_enabled() use in setup_initial_memory_limit(). Add some stubs to reduce number of ifdefs.] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-18-npiggin@gmail.com --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 8 ++++- arch/powerpc/include/asm/book3s/64/mmu.h | 21 ++++++++++-- .../include/asm/book3s/64/tlbflush-hash.h | 6 ++++ arch/powerpc/include/asm/book3s/pgtable.h | 4 +++ arch/powerpc/include/asm/mmu_context.h | 2 ++ arch/powerpc/include/asm/paca.h | 8 +++++ arch/powerpc/kernel/asm-offsets.c | 2 ++ arch/powerpc/kernel/entry_64.S | 4 +-- arch/powerpc/kernel/exceptions-64s.S | 16 ++++++++++ arch/powerpc/kernel/mce.c | 2 +- arch/powerpc/kernel/mce_power.c | 10 ++++-- arch/powerpc/kernel/paca.c | 18 ++++------- arch/powerpc/kernel/process.c | 13 ++++---- arch/powerpc/kernel/prom.c | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kexec/core_64.c | 4 +-- arch/powerpc/kexec/ranges.c | 2 +- arch/powerpc/mm/book3s64/Makefile | 15 +++++---- arch/powerpc/mm/book3s64/hugetlbpage.c | 2 ++ arch/powerpc/mm/book3s64/mmu_context.c | 32 +++++++++++++++---- arch/powerpc/mm/book3s64/pgtable.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++- arch/powerpc/mm/copro_fault.c | 2 ++ arch/powerpc/mm/ptdump/Makefile | 2 +- arch/powerpc/platforms/powernv/idle.c | 2 ++ arch/powerpc/platforms/powernv/setup.c | 2 ++ arch/powerpc/platforms/pseries/lpar.c | 11 +++++-- arch/powerpc/platforms/pseries/lparcfg.c | 2 +- arch/powerpc/platforms/pseries/mobility.c | 4 +++ arch/powerpc/platforms/pseries/pseries.h | 5 +++ arch/powerpc/platforms/pseries/ras.c | 2 ++ arch/powerpc/platforms/pseries/setup.c | 6 ++-- arch/powerpc/xmon/xmon.c | 8 +++-- drivers/misc/lkdtm/core.c | 2 +- 35 files changed, 172 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2555563efff0..0631c9241af3 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,7 +129,7 @@ config PPC select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE - select ARCH_HAS_MEMREMAP_COMPAT_ALIGN if PPC_BOOK3S_64 + select ARCH_HAS_MEMREMAP_COMPAT_ALIGN if PPC_64S_HASH_MMU select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PHYS_TO_DMA diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 3004f3323144..21f780942911 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -523,8 +523,14 @@ void slb_save_contents(struct slb_entry *slb_ptr); void slb_dump_contents(struct slb_entry *slb_ptr); extern void slb_vmalloc_update(void); -extern void slb_set_size(u16 size); void preload_new_slb_context(unsigned long start, unsigned long sp); + +#ifdef CONFIG_PPC_64S_HASH_MMU +void slb_set_size(u16 size); +#else +static inline void slb_set_size(u16 size) { } +#endif + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index a265c5618ce8..7fee46e50377 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -105,7 +105,9 @@ typedef struct { * from EA and new context ids to build the new VAs. */ mm_context_id_t id; +#ifdef CONFIG_PPC_64S_HASH_MMU mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE]; +#endif }; /* Number of bits in the mm_cpumask */ @@ -117,7 +119,9 @@ typedef struct { /* Number of user space windows opened in process mm_context */ atomic_t vas_windows; +#ifdef CONFIG_PPC_64S_HASH_MMU struct hash_mm_context *hash_context; +#endif void __user *vdso; /* @@ -140,6 +144,7 @@ typedef struct { #endif } mm_context_t; +#ifdef CONFIG_PPC_64S_HASH_MMU static inline u16 mm_ctx_user_psize(mm_context_t *ctx) { return ctx->hash_context->user_psize; @@ -200,8 +205,15 @@ static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) extern int mmu_linear_psize; extern int mmu_virtual_psize; extern int mmu_vmalloc_psize; -extern int mmu_vmemmap_psize; extern int mmu_io_psize; +#else /* CONFIG_PPC_64S_HASH_MMU */ +#ifdef CONFIG_PPC_64K_PAGES +#define mmu_virtual_psize MMU_PAGE_64K +#else +#define mmu_virtual_psize MMU_PAGE_4K +#endif +#endif +extern int mmu_vmemmap_psize; /* MMU initialization */ void mmu_early_init_devtree(void); @@ -240,8 +252,9 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, * know which translations we will pick. Hence go with hash * restrictions. */ - return hash__setup_initial_memory_limit(first_memblock_base, - first_memblock_size); + if (!early_radix_enabled()) + hash__setup_initial_memory_limit(first_memblock_base, + first_memblock_size); } #ifdef CONFIG_PPC_PSERIES @@ -262,6 +275,7 @@ static inline void radix_init_pseries(void) { } void cleanup_cpu_mmu_context(void); #endif +#ifdef CONFIG_PPC_64S_HASH_MMU static inline int get_user_context(mm_context_t *ctx, unsigned long ea) { int index = ea >> MAX_EA_BITS_PER_CONTEXT; @@ -281,6 +295,7 @@ static inline unsigned long get_user_vsid(mm_context_t *ctx, return get_vsid(context, ea, ssize); } +#endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 3b95769739c7..8b762f282190 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -112,8 +112,14 @@ static inline void hash__flush_tlb_kernel_range(unsigned long start, struct mmu_gather; extern void hash__tlb_flush(struct mmu_gather *tlb); +void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); + +#ifdef CONFIG_PPC_64S_HASH_MMU /* Private function for use by PCI IO mapping code */ extern void __flush_hash_table_range(unsigned long start, unsigned long end); extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr); +#else +static inline void __flush_hash_table_range(unsigned long start, unsigned long end) { } +#endif #endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index ad130e15a126..e8269434ecbe 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -25,6 +25,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); #define __HAVE_PHYS_MEM_ACCESS_PROT +#if defined(CONFIG_PPC32) || defined(CONFIG_PPC_64S_HASH_MMU) /* * This gets called at the end of handling a page fault, when * the kernel has put a new PTE into the page table for the process. @@ -35,6 +36,9 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, * waiting for the inevitable extra hash-table miss exception. */ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); +#else +static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {} +#endif #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 9ba6b585337f..e46394d27785 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -75,6 +75,7 @@ extern void hash__reserve_context_id(int id); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } +#ifdef CONFIG_PPC_64S_HASH_MMU static inline int alloc_extended_context(struct mm_struct *mm, unsigned long ea) { @@ -100,6 +101,7 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea) return true; return false; } +#endif #else extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index dc05a862e72a..295573a82c66 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -97,7 +97,9 @@ struct paca_struct { /* this becomes non-zero. */ u8 kexec_state; /* set when kexec down has irqs off */ #ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU struct slb_shadow *slb_shadow_ptr; +#endif struct dtl_entry *dispatch_log; struct dtl_entry *dispatch_log_end; #endif @@ -110,6 +112,7 @@ struct paca_struct { /* used for most interrupts/exceptions */ u64 exgen[EX_SIZE] __attribute__((aligned(0x80))); +#ifdef CONFIG_PPC_64S_HASH_MMU /* SLB related definitions */ u16 vmalloc_sllp; u8 slb_cache_ptr; @@ -120,6 +123,7 @@ struct paca_struct { u32 slb_used_bitmap; /* Bitmaps for first 32 SLB entries. */ u32 slb_kern_bitmap; u32 slb_cache[SLB_CACHE_ENTRIES]; +#endif #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_BOOK3E @@ -149,6 +153,7 @@ struct paca_struct { #endif /* CONFIG_PPC_BOOK3E */ #ifdef CONFIG_PPC_BOOK3S +#ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_PPC_MM_SLICES unsigned char mm_ctx_low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; @@ -156,6 +161,7 @@ struct paca_struct { u16 mm_ctx_user_psize; u16 mm_ctx_sllp; #endif +#endif #endif /* @@ -268,9 +274,11 @@ struct paca_struct { #endif /* CONFIG_PPC_PSERIES */ #ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /* Capture SLB related old contents in MCE handler. */ struct slb_entry *mce_faulty_slbs; u16 slb_save_cache_ptr; +#endif #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_STACKPROTECTOR unsigned long canary; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index cc05522f50bf..b823f484c640 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -218,10 +218,12 @@ int main(void) OFFSET(PACA_EXGEN, paca_struct, exgen); OFFSET(PACA_EXMC, paca_struct, exmc); OFFSET(PACA_EXNMI, paca_struct, exnmi); +#ifdef CONFIG_PPC_64S_HASH_MMU OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr); OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid); OFFSET(SLBSHADOW_STACKESID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].esid); OFFSET(SLBSHADOW_SAVEAREA, slb_shadow, save_area); +#endif OFFSET(LPPACA_PMCINUSE, lppaca, pmcregs_in_use); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE OFFSET(PACA_PMCINUSE, paca_struct, pmcregs_in_use); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 70cff7b49e17..9581906b5ee9 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -180,7 +180,7 @@ _GLOBAL(_switch) #endif ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION b 2f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) @@ -232,7 +232,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) slbmte r7,r0 isync 2: -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_64S_HASH_MMU */ clrrdi r7, r8, THREAD_SHIFT /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2acd7e66694e..a30f563bc7a8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1367,11 +1367,15 @@ EXC_COMMON_BEGIN(data_access_common) addi r3,r1,STACK_FRAME_OVERHEAD andis. r0,r4,DSISR_DABRMATCH@h bne- 1f +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION bl do_hash_fault MMU_FTR_SECTION_ELSE bl do_page_fault ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + bl do_page_fault +#endif b interrupt_return_srr 1: bl do_break @@ -1414,6 +1418,7 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) EXC_COMMON_BEGIN(data_access_slb_common) GEN_COMMON data_access_slb +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ addi r3,r1,STACK_FRAME_OVERHEAD @@ -1426,6 +1431,9 @@ MMU_FTR_SECTION_ELSE /* Radix case, access is outside page table range */ li r3,-EFAULT ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + li r3,-EFAULT +#endif std r3,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_bad_segment_interrupt @@ -1460,11 +1468,15 @@ EXC_VIRT_END(instruction_access, 0x4400, 0x80) EXC_COMMON_BEGIN(instruction_access_common) GEN_COMMON instruction_access addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION bl do_hash_fault MMU_FTR_SECTION_ELSE bl do_page_fault ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + bl do_page_fault +#endif b interrupt_return_srr @@ -1494,6 +1506,7 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) EXC_COMMON_BEGIN(instruction_access_slb_common) GEN_COMMON instruction_access_slb +#ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ addi r3,r1,STACK_FRAME_OVERHEAD @@ -1506,6 +1519,9 @@ MMU_FTR_SECTION_ELSE /* Radix case, access is outside page table range */ li r3,-EFAULT ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) +#else + li r3,-EFAULT +#endif std r3,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_bad_segment_interrupt diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index fd829f7f25a4..2503dd4713b9 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -586,7 +586,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, mc_error_class[evt->error_class] : "Unknown"; printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /* Display faulty slb contents for SLB errors. */ if (evt->error_type == MCE_ERROR_TYPE_SLB && !in_guest) slb_dump_contents(local_paca->mce_faulty_slbs); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index cf5263b648fc..a48ff18d6d65 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -77,7 +77,7 @@ static bool mce_in_guest(void) } /* flush SLBs and reload */ -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU void flush_and_reload_slb(void) { if (early_radix_enabled()) @@ -99,7 +99,7 @@ void flush_and_reload_slb(void) void flush_erat(void) { -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { flush_and_reload_slb(); return; @@ -114,7 +114,7 @@ void flush_erat(void) static int mce_flush(int what) { -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (what == MCE_FLUSH_SLB) { flush_and_reload_slb(); return 1; @@ -499,8 +499,10 @@ static int mce_handle_ierror(struct pt_regs *regs, unsigned long srr1, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: +#ifdef CONFIG_PPC_64S_HASH_MMU if (local_paca->in_mce == 1) slb_save_contents(local_paca->mce_faulty_slbs); +#endif handled = mce_flush(MCE_FLUSH_SLB); break; case MCE_ERROR_TYPE_ERAT: @@ -588,8 +590,10 @@ static int mce_handle_derror(struct pt_regs *regs, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: +#ifdef CONFIG_PPC_64S_HASH_MMU if (local_paca->in_mce == 1) slb_save_contents(local_paca->mce_faulty_slbs); +#endif if (mce_flush(MCE_FLUSH_SLB)) handled = 1; break; diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 4208b4044d12..39da688a9455 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -139,8 +139,7 @@ static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) } #endif /* CONFIG_PPC_PSERIES */ -#ifdef CONFIG_PPC_BOOK3S_64 - +#ifdef CONFIG_PPC_64S_HASH_MMU /* * 3 persistent SLBs are allocated here. The buffer will be zero * initially, hence will all be invaild until we actually write them. @@ -169,8 +168,7 @@ static struct slb_shadow * __init new_slb_shadow(int cpu, unsigned long limit) return s; } - -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_64S_HASH_MMU */ #ifdef CONFIG_PPC_PSERIES /** @@ -226,7 +224,7 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU new_paca->slb_shadow_ptr = NULL; #endif @@ -307,7 +305,7 @@ void __init allocate_paca(int cpu) #ifdef CONFIG_PPC_PSERIES paca->lppaca_ptr = new_lppaca(cpu, limit); #endif -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU paca->slb_shadow_ptr = new_slb_shadow(cpu, limit); #endif #ifdef CONFIG_PPC_PSERIES @@ -328,7 +326,7 @@ void __init free_unused_pacas(void) paca_nr_cpu_ids = nr_cpu_ids; paca_ptrs_size = new_ptrs_size; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (early_radix_enabled()) { /* Ugly fixup, see new_slb_shadow() */ memblock_phys_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr), @@ -341,9 +339,9 @@ void __init free_unused_pacas(void) paca_ptrs_size + paca_struct_size, nr_cpu_ids); } +#ifdef CONFIG_PPC_64S_HASH_MMU void copy_mm_to_paca(struct mm_struct *mm) { -#ifdef CONFIG_PPC_BOOK3S mm_context_t *context = &mm->context; #ifdef CONFIG_PPC_MM_SLICES @@ -356,7 +354,5 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; #endif -#else /* !CONFIG_PPC_BOOK3S */ - return; -#endif } +#endif /* CONFIG_PPC_64S_HASH_MMU */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 5d2333d2a283..a64cfbb85ca2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1240,7 +1240,7 @@ struct task_struct *__switch_to(struct task_struct *prev, { struct thread_struct *new_thread, *old_thread; struct task_struct *last; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU struct ppc64_tlb_batch *batch; #endif @@ -1249,7 +1249,7 @@ struct task_struct *__switch_to(struct task_struct *prev, WARN_ON(!irqs_disabled()); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1328,6 +1328,7 @@ struct task_struct *__switch_to(struct task_struct *prev, */ #ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /* * This applies to a process that was context switched while inside * arch_enter_lazy_mmu_mode(), to re-activate the batch that was @@ -1339,6 +1340,7 @@ struct task_struct *__switch_to(struct task_struct *prev, batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } +#endif /* * Math facilities are masked out of the child MSR in copy_thread. @@ -1689,7 +1691,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) { -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; @@ -2333,10 +2335,9 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * the heap, we can put it above 1TB so it is backed by a 1TB * segment. Otherwise the heap will be in the bottom 1TB * which always uses 256MB segments and this may result in a - * performance penalty. We don't need to worry about radix. For - * radix, mmu_highuser_ssize remains unchanged from 256MB. + * performance penalty. */ - if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) + if (!radix_enabled() && !is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); #endif diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 2e67588f6f6e..75678ff04dd7 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -231,7 +231,7 @@ static void __init check_cpu_pa_features(unsigned long node) ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU static void __init init_mmu_slb_size(unsigned long node) { const __be32 *slb_size_ptr; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 9a493796ce66..703a2e6ab08d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -886,7 +886,7 @@ void __init setup_per_cpu_areas(void) atom_size = SZ_1M; } else if (radix_enabled()) { atom_size = PAGE_SIZE; - } else { + } else if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU)) { /* * Linear mapping is one of 4K, 1M and 16M. For 4K, no need * to group units. For larger mappings, use 1M atom which diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 66678518b938..635b5fc30b53 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -378,7 +378,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -420,4 +420,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_64S_HASH_MMU */ diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c index 6b81c852feab..563e9989a5bf 100644 --- a/arch/powerpc/kexec/ranges.c +++ b/arch/powerpc/kexec/ranges.c @@ -296,7 +296,7 @@ int add_initrd_mem_range(struct crash_mem **mem_ranges) return ret; } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /** * add_htab_mem_range - Adds htab range to the given memory ranges list, * if it exists diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index 501efadb287f..2d50cac499c5 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -2,20 +2,23 @@ ccflags-y := $(NO_MINIMAL_TOC) +obj-y += mmu_context.o pgtable.o trace.o +ifdef CONFIG_PPC_64S_HASH_MMU CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - -obj-y += hash_pgtable.o hash_utils.o slb.o \ - mmu_context.o pgtable.o hash_tlb.o trace.o +obj-y += hash_pgtable.o hash_utils.o hash_tlb.o slb.o obj-$(CONFIG_PPC_HASH_MMU_NATIVE) += hash_native.o -obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +endif + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o ifdef CONFIG_HUGETLB_PAGE obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o obj-$(CONFIG_PPC_PKEY) += pkeys.o diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index a688e1324ae5..95b2a283fd6e 100644 --- a/arch/powerpc/mm/book3s64/hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -16,6 +16,7 @@ unsigned int hpage_shift; EXPORT_SYMBOL(hpage_shift); +#ifdef CONFIG_PPC_64S_HASH_MMU int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize, unsigned int shift, unsigned int mmu_psize) @@ -122,6 +123,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } +#endif pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index c10fc8a72fb3..24aa953c9311 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -31,6 +31,7 @@ static int alloc_context_id(int min_id, int max_id) return ida_alloc_range(&mmu_context_ida, min_id, max_id, GFP_KERNEL); } +#ifdef CONFIG_PPC_64S_HASH_MMU void hash__reserve_context_id(int id) { int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); @@ -50,7 +51,9 @@ int hash__alloc_context_id(void) return alloc_context_id(MIN_USER_CONTEXT, max); } EXPORT_SYMBOL_GPL(hash__alloc_context_id); +#endif +#ifdef CONFIG_PPC_64S_HASH_MMU static int realloc_context_ids(mm_context_t *ctx) { int i, id; @@ -150,6 +153,13 @@ void hash__setup_new_exec(void) slb_setup_new_exec(); } +#else +static inline int hash__init_new_context(struct mm_struct *mm) +{ + BUILD_BUG(); + return 0; +} +#endif static int radix__init_new_context(struct mm_struct *mm) { @@ -175,7 +185,9 @@ static int radix__init_new_context(struct mm_struct *mm) */ asm volatile("ptesync;isync" : : : "memory"); +#ifdef CONFIG_PPC_64S_HASH_MMU mm->context.hash_context = NULL; +#endif return index; } @@ -213,14 +225,22 @@ EXPORT_SYMBOL_GPL(__destroy_context); static void destroy_contexts(mm_context_t *ctx) { - int index, context_id; + if (radix_enabled()) { + ida_free(&mmu_context_ida, ctx->id); + } else { +#ifdef CONFIG_PPC_64S_HASH_MMU + int index, context_id; - for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { - context_id = ctx->extended_id[index]; - if (context_id) - ida_free(&mmu_context_ida, context_id); + for (index = 0; index < ARRAY_SIZE(ctx->extended_id); index++) { + context_id = ctx->extended_id[index]; + if (context_id) + ida_free(&mmu_context_ida, context_id); + } + kfree(ctx->hash_context); +#else + BUILD_BUG(); // radix_enabled() should be constant true +#endif } - kfree(ctx->hash_context); } static void pmd_frag_destroy(void *pmd_frag) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index d3b01f6ba530..79ce3c22a29d 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -529,7 +529,7 @@ static int __init pgtable_debugfs_setup(void) } arch_initcall(pgtable_debugfs_setup); -#ifdef CONFIG_ZONE_DEVICE +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN) /* * Override the generic version in mm/memremap.c. * diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 1f4afc37843d..3c4f0ebe5df8 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -334,7 +334,7 @@ static void __init radix_init_pgtable(void) u64 i; /* We don't support slb for radix */ - mmu_slb_size = 0; + slb_set_size(0); /* * Create the linear mapping @@ -565,6 +565,7 @@ void __init radix__early_init_mmu(void) { unsigned long lpcr; +#ifdef CONFIG_PPC_64S_HASH_MMU #ifdef CONFIG_PPC_64K_PAGES /* PAGE_SIZE mappings */ mmu_virtual_psize = MMU_PAGE_64K; @@ -581,6 +582,7 @@ void __init radix__early_init_mmu(void) mmu_vmemmap_psize = MMU_PAGE_2M; } else mmu_vmemmap_psize = mmu_virtual_psize; +#endif #endif /* * initialize page table size diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 8acd00178956..c1cb21a00884 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -82,6 +82,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(copro_handle_mm_fault); +#ifdef CONFIG_PPC_64S_HASH_MMU int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) { u64 vsid, vsidkey; @@ -146,3 +147,4 @@ void copro_flush_all_slbs(struct mm_struct *mm) cxl_slbia(mm); } EXPORT_SYMBOL_GPL(copro_flush_all_slbs); +#endif diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index 4050cbb55acf..b533caaf0910 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -10,5 +10,5 @@ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o ifdef CONFIG_PTDUMP_DEBUGFS obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o -obj-$(CONFIG_PPC_BOOK3S_64) += hashpagetable.o +obj-$(CONFIG_PPC_64S_HASH_MMU) += hashpagetable.o endif diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 95458fd9572c..885ef229aba1 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -491,12 +491,14 @@ subcore_woken: mtspr(SPRN_SPRG3, local_paca->sprg_vdso); +#ifdef CONFIG_PPC_64S_HASH_MMU /* * The SLB has to be restored here, but it sometimes still * contains entries, so the __ variant must be used to prevent * multi hits. */ __slb_restore_bolted_realmode(); +#endif return srr1; } diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 5ef6b8afb3d0..f37d6524a24d 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -211,6 +211,7 @@ static void __init pnv_init(void) #endif add_preferred_console("hvc", 0, NULL); +#ifdef CONFIG_PPC_64S_HASH_MMU if (!radix_enabled()) { size_t size = sizeof(struct slb_entry) * mmu_slb_size; int i; @@ -223,6 +224,7 @@ static void __init pnv_init(void) cpu_to_node(i)); } } +#endif } static void __init pnv_init_IRQ(void) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 06d6a824c0dc..fac5d86777db 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -58,6 +58,7 @@ EXPORT_SYMBOL(plpar_hcall); EXPORT_SYMBOL(plpar_hcall9); EXPORT_SYMBOL(plpar_hcall_norets); +#ifdef CONFIG_PPC_64S_HASH_MMU /* * H_BLOCK_REMOVE supported block size for this page size in segment who's base * page size is that page size. @@ -66,6 +67,7 @@ EXPORT_SYMBOL(plpar_hcall_norets); * page size. */ static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init; +#endif /* * Due to the involved complexity, and that the current hypervisor is only @@ -689,7 +691,7 @@ void vpa_init(int cpu) return; } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. @@ -702,7 +704,7 @@ void vpa_init(int cpu) "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_64S_HASH_MMU */ /* * Register dispatch trace log, if one has been allocated. @@ -740,6 +742,8 @@ static int pseries_lpar_register_process_table(unsigned long base, return rc; } +#ifdef CONFIG_PPC_64S_HASH_MMU + static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, @@ -1730,6 +1734,7 @@ void __init hpte_init_pseries(void) if (cpu_has_feature(CPU_FTR_ARCH_300)) pseries_lpar_register_process_table(0, 0, 0); } +#endif /* CONFIG_PPC_64S_HASH_MMU */ #ifdef CONFIG_PPC_RADIX_MMU void radix_init_pseries(void) @@ -1932,6 +1937,7 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) return rc; } +#ifdef CONFIG_PPC_64S_HASH_MMU static unsigned long vsid_unscramble(unsigned long vsid, int ssize) { unsigned long protovsid; @@ -1992,6 +1998,7 @@ static int __init reserve_vrma_context_id(void) return 0; } machine_device_initcall(pseries, reserve_vrma_context_id); +#endif #ifdef CONFIG_DEBUG_FS /* debugfs file interface for vpa data */ diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 3354c00914fa..c7940fcfc911 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -531,7 +531,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (!radix_enabled()) seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 210a37a065fb..85033f392c78 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -451,11 +451,15 @@ static void prod_others(void) static u16 clamp_slb_size(void) { +#ifdef CONFIG_PPC_64S_HASH_MMU u16 prev = mmu_slb_size; slb_set_size(SLB_MIN_SIZE); return prev; +#else + return 0; +#endif } static int do_suspend(void) diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 3544778e06d0..b4c63c481f33 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -113,6 +113,11 @@ int dlpar_workqueue_init(void); extern u32 pseries_security_flavor; void pseries_setup_security_mitigations(void); + +#ifdef CONFIG_PPC_64S_HASH_MMU void pseries_lpar_read_hblkrm_characteristics(void); +#else +static inline void pseries_lpar_read_hblkrm_characteristics(void) { } +#endif #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 56092dccfdb8..74c9b1b5bc66 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -526,6 +526,7 @@ static int mce_handle_err_realmode(int disposition, u8 error_type) disposition = RTAS_DISP_FULLY_RECOVERED; break; case MC_ERROR_TYPE_SLB: +#ifdef CONFIG_PPC_64S_HASH_MMU /* * Store the old slb content in paca before flushing. * Print this when we go to virtual mode. @@ -538,6 +539,7 @@ static int mce_handle_err_realmode(int disposition, u8 error_type) slb_save_contents(local_paca->mce_faulty_slbs); flush_and_reload_slb(); disposition = RTAS_DISP_FULLY_RECOVERED; +#endif break; default: break; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 8a62af5b9c24..7f69237d4fa4 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -112,7 +112,7 @@ static void __init fwnmi_init(void) u8 *mce_data_buf; unsigned int i; int nr_cpus = num_possible_cpus(); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU struct slb_entry *slb_ptr; size_t size; #endif @@ -152,7 +152,7 @@ static void __init fwnmi_init(void) (RTAS_ERROR_LOG_MAX * i); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (!radix_enabled()) { /* Allocate per cpu area to save old slb contents during MCE */ size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; @@ -801,7 +801,9 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); pseries_setup_security_mitigations(); +#ifdef CONFIG_PPC_64S_HASH_MMU pseries_lpar_read_hblkrm_characteristics(); +#endif /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 83100c6524cc..0c65dc01c325 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1159,7 +1159,7 @@ cmds(struct pt_regs *excp) case 'P': show_tasks(); break; -#ifdef CONFIG_PPC_BOOK3S +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_64S_HASH_MMU) case 'u': dump_segments(); break; @@ -2614,7 +2614,7 @@ static void dump_tracing(void) static void dump_one_paca(int cpu) { struct paca_struct *p; -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU int i = 0; #endif @@ -2656,6 +2656,7 @@ static void dump_one_paca(int cpu) DUMP(p, cpu_start, "%#-*x"); DUMP(p, kexec_state, "%#-*x"); #ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU if (!early_radix_enabled()) { for (i = 0; i < SLB_NUM_BOLTED; i++) { u64 esid, vsid; @@ -2683,6 +2684,7 @@ static void dump_one_paca(int cpu) 22, "slb_cache", i, p->slb_cache[i]); } } +#endif DUMP(p, rfi_flush_fallback_area, "%-*px"); #endif @@ -3746,7 +3748,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU void dump_segments(void) { int i; diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index 609d9ee2acc0..82fb276f7e09 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -182,7 +182,7 @@ static const struct crashtype crashtypes[] = { CRASHTYPE(FORTIFIED_SUBOBJECT), CRASHTYPE(FORTIFIED_STRSCPY), CRASHTYPE(DOUBLE_FAULT), -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_64S_HASH_MMU CRASHTYPE(PPC_SLB_MULTIHIT), #endif }; From 31284f703db2f1605b2dbc6bb0632b04d7be13e7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 Dec 2021 00:41:53 +1000 Subject: [PATCH 138/240] powerpc/microwatt: add POWER9_CPU, clear PPC_64S_HASH_MMU Microwatt implements a subset of ISA v3.0 (which is equivalent to the POWER9_CPU option). It is radix-only, so does not require hash MMU support. This saves 20kB compressed dtbImage and 56kB vmlinux size. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211201144153.2456614-19-npiggin@gmail.com --- arch/powerpc/configs/microwatt_defconfig | 3 ++- arch/powerpc/platforms/microwatt/Kconfig | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index 07d87a4044b2..eff933ebbb9e 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -15,6 +15,8 @@ CONFIG_EMBEDDED=y # CONFIG_COMPAT_BRK is not set # CONFIG_SLAB_MERGE_DEFAULT is not set CONFIG_PPC64=y +CONFIG_POWER9_CPU=y +# CONFIG_PPC_64S_HASH_MMU is not set # CONFIG_PPC_KUEP is not set # CONFIG_PPC_KUAP is not set CONFIG_CPU_LITTLE_ENDIAN=y @@ -27,7 +29,6 @@ CONFIG_PPC_MICROWATT=y CONFIG_CPU_FREQ=y CONFIG_HZ_100=y CONFIG_PPC_4K_PAGES=y -# CONFIG_PPC_MEM_KEYS is not set # CONFIG_SECCOMP is not set # CONFIG_MQ_IOSCHED_KYBER is not set # CONFIG_COREDUMP is not set diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig index 823192e9d38a..5e320f49583a 100644 --- a/arch/powerpc/platforms/microwatt/Kconfig +++ b/arch/powerpc/platforms/microwatt/Kconfig @@ -5,7 +5,6 @@ config PPC_MICROWATT select PPC_XICS select PPC_ICS_NATIVE select PPC_ICP_NATIVE - select PPC_HASH_MMU_NATIVE if PPC_64S_HASH_MMU select PPC_UDBG_16550 select ARCH_RANDOM help From 06e7cbc29e97b4713b4ea6def04ae8501a7d1a59 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 27 Sep 2021 17:12:39 +0200 Subject: [PATCH 139/240] powerpc/40x: Map 32Mbytes of memory at startup As reported by Carlo, 16Mbytes is not enough with modern kernels that tend to be a bit big, so map another 16M page at boot. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/89b5f974a7fa5011206682cd092e2c905530ff46.1632755552.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_40x.S | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 7d72ee5ab387..e783860bea83 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -650,7 +651,7 @@ start_here: b . /* prevent prefetch past rfi */ /* Set up the initial MMU state so we can do the first level of - * kernel initialization. This maps the first 16 MBytes of memory 1:1 + * kernel initialization. This maps the first 32 MBytes of memory 1:1 * virtual to physical and more importantly sets the cache mode. */ initial_mmu: @@ -687,6 +688,12 @@ initial_mmu: tlbwe r4,r0,TLB_DATA /* Load the data portion of the entry */ tlbwe r3,r0,TLB_TAG /* Load the tag portion of the entry */ + li r0,62 /* TLB slot 62 */ + addis r4,r4,SZ_16M@h + addis r3,r3,SZ_16M@h + tlbwe r4,r0,TLB_DATA /* Load the data portion of the entry */ + tlbwe r3,r0,TLB_TAG /* Load the tag portion of the entry */ + isync /* Establish the exception vector base From 6c1fa60d368e6b752e1612eae9bb0970e85392b2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:12 +0200 Subject: [PATCH 140/240] Revert "powerpc: Inline setup_kup()" This reverts commit 1791ebd131c46539b024c0f2ebf12b6c88a265b9. setup_kup() was inlined to manage conflict between PPC32 marking setup_{kuap/kuep}() __init and PPC64 not marking them __init. But in fact PPC32 has removed the __init mark for all but 8xx in order to properly handle SMP. In order to make setup_kup() grow a bit, revert the commit mentioned above but remove __init for 8xx as well so that we don't have to mark setup_kup() as __ref. Also switch the order so that KUAP is initialised before KUEP because on the 40x, KUEP will depend on the activation of KUAP. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7691088fd0994ee3c8db6298dc8c00259e3f6a7f.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/kup.h | 8 ++------ arch/powerpc/mm/init-common.c | 6 ++++++ arch/powerpc/mm/nohash/8xx.c | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 1df763002726..8699ca5884b9 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -32,6 +32,8 @@ extern bool disable_kuap; #include +void setup_kup(void); + #ifdef CONFIG_PPC_KUEP void setup_kuep(bool disabled); #else @@ -78,12 +80,6 @@ static inline void restore_user_access(unsigned long flags) { } #endif /* CONFIG_PPC_BOOK3S_64 */ #endif /* CONFIG_PPC_KUAP */ -static __always_inline void setup_kup(void) -{ - setup_kuep(disable_kuep); - setup_kuap(disable_kuap); -} - static __always_inline void allow_read_from_user(const void __user *from, unsigned long size) { barrier_nospec(); diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 3a82f89827a5..b4f3437aee38 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -47,6 +47,12 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); +void setup_kup(void) +{ + setup_kuap(disable_kuap); + setup_kuep(disable_kuep); +} + #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ memset(addr, 0, sizeof(void *) << (shift)); \ diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 0df9fe29dd56..baa1f8a40af8 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -213,7 +213,7 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, } #ifdef CONFIG_PPC_KUEP -void __init setup_kuep(bool disabled) +void setup_kuep(bool disabled) { if (disabled) return; @@ -228,7 +228,7 @@ void __init setup_kuep(bool disabled) struct static_key_false disable_kuap_key; EXPORT_SYMBOL(disable_kuap_key); -void __init setup_kuap(bool disabled) +void setup_kuap(bool disabled) { if (disabled) { static_branch_enable(&disable_kuap_key); From 13dac4e31e75ce10b2fcaad4432a24dae6c955f6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:13 +0200 Subject: [PATCH 141/240] powerpc/8xx: Activate KUEP at all time On the 8xx, there is absolutely no runtime impact with KUEP. Protection against execution of user code in kernel mode is set up at boot time by configuring the groups with contain all user pages as having swapped protection rights, in extenso EX for user and NA for supervisor. Configure KUEP at startup and force selection of CONFIG_PPC_KUEP. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2129e86944323ffe9ed07fffbeafdfd2e363690a.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 6 ++---- arch/powerpc/mm/nohash/8xx.c | 5 ----- arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 997cec973406..0e93a4728c9e 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -39,12 +39,10 @@ * 0 => Kernel => 11 (all accesses performed according as user iaw page definition) * 1 => Kernel+Accessed => 01 (all accesses performed according to page definition) * 2 => User => 11 (all accesses performed according as user iaw page definition) - * 3 => User+Accessed => 00 (all accesses performed as supervisor iaw page definition) for INIT - * => 10 (all accesses performed according to swaped page definition) for KUEP + * 3 => User+Accessed => 10 (all accesses performed according to swaped page definition) for KUEP * 4-15 => Not Used */ -#define MI_APG_INIT 0xdc000000 -#define MI_APG_KUEP 0xde000000 +#define MI_APG_INIT 0xde000000 /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index baa1f8a40af8..e878e8124ee6 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -215,12 +215,7 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, #ifdef CONFIG_PPC_KUEP void setup_kuep(bool disabled) { - if (disabled) - return; - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - mtspr(SPRN_MI_AP, MI_APG_KUEP); } #endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7ca07df1c374..8b36608c7888 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -43,6 +43,7 @@ config PPC_8xx select ARCH_SUPPORTS_HUGETLBFS select FSL_SOC select PPC_HAVE_KUEP + select PPC_KUEP select PPC_HAVE_KUAP select HAVE_ARCH_VMAP_STACK select HUGETLBFS From ee2631603fdbab6f76e86ea87f7a03ebc3a1ef85 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:14 +0200 Subject: [PATCH 142/240] powerpc/44x: Activate KUEP at all time On 44x, KUEP is implemented by clearing SX bit during TLB miss for user pages. The impact is minimal and not worth neither boot time nor build time selection. Activate it at all time. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2414d662558e7fb27d1ed41c8e47c591d576acac.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/mmu-44x.h | 1 - arch/powerpc/kernel/head_44x.S | 10 ++-------- arch/powerpc/mm/nohash/44x.c | 8 +------- arch/powerpc/platforms/Kconfig.cputype | 1 + 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-44x.h b/arch/powerpc/include/asm/nohash/32/mmu-44x.h index 43ceca128531..2d92a39d8f2e 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-44x.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-44x.h @@ -113,7 +113,6 @@ typedef struct { /* patch sites */ extern s32 patch__tlb_44x_hwater_D, patch__tlb_44x_hwater_I; -extern s32 patch__tlb_44x_kuep, patch__tlb_47x_kuep; #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 02d2928d1e01..916f7e91c6de 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -532,10 +532,7 @@ finish_tlb_load_44x: andi. r10,r12,_PAGE_USER /* User page ? */ beq 1f /* nope, leave U bits empty */ rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ -#ifdef CONFIG_PPC_KUEP -0: rlwinm r11,r11,0,~PPC44x_TLB_SX /* Clear SX if User page */ - patch_site 0b, patch__tlb_44x_kuep -#endif + rlwinm r11,r11,0,~PPC44x_TLB_SX /* Clear SX if User page */ 1: tlbwe r11,r13,PPC44x_TLB_ATTRIB /* Write ATTRIB */ /* Done...restore registers and get out of here. @@ -747,10 +744,7 @@ finish_tlb_load_47x: andi. r10,r12,_PAGE_USER /* User page ? */ beq 1f /* nope, leave U bits empty */ rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ -#ifdef CONFIG_PPC_KUEP -0: rlwinm r11,r11,0,~PPC47x_TLB2_SX /* Clear SX if User page */ - patch_site 0b, patch__tlb_47x_kuep -#endif + rlwinm r11,r11,0,~PPC47x_TLB2_SX /* Clear SX if User page */ 1: tlbwe r11,r13,2 /* Done...restore registers and get out of here. diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c index e079f26b267e..ceb290df1fb5 100644 --- a/arch/powerpc/mm/nohash/44x.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -247,12 +247,6 @@ void setup_kuep(bool disabled) if (smp_processor_id() != boot_cpuid) return; - if (disabled) - patch_instruction_site(&patch__tlb_44x_kuep, ppc_inst(PPC_RAW_NOP())); - else - pr_info("Activating Kernel Userspace Execution Prevention\n"); - - if (IS_ENABLED(CONFIG_PPC_47x) && disabled) - patch_instruction_site(&patch__tlb_47x_kuep, ppc_inst(PPC_RAW_NOP())); + pr_info("Activating Kernel Userspace Execution Prevention\n"); } #endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8b36608c7888..4f8774d65aa8 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -63,6 +63,7 @@ config 44x select HAVE_PCI select PHYS_64BIT select PPC_HAVE_KUEP + select PPC_KUEP endchoice From dc3a0e5b83a8806d7da1f343a7d2e0be386d16d2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:15 +0200 Subject: [PATCH 143/240] powerpc/book3e: Activate KUEP at all time On book3e, - When using 64 bits PTE: User pages don't have the SX bit defined so KUEP is always active. - When using 32 bits PTE: Implement KUEP by clearing SX bit during TLB miss for user pages. The impact is minimal and worth neither boot time nor build time selection. Activate it at all time. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e376b114283fb94504e2aa2de846780063252cde.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_fsl_booke.S | 1 + arch/powerpc/platforms/Kconfig.cputype | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 0a9a0f301474..4622b50a5208 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -777,6 +777,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ slwi r10, r12, 1 or r10, r10, r12 + rlwinm r10, r10, 0, ~_PAGE_EXEC /* Clear SX on user pages */ iseleq r12, r12, r10 rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ mtspr SPRN_MAS3, r13 diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 4f8774d65aa8..408d8ee5bfcd 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -299,6 +299,8 @@ config PPC_FSL_BOOK3E select FSL_EMB_PERFMON select PPC_SMP_MUXED_IPI select PPC_DOORBELL + select PPC_HAVE_KUEP + select PPC_KUEP default y if FSL_BOOKE config PTE_64BIT From df415cd758261bceff27f34a145dd8328bbfb018 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:16 +0200 Subject: [PATCH 144/240] powerpc/32s: Remove capability to disable KUEP at boottime Disabling KUEP at boottime makes things unnecessarily complex. Still allow disabling KUEP at build time, but when it's built-in it is always there. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/96f583f82423a29a4205c60b9721079111b35567.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/kup.h | 3 +-- arch/powerpc/mm/book3s32/kuep.c | 10 ++-------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 9f38040f0641..fb6c39225dd1 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -12,7 +12,6 @@ #include extern struct static_key_false disable_kuap_key; -extern struct static_key_false disable_kuep_key; static __always_inline bool kuap_is_disabled(void) { @@ -21,7 +20,7 @@ static __always_inline bool kuap_is_disabled(void) static __always_inline bool kuep_is_disabled(void) { - return !IS_ENABLED(CONFIG_PPC_KUEP) || static_branch_unlikely(&disable_kuep_key); + return !IS_ENABLED(CONFIG_PPC_KUEP); } static inline void kuep_lock(void) diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c index c20733d6e02c..8474edce3df9 100644 --- a/arch/powerpc/mm/book3s32/kuep.c +++ b/arch/powerpc/mm/book3s32/kuep.c @@ -3,18 +3,12 @@ #include #include -struct static_key_false disable_kuep_key; - void setup_kuep(bool disabled) { - if (!disabled) - kuep_lock(); + kuep_lock(); if (smp_processor_id() != boot_cpuid) return; - if (disabled) - static_branch_enable(&disable_kuep_key); - else - pr_info("Activating Kernel Userspace Execution Prevention\n"); + pr_info("Activating Kernel Userspace Execution Prevention\n"); } From 526d4a4c77aedf1b7df1133e5cced29c70232e6e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:17 +0200 Subject: [PATCH 145/240] powerpc/32s: Do kuep_lock() and kuep_unlock() in assembly When interrupt and syscall entries where converted to C, KUEP locking and unlocking was also converted. It improved performance by unrolling the loop, and allowed easily implementing boot time deactivation of KUEP. However, null_syscall selftest shows that KUEP is still heavy (361 cycles with KUEP, 212 cycles without). A way to improve more is to group 'mtsr's together, instead of repeating 'addi' + 'mtsr' several times. In order to do that, more registers need to be available. In C, GCC will always be able to provide the requested number of registers, but at the cost of saving some data on the stack, which is counter performant here. So let's do it in assembly, when we have full control of which register can be used. It also has the advantage of locking earlier and unlocking later and it helps GCC generating less tricky code. The only drawback is to make boot time deactivation less straight forward and require 'hand' instruction patching. Group 'mtsr's by 4. With this change, null_syscall selftest reports 336 cycles. Without the change it was 361 cycles, that's a 7% reduction. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/115cb279e9b9948dfd93a065e047081c59e3a2a6.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/kup.h | 34 -------- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 77 ++++++++++++++++++- arch/powerpc/include/asm/interrupt.h | 6 +- arch/powerpc/include/asm/kup.h | 5 -- arch/powerpc/kernel/entry_32.S | 31 ++++++++ arch/powerpc/kernel/head_32.h | 6 ++ arch/powerpc/kernel/head_book3s_32.S | 4 + arch/powerpc/kernel/interrupt.c | 3 - arch/powerpc/mm/book3s32/kuep.c | 2 - 9 files changed, 119 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index fb6c39225dd1..e3db5ed4b255 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -23,40 +23,6 @@ static __always_inline bool kuep_is_disabled(void) return !IS_ENABLED(CONFIG_PPC_KUEP); } -static inline void kuep_lock(void) -{ - if (kuep_is_disabled()) - return; - - update_user_segments(mfsr(0) | SR_NX); - /* - * This isync() shouldn't be necessary as the kernel is not excepted to - * run any instruction in userspace soon after the update of segments, - * but hash based cores (at least G3) seem to exhibit a random - * behaviour when the 'isync' is not there. 603 cores don't have this - * behaviour so don't do the 'isync' as it saves several CPU cycles. - */ - if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) - isync(); /* Context sync required after mtsr() */ -} - -static inline void kuep_unlock(void) -{ - if (kuep_is_disabled()) - return; - - update_user_segments(mfsr(0) & ~SR_NX); - /* - * This isync() shouldn't be necessary as a 'rfi' will soon be executed - * to return to userspace, but hash based cores (at least G3) seem to - * exhibit a random behaviour when the 'isync' is not there. 603 cores - * don't have this behaviour so don't do the 'isync' as it saves several - * CPU cycles. - */ - if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) - isync(); /* Context sync required after mtsr() */ -} - #ifdef CONFIG_PPC_KUAP #include diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index f5be185cbdf8..e2f7ccc13edb 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -64,7 +64,82 @@ struct ppc_bat { #define SR_KP 0x20000000 /* User key */ #define SR_KS 0x40000000 /* Supervisor key */ -#ifndef __ASSEMBLY__ +#ifdef __ASSEMBLY__ + +#include + +.macro uus_addi sr reg1 reg2 imm + .if NUM_USER_SEGMENTS > \sr + addi \reg1,\reg2,\imm + .endif +.endm + +.macro uus_mtsr sr reg1 + .if NUM_USER_SEGMENTS > \sr + mtsr \sr, \reg1 + .endif +.endm + +/* + * This isync() shouldn't be necessary as the kernel is not excepted to run + * any instruction in userspace soon after the update of segments and 'rfi' + * instruction is used to return to userspace, but hash based cores + * (at least G3) seem to exhibit a random behaviour when the 'isync' is not + * there. 603 cores don't have this behaviour so don't do the 'isync' as it + * saves several CPU cycles. + */ +.macro uus_isync +#ifdef CONFIG_PPC_BOOK3S_604 +BEGIN_MMU_FTR_SECTION + isync +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) +#endif +.endm + +.macro update_user_segments_by_4 tmp1 tmp2 tmp3 tmp4 + uus_addi 1, \tmp2, \tmp1, 0x111 + uus_addi 2, \tmp3, \tmp1, 0x222 + uus_addi 3, \tmp4, \tmp1, 0x333 + + uus_mtsr 0, \tmp1 + uus_mtsr 1, \tmp2 + uus_mtsr 2, \tmp3 + uus_mtsr 3, \tmp4 + + uus_addi 4, \tmp1, \tmp1, 0x444 + uus_addi 5, \tmp2, \tmp2, 0x444 + uus_addi 6, \tmp3, \tmp3, 0x444 + uus_addi 7, \tmp4, \tmp4, 0x444 + + uus_mtsr 4, \tmp1 + uus_mtsr 5, \tmp2 + uus_mtsr 6, \tmp3 + uus_mtsr 7, \tmp4 + + uus_addi 8, \tmp1, \tmp1, 0x444 + uus_addi 9, \tmp2, \tmp2, 0x444 + uus_addi 10, \tmp3, \tmp3, 0x444 + uus_addi 11, \tmp4, \tmp4, 0x444 + + uus_mtsr 8, \tmp1 + uus_mtsr 9, \tmp2 + uus_mtsr 10, \tmp3 + uus_mtsr 11, \tmp4 + + uus_addi 12, \tmp1, \tmp1, 0x444 + uus_addi 13, \tmp2, \tmp2, 0x444 + uus_addi 14, \tmp3, \tmp3, 0x444 + uus_addi 15, \tmp4, \tmp4, 0x444 + + uus_mtsr 12, \tmp1 + uus_mtsr 13, \tmp2 + uus_mtsr 14, \tmp3 + uus_mtsr 15, \tmp4 + + uus_isync +.endm + +#else /* * This macro defines the mapping from contexts to VSIDs (virtual diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 3487aab12229..94cc9366f3f0 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -139,12 +139,10 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup if (!arch_irq_disabled_regs(regs)) trace_hardirqs_off(); - if (user_mode(regs)) { - kuep_lock(); + if (user_mode(regs)) account_cpu_user_entry(); - } else { + else kuap_save_and_lock(regs); - } #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 8699ca5884b9..94734a8eb54d 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -40,11 +40,6 @@ void setup_kuep(bool disabled); static inline void setup_kuep(bool disabled) { } #endif /* CONFIG_PPC_KUEP */ -#ifndef CONFIG_PPC_BOOK3S_32 -static inline void kuep_lock(void) { } -static inline void kuep_unlock(void) { } -#endif - #ifdef CONFIG_PPC_KUAP void setup_kuap(bool disabled); #else diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index c62dd9815965..0756829b2f7f 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -73,6 +73,34 @@ prepare_transfer_to_handler: _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler) #endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */ +#if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32) + .globl __kuep_lock +__kuep_lock: + mfsr r9,0 + rlwinm r9,r9,0,8,3 + oris r9,r9,SR_NX@h + update_user_segments_by_4 r9, r10, r11, r12 + blr + +__kuep_unlock: + mfsr r9,0 + rlwinm r9,r9,0,8,2 + update_user_segments_by_4 r9, r10, r11, r12 + blr + +.macro kuep_lock + bl __kuep_lock +.endm +.macro kuep_unlock + bl __kuep_unlock +.endm +#else +.macro kuep_lock +.endm +.macro kuep_unlock +.endm +#endif + .globl transfer_to_syscall transfer_to_syscall: stw r11, GPR1(r1) @@ -93,6 +121,7 @@ transfer_to_syscall: SAVE_GPRS(3, 8, r1) addi r2,r10,-THREAD SAVE_NVGPRS(r1) + kuep_lock /* Calling convention has r9 = orig r0, r10 = regs */ addi r10,r1,STACK_FRAME_OVERHEAD @@ -109,6 +138,7 @@ ret_from_syscall: cmplwi cr0,r5,0 bne- 2f #endif /* CONFIG_PPC_47x */ + kuep_unlock lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlr r4 @@ -272,6 +302,7 @@ interrupt_return: beq .Lkernel_interrupt_return bl interrupt_exit_user_prepare cmpwi r3,0 + kuep_unlock bne- .Lrestore_nvgprs .Lfast_user_interrupt_return: diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 25887303651a..40d23a863b28 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -135,6 +135,12 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) andi. r12,r9,MSR_PR bne 777f bl prepare_transfer_to_handler +#ifdef CONFIG_PPC_KUEP + b 778f +777: + bl __kuep_lock +778: +#endif 777: #endif .endm diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 68e5c0a7e99d..fa84744d6b24 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -931,7 +931,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) _GLOBAL(load_segment_registers) li r0, NUM_USER_SEGMENTS /* load up user segment register values */ mtctr r0 /* for context 0 */ +#ifdef CONFIG_PPC_KUEP + lis r3, SR_NX@h /* Kp = 0, Ks = 0, VSID = 0 */ +#else li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ +#endif li r4, 0 3: mtsrin r3, r4 addi r3, r3, 0x111 /* increment VSID */ diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 835b626cd476..75dc045bdcb8 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -81,8 +81,6 @@ notrace long system_call_exception(long r3, long r4, long r5, { syscall_fn f; - kuep_lock(); - regs->orig_gpr3 = r3; if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) @@ -406,7 +404,6 @@ again: /* Restore user access locks last */ kuap_user_restore(regs); - kuep_unlock(); return ret; } diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c index 8474edce3df9..bac1420d028b 100644 --- a/arch/powerpc/mm/book3s32/kuep.c +++ b/arch/powerpc/mm/book3s32/kuep.c @@ -5,8 +5,6 @@ void setup_kuep(bool disabled) { - kuep_lock(); - if (smp_processor_id() != boot_cpuid) return; From 70428da94c7ad692d306747a04117543827292a7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:18 +0200 Subject: [PATCH 146/240] powerpc/32s: Save content of sr0 to avoid 'mfsr' Calling 'mfsr' to get the content of segment registers is heavy, in addition it requires clearing of the 'reserved' bits. In order to avoid this operation, save it in mm context and in thread struct. The saved sr0 is the one used by kernel, this means that on locking entry it can be used as is. For unlocking, the only thing to do is to clear SR_NX. This improves null_syscall selftest by 12 cycles, ie 4%. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b02baf2ed8f09bad910dfaeeb7353b2ae6830525.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 5 +++++ arch/powerpc/include/asm/processor.h | 9 +++++++++ arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/entry_32.S | 8 +++----- arch/powerpc/mm/book3s32/kuap.c | 5 ++++- arch/powerpc/mm/book3s32/kuep.c | 1 + arch/powerpc/mm/book3s32/mmu_context.c | 15 +++++++-------- arch/powerpc/mm/mmu_context.c | 3 +++ 8 files changed, 33 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index e2f7ccc13edb..7be27862329f 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -175,9 +175,14 @@ struct hash_pte { typedef struct { unsigned long id; + unsigned long sr0; void __user *vdso; } mm_context_t; +#ifdef CONFIG_PPC_KUEP +#define INIT_MM_CONTEXT(mm) .context.sr0 = SR_NX +#endif + void update_bats(void); static inline void cleanup_cpu_mmu_context(void) { } diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 978a80308466..fe1ef1d7523b 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -157,6 +157,7 @@ struct thread_struct { #ifdef CONFIG_PPC_BOOK3S_32 unsigned long r0, r3, r4, r5, r6, r8, r9, r11; unsigned long lr, ctr; + unsigned long sr0; #endif #endif /* CONFIG_PPC32 */ /* Debug Registers */ @@ -278,6 +279,12 @@ struct thread_struct { #define SPEFSCR_INIT #endif +#ifdef CONFIG_PPC_BOOK3S_32 +#define SR0_INIT .sr0 = IS_ENABLED(CONFIG_PPC_KUEP) ? SR_NX : 0, +#else +#define SR0_INIT +#endif + #if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) #define INIT_THREAD { \ .ksp = INIT_SP, \ @@ -285,6 +292,7 @@ struct thread_struct { .kuap = ~0UL, /* KUAP_NONE */ \ .fpexc_mode = MSR_FE0 | MSR_FE1, \ SPEFSCR_INIT \ + SR0_INIT \ } #elif defined(CONFIG_PPC32) #define INIT_THREAD { \ @@ -292,6 +300,7 @@ struct thread_struct { .pgdir = swapper_pg_dir, \ .fpexc_mode = MSR_FE0 | MSR_FE1, \ SPEFSCR_INIT \ + SR0_INIT \ } #else #define INIT_THREAD { \ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b823f484c640..cf3436b7b166 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -139,6 +139,7 @@ int main(void) OFFSET(THR11, thread_struct, r11); OFFSET(THLR, thread_struct, lr); OFFSET(THCTR, thread_struct, ctr); + OFFSET(THSR0, thread_struct, sr0); #endif #ifdef CONFIG_SPE OFFSET(THREAD_EVR0, thread_struct, evr[0]); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0756829b2f7f..035bf4f3eb5d 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -76,15 +76,13 @@ _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler) #if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32) .globl __kuep_lock __kuep_lock: - mfsr r9,0 - rlwinm r9,r9,0,8,3 - oris r9,r9,SR_NX@h + lwz r9, THREAD+THSR0(r2) update_user_segments_by_4 r9, r10, r11, r12 blr __kuep_unlock: - mfsr r9,0 - rlwinm r9,r9,0,8,2 + lwz r9, THREAD+THSR0(r2) + rlwinm r9,r9,0,~SR_NX update_user_segments_by_4 r9, r10, r11, r12 blr diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c index 0f920f09af57..28676cabb005 100644 --- a/arch/powerpc/mm/book3s32/kuap.c +++ b/arch/powerpc/mm/book3s32/kuap.c @@ -20,8 +20,11 @@ EXPORT_SYMBOL(kuap_unlock_all_ool); void setup_kuap(bool disabled) { - if (!disabled) + if (!disabled) { kuap_lock_all_ool(); + init_mm.context.sr0 |= SR_KS; + current->thread.sr0 |= SR_KS; + } if (smp_processor_id() != boot_cpuid) return; diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c index bac1420d028b..78fc48eee510 100644 --- a/arch/powerpc/mm/book3s32/kuep.c +++ b/arch/powerpc/mm/book3s32/kuep.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include diff --git a/arch/powerpc/mm/book3s32/mmu_context.c b/arch/powerpc/mm/book3s32/mmu_context.c index e2708e387dc3..269a3eb25a73 100644 --- a/arch/powerpc/mm/book3s32/mmu_context.c +++ b/arch/powerpc/mm/book3s32/mmu_context.c @@ -69,6 +69,12 @@ EXPORT_SYMBOL_GPL(__init_new_context); int init_new_context(struct task_struct *t, struct mm_struct *mm) { mm->context.id = __init_new_context(); + mm->context.sr0 = CTX_TO_VSID(mm->context.id, 0); + + if (!kuep_is_disabled()) + mm->context.sr0 |= SR_NX; + if (!kuap_is_disabled()) + mm->context.sr0 |= SR_KS; return 0; } @@ -108,20 +114,13 @@ void __init mmu_context_init(void) void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { long id = next->context.id; - unsigned long val; if (id < 0) panic("mm_struct %p has no context ID", next); isync(); - val = CTX_TO_VSID(id, 0); - if (!kuep_is_disabled()) - val |= SR_NX; - if (!kuap_is_disabled()) - val |= SR_KS; - - update_user_segments(val); + update_user_segments(next->context.sr0); if (IS_ENABLED(CONFIG_BDI_SWITCH)) abatron_pteptrs[1] = next->pgd; diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 74246536b832..e618d5442a28 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -18,6 +18,9 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, { /* 32-bit keeps track of the current PGDIR in the thread struct */ tsk->thread.pgdir = mm->pgd; +#ifdef CONFIG_PPC_BOOK3S_32 + tsk->thread.sr0 = mm->context.sr0; +#endif } #elif defined(CONFIG_PPC_BOOK3E_64) static inline void switch_mm_pgdir(struct task_struct *tsk, From 6754862249d324b11f1361a5353e234325d805ec Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:19 +0200 Subject: [PATCH 147/240] powerpc/kuep: Remove 'nosmep' boot time parameter except for book3s/64 Deactivating KUEP at boot time is unrelevant for PPC32 and BOOK3E/64. Remove it. It allows to refactor setup_kuep() via a __weak function that only PPC64s will overide for now. Signed-off-by: Christophe Leroy [mpe: Fix CONFIG_PPC_BOOKS_64 -> CONFIG_PPC_BOOK3S_64 typo] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4c36df18b41c988c4512f45d96220486adbe4c99.1634627931.git.christophe.leroy@csgroup.eu --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/include/asm/kup.h | 5 ----- arch/powerpc/mm/book3s32/Makefile | 1 - arch/powerpc/mm/book3s32/kuep.c | 13 ------------- arch/powerpc/mm/init-common.c | 15 +++++++++++++++ arch/powerpc/mm/nohash/44x.c | 10 ---------- arch/powerpc/mm/nohash/8xx.c | 7 ------- 7 files changed, 16 insertions(+), 37 deletions(-) delete mode 100644 arch/powerpc/mm/book3s32/kuep.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6248a061788a..83d558de56d4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3378,7 +3378,7 @@ Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [X86,PPC] + nosmep [X86,PPC64s] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 94734a8eb54d..fa8513b7acca 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -33,12 +33,7 @@ extern bool disable_kuap; #include void setup_kup(void); - -#ifdef CONFIG_PPC_KUEP void setup_kuep(bool disabled); -#else -static inline void setup_kuep(bool disabled) { } -#endif /* CONFIG_PPC_KUEP */ #ifdef CONFIG_PPC_KUAP void setup_kuap(bool disabled); diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile index 15f4773643d2..50dd8f6bdf46 100644 --- a/arch/powerpc/mm/book3s32/Makefile +++ b/arch/powerpc/mm/book3s32/Makefile @@ -9,5 +9,4 @@ endif obj-y += mmu.o mmu_context.o obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o -obj-$(CONFIG_PPC_KUEP) += kuep.o obj-$(CONFIG_PPC_KUAP) += kuap.o diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c deleted file mode 100644 index 78fc48eee510..000000000000 --- a/arch/powerpc/mm/book3s32/kuep.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include - -void setup_kuep(bool disabled) -{ - if (smp_processor_id() != boot_cpuid) - return; - - pr_info("Activating Kernel Userspace Execution Prevention\n"); -} diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index b4f3437aee38..119ef491f797 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -20,6 +20,7 @@ #include #include #include +#include phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull; EXPORT_SYMBOL_GPL(memstart_addr); @@ -33,6 +34,9 @@ bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); static int __init parse_nosmep(char *p) { + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + return 0; + disable_kuep = true; pr_warn("Disabling Kernel Userspace Execution Prevention\n"); return 0; @@ -47,6 +51,17 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); +void __weak setup_kuep(bool disabled) +{ + if (!IS_ENABLED(CONFIG_PPC_KUEP) || disabled) + return; + + if (smp_processor_id() != boot_cpuid) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); +} + void setup_kup(void) { setup_kuap(disable_kuap); diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c index ceb290df1fb5..796c824acc8c 100644 --- a/arch/powerpc/mm/nohash/44x.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -240,13 +240,3 @@ void __init mmu_init_secondary(int cpu) } } #endif /* CONFIG_SMP */ - -#ifdef CONFIG_PPC_KUEP -void setup_kuep(bool disabled) -{ - if (smp_processor_id() != boot_cpuid) - return; - - pr_info("Activating Kernel Userspace Execution Prevention\n"); -} -#endif diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index e878e8124ee6..36010d1c0bc4 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -212,13 +212,6 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M)); } -#ifdef CONFIG_PPC_KUEP -void setup_kuep(bool disabled) -{ - pr_info("Activating Kernel Userspace Execution Prevention\n"); -} -#endif - #ifdef CONFIG_PPC_KUAP struct static_key_false disable_kuap_key; EXPORT_SYMBOL(disable_kuap_key); From ba454f9c8e4efcc47c772b7642a5c8c6d1343cbf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:20 +0200 Subject: [PATCH 148/240] powerpc/kuap: Add a generic intermediate layer Make the following functions generic to all platforms. - bad_kuap_fault() - kuap_assert_locked() - kuap_save_and_lock() (PPC32 only) - kuap_kernel_restore() - kuap_get_and_assert_locked() And for all platforms except book3s/64 - allow_user_access() - prevent_user_access() - prevent_user_access_return() - restore_user_access() Prepend __ in front of the name of platform specific ones. For now the generic just calls the platform specific, but next patch will move redundant parts of specific functions into the generic one. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/eaef143a8dae7288cd34565ffa7b49c16aee1ec3.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/kup.h | 22 +++--- arch/powerpc/include/asm/book3s/64/kup.h | 10 ++- arch/powerpc/include/asm/kup.h | 71 +++++++++++++++++--- arch/powerpc/include/asm/nohash/32/kup-8xx.h | 20 +++--- 4 files changed, 86 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index e3db5ed4b255..9e9b2692070c 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -77,7 +77,7 @@ static inline void kuap_unlock(unsigned long addr, bool ool) kuap_unlock_all_ool(); } -static inline void kuap_save_and_lock(struct pt_regs *regs) +static inline void __kuap_save_and_lock(struct pt_regs *regs) { unsigned long kuap = current->thread.kuap; @@ -96,7 +96,7 @@ static inline void kuap_user_restore(struct pt_regs *regs) { } -static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) +static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) { if (kuap_is_disabled()) return; @@ -114,7 +114,7 @@ static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) kuap_unlock(regs->kuap, false); } -static inline unsigned long kuap_get_and_assert_locked(void) +static inline unsigned long __kuap_get_and_assert_locked(void) { unsigned long kuap = current->thread.kuap; @@ -126,13 +126,13 @@ static inline unsigned long kuap_get_and_assert_locked(void) return kuap; } -static inline void kuap_assert_locked(void) +static inline void __kuap_assert_locked(void) { - kuap_get_and_assert_locked(); + __kuap_get_and_assert_locked(); } -static __always_inline void allow_user_access(void __user *to, const void __user *from, - u32 size, unsigned long dir) +static __always_inline void __allow_user_access(void __user *to, const void __user *from, + u32 size, unsigned long dir) { if (kuap_is_disabled()) return; @@ -146,7 +146,7 @@ static __always_inline void allow_user_access(void __user *to, const void __user kuap_unlock_one((__force u32)to); } -static __always_inline void prevent_user_access(unsigned long dir) +static __always_inline void __prevent_user_access(unsigned long dir) { u32 kuap = current->thread.kuap; @@ -162,7 +162,7 @@ static __always_inline void prevent_user_access(unsigned long dir) kuap_lock(kuap, true); } -static inline unsigned long prevent_user_access_return(void) +static inline unsigned long __prevent_user_access_return(void) { unsigned long flags = current->thread.kuap; @@ -177,7 +177,7 @@ static inline unsigned long prevent_user_access_return(void) return flags; } -static inline void restore_user_access(unsigned long flags) +static inline void __restore_user_access(unsigned long flags) { if (kuap_is_disabled()) return; @@ -189,7 +189,7 @@ static inline void restore_user_access(unsigned long flags) } static inline bool -bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) +__bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { unsigned long kuap = regs->kuap; diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 170339969b7c..03d61c5205a4 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -268,8 +268,7 @@ static inline void kuap_user_restore(struct pt_regs *regs) */ } -static inline void kuap_kernel_restore(struct pt_regs *regs, - unsigned long amr) +static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) { if (unlikely(regs->amr != amr)) { @@ -287,7 +286,7 @@ static inline void kuap_kernel_restore(struct pt_regs *regs, */ } -static inline unsigned long kuap_get_and_assert_locked(void) +static inline unsigned long __kuap_get_and_assert_locked(void) { if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) { unsigned long amr = mfspr(SPRN_AMR); @@ -298,7 +297,7 @@ static inline unsigned long kuap_get_and_assert_locked(void) return 0; } -static inline void kuap_assert_locked(void) +static inline void __kuap_assert_locked(void) { if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED); @@ -339,8 +338,7 @@ static inline void set_kuap(unsigned long value) isync(); } -static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, - bool is_write) +static inline bool __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { if (!mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) return false; diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index fa8513b7acca..f2a6fdb45d33 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -41,17 +41,17 @@ void setup_kuap(bool disabled); static inline void setup_kuap(bool disabled) { } static inline bool -bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) +__bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { return false; } -static inline void kuap_assert_locked(void) { } -static inline void kuap_save_and_lock(struct pt_regs *regs) { } +static inline void __kuap_assert_locked(void) { } +static inline void __kuap_save_and_lock(struct pt_regs *regs) { } static inline void kuap_user_restore(struct pt_regs *regs) { } -static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { } +static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { } -static inline unsigned long kuap_get_and_assert_locked(void) +static inline unsigned long __kuap_get_and_assert_locked(void) { return 0; } @@ -62,14 +62,65 @@ static inline unsigned long kuap_get_and_assert_locked(void) * platforms. */ #ifndef CONFIG_PPC_BOOK3S_64 -static inline void allow_user_access(void __user *to, const void __user *from, - unsigned long size, unsigned long dir) { } -static inline void prevent_user_access(unsigned long dir) { } -static inline unsigned long prevent_user_access_return(void) { return 0UL; } -static inline void restore_user_access(unsigned long flags) { } +static inline void __allow_user_access(void __user *to, const void __user *from, + unsigned long size, unsigned long dir) { } +static inline void __prevent_user_access(unsigned long dir) { } +static inline unsigned long __prevent_user_access_return(void) { return 0UL; } +static inline void __restore_user_access(unsigned long flags) { } #endif /* CONFIG_PPC_BOOK3S_64 */ #endif /* CONFIG_PPC_KUAP */ +static __always_inline bool +bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) +{ + return __bad_kuap_fault(regs, address, is_write); +} + +static __always_inline void kuap_assert_locked(void) +{ + __kuap_assert_locked(); +} + +#ifdef CONFIG_PPC32 +static __always_inline void kuap_save_and_lock(struct pt_regs *regs) +{ + __kuap_save_and_lock(regs); +} +#endif + +static __always_inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) +{ + __kuap_kernel_restore(regs, amr); +} + +static __always_inline unsigned long kuap_get_and_assert_locked(void) +{ + return __kuap_get_and_assert_locked(); +} + +#ifndef CONFIG_PPC_BOOK3S_64 +static __always_inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size, unsigned long dir) +{ + __allow_user_access(to, from, size, dir); +} + +static __always_inline void prevent_user_access(unsigned long dir) +{ + __prevent_user_access(dir); +} + +static __always_inline unsigned long prevent_user_access_return(void) +{ + return __prevent_user_access_return(); +} + +static __always_inline void restore_user_access(unsigned long flags) +{ + __restore_user_access(flags); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + static __always_inline void allow_read_from_user(const void __user *from, unsigned long size) { barrier_nospec(); diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 882a0bc7887a..a5db84164afd 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -20,7 +20,7 @@ static __always_inline bool kuap_is_disabled(void) return static_branch_unlikely(&disable_kuap_key); } -static inline void kuap_save_and_lock(struct pt_regs *regs) +static inline void __kuap_save_and_lock(struct pt_regs *regs) { if (kuap_is_disabled()) return; @@ -33,7 +33,7 @@ static inline void kuap_user_restore(struct pt_regs *regs) { } -static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) +static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) { if (kuap_is_disabled()) return; @@ -41,7 +41,7 @@ static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) mtspr(SPRN_MD_AP, regs->kuap); } -static inline unsigned long kuap_get_and_assert_locked(void) +static inline unsigned long __kuap_get_and_assert_locked(void) { unsigned long kuap; @@ -56,14 +56,14 @@ static inline unsigned long kuap_get_and_assert_locked(void) return kuap; } -static inline void kuap_assert_locked(void) +static inline void __kuap_assert_locked(void) { if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && !kuap_is_disabled()) kuap_get_and_assert_locked(); } -static inline void allow_user_access(void __user *to, const void __user *from, - unsigned long size, unsigned long dir) +static inline void __allow_user_access(void __user *to, const void __user *from, + unsigned long size, unsigned long dir) { if (kuap_is_disabled()) return; @@ -71,7 +71,7 @@ static inline void allow_user_access(void __user *to, const void __user *from, mtspr(SPRN_MD_AP, MD_APG_INIT); } -static inline void prevent_user_access(unsigned long dir) +static inline void __prevent_user_access(unsigned long dir) { if (kuap_is_disabled()) return; @@ -79,7 +79,7 @@ static inline void prevent_user_access(unsigned long dir) mtspr(SPRN_MD_AP, MD_APG_KUAP); } -static inline unsigned long prevent_user_access_return(void) +static inline unsigned long __prevent_user_access_return(void) { unsigned long flags; @@ -93,7 +93,7 @@ static inline unsigned long prevent_user_access_return(void) return flags; } -static inline void restore_user_access(unsigned long flags) +static inline void __restore_user_access(unsigned long flags) { if (kuap_is_disabled()) return; @@ -102,7 +102,7 @@ static inline void restore_user_access(unsigned long flags) } static inline bool -bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) +__bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { if (kuap_is_disabled()) return false; From c252f3846d3114542c606618995e3cbc11775357 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:21 +0200 Subject: [PATCH 149/240] powerpc/kuap: Check KUAP activation in generic functions Today, every platform checks that KUAP is not de-activated before doing the real job. Move the verification out of platform specific functions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/894f110397fcd248e125fb855d1e863e4e633a0d.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/kup.h | 34 +++------------- arch/powerpc/include/asm/book3s/64/kup.h | 41 ++++++++++---------- arch/powerpc/include/asm/kup.h | 29 ++++++++++++++ arch/powerpc/include/asm/nohash/32/kup-8xx.h | 28 +------------ 4 files changed, 56 insertions(+), 76 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 9e9b2692070c..35ca48f7c293 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -13,11 +13,6 @@ extern struct static_key_false disable_kuap_key; -static __always_inline bool kuap_is_disabled(void) -{ - return !IS_ENABLED(CONFIG_PPC_KUAP) || static_branch_unlikely(&disable_kuap_key); -} - static __always_inline bool kuep_is_disabled(void) { return !IS_ENABLED(CONFIG_PPC_KUEP); @@ -30,6 +25,11 @@ static __always_inline bool kuep_is_disabled(void) #define KUAP_NONE (~0UL) #define KUAP_ALL (~1UL) +static __always_inline bool kuap_is_disabled(void) +{ + return static_branch_unlikely(&disable_kuap_key); +} + static inline void kuap_lock_one(unsigned long addr) { mtsr(mfsr(addr) | SR_KS, addr); @@ -81,9 +81,6 @@ static inline void __kuap_save_and_lock(struct pt_regs *regs) { unsigned long kuap = current->thread.kuap; - if (kuap_is_disabled()) - return; - regs->kuap = kuap; if (unlikely(kuap == KUAP_NONE)) return; @@ -98,9 +95,6 @@ static inline void kuap_user_restore(struct pt_regs *regs) static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) { - if (kuap_is_disabled()) - return; - if (unlikely(kuap != KUAP_NONE)) { current->thread.kuap = KUAP_NONE; kuap_lock(kuap, false); @@ -118,9 +112,6 @@ static inline unsigned long __kuap_get_and_assert_locked(void) { unsigned long kuap = current->thread.kuap; - if (kuap_is_disabled()) - return KUAP_NONE; - WARN_ON_ONCE(IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && kuap != KUAP_NONE); return kuap; @@ -134,9 +125,6 @@ static inline void __kuap_assert_locked(void) static __always_inline void __allow_user_access(void __user *to, const void __user *from, u32 size, unsigned long dir) { - if (kuap_is_disabled()) - return; - BUILD_BUG_ON(!__builtin_constant_p(dir)); if (!(dir & KUAP_WRITE)) @@ -150,9 +138,6 @@ static __always_inline void __prevent_user_access(unsigned long dir) { u32 kuap = current->thread.kuap; - if (kuap_is_disabled()) - return; - BUILD_BUG_ON(!__builtin_constant_p(dir)); if (!(dir & KUAP_WRITE)) @@ -166,9 +151,6 @@ static inline unsigned long __prevent_user_access_return(void) { unsigned long flags = current->thread.kuap; - if (kuap_is_disabled()) - return KUAP_NONE; - if (flags != KUAP_NONE) { current->thread.kuap = KUAP_NONE; kuap_lock(flags, true); @@ -179,9 +161,6 @@ static inline unsigned long __prevent_user_access_return(void) static inline void __restore_user_access(unsigned long flags) { - if (kuap_is_disabled()) - return; - if (flags != KUAP_NONE) { current->thread.kuap = flags; kuap_unlock(flags, true); @@ -193,9 +172,6 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { unsigned long kuap = regs->kuap; - if (kuap_is_disabled()) - return false; - if (!is_write || kuap == KUAP_ALL) return false; if (kuap == KUAP_NONE) diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 03d61c5205a4..9f2099790658 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -229,6 +229,11 @@ static inline u64 current_thread_iamr(void) #ifdef CONFIG_PPC_KUAP +static __always_inline bool kuap_is_disabled(void) +{ + return !mmu_has_feature(MMU_FTR_BOOK3S_KUAP); +} + static inline void kuap_user_restore(struct pt_regs *regs) { bool restore_amr = false, restore_iamr = false; @@ -270,36 +275,32 @@ static inline void kuap_user_restore(struct pt_regs *regs) static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { - if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) { - if (unlikely(regs->amr != amr)) { - isync(); - mtspr(SPRN_AMR, regs->amr); - /* - * No isync required here because we are about to rfi - * back to previous context before any user accesses - * would be made, which is a CSI. - */ - } - } + if (likely(regs->amr == amr)) + return; + + isync(); + mtspr(SPRN_AMR, regs->amr); /* + * No isync required here because we are about to rfi + * back to previous context before any user accesses + * would be made, which is a CSI. + * * No need to restore IAMR when returning to kernel space. */ } static inline unsigned long __kuap_get_and_assert_locked(void) { - if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) { - unsigned long amr = mfspr(SPRN_AMR); - if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) /* kuap_check_amr() */ - WARN_ON_ONCE(amr != AMR_KUAP_BLOCKED); - return amr; - } - return 0; + unsigned long amr = mfspr(SPRN_AMR); + + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) /* kuap_check_amr() */ + WARN_ON_ONCE(amr != AMR_KUAP_BLOCKED); + return amr; } static inline void __kuap_assert_locked(void) { - if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED); } @@ -340,8 +341,6 @@ static inline void set_kuap(unsigned long value) static inline bool __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { - if (!mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) - return false; /* * For radix this will be a storage protection fault (DSISR_PROTFAULT). * For hash this will be a key fault (DSISR_KEYFAULT) diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index f2a6fdb45d33..33e93a6c5d19 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -40,6 +40,8 @@ void setup_kuap(bool disabled); #else static inline void setup_kuap(bool disabled) { } +static __always_inline bool kuap_is_disabled(void) { return true; } + static inline bool __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { @@ -73,28 +75,43 @@ static inline void __restore_user_access(unsigned long flags) { } static __always_inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { + if (kuap_is_disabled()) + return false; + return __bad_kuap_fault(regs, address, is_write); } static __always_inline void kuap_assert_locked(void) { + if (kuap_is_disabled()) + return; + __kuap_assert_locked(); } #ifdef CONFIG_PPC32 static __always_inline void kuap_save_and_lock(struct pt_regs *regs) { + if (kuap_is_disabled()) + return; + __kuap_save_and_lock(regs); } #endif static __always_inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { + if (kuap_is_disabled()) + return; + __kuap_kernel_restore(regs, amr); } static __always_inline unsigned long kuap_get_and_assert_locked(void) { + if (kuap_is_disabled()) + return 0; + return __kuap_get_and_assert_locked(); } @@ -102,21 +119,33 @@ static __always_inline unsigned long kuap_get_and_assert_locked(void) static __always_inline void allow_user_access(void __user *to, const void __user *from, unsigned long size, unsigned long dir) { + if (kuap_is_disabled()) + return; + __allow_user_access(to, from, size, dir); } static __always_inline void prevent_user_access(unsigned long dir) { + if (kuap_is_disabled()) + return; + __prevent_user_access(dir); } static __always_inline unsigned long prevent_user_access_return(void) { + if (kuap_is_disabled()) + return 0; + return __prevent_user_access_return(); } static __always_inline void restore_user_access(unsigned long flags) { + if (kuap_is_disabled()) + return; + __restore_user_access(flags); } #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index a5db84164afd..74f15c386476 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -22,9 +22,6 @@ static __always_inline bool kuap_is_disabled(void) static inline void __kuap_save_and_lock(struct pt_regs *regs) { - if (kuap_is_disabled()) - return; - regs->kuap = mfspr(SPRN_MD_AP); mtspr(SPRN_MD_AP, MD_APG_KUAP); } @@ -35,9 +32,6 @@ static inline void kuap_user_restore(struct pt_regs *regs) static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) { - if (kuap_is_disabled()) - return; - mtspr(SPRN_MD_AP, regs->kuap); } @@ -45,9 +39,6 @@ static inline unsigned long __kuap_get_and_assert_locked(void) { unsigned long kuap; - if (kuap_is_disabled()) - return MD_APG_INIT; - kuap = mfspr(SPRN_MD_AP); if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) @@ -58,24 +49,18 @@ static inline unsigned long __kuap_get_and_assert_locked(void) static inline void __kuap_assert_locked(void) { - if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && !kuap_is_disabled()) - kuap_get_and_assert_locked(); + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) + __kuap_get_and_assert_locked(); } static inline void __allow_user_access(void __user *to, const void __user *from, unsigned long size, unsigned long dir) { - if (kuap_is_disabled()) - return; - mtspr(SPRN_MD_AP, MD_APG_INIT); } static inline void __prevent_user_access(unsigned long dir) { - if (kuap_is_disabled()) - return; - mtspr(SPRN_MD_AP, MD_APG_KUAP); } @@ -83,9 +68,6 @@ static inline unsigned long __prevent_user_access_return(void) { unsigned long flags; - if (kuap_is_disabled()) - return MD_APG_INIT; - flags = mfspr(SPRN_MD_AP); mtspr(SPRN_MD_AP, MD_APG_KUAP); @@ -95,18 +77,12 @@ static inline unsigned long __prevent_user_access_return(void) static inline void __restore_user_access(unsigned long flags) { - if (kuap_is_disabled()) - return; - mtspr(SPRN_MD_AP, flags); } static inline bool __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { - if (kuap_is_disabled()) - return false; - return !((regs->kuap ^ MD_APG_KUAP) & 0xff000000); } From 2341964e27b02b2ca1deef8a18df59d1db7b9085 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:22 +0200 Subject: [PATCH 150/240] powerpc/kuap: Remove __kuap_assert_locked() __kuap_assert_locked() is redundant with __kuap_get_and_assert_locked(). Move the verification of CONFIG_PPC_KUAP_DEBUG in kuap_assert_locked() and make it call __kuap_get_and_assert_locked() directly. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1a60198a25d2ba38a37f1b92bc7d096435df4224.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/kup.h | 5 ----- arch/powerpc/include/asm/book3s/64/kup.h | 6 ------ arch/powerpc/include/asm/kup.h | 3 ++- arch/powerpc/include/asm/nohash/32/kup-8xx.h | 6 ------ 4 files changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 35ca48f7c293..bc245e9f0bcc 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -117,11 +117,6 @@ static inline unsigned long __kuap_get_and_assert_locked(void) return kuap; } -static inline void __kuap_assert_locked(void) -{ - __kuap_get_and_assert_locked(); -} - static __always_inline void __allow_user_access(void __user *to, const void __user *from, u32 size, unsigned long dir) { diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 9f2099790658..503828709d55 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -298,12 +298,6 @@ static inline unsigned long __kuap_get_and_assert_locked(void) return amr; } -static inline void __kuap_assert_locked(void) -{ - if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) - WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED); -} - /* * We support individually allowing read or write, but we don't support nesting * because that would require an expensive read/modify write of the AMR. diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 33e93a6c5d19..5d3c1e8060f9 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -86,7 +86,8 @@ static __always_inline void kuap_assert_locked(void) if (kuap_is_disabled()) return; - __kuap_assert_locked(); + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) + __kuap_get_and_assert_locked(); } #ifdef CONFIG_PPC32 diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 74f15c386476..37fe4b32b658 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -47,12 +47,6 @@ static inline unsigned long __kuap_get_and_assert_locked(void) return kuap; } -static inline void __kuap_assert_locked(void) -{ - if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) - __kuap_get_and_assert_locked(); -} - static inline void __allow_user_access(void __user *to, const void __user *from, unsigned long size, unsigned long dir) { From 937fb7003ee1f37faed1f1a4ece46e8a14863d92 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:23 +0200 Subject: [PATCH 151/240] powerpc/kuap: Add kuap_lock() Add kuap_lock() and call it when entering interrupts from user. It is called kuap_lock() as it is similar to kuap_save_and_lock() without the save. However book3s/32 already have a kuap_lock(). Rename it kuap_lock_addr(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4437e2deb9f6f549f7089d45e9c6f96a7e77905a.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/kup.h | 14 +++++++++----- arch/powerpc/include/asm/interrupt.h | 5 ++++- arch/powerpc/include/asm/kup.h | 9 +++++++++ arch/powerpc/include/asm/nohash/32/kup-8xx.h | 4 ++++ arch/powerpc/kernel/interrupt.c | 2 ++ 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index bc245e9f0bcc..678f9c9d89b6 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -57,7 +57,7 @@ static inline void kuap_unlock_all(void) void kuap_lock_all_ool(void); void kuap_unlock_all_ool(void); -static inline void kuap_lock(unsigned long addr, bool ool) +static inline void kuap_lock_addr(unsigned long addr, bool ool) { if (likely(addr != KUAP_ALL)) kuap_lock_one(addr); @@ -77,6 +77,10 @@ static inline void kuap_unlock(unsigned long addr, bool ool) kuap_unlock_all_ool(); } +static inline void __kuap_lock(void) +{ +} + static inline void __kuap_save_and_lock(struct pt_regs *regs) { unsigned long kuap = current->thread.kuap; @@ -86,7 +90,7 @@ static inline void __kuap_save_and_lock(struct pt_regs *regs) return; current->thread.kuap = KUAP_NONE; - kuap_lock(kuap, false); + kuap_lock_addr(kuap, false); } static inline void kuap_user_restore(struct pt_regs *regs) @@ -97,7 +101,7 @@ static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long kua { if (unlikely(kuap != KUAP_NONE)) { current->thread.kuap = KUAP_NONE; - kuap_lock(kuap, false); + kuap_lock_addr(kuap, false); } if (likely(regs->kuap == KUAP_NONE)) @@ -139,7 +143,7 @@ static __always_inline void __prevent_user_access(unsigned long dir) return; current->thread.kuap = KUAP_NONE; - kuap_lock(kuap, true); + kuap_lock_addr(kuap, true); } static inline unsigned long __prevent_user_access_return(void) @@ -148,7 +152,7 @@ static inline unsigned long __prevent_user_access_return(void) if (flags != KUAP_NONE) { current->thread.kuap = KUAP_NONE; - kuap_lock(flags, true); + kuap_lock_addr(flags, true); } return flags; diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 94cc9366f3f0..50d891e4c08c 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -140,9 +140,12 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup trace_hardirqs_off(); if (user_mode(regs)) - account_cpu_user_entry(); + kuap_lock(); else kuap_save_and_lock(regs); + + if (user_mode(regs)) + account_cpu_user_entry(); #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 5d3c1e8060f9..34574a7455ce 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -49,6 +49,7 @@ __bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) } static inline void __kuap_assert_locked(void) { } +static inline void __kuap_lock(void) { } static inline void __kuap_save_and_lock(struct pt_regs *regs) { } static inline void kuap_user_restore(struct pt_regs *regs) { } static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { } @@ -91,6 +92,14 @@ static __always_inline void kuap_assert_locked(void) } #ifdef CONFIG_PPC32 +static __always_inline void kuap_lock(void) +{ + if (kuap_is_disabled()) + return; + + __kuap_lock(); +} + static __always_inline void kuap_save_and_lock(struct pt_regs *regs) { if (kuap_is_disabled()) diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 37fe4b32b658..c44d97751723 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -20,6 +20,10 @@ static __always_inline bool kuap_is_disabled(void) return static_branch_unlikely(&disable_kuap_key); } +static inline void __kuap_lock(void) +{ +} + static inline void __kuap_save_and_lock(struct pt_regs *regs) { regs->kuap = mfspr(SPRN_MD_AP); diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 75dc045bdcb8..beb55bc92ffe 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -81,6 +81,8 @@ notrace long system_call_exception(long r3, long r4, long r5, { syscall_fn f; + kuap_lock(); + regs->orig_gpr3 = r3; if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) From 25ae981fafaa140a12e4c830992b4fe997071124 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:24 +0200 Subject: [PATCH 152/240] powerpc/nohash: Move setup_kuap out of 8xx.c In order to reuse it on booke/4xx, move KUAP setup routine out of 8xx.c Make them usable on SMP by removing the __init tag as it is called for each CPU. And use __prevent_user_access() instead of hard coding initial lock. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ae35eec3426509efc2b8ae69586c822e2fe2642a.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/nohash/8xx.c | 21 --------------------- arch/powerpc/mm/nohash/Makefile | 2 +- arch/powerpc/mm/nohash/kup.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 22 deletions(-) create mode 100644 arch/powerpc/mm/nohash/kup.c diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 36010d1c0bc4..27f9186ae374 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -8,11 +8,7 @@ */ #include -#include #include -#include -#include -#include #include @@ -212,23 +208,6 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M)); } -#ifdef CONFIG_PPC_KUAP -struct static_key_false disable_kuap_key; -EXPORT_SYMBOL(disable_kuap_key); - -void setup_kuap(bool disabled) -{ - if (disabled) { - static_branch_enable(&disable_kuap_key); - return; - } - - pr_info("Activating Kernel Userspace Access Protection\n"); - - mtspr(SPRN_MD_AP, MD_APG_KUAP); -} -#endif - int pud_clear_huge(pud_t *pud) { return 0; diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index b1f630d423d8..b467a25ee155 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -2,7 +2,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y += mmu_context.o tlb.o tlb_low.o +obj-y += mmu_context.o tlb.o tlb_low.o kup.o obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o obj-$(CONFIG_40x) += 40x.o obj-$(CONFIG_44x) += 44x.o diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c new file mode 100644 index 000000000000..eaea52231dd6 --- /dev/null +++ b/arch/powerpc/mm/nohash/kup.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains the routines for initializing kernel userspace protection + */ + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_PPC_KUAP +struct static_key_false disable_kuap_key; +EXPORT_SYMBOL(disable_kuap_key); + +void setup_kuap(bool disabled) +{ + if (disabled) { + if (smp_processor_id() == boot_cpuid) + static_branch_enable(&disable_kuap_key); + return; + } + + pr_info("Activating Kernel Userspace Access Protection\n"); + + __prevent_user_access(KUAP_READ_WRITE); +} +#endif From 047a6fd40199eb55ffd18091f7ceae9743d972bf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:25 +0200 Subject: [PATCH 153/240] powerpc/config: Add CONFIG_BOOKE_OR_40x We have many functionnalities common to 40x and BOOKE, it leads to many places with #if defined(CONFIG_BOOKE) || defined(CONFIG_40x). We are going to add a few more with KUAP for booke/40x, so create a new symbol which is defined when either BOOKE or 40x is defined. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9a3dbd60924cb25c9f944d3d8205ac5a0d15e229.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 8 ++++---- arch/powerpc/include/asm/irq.h | 2 +- arch/powerpc/include/asm/ptrace.h | 2 +- arch/powerpc/include/asm/reg.h | 4 ++-- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_32.S | 2 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/kgdb.c | 4 ++-- arch/powerpc/kernel/setup.h | 2 +- arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/time.c | 2 +- arch/powerpc/platforms/Kconfig.cputype | 5 +++++ 12 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 5c98a950eca0..7a2690e97b0e 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -61,7 +61,7 @@ static inline void __hard_irq_enable(void) { - if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x)) + if (IS_ENABLED(CONFIG_BOOKE_OR_40x)) wrtee(MSR_EE); else if (IS_ENABLED(CONFIG_PPC_8xx)) wrtspr(SPRN_EIE); @@ -73,7 +73,7 @@ static inline void __hard_irq_enable(void) static inline void __hard_irq_disable(void) { - if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x)) + if (IS_ENABLED(CONFIG_BOOKE_OR_40x)) wrtee(0); else if (IS_ENABLED(CONFIG_PPC_8xx)) wrtspr(SPRN_EID); @@ -85,7 +85,7 @@ static inline void __hard_irq_disable(void) static inline void __hard_EE_RI_disable(void) { - if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x)) + if (IS_ENABLED(CONFIG_BOOKE_OR_40x)) wrtee(0); else if (IS_ENABLED(CONFIG_PPC_8xx)) wrtspr(SPRN_NRI); @@ -97,7 +97,7 @@ static inline void __hard_EE_RI_disable(void) static inline void __hard_RI_enable(void) { - if (IS_ENABLED(CONFIG_BOOKE) || IS_ENABLED(CONFIG_40x)) + if (IS_ENABLED(CONFIG_BOOKE_OR_40x)) return; if (IS_ENABLED(CONFIG_PPC_8xx)) diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 2b3278534bc1..13f0409dd617 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -36,7 +36,7 @@ extern int distribute_irqs; struct pt_regs; -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x /* * Per-cpu stacks for handling critical, debug and machine check * level interrupts. diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 6e560f035614..42f89e2d8f04 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -291,7 +291,7 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) static inline bool cpu_has_msr_ri(void) { - return !IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x); + return !IS_ENABLED(CONFIG_BOOKE_OR_40x); } static inline bool regs_is_unrecoverable(struct pt_regs *regs) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e9d27265253b..50478738c8f1 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -18,9 +18,9 @@ #include /* Pickup Book E specific registers. */ -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x #include -#endif /* CONFIG_BOOKE || CONFIG_40x */ +#endif #ifdef CONFIG_FSL_EMB_PERFMON #include diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index cf3436b7b166..7582f3e3a330 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -54,7 +54,7 @@ #endif #ifdef CONFIG_PPC32 -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x #include "head_booke.h" #endif #endif diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 035bf4f3eb5d..7748c278d13c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -105,7 +105,7 @@ transfer_to_syscall: stw r11, 0(r1) mflr r12 stw r12, _LINK(r1) -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #endif lis r12,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index c4f1d6b7d992..8207f97d51e8 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -811,7 +811,7 @@ void __init init_IRQ(void) ppc_md.init_IRQ(); } -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x void *critirq_ctx[NR_CPUS] __read_mostly; void *dbgirq_ctx[NR_CPUS] __read_mostly; void *mcheckirq_ctx[NR_CPUS] __read_mostly; diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index bdee7262c080..9f8d0fa7b718 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -48,7 +48,7 @@ static struct hard_trap_info { 0x0800, 0x08 /* SIGFPE */ }, /* fp unavailable */ { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */ { 0x0c00, 0x14 /* SIGCHLD */ }, /* system call */ -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +#ifdef CONFIG_BOOKE_OR_40x { 0x2002, 0x05 /* SIGTRAP */ }, /* debug */ #if defined(CONFIG_FSL_BOOKE) { 0x2010, 0x08 /* SIGFPE */ }, /* spe unavailable */ @@ -67,7 +67,7 @@ static struct hard_trap_info { 0x2010, 0x08 /* SIGFPE */ }, /* fp unavailable */ { 0x2020, 0x08 /* SIGFPE */ }, /* ap unavailable */ #endif -#else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */ +#else /* !CONFIG_BOOKE_OR_40x */ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ #if defined(CONFIG_PPC_8xx) { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index 84058bbc8fe9..93f22da12abe 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -29,7 +29,7 @@ void setup_tlb_core_data(void); static inline void setup_tlb_core_data(void) { } #endif -#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x void exc_lvl_early_init(void); #else static inline void exc_lvl_early_init(void) { } diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 7ec5c47fce0e..15e7386584f9 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -175,7 +175,7 @@ void __init emergency_stack_init(void) } #endif -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x void __init exc_lvl_early_init(void) { unsigned int i, hw_cpu; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index f7cddb82938f..42df9dd7fb41 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -770,7 +770,7 @@ static int __init get_freq(char *name, int cells, unsigned long *val) static void start_cpu_decrementer(void) { -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#ifdef CONFIG_BOOKE_OR_40x unsigned int tcr; /* Clear any pending timer interrupts */ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 408d8ee5bfcd..24b6ee689239 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -287,6 +287,11 @@ config BOOKE depends on E500 || 44x || PPC_BOOK3E default y +config BOOKE_OR_40x + bool + depends on BOOKE || 40x + default y + config FSL_BOOKE bool depends on E500 && PPC32 From 42e03bc5240b75007682d9941ef672d12828fc70 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:26 +0200 Subject: [PATCH 154/240] powerpc/kuap: Prepare for supporting KUAP on BOOK3E/64 Also call kuap_lock() and kuap_save_and_lock() from interrupt functions with CONFIG_PPC64. For book3s/64 we keep them empty as it is done in assembly. Also do the locked assert when switching task unless it is book3s/64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1cbf94e26e6d6e2e028fd687588a7e6622d454a6.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/kup.h | 9 +++++++++ arch/powerpc/include/asm/interrupt.h | 2 ++ arch/powerpc/include/asm/kup.h | 2 -- arch/powerpc/kernel/process.c | 6 +++--- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 503828709d55..69fcf63eec94 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -298,6 +298,15 @@ static inline unsigned long __kuap_get_and_assert_locked(void) return amr; } +/* Do nothing, book3s/64 does that in ASM */ +static inline void __kuap_lock(void) +{ +} + +static inline void __kuap_save_and_lock(struct pt_regs *regs) +{ +} + /* * We support individually allowing read or write, but we don't support nesting * because that would require an expensive read/modify write of the AMR. diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 50d891e4c08c..6d414ddc8e24 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -154,12 +154,14 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup local_paca->irq_happened |= PACA_IRQ_HARD_DIS; if (user_mode(regs)) { + kuap_lock(); CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit_irqoff(); account_cpu_user_entry(); account_stolen_time(); } else { + kuap_save_and_lock(regs); /* * CT_WARN_ON comes here via program_check_exception, * so avoid recursion. diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 34574a7455ce..656e6f1d6b6f 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -91,7 +91,6 @@ static __always_inline void kuap_assert_locked(void) __kuap_get_and_assert_locked(); } -#ifdef CONFIG_PPC32 static __always_inline void kuap_lock(void) { if (kuap_is_disabled()) @@ -107,7 +106,6 @@ static __always_inline void kuap_save_and_lock(struct pt_regs *regs) __kuap_save_and_lock(regs); } -#endif static __always_inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a64cfbb85ca2..afdcc2d3d470 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1315,9 +1315,9 @@ struct task_struct *__switch_to(struct task_struct *prev, set_return_regs_changed(); /* _switch changes stack (and regs) */ -#ifdef CONFIG_PPC32 - kuap_assert_locked(); -#endif + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + kuap_assert_locked(); + last = _switch(old_thread, new_thread); /* From e3c02f25b4296c48376b8edb6aadcec460e803bc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:27 +0200 Subject: [PATCH 155/240] powerpc/kuap: Make PPC_KUAP_DEBUG depend on PPC_KUAP only PPC_KUAP_DEBUG is supported by all platforms doing PPC_KUAP, it doesn't depend on Radix on book3s/64. This will avoid adding one more dependency when implementing KUAP on book3e/64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a5ff6228a36e51783b83d8c10d058db76e450f63.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/platforms/Kconfig.cputype | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 24b6ee689239..95eb7308fdd9 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -446,7 +446,7 @@ config PPC_KUAP config PPC_KUAP_DEBUG bool "Extra debugging for Kernel Userspace Access Protection" - depends on PPC_KUAP && (PPC_RADIX_MMU || PPC32) + depends on PPC_KUAP help Add extra debugging for Kernel Userspace Access Protection (KUAP) If you're unsure, say N. From 43afcf8f0101279cf4243bb4f9f9b249ddd8613c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:28 +0200 Subject: [PATCH 156/240] powerpc: Add KUAP support for BOOKE and 40x On booke/40x we don't have segments like book3s/32. On booke/40x we don't have access protection groups like 8xx. Use the PID register to provide user access protection. Kernel address space can be accessed with any PID. User address space has to be accessed with the PID of the user. User PID is always not null. Everytime the kernel is entered, set PID register to 0 and restore PID register when returning to user. Everytime kernel needs to access user data, PID is restored for the access. In TLB miss handlers, check the PID and bail out to data storage exception when PID is 0 and accessed address is in user space. Note that also forbids execution of user text by kernel except when user access is unlocked. But this shouldn't be a problem as the kernel is not supposed to ever run user text. This patch prepares the infrastructure but the real activation of KUAP is done by following patches for each processor type one by one. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5d65576a8e31e9480415785a180c92dd4e72306d.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/kup.h | 4 + arch/powerpc/include/asm/nohash/kup-booke.h | 110 ++++++++++++++++++++ arch/powerpc/include/asm/processor.h | 3 + arch/powerpc/kernel/process.c | 3 + arch/powerpc/mm/mmu_context.c | 6 ++ arch/powerpc/mm/nohash/mmu_context.c | 6 +- 6 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/nohash/kup-booke.h diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 656e6f1d6b6f..fb2237809d63 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -14,6 +14,10 @@ #include #endif +#ifdef CONFIG_BOOKE_OR_40x +#include +#endif + #ifdef CONFIG_PPC_BOOK3S_32 #include #endif diff --git a/arch/powerpc/include/asm/nohash/kup-booke.h b/arch/powerpc/include/asm/nohash/kup-booke.h new file mode 100644 index 000000000000..49bb41ed0816 --- /dev/null +++ b/arch/powerpc/include/asm/nohash/kup-booke.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_KUP_BOOKE_H_ +#define _ASM_POWERPC_KUP_BOOKE_H_ + +#include + +#ifdef CONFIG_PPC_KUAP + +#ifdef __ASSEMBLY__ + +.macro kuap_check_amr gpr1, gpr2 +.endm + +#else + +#include +#include + +#include + +extern struct static_key_false disable_kuap_key; + +static __always_inline bool kuap_is_disabled(void) +{ + return static_branch_unlikely(&disable_kuap_key); +} + +static inline void __kuap_lock(void) +{ + mtspr(SPRN_PID, 0); + isync(); +} + +static inline void __kuap_save_and_lock(struct pt_regs *regs) +{ + regs->kuap = mfspr(SPRN_PID); + mtspr(SPRN_PID, 0); + isync(); +} + +static inline void kuap_user_restore(struct pt_regs *regs) +{ + if (kuap_is_disabled()) + return; + + mtspr(SPRN_PID, current->thread.pid); + + /* Context synchronisation is performed by rfi */ +} + +static inline void __kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap) +{ + if (regs->kuap) + mtspr(SPRN_PID, current->thread.pid); + + /* Context synchronisation is performed by rfi */ +} + +static inline unsigned long __kuap_get_and_assert_locked(void) +{ + unsigned long kuap = mfspr(SPRN_PID); + + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) + WARN_ON_ONCE(kuap); + + return kuap; +} + +static inline void __allow_user_access(void __user *to, const void __user *from, + unsigned long size, unsigned long dir) +{ + mtspr(SPRN_PID, current->thread.pid); + isync(); +} + +static inline void __prevent_user_access(unsigned long dir) +{ + mtspr(SPRN_PID, 0); + isync(); +} + +static inline unsigned long __prevent_user_access_return(void) +{ + unsigned long flags = mfspr(SPRN_PID); + + mtspr(SPRN_PID, 0); + isync(); + + return flags; +} + +static inline void __restore_user_access(unsigned long flags) +{ + if (flags) { + mtspr(SPRN_PID, current->thread.pid); + isync(); + } +} + +static inline bool +__bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) +{ + return !regs->kuap; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* CONFIG_PPC_KUAP */ + +#endif /* _ASM_POWERPC_KUP_BOOKE_H_ */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fe1ef1d7523b..2c8686d9e964 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -160,6 +160,9 @@ struct thread_struct { unsigned long sr0; #endif #endif /* CONFIG_PPC32 */ +#if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP) + unsigned long pid; /* value written in PID reg. at interrupt exit */ +#endif /* Debug Registers */ struct debug_reg debug; #ifdef CONFIG_PPC_FPU_REGS diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index afdcc2d3d470..790790dfb390 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1803,6 +1803,9 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, #if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) p->thread.kuap = KUAP_NONE; #endif +#if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP) + p->thread.pid = MMU_NO_CONTEXT; +#endif setup_ksp_vsid(p, sp); diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index e618d5442a28..735c36f26388 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -21,6 +21,9 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, #ifdef CONFIG_PPC_BOOK3S_32 tsk->thread.sr0 = mm->context.sr0; #endif +#if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = mm->context.id; +#endif } #elif defined(CONFIG_PPC_BOOK3E_64) static inline void switch_mm_pgdir(struct task_struct *tsk, @@ -28,6 +31,9 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, { /* 64-bit Book3E keeps track of current PGD in the PACA */ get_paca()->pgd = mm->pgd; +#ifdef CONFIG_PPC_KUAP + tsk->thread.pid = mm->context.id; +#endif } #else static inline void switch_mm_pgdir(struct task_struct *tsk, diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index 44b2b5e7cabe..85b048f04c56 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -217,7 +218,7 @@ static void set_context(unsigned long id, pgd_t *pgd) /* sync */ mb(); - } else { + } else if (kuap_is_disabled()) { if (IS_ENABLED(CONFIG_40x)) mb(); /* sync */ @@ -305,6 +306,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, if (IS_ENABLED(CONFIG_BDI_SWITCH)) abatron_pteptrs[1] = next->pgd; set_context(id, next->pgd); +#if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP) + tsk->thread.pid = id; +#endif raw_spin_unlock(&context_lock); } From f6fad4fb55936f0d613cea08341d187d691d6440 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:29 +0200 Subject: [PATCH 157/240] powerpc/kuap: Wire-up KUAP on 44x This adds KUAP support to 44x. This is done by checking the content of SPRN_PID at the time it is read and written into SPRN_MMUCR. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7d6c3f1978a26feada74b084f651e8cf1e3b3a47.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_44x.S | 16 ++++++++++++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 17 insertions(+) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 916f7e91c6de..b73a56466903 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -334,6 +334,10 @@ interrupt_base: mfspr r12,SPRN_MMUCR mfspr r13,SPRN_PID /* Get PID */ rlwimi r12,r13,0,24,31 /* Set TID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r13,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 @@ -444,6 +448,10 @@ interrupt_base: mfspr r12,SPRN_MMUCR mfspr r13,SPRN_PID /* Get PID */ rlwimi r12,r13,0,24,31 /* Set TID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r13,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 @@ -572,6 +580,10 @@ finish_tlb_load_44x: 3: mfspr r11,SPRN_SPRG3 lwz r11,PGDIR(r11) mfspr r12,SPRN_PID /* Get PID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r12,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 /* Set MMUCR */ /* Mask of required permission bits. Note that while we @@ -669,6 +681,10 @@ finish_tlb_load_44x: 3: mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) mfspr r12,SPRN_PID /* Get PID */ +#ifdef CONFIG_PPC_KUAP + cmpwi r12,0 + beq 2f /* KUAP Fault */ +#endif 4: mtspr SPRN_MMUCR,r12 /* Set MMUCR */ /* Make up the required permissions */ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 95eb7308fdd9..3f00e75edf70 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -64,6 +64,7 @@ config 44x select PHYS_64BIT select PPC_HAVE_KUEP select PPC_KUEP + select PPC_HAVE_KUAP endchoice From fcf9bb6d32f8a268bc3daf3281e3beefabec4e7c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:30 +0200 Subject: [PATCH 158/240] powerpc/kuap: Wire-up KUAP on 40x This adds KUAP support to 40x. This is done by checking the content of SPRN_PID at the time user pgtable is loaded. 40x doesn't have KUEP, but KUAP implies KUEP because when the PID doesn't match the page's PID, the page cannot be read nor executed. So KUEP is now automatically selected when KUAP is selected and disabled when KUAP is disabled. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/aaefa91897ddc42ac11019dc0e1d1a525bd08e90.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_40x.S | 8 ++++++++ arch/powerpc/mm/nohash/kup.c | 2 ++ arch/powerpc/platforms/Kconfig.cputype | 7 +++++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index e783860bea83..b6c6d1de5fd5 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -298,6 +298,10 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) 3: mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + rlwinm. r9, r9, 0, 0xff + beq 5f /* Kuap fault */ +#endif 4: tophys(r11, r11) rlwimi r11, r10, 12, 20, 29 /* Create L1 (pgdir/pmd) address */ @@ -378,6 +382,10 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) 3: mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + rlwinm. r9, r9, 0, 0xff + beq 5f /* Kuap fault */ +#endif 4: tophys(r11, r11) rlwimi r11, r10, 12, 20, 29 /* Create L1 (pgdir/pmd) address */ diff --git a/arch/powerpc/mm/nohash/kup.c b/arch/powerpc/mm/nohash/kup.c index eaea52231dd6..552becf90e97 100644 --- a/arch/powerpc/mm/nohash/kup.c +++ b/arch/powerpc/mm/nohash/kup.c @@ -19,6 +19,8 @@ EXPORT_SYMBOL(disable_kuap_key); void setup_kuap(bool disabled) { if (disabled) { + if (IS_ENABLED(CONFIG_40x)) + disable_kuep = true; if (smp_processor_id() == boot_cpuid) static_branch_enable(&disable_kuap_key); return; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3f00e75edf70..95e034e061c2 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -54,6 +54,9 @@ config 40x select PPC_UDBG_16550 select 4xx_SOC select HAVE_PCI + select PPC_HAVE_KUAP + select PPC_HAVE_KUEP + select PPC_KUEP if PPC_KUAP config 44x bool "AMCC 44x, 46x or 47x" @@ -425,9 +428,9 @@ config PPC_HAVE_KUEP bool config PPC_KUEP - bool "Kernel Userspace Execution Prevention" + bool "Kernel Userspace Execution Prevention" if !40x depends on PPC_HAVE_KUEP - default y + default y if !40x help Enable support for Kernel Userspace Execution Prevention (KUEP) From 4f6a025201a290316b28a2a0ef9950398bd75088 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:31 +0200 Subject: [PATCH 159/240] powerpc/kuap: Wire-up KUAP on 85xx in 32 bits mode. This adds KUAP support to 85xx in 32 bits mode. This is done by reading the content of SPRN_MAS1 and checking the TID at the time user pgtable is loaded. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f8696f8980ca1532ada3a2f0e0a03e756269c7fe.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_fsl_booke.S | 12 ++++++++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 13 insertions(+) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 4622b50a5208..ac2b4dcf5fd3 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -462,6 +462,12 @@ END_BTB_FLUSH_SECTION mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + mfspr r12, SPRN_MAS1 + rlwinm. r12,r12,0,0x3fff0000 + beq 2f /* KUAP fault */ +#endif + 4: /* Mask of required permission bits. Note that while we * do copy ESR:ST to _PAGE_RW position as trying to write @@ -571,6 +577,12 @@ END_BTB_FLUSH_SECTION mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) +#ifdef CONFIG_PPC_KUAP + mfspr r12, SPRN_MAS1 + rlwinm. r12,r12,0,0x3fff0000 + beq 2f /* KUAP fault */ +#endif + /* Make up the required permissions for user code */ #ifdef CONFIG_PTE_64BIT li r13,_PAGE_PRESENT | _PAGE_BAP_UX diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 95e034e061c2..3ad2f3fc67a4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -37,6 +37,7 @@ config PPC_BOOK3S_32 config PPC_85xx bool "Freescale 85xx" select E500 + select PPC_HAVE_KUAP config PPC_8xx bool "Freescale 8xx" From 57bc963837f5f1753a1d51fada54a32b8a84fdc3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:32 +0200 Subject: [PATCH 160/240] powerpc/kuap: Wire-up KUAP on book3e/64 This adds KUAP support to book3e/64. This is done by reading the content of SPRN_MAS1 and checking the TID at the time user pgtable is loaded. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e2c2c9375afd4bbc06aa904d0103a5f5102a2b1a.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/nohash/tlb_low_64e.S | 40 ++++++++++++++++++++++---- arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 9235e720e357..8b97c4acfebf 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -128,6 +128,13 @@ END_BTB_FLUSH_SECTION bne tlb_miss_kernel_bolted +tlb_miss_user_bolted: +#ifdef CONFIG_PPC_KUAP + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- tlb_miss_fault_bolted /* KUAP fault */ +#endif + tlb_miss_common_bolted: /* * This is the guts of the TLB miss handler for bolted-linear. @@ -246,7 +253,7 @@ itlb_miss_fault_bolted: cmpldi cr0,r15,0 /* Check for user region */ oris r11,r11,_PAGE_ACCESSED@h - beq tlb_miss_common_bolted + beq tlb_miss_user_bolted b itlb_miss_kernel_bolted #ifdef CONFIG_PPC_FSL_BOOK3E @@ -676,6 +683,11 @@ finish_normal_tlb_miss: /* Check if required permissions are met */ andc. r15,r11,r14 bne- normal_tlb_miss_access_fault +#ifdef CONFIG_PPC_KUAP + mfspr r11,SPRN_MAS1 + rlwinm. r10,r11,0,0x3fff0000 + beq- normal_tlb_miss_access_fault /* KUAP fault */ +#endif /* Now we build the MAS: * @@ -689,15 +701,17 @@ finish_normal_tlb_miss: * * TODO: mix up code below for better scheduling */ - clrrdi r11,r16,12 /* Clear low crap in EA */ - rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ - mtspr SPRN_MAS2,r11 + clrrdi r10,r16,12 /* Clear low crap in EA */ + rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r10 /* Check page size, if not standard, update MAS1 */ - rldicl r11,r14,64-8,64-8 - cmpldi cr0,r11,BOOK3E_PAGESZ_4K + rldicl r10,r14,64-8,64-8 + cmpldi cr0,r10,BOOK3E_PAGESZ_4K beq- 1f +#ifndef CONFIG_PPC_KUAP mfspr r11,SPRN_MAS1 +#endif rlwimi r11,r14,31,21,24 rlwinm r11,r11,0,21,19 mtspr SPRN_MAS1,r11 @@ -786,7 +800,16 @@ virt_page_table_tlb_miss: mfspr r10,SPRN_MAS1 rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 +#ifdef CONFIG_PPC_KUAP + b 2f 1: + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- virt_page_table_tlb_miss_fault /* KUAP fault */ +2: +#else +1: +#endif BEGIN_MMU_FTR_SECTION /* Search if we already have a TLB entry for that virtual address, and * if we do, bail out. @@ -1027,6 +1050,11 @@ virt_page_table_tlb_miss_whacko_fault: * avoid too much complication, it will save/restore things for us */ htw_tlb_miss: +#ifdef CONFIG_PPC_KUAP + mfspr r10,SPRN_MAS1 + rlwinm. r10,r10,0,0x3fff0000 + beq- htw_tlb_miss_fault /* KUAP fault */ +#endif /* Search if we already have a TLB entry for that virtual address, and * if we do, bail out. * diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3ad2f3fc67a4..172f28edb363 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -122,6 +122,7 @@ config PPC_BOOK3E_64 select PPC_SMP_MUXED_IPI select PPC_DOORBELL select ZONE_DMA + select PPC_HAVE_KUAP endchoice From dede19be5163cdc5b5d65a2ce7e7f6eedcb666ff Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 19 Oct 2021 09:29:33 +0200 Subject: [PATCH 161/240] powerpc: Remove CONFIG_PPC_HAVE_KUAP and CONFIG_PPC_HAVE_KUEP All platforms now have KUAP and KUEP so remove CONFIG_PPC_HAVE_KUAP and CONFIG_PPC_HAVE_KUEP. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a3c007ad0951965199e6ab2ef1035966bc66e771.1634627931.git.christophe.leroy@csgroup.eu --- arch/powerpc/platforms/Kconfig.cputype | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 172f28edb363..87bc1929ee5a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -30,22 +30,17 @@ config PPC_BOOK3S_32 bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" imply PPC_FPU select PPC_HAVE_PMU_SUPPORT - select PPC_HAVE_KUEP - select PPC_HAVE_KUAP select HAVE_ARCH_VMAP_STACK config PPC_85xx bool "Freescale 85xx" select E500 - select PPC_HAVE_KUAP config PPC_8xx bool "Freescale 8xx" select ARCH_SUPPORTS_HUGETLBFS select FSL_SOC - select PPC_HAVE_KUEP select PPC_KUEP - select PPC_HAVE_KUAP select HAVE_ARCH_VMAP_STACK select HUGETLBFS @@ -55,8 +50,6 @@ config 40x select PPC_UDBG_16550 select 4xx_SOC select HAVE_PCI - select PPC_HAVE_KUAP - select PPC_HAVE_KUEP select PPC_KUEP if PPC_KUAP config 44x @@ -66,9 +59,7 @@ config 44x select 4xx_SOC select HAVE_PCI select PHYS_64BIT - select PPC_HAVE_KUEP select PPC_KUEP - select PPC_HAVE_KUAP endchoice @@ -112,8 +103,6 @@ config PPC_BOOK3S_64 select HAVE_MOVE_PMD select HAVE_MOVE_PUD select IRQ_WORK - select PPC_HAVE_KUEP - select PPC_HAVE_KUAP select PPC_64S_HASH_MMU if !PPC_RADIX_MMU config PPC_BOOK3E_64 @@ -122,7 +111,6 @@ config PPC_BOOK3E_64 select PPC_SMP_MUXED_IPI select PPC_DOORBELL select ZONE_DMA - select PPC_HAVE_KUAP endchoice @@ -310,7 +298,6 @@ config PPC_FSL_BOOK3E select FSL_EMB_PERFMON select PPC_SMP_MUXED_IPI select PPC_DOORBELL - select PPC_HAVE_KUEP select PPC_KUEP default y if FSL_BOOKE @@ -426,24 +413,16 @@ config PPC_RADIX_MMU_DEFAULT If you're unsure, say Y. -config PPC_HAVE_KUEP - bool - config PPC_KUEP bool "Kernel Userspace Execution Prevention" if !40x - depends on PPC_HAVE_KUEP default y if !40x help Enable support for Kernel Userspace Execution Prevention (KUEP) If you're unsure, say Y. -config PPC_HAVE_KUAP - bool - config PPC_KUAP bool "Kernel Userspace Access Protection" - depends on PPC_HAVE_KUAP default y help Enable support for Kernel Userspace Access Protection (KUAP) From 37eb7ca91b692e8e49e7dd50158349a6c8fb5b09 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Nov 2021 13:40:35 +0100 Subject: [PATCH 162/240] powerpc/32s: Allocate one 256k IBAT instead of two consecutives 128k IBATs Today we have the following IBATs allocated: ---[ Instruction Block Address Translation ]--- 0: 0xc0000000-0xc03fffff 0x00000000 4M Kernel x m 1: 0xc0400000-0xc05fffff 0x00400000 2M Kernel x m 2: 0xc0600000-0xc06fffff 0x00600000 1M Kernel x m 3: 0xc0700000-0xc077ffff 0x00700000 512K Kernel x m 4: 0xc0780000-0xc079ffff 0x00780000 128K Kernel x m 5: 0xc07a0000-0xc07bffff 0x007a0000 128K Kernel x m 6: - 7: - The two 128K should be a single 256K instead. When _etext is not aligned to 128Kbytes, the system will allocate all necessary BATs to the lower 128Kbytes boundary, then allocate an additional 128Kbytes BAT for the remaining block. Instead, align the top to 128Kbytes so that the function directly allocates a 256Kbytes last block: ---[ Instruction Block Address Translation ]--- 0: 0xc0000000-0xc03fffff 0x00000000 4M Kernel x m 1: 0xc0400000-0xc05fffff 0x00400000 2M Kernel x m 2: 0xc0600000-0xc06fffff 0x00600000 1M Kernel x m 3: 0xc0700000-0xc077ffff 0x00700000 512K Kernel x m 4: 0xc0780000-0xc07bffff 0x00780000 256K Kernel x m 5: - 6: - 7: - Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ab58b296832b0ec650e2203200e060adbcb2677d.1637930421.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/book3s32/mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 27061583a010..33ab63d56435 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -196,18 +196,17 @@ void mmu_mark_initmem_nx(void) int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; unsigned long base = (unsigned long)_stext - PAGE_OFFSET; - unsigned long top = (unsigned long)_etext - PAGE_OFFSET; + unsigned long top = ALIGN((unsigned long)_etext - PAGE_OFFSET, SZ_128K); unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; unsigned long size; - for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { + for (i = 0; i < nb - 1 && base < top;) { size = block_size(base, top); setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); base += size; } if (base < top) { size = block_size(base, top); - size = max(size, 128UL << 10); if ((top - base) > size) { size <<= 1; if (strict_kernel_rwx_enabled() && base + size > border) From 3261d99adba269a024d0e55737beeedec5eba00e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 29 Nov 2021 18:49:37 +0100 Subject: [PATCH 163/240] powerpc/inst: Refactor ___get_user_instr() PPC64 version of ___get_user_instr() can be used for PPC32 as well, by simply disabling the suffix part with IS_ENABLED(CONFIG_PPC64). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1f0ede830ccb33a659119a55cb590820c27004db.1638208156.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/inst.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index b11c0e2f9639..10a5c1b76ca0 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -4,8 +4,6 @@ #include -#ifdef CONFIG_PPC64 - #define ___get_user_instr(gu_op, dest, ptr) \ ({ \ long __gui_ret; \ @@ -16,7 +14,7 @@ __chk_user_ptr(ptr); \ __gui_ret = gu_op(__prefix, __gui_ptr); \ if (__gui_ret == 0) { \ - if ((__prefix >> 26) == OP_PREFIX) { \ + if (IS_ENABLED(CONFIG_PPC64) && (__prefix >> 26) == OP_PREFIX) { \ __gui_ret = gu_op(__suffix, __gui_ptr + 1); \ __gui_inst = ppc_inst_prefix(__prefix, __suffix); \ } else { \ @@ -27,13 +25,6 @@ } \ __gui_ret; \ }) -#else /* !CONFIG_PPC64 */ -#define ___get_user_instr(gu_op, dest, ptr) \ -({ \ - __chk_user_ptr(ptr); \ - gu_op((dest).val, (u32 __user *)(ptr)); \ -}) -#endif /* CONFIG_PPC64 */ #define get_user_instr(x, ptr) ___get_user_instr(get_user, x, ptr) @@ -71,7 +62,7 @@ static inline u32 ppc_inst_suffix(struct ppc_inst x) } #else -#define ppc_inst_prefix(x, y) ppc_inst(x) +#define ppc_inst_prefix(x, y) ((void)y, ppc_inst(x)) static inline u32 ppc_inst_suffix(struct ppc_inst x) { From c545b9f040f341038d5228932140fb17e0c156e2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 29 Nov 2021 18:49:38 +0100 Subject: [PATCH 164/240] powerpc/inst: Define ppc_inst_t In order to stop using 'struct ppc_inst' on PPC32, define a ppc_inst_t typedef. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/fe5baa2c66fea9db05a8b300b3e8d2880a42596c.1638208156.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 18 +++---- arch/powerpc/include/asm/hw_breakpoint.h | 4 +- arch/powerpc/include/asm/inst.h | 36 ++++++------- arch/powerpc/include/asm/sstep.h | 4 +- arch/powerpc/kernel/align.c | 4 +- arch/powerpc/kernel/epapr_paravirt.c | 2 +- arch/powerpc/kernel/hw_breakpoint.c | 4 +- .../kernel/hw_breakpoint_constraints.c | 4 +- arch/powerpc/kernel/kprobes.c | 4 +- arch/powerpc/kernel/mce_power.c | 2 +- arch/powerpc/kernel/optprobes.c | 4 +- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/trace/ftrace.c | 54 +++++++++---------- arch/powerpc/kernel/vecemu.c | 2 +- arch/powerpc/lib/code-patching.c | 38 ++++++------- arch/powerpc/lib/feature-fixups.c | 4 +- arch/powerpc/lib/sstep.c | 4 +- arch/powerpc/lib/test_emulate_step.c | 10 ++-- arch/powerpc/mm/maccess.c | 2 +- arch/powerpc/perf/8xx-pmu.c | 2 +- arch/powerpc/xmon/xmon.c | 14 ++--- arch/powerpc/xmon/xmon_bpts.h | 4 +- 23 files changed, 112 insertions(+), 112 deletions(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 4ba834599c4d..46e8c5a8ce51 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -24,20 +24,20 @@ bool is_offset_in_branch_range(long offset); bool is_offset_in_cond_branch_range(long offset); -int create_branch(struct ppc_inst *instr, const u32 *addr, +int create_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags); -int create_cond_branch(struct ppc_inst *instr, const u32 *addr, +int create_cond_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags); int patch_branch(u32 *addr, unsigned long target, int flags); -int patch_instruction(u32 *addr, struct ppc_inst instr); -int raw_patch_instruction(u32 *addr, struct ppc_inst instr); +int patch_instruction(u32 *addr, ppc_inst_t instr); +int raw_patch_instruction(u32 *addr, ppc_inst_t instr); static inline unsigned long patch_site_addr(s32 *site) { return (unsigned long)site + *site; } -static inline int patch_instruction_site(s32 *site, struct ppc_inst instr) +static inline int patch_instruction_site(s32 *site, ppc_inst_t instr) { return patch_instruction((u32 *)patch_site_addr(site), instr); } @@ -58,11 +58,11 @@ static inline int modify_instruction_site(s32 *site, unsigned int clr, unsigned return modify_instruction((unsigned int *)patch_site_addr(site), clr, set); } -int instr_is_relative_branch(struct ppc_inst instr); -int instr_is_relative_link_branch(struct ppc_inst instr); +int instr_is_relative_branch(ppc_inst_t instr); +int instr_is_relative_link_branch(ppc_inst_t instr); unsigned long branch_target(const u32 *instr); -int translate_branch(struct ppc_inst *instr, const u32 *dest, const u32 *src); -extern bool is_conditional_branch(struct ppc_inst instr); +int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src); +bool is_conditional_branch(ppc_inst_t instr); #ifdef CONFIG_PPC_BOOK3E_64 void __patch_exception(int exc, unsigned long addr); #define patch_exception(exc, name) do { \ diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index abebfbee5b1c..88053d3c68e6 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -56,11 +56,11 @@ static inline int nr_wp_slots(void) return cpu_has_feature(CPU_FTR_DAWR1) ? 2 : 1; } -bool wp_check_constraints(struct pt_regs *regs, struct ppc_inst instr, +bool wp_check_constraints(struct pt_regs *regs, ppc_inst_t instr, unsigned long ea, int type, int size, struct arch_hw_breakpoint *info); -void wp_get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, +void wp_get_instr_detail(struct pt_regs *regs, ppc_inst_t *instr, int *type, int *size, unsigned long *ea); #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index 10a5c1b76ca0..b3502f21e0f4 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -8,7 +8,7 @@ ({ \ long __gui_ret; \ u32 __user *__gui_ptr = (u32 __user *)ptr; \ - struct ppc_inst __gui_inst; \ + ppc_inst_t __gui_inst; \ unsigned int __prefix, __suffix; \ \ __chk_user_ptr(ptr); \ @@ -34,29 +34,29 @@ * Instruction data type for POWER */ -struct ppc_inst { +typedef struct { u32 val; #ifdef CONFIG_PPC64 u32 suffix; #endif -} __packed; +} __packed ppc_inst_t; -static inline u32 ppc_inst_val(struct ppc_inst x) +static inline u32 ppc_inst_val(ppc_inst_t x) { return x.val; } -static inline int ppc_inst_primary_opcode(struct ppc_inst x) +static inline int ppc_inst_primary_opcode(ppc_inst_t x) { return ppc_inst_val(x) >> 26; } -#define ppc_inst(x) ((struct ppc_inst){ .val = (x) }) +#define ppc_inst(x) ((ppc_inst_t){ .val = (x) }) #ifdef CONFIG_PPC64 -#define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix = (y) }) +#define ppc_inst_prefix(x, y) ((ppc_inst_t){ .val = (x), .suffix = (y) }) -static inline u32 ppc_inst_suffix(struct ppc_inst x) +static inline u32 ppc_inst_suffix(ppc_inst_t x) { return x.suffix; } @@ -64,14 +64,14 @@ static inline u32 ppc_inst_suffix(struct ppc_inst x) #else #define ppc_inst_prefix(x, y) ((void)y, ppc_inst(x)) -static inline u32 ppc_inst_suffix(struct ppc_inst x) +static inline u32 ppc_inst_suffix(ppc_inst_t x) { return 0; } #endif /* CONFIG_PPC64 */ -static inline struct ppc_inst ppc_inst_read(const u32 *ptr) +static inline ppc_inst_t ppc_inst_read(const u32 *ptr) { if (IS_ENABLED(CONFIG_PPC64) && (*ptr >> 26) == OP_PREFIX) return ppc_inst_prefix(*ptr, *(ptr + 1)); @@ -79,17 +79,17 @@ static inline struct ppc_inst ppc_inst_read(const u32 *ptr) return ppc_inst(*ptr); } -static inline bool ppc_inst_prefixed(struct ppc_inst x) +static inline bool ppc_inst_prefixed(ppc_inst_t x) { return IS_ENABLED(CONFIG_PPC64) && ppc_inst_primary_opcode(x) == OP_PREFIX; } -static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x) +static inline ppc_inst_t ppc_inst_swab(ppc_inst_t x) { return ppc_inst_prefix(swab32(ppc_inst_val(x)), swab32(ppc_inst_suffix(x))); } -static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y) +static inline bool ppc_inst_equal(ppc_inst_t x, ppc_inst_t y) { if (ppc_inst_val(x) != ppc_inst_val(y)) return false; @@ -98,7 +98,7 @@ static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y) return ppc_inst_suffix(x) == ppc_inst_suffix(y); } -static inline int ppc_inst_len(struct ppc_inst x) +static inline int ppc_inst_len(ppc_inst_t x) { return ppc_inst_prefixed(x) ? 8 : 4; } @@ -109,14 +109,14 @@ static inline int ppc_inst_len(struct ppc_inst x) */ static inline u32 *ppc_inst_next(u32 *location, u32 *value) { - struct ppc_inst tmp; + ppc_inst_t tmp; tmp = ppc_inst_read(value); return (void *)location + ppc_inst_len(tmp); } -static inline unsigned long ppc_inst_as_ulong(struct ppc_inst x) +static inline unsigned long ppc_inst_as_ulong(ppc_inst_t x) { if (IS_ENABLED(CONFIG_PPC32)) return ppc_inst_val(x); @@ -128,7 +128,7 @@ static inline unsigned long ppc_inst_as_ulong(struct ppc_inst x) #define PPC_INST_STR_LEN sizeof("00000000 00000000") -static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], struct ppc_inst x) +static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], ppc_inst_t x) { if (ppc_inst_prefixed(x)) sprintf(str, "%08x %08x", ppc_inst_val(x), ppc_inst_suffix(x)); @@ -145,6 +145,6 @@ static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], struct ppc_ins __str; \ }) -int copy_inst_from_kernel_nofault(struct ppc_inst *inst, u32 *src); +int copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src); #endif /* _ASM_POWERPC_INST_H */ diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index 1df867c2e054..50950deedb87 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -145,7 +145,7 @@ union vsx_reg { * otherwise. */ extern int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, - struct ppc_inst instr); + ppc_inst_t instr); /* * Emulate an instruction that can be executed just by updating @@ -162,7 +162,7 @@ void emulate_update_regs(struct pt_regs *reg, struct instruction_op *op); * 0 if it could not be emulated, or -1 for an instruction that * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.). */ -extern int emulate_step(struct pt_regs *regs, struct ppc_inst instr); +int emulate_step(struct pt_regs *regs, ppc_inst_t instr); /* * Emulate a load or store instruction by reading/writing the diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index bf96b954a4eb..3e37ece06739 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -105,7 +105,7 @@ static struct aligninfo spe_aligninfo[32] = { * so we don't need the address swizzling. */ static int emulate_spe(struct pt_regs *regs, unsigned int reg, - struct ppc_inst ppc_instr) + ppc_inst_t ppc_instr) { union { u64 ll; @@ -300,7 +300,7 @@ Efault_write: int fix_alignment(struct pt_regs *regs) { - struct ppc_inst instr; + ppc_inst_t instr; struct instruction_op op; int r, type; diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c index 93b0f3ec8fb0..d4b8aff20815 100644 --- a/arch/powerpc/kernel/epapr_paravirt.c +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -37,7 +37,7 @@ static int __init early_init_dt_scan_epapr(unsigned long node, return -1; for (i = 0; i < (len / 4); i++) { - struct ppc_inst inst = ppc_inst(be32_to_cpu(insts[i])); + ppc_inst_t inst = ppc_inst(be32_to_cpu(insts[i])); patch_instruction(epapr_hypercall_start + i, inst); #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) patch_instruction(epapr_ev_idle_start + i, inst); diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 91a3be14808b..2669f80b3a49 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -523,7 +523,7 @@ static void larx_stcx_err(struct perf_event *bp, struct arch_hw_breakpoint *info static bool stepping_handler(struct pt_regs *regs, struct perf_event **bp, struct arch_hw_breakpoint **info, int *hit, - struct ppc_inst instr) + ppc_inst_t instr) { int i; int stepped; @@ -616,7 +616,7 @@ int hw_breakpoint_handler(struct die_args *args) int hit[HBP_NUM_MAX] = {0}; int nr_hit = 0; bool ptrace_bp = false; - struct ppc_inst instr = ppc_inst(0); + ppc_inst_t instr = ppc_inst(0); int type = 0; int size = 0; unsigned long ea; diff --git a/arch/powerpc/kernel/hw_breakpoint_constraints.c b/arch/powerpc/kernel/hw_breakpoint_constraints.c index 42b967e3d85c..a74623025f3a 100644 --- a/arch/powerpc/kernel/hw_breakpoint_constraints.c +++ b/arch/powerpc/kernel/hw_breakpoint_constraints.c @@ -80,7 +80,7 @@ static bool check_dawrx_constraints(struct pt_regs *regs, int type, * Return true if the event is valid wrt dawr configuration, * including extraneous exception. Otherwise return false. */ -bool wp_check_constraints(struct pt_regs *regs, struct ppc_inst instr, +bool wp_check_constraints(struct pt_regs *regs, ppc_inst_t instr, unsigned long ea, int type, int size, struct arch_hw_breakpoint *info) { @@ -127,7 +127,7 @@ bool wp_check_constraints(struct pt_regs *regs, struct ppc_inst instr, return false; } -void wp_get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr, +void wp_get_instr_detail(struct pt_regs *regs, ppc_inst_t *instr, int *type, int *size, unsigned long *ea) { struct instruction_op op; diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 86d77ff056a6..9a492fdec1df 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -124,7 +124,7 @@ int arch_prepare_kprobe(struct kprobe *p) { int ret = 0; struct kprobe *prev; - struct ppc_inst insn = ppc_inst_read(p->addr); + ppc_inst_t insn = ppc_inst_read(p->addr); if ((unsigned long)p->addr & 0x03) { printk("Attempt to register kprobe at an unaligned address\n"); @@ -244,7 +244,7 @@ NOKPROBE_SYMBOL(arch_prepare_kretprobe); static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; - struct ppc_inst insn = ppc_inst_read(p->ainsn.insn); + ppc_inst_t insn = ppc_inst_read(p->ainsn.insn); /* regs->nip is also adjusted if emulate_step returns 1 */ ret = emulate_step(regs, insn); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index a48ff18d6d65..71e8f2a92e36 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -455,7 +455,7 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, * in real-mode is tricky and can lead to recursive * faults */ - struct ppc_inst instr; + ppc_inst_t instr; unsigned long pfn, instr_addr; struct instruction_op op; struct pt_regs tmp = *regs; diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index ce1903064031..378db980ded3 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -153,7 +153,7 @@ static void patch_imm_load_insns(unsigned long val, int reg, kprobe_opcode_t *ad int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) { - struct ppc_inst branch_op_callback, branch_emulate_step, temp; + ppc_inst_t branch_op_callback, branch_emulate_step, temp; unsigned long op_callback_addr, emulate_step_addr; kprobe_opcode_t *buff; long b_offset; @@ -269,7 +269,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op) void arch_optimize_kprobes(struct list_head *oplist) { - struct ppc_inst instr; + ppc_inst_t instr; struct optimized_kprobe *op; struct optimized_kprobe *tmp; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 790790dfb390..984813a4d5dc 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -628,7 +628,7 @@ static void do_break_handler(struct pt_regs *regs) { struct arch_hw_breakpoint null_brk = {0}; struct arch_hw_breakpoint *info; - struct ppc_inst instr = ppc_inst(0); + ppc_inst_t instr = ppc_inst(0); int type = 0; int size = 0; unsigned long ea; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 15e7386584f9..a6e9d36d7c01 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(DMA_MODE_WRITE); notrace void __init machine_init(u64 dt_ptr) { u32 *addr = (u32 *)patch_site_addr(&patch__memset_nocache); - struct ppc_inst insn; + ppc_inst_t insn; /* Configure static keys first, now that we're relocated. */ setup_feature_keys(); diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index faa0fa29ac20..80b6285769f2 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -41,10 +41,10 @@ #define NUM_FTRACE_TRAMPS 8 static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; -static struct ppc_inst +static ppc_inst_t ftrace_call_replace(unsigned long ip, unsigned long addr, int link) { - struct ppc_inst op; + ppc_inst_t op; addr = ppc_function_entry((void *)addr); @@ -55,9 +55,9 @@ ftrace_call_replace(unsigned long ip, unsigned long addr, int link) } static int -ftrace_modify_code(unsigned long ip, struct ppc_inst old, struct ppc_inst new) +ftrace_modify_code(unsigned long ip, ppc_inst_t old, ppc_inst_t new) { - struct ppc_inst replaced; + ppc_inst_t replaced; /* * Note: @@ -90,24 +90,24 @@ ftrace_modify_code(unsigned long ip, struct ppc_inst old, struct ppc_inst new) */ static int test_24bit_addr(unsigned long ip, unsigned long addr) { - struct ppc_inst op; + ppc_inst_t op; addr = ppc_function_entry((void *)addr); /* use the create_branch to verify that this offset can be branched */ return create_branch(&op, (u32 *)ip, addr, 0) == 0; } -static int is_bl_op(struct ppc_inst op) +static int is_bl_op(ppc_inst_t op) { return (ppc_inst_val(op) & 0xfc000003) == 0x48000001; } -static int is_b_op(struct ppc_inst op) +static int is_b_op(ppc_inst_t op) { return (ppc_inst_val(op) & 0xfc000003) == 0x48000000; } -static unsigned long find_bl_target(unsigned long ip, struct ppc_inst op) +static unsigned long find_bl_target(unsigned long ip, ppc_inst_t op) { int offset; @@ -127,7 +127,7 @@ __ftrace_make_nop(struct module *mod, { unsigned long entry, ptr, tramp; unsigned long ip = rec->ip; - struct ppc_inst op, pop; + ppc_inst_t op, pop; /* read where this goes */ if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { @@ -221,7 +221,7 @@ static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - struct ppc_inst op; + ppc_inst_t op; unsigned long ip = rec->ip; unsigned long tramp, ptr; @@ -262,7 +262,7 @@ __ftrace_make_nop(struct module *mod, static unsigned long find_ftrace_tramp(unsigned long ip) { int i; - struct ppc_inst instr; + ppc_inst_t instr; /* * We have the compiler generated long_branch tramps at the end @@ -300,9 +300,9 @@ static int add_ftrace_tramp(unsigned long tramp) static int setup_mcount_compiler_tramp(unsigned long tramp) { int i; - struct ppc_inst op; + ppc_inst_t op; unsigned long ptr; - struct ppc_inst instr; + ppc_inst_t instr; static unsigned long ftrace_plt_tramps[NUM_FTRACE_TRAMPS]; /* Is this a known long jump tramp? */ @@ -367,7 +367,7 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) { unsigned long tramp, ip = rec->ip; - struct ppc_inst op; + ppc_inst_t op; /* Read where this goes */ if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { @@ -407,7 +407,7 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned long ip = rec->ip; - struct ppc_inst old, new; + ppc_inst_t old, new; /* * If the calling address is more that 24 bits away, @@ -460,7 +460,7 @@ int ftrace_make_nop(struct module *mod, */ #ifndef CONFIG_MPROFILE_KERNEL static int -expected_nop_sequence(void *ip, struct ppc_inst op0, struct ppc_inst op1) +expected_nop_sequence(void *ip, ppc_inst_t op0, ppc_inst_t op1) { /* * We expect to see: @@ -478,7 +478,7 @@ expected_nop_sequence(void *ip, struct ppc_inst op0, struct ppc_inst op1) } #else static int -expected_nop_sequence(void *ip, struct ppc_inst op0, struct ppc_inst op1) +expected_nop_sequence(void *ip, ppc_inst_t op0, ppc_inst_t op1) { /* look for patched "NOP" on ppc64 with -mprofile-kernel */ if (!ppc_inst_equal(op0, ppc_inst(PPC_RAW_NOP()))) @@ -490,8 +490,8 @@ expected_nop_sequence(void *ip, struct ppc_inst op0, struct ppc_inst op1) static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - struct ppc_inst op[2]; - struct ppc_inst instr; + ppc_inst_t op[2]; + ppc_inst_t instr; void *ip = (void *)rec->ip; unsigned long entry, ptr, tramp; struct module *mod = rec->arch.mod; @@ -559,7 +559,7 @@ static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { int err; - struct ppc_inst op; + ppc_inst_t op; u32 *ip = (u32 *)rec->ip; struct module *mod = rec->arch.mod; unsigned long tramp; @@ -609,7 +609,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) { - struct ppc_inst op; + ppc_inst_t op; void *ip = (void *)rec->ip; unsigned long tramp, entry, ptr; @@ -657,7 +657,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long ip = rec->ip; - struct ppc_inst old, new; + ppc_inst_t old, new; /* * If the calling address is more that 24 bits away, @@ -696,7 +696,7 @@ static int __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { - struct ppc_inst op; + ppc_inst_t op; unsigned long ip = rec->ip; unsigned long entry, ptr, tramp; struct module *mod = rec->arch.mod; @@ -790,7 +790,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { unsigned long ip = rec->ip; - struct ppc_inst old, new; + ppc_inst_t old, new; /* * If the calling address is more that 24 bits away, @@ -830,7 +830,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - struct ppc_inst old, new; + ppc_inst_t old, new; int ret; old = ppc_inst_read((u32 *)&ftrace_call); @@ -915,7 +915,7 @@ int ftrace_enable_ftrace_graph_caller(void) unsigned long ip = (unsigned long)(&ftrace_graph_call); unsigned long addr = (unsigned long)(&ftrace_graph_caller); unsigned long stub = (unsigned long)(&ftrace_graph_stub); - struct ppc_inst old, new; + ppc_inst_t old, new; old = ftrace_call_replace(ip, stub, 0); new = ftrace_call_replace(ip, addr, 0); @@ -928,7 +928,7 @@ int ftrace_disable_ftrace_graph_caller(void) unsigned long ip = (unsigned long)(&ftrace_graph_call); unsigned long addr = (unsigned long)(&ftrace_graph_caller); unsigned long stub = (unsigned long)(&ftrace_graph_stub); - struct ppc_inst old, new; + ppc_inst_t old, new; old = ftrace_call_replace(ip, addr, 0); new = ftrace_call_replace(ip, stub, 0); diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c index ae632569446f..fd9432875ebc 100644 --- a/arch/powerpc/kernel/vecemu.c +++ b/arch/powerpc/kernel/vecemu.c @@ -261,7 +261,7 @@ static unsigned int rfin(unsigned int x) int emulate_altivec(struct pt_regs *regs) { - struct ppc_inst instr; + ppc_inst_t instr; unsigned int i, word; unsigned int va, vb, vc, vd; vector128 *vrs; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index c5ed98823835..312324a26df3 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -18,7 +18,7 @@ #include #include -static int __patch_instruction(u32 *exec_addr, struct ppc_inst instr, u32 *patch_addr) +static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr) { if (!ppc_inst_prefixed(instr)) { u32 val = ppc_inst_val(instr); @@ -39,7 +39,7 @@ failed: return -EFAULT; } -int raw_patch_instruction(u32 *addr, struct ppc_inst instr) +int raw_patch_instruction(u32 *addr, ppc_inst_t instr) { return __patch_instruction(addr, instr, addr); } @@ -141,7 +141,7 @@ static inline int unmap_patch_area(unsigned long addr) return 0; } -static int do_patch_instruction(u32 *addr, struct ppc_inst instr) +static int do_patch_instruction(u32 *addr, ppc_inst_t instr) { int err; u32 *patch_addr = NULL; @@ -180,14 +180,14 @@ out: } #else /* !CONFIG_STRICT_KERNEL_RWX */ -static int do_patch_instruction(u32 *addr, struct ppc_inst instr) +static int do_patch_instruction(u32 *addr, ppc_inst_t instr) { return raw_patch_instruction(addr, instr); } #endif /* CONFIG_STRICT_KERNEL_RWX */ -int patch_instruction(u32 *addr, struct ppc_inst instr) +int patch_instruction(u32 *addr, ppc_inst_t instr) { /* Make sure we aren't patching a freed init section */ if (init_mem_is_free && init_section_contains(addr, 4)) { @@ -200,7 +200,7 @@ NOKPROBE_SYMBOL(patch_instruction); int patch_branch(u32 *addr, unsigned long target, int flags) { - struct ppc_inst instr; + ppc_inst_t instr; create_branch(&instr, addr, target, flags); return patch_instruction(addr, instr); @@ -237,7 +237,7 @@ bool is_offset_in_cond_branch_range(long offset) * Helper to check if a given instruction is a conditional branch * Derived from the conditional checks in analyse_instr() */ -bool is_conditional_branch(struct ppc_inst instr) +bool is_conditional_branch(ppc_inst_t instr) { unsigned int opcode = ppc_inst_primary_opcode(instr); @@ -255,7 +255,7 @@ bool is_conditional_branch(struct ppc_inst instr) } NOKPROBE_SYMBOL(is_conditional_branch); -int create_branch(struct ppc_inst *instr, const u32 *addr, +int create_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags) { long offset; @@ -275,7 +275,7 @@ int create_branch(struct ppc_inst *instr, const u32 *addr, return 0; } -int create_cond_branch(struct ppc_inst *instr, const u32 *addr, +int create_cond_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags) { long offset; @@ -294,22 +294,22 @@ int create_cond_branch(struct ppc_inst *instr, const u32 *addr, return 0; } -static unsigned int branch_opcode(struct ppc_inst instr) +static unsigned int branch_opcode(ppc_inst_t instr) { return ppc_inst_primary_opcode(instr) & 0x3F; } -static int instr_is_branch_iform(struct ppc_inst instr) +static int instr_is_branch_iform(ppc_inst_t instr) { return branch_opcode(instr) == 18; } -static int instr_is_branch_bform(struct ppc_inst instr) +static int instr_is_branch_bform(ppc_inst_t instr) { return branch_opcode(instr) == 16; } -int instr_is_relative_branch(struct ppc_inst instr) +int instr_is_relative_branch(ppc_inst_t instr) { if (ppc_inst_val(instr) & BRANCH_ABSOLUTE) return 0; @@ -317,7 +317,7 @@ int instr_is_relative_branch(struct ppc_inst instr) return instr_is_branch_iform(instr) || instr_is_branch_bform(instr); } -int instr_is_relative_link_branch(struct ppc_inst instr) +int instr_is_relative_link_branch(ppc_inst_t instr) { return instr_is_relative_branch(instr) && (ppc_inst_val(instr) & BRANCH_SET_LINK); } @@ -364,7 +364,7 @@ unsigned long branch_target(const u32 *instr) return 0; } -int translate_branch(struct ppc_inst *instr, const u32 *dest, const u32 *src) +int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src) { unsigned long target; target = branch_target(src); @@ -417,7 +417,7 @@ static void __init test_trampoline(void) static void __init test_branch_iform(void) { int err; - struct ppc_inst instr; + ppc_inst_t instr; u32 tmp[2]; u32 *iptr = tmp; unsigned long addr = (unsigned long)tmp; @@ -499,7 +499,7 @@ static void __init test_create_function_call(void) { u32 *iptr; unsigned long dest; - struct ppc_inst instr; + ppc_inst_t instr; /* Check we can create a function call */ iptr = (u32 *)ppc_function_entry(test_trampoline); @@ -513,7 +513,7 @@ static void __init test_branch_bform(void) { int err; unsigned long addr; - struct ppc_inst instr; + ppc_inst_t instr; u32 tmp[2]; u32 *iptr = tmp; unsigned int flags; @@ -591,7 +591,7 @@ static void __init test_translate_branch(void) { unsigned long addr; void *p, *q; - struct ppc_inst instr; + ppc_inst_t instr; void *buf; buf = vmalloc(PAGE_ALIGN(0x2000000 + 1)); diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index c3e06922468b..57c6bb802f6c 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -47,7 +47,7 @@ static u32 *calc_addr(struct fixup_entry *fcur, long offset) static int patch_alt_instruction(u32 *src, u32 *dest, u32 *alt_start, u32 *alt_end) { int err; - struct ppc_inst instr; + ppc_inst_t instr; instr = ppc_inst_read(src); @@ -624,7 +624,7 @@ void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) static void do_final_fixups(void) { #if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE) - struct ppc_inst inst; + ppc_inst_t inst; u32 *src, *dest, *end; if (PHYSICAL_START == 0) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 86f49e3e7cf5..a94b0cd0bdc5 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1354,7 +1354,7 @@ static nokprobe_inline int trap_compare(long v1, long v2) * otherwise. */ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, - struct ppc_inst instr) + ppc_inst_t instr) { #ifdef CONFIG_PPC64 unsigned int suffixopcode, prefixtype, prefix_r; @@ -3578,7 +3578,7 @@ NOKPROBE_SYMBOL(emulate_loadstore); * or -1 if the instruction is one that should not be stepped, * such as an rfid, or a mtmsrd that would clear MSR_RI. */ -int emulate_step(struct pt_regs *regs, struct ppc_inst instr) +int emulate_step(struct pt_regs *regs, ppc_inst_t instr) { struct instruction_op op; int r, err, type; diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 8b4f6b3e96c4..4f141daafcff 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -792,7 +792,7 @@ static void __init test_lxvpx_stxvpx(void) #ifdef CONFIG_VSX static void __init test_plxvp_pstxvp(void) { - struct ppc_inst instr; + ppc_inst_t instr; struct pt_regs regs; union { vector128 a; @@ -906,7 +906,7 @@ struct compute_test { struct { char *descr; unsigned long flags; - struct ppc_inst instr; + ppc_inst_t instr; struct pt_regs regs; } subtests[MAX_SUBTESTS + 1]; }; @@ -1600,7 +1600,7 @@ static struct compute_test compute_tests[] = { }; static int __init emulate_compute_instr(struct pt_regs *regs, - struct ppc_inst instr, + ppc_inst_t instr, bool negative) { int analysed; @@ -1627,7 +1627,7 @@ static int __init emulate_compute_instr(struct pt_regs *regs, } static int __init execute_compute_instr(struct pt_regs *regs, - struct ppc_inst instr) + ppc_inst_t instr) { extern int exec_instr(struct pt_regs *regs); @@ -1658,7 +1658,7 @@ static void __init run_tests_compute(void) struct compute_test *test; struct pt_regs *regs, exp, got; unsigned int i, j, k; - struct ppc_inst instr; + ppc_inst_t instr; bool ignore_gpr, ignore_xer, ignore_ccr, passed, rc, negative; for (i = 0; i < ARRAY_SIZE(compute_tests); i++) { diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c index aad7c47e0030..5abae96b2b46 100644 --- a/arch/powerpc/mm/maccess.c +++ b/arch/powerpc/mm/maccess.c @@ -12,7 +12,7 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) return is_kernel_addr((unsigned long)unsafe_src); } -int copy_inst_from_kernel_nofault(struct ppc_inst *inst, u32 *src) +int copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src) { unsigned int val, suffix; int err; diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c index f970d1510d3d..4738c4dbf567 100644 --- a/arch/powerpc/perf/8xx-pmu.c +++ b/arch/powerpc/perf/8xx-pmu.c @@ -153,7 +153,7 @@ static void mpc8xx_pmu_read(struct perf_event *event) static void mpc8xx_pmu_del(struct perf_event *event, int flags) { - struct ppc_inst insn = ppc_inst(PPC_RAW_MFSPR(10, SPRN_SPRG_SCRATCH2)); + ppc_inst_t insn = ppc_inst(PPC_RAW_MFSPR(10, SPRN_SPRG_SCRATCH2)); mpc8xx_pmu_read(event); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 0c65dc01c325..f9ae0b398260 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -125,7 +125,7 @@ static unsigned bpinstr = 0x7fe00008; /* trap */ static int cmds(struct pt_regs *); static int mread(unsigned long, void *, int); static int mwrite(unsigned long, void *, int); -static int mread_instr(unsigned long, struct ppc_inst *); +static int mread_instr(unsigned long, ppc_inst_t *); static int handle_fault(struct pt_regs *); static void byterev(unsigned char *, int); static void memex(void); @@ -908,7 +908,7 @@ static struct bpt *new_breakpoint(unsigned long a) static void insert_bpts(void) { int i; - struct ppc_inst instr, instr2; + ppc_inst_t instr, instr2; struct bpt *bp, *bp2; bp = bpts; @@ -988,7 +988,7 @@ static void remove_bpts(void) { int i; struct bpt *bp; - struct ppc_inst instr; + ppc_inst_t instr; bp = bpts; for (i = 0; i < NBPTS; ++i, ++bp) { @@ -1204,7 +1204,7 @@ static int do_step(struct pt_regs *regs) */ static int do_step(struct pt_regs *regs) { - struct ppc_inst instr; + ppc_inst_t instr; int stepped; force_enable_xmon(); @@ -1459,7 +1459,7 @@ csum(void) */ static long check_bp_loc(unsigned long addr) { - struct ppc_inst instr; + ppc_inst_t instr; addr &= ~3; if (!is_kernel_addr(addr)) { @@ -2306,7 +2306,7 @@ mwrite(unsigned long adrs, void *buf, int size) } static int -mread_instr(unsigned long adrs, struct ppc_inst *instr) +mread_instr(unsigned long adrs, ppc_inst_t *instr) { volatile int n; @@ -3028,7 +3028,7 @@ generic_inst_dump(unsigned long adr, long count, int praddr, { int nr, dotted; unsigned long first_adr; - struct ppc_inst inst, last_inst = ppc_inst(0); + ppc_inst_t inst, last_inst = ppc_inst(0); dotted = 0; for (first_adr = adr; count > 0; --count, adr += ppc_inst_len(inst)) { diff --git a/arch/powerpc/xmon/xmon_bpts.h b/arch/powerpc/xmon/xmon_bpts.h index 57e6fb03de48..377068f52edb 100644 --- a/arch/powerpc/xmon/xmon_bpts.h +++ b/arch/powerpc/xmon/xmon_bpts.h @@ -5,8 +5,8 @@ #define NBPTS 256 #ifndef __ASSEMBLY__ #include -#define BPT_SIZE (sizeof(struct ppc_inst) * 2) -#define BPT_WORDS (BPT_SIZE / sizeof(struct ppc_inst)) +#define BPT_SIZE (sizeof(ppc_inst_t) * 2) +#define BPT_WORDS (BPT_SIZE / sizeof(ppc_inst_t)) extern unsigned int bpt_table[NBPTS * BPT_WORDS]; #endif /* __ASSEMBLY__ */ From 07b863aef5b682a482474b524f3df4957d2862ac Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 29 Nov 2021 18:49:39 +0100 Subject: [PATCH 165/240] powerpc/inst: Define ppc_inst_t as u32 on PPC32 Unlike PPC64 ABI, PPC32 uses the stack to pass a parameter defined as a struct, even when the struct has a single simple element. To avoid that, define ppc_inst_t as u32 on PPC32. Keep it as 'struct ppc_inst' when __CHECKER__ is defined so that sparse can perform type checking. Also revert commit 511eea5e2ccd ("powerpc/kprobes: Fix Oops by passing ppc_inst as a pointer to emulate_step() on ppc32") as now the instruction to be emulated is passed as a register to emulate_step(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c6d0c46f598f76ad0b0a88bc0d84773bd921b17c.1638208156.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/inst.h | 15 +++++++++++++-- arch/powerpc/kernel/optprobes.c | 8 ++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index b3502f21e0f4..7ef5fd3bb167 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -34,6 +34,7 @@ * Instruction data type for POWER */ +#if defined(CONFIG_PPC64) || defined(__CHECKER__) typedef struct { u32 val; #ifdef CONFIG_PPC64 @@ -46,13 +47,23 @@ static inline u32 ppc_inst_val(ppc_inst_t x) return x.val; } +#define ppc_inst(x) ((ppc_inst_t){ .val = (x) }) + +#else +typedef u32 ppc_inst_t; + +static inline u32 ppc_inst_val(ppc_inst_t x) +{ + return x; +} +#define ppc_inst(x) (x) +#endif + static inline int ppc_inst_primary_opcode(ppc_inst_t x) { return ppc_inst_val(x) >> 26; } -#define ppc_inst(x) ((ppc_inst_t){ .val = (x) }) - #ifdef CONFIG_PPC64 #define ppc_inst_prefix(x, y) ((ppc_inst_t){ .val = (x), .suffix = (y) }) diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 378db980ded3..3b1c2236cbee 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -228,12 +228,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 3. load instruction to be emulated into relevant register, and */ - if (IS_ENABLED(CONFIG_PPC64)) { - temp = ppc_inst_read(p->ainsn.insn); - patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); - } else { - patch_imm_load_insns((unsigned long)p->ainsn.insn, 4, buff + TMPL_INSN_IDX); - } + temp = ppc_inst_read(p->ainsn.insn); + patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); /* * 4. branch back from trampoline From 9b307576f37136d37d5e42b1d8713ec34a601a62 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 29 Nov 2021 18:49:40 +0100 Subject: [PATCH 166/240] powerpc/inst: Move ppc_inst_t definition in asm/reg.h Because of circular inclusion of asm/hw_breakpoint.h, we need to move definition of asm/reg.h outside of inst.h so that asm/hw_breakpoint.h gets it without including asm/inst.h Also remove asm/inst.h from asm/uprobes.h as it's not needed anymore. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4b79f1491118af96b1ac0735e74aeca02ea4c04e.1638208156.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_breakpoint.h | 1 - arch/powerpc/include/asm/inst.h | 10 +--------- arch/powerpc/include/asm/reg.h | 12 ++++++++++++ arch/powerpc/include/asm/uprobes.h | 1 - 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index 88053d3c68e6..84d39fd42f71 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -10,7 +10,6 @@ #define _PPC_BOOK3S_64_HW_BREAKPOINT_H #include -#include #ifdef __KERNEL__ struct arch_hw_breakpoint { diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index 7ef5fd3bb167..53a40faf362a 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -3,6 +3,7 @@ #define _ASM_POWERPC_INST_H #include +#include #define ___get_user_instr(gu_op, dest, ptr) \ ({ \ @@ -35,13 +36,6 @@ */ #if defined(CONFIG_PPC64) || defined(__CHECKER__) -typedef struct { - u32 val; -#ifdef CONFIG_PPC64 - u32 suffix; -#endif -} __packed ppc_inst_t; - static inline u32 ppc_inst_val(ppc_inst_t x) { return x.val; @@ -50,8 +44,6 @@ static inline u32 ppc_inst_val(ppc_inst_t x) #define ppc_inst(x) ((ppc_inst_t){ .val = (x) }) #else -typedef u32 ppc_inst_t; - static inline u32 ppc_inst_val(ppc_inst_t x) { return x; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 50478738c8f1..2835f6363228 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1366,6 +1366,18 @@ /* Macros for setting and retrieving special purpose registers */ #ifndef __ASSEMBLY__ + +#if defined(CONFIG_PPC64) || defined(__CHECKER__) +typedef struct { + u32 val; +#ifdef CONFIG_PPC64 + u32 suffix; +#endif +} __packed ppc_inst_t; +#else +typedef u32 ppc_inst_t; +#endif + #define mfmsr() ({unsigned long rval; \ asm volatile("mfmsr %0" : "=r" (rval) : \ : "memory"); rval;}) diff --git a/arch/powerpc/include/asm/uprobes.h b/arch/powerpc/include/asm/uprobes.h index fe683371336f..a7ae1860115a 100644 --- a/arch/powerpc/include/asm/uprobes.h +++ b/arch/powerpc/include/asm/uprobes.h @@ -11,7 +11,6 @@ #include #include -#include typedef ppc_opcode_t uprobe_opcode_t; From 0d76914a4c99ab5658f3fb07cdf3799d28e2eab3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 29 Nov 2021 18:49:41 +0100 Subject: [PATCH 167/240] powerpc/inst: Optimise copy_inst_from_kernel_nofault() copy_inst_from_kernel_nofault() uses copy_from_kernel_nofault() to copy one or two 32bits words. This means calling an out-of-line function which itself calls back copy_from_kernel_nofault_allowed() then performs a generic copy with loops. Rewrite copy_inst_from_kernel_nofault() to do everything at a single place and use __get_kernel_nofault() directly to perform single accesses without loops. Allthough the generic function uses pagefault_disable(), it is not required on powerpc because do_page_fault() bails earlier when a kernel mode fault happens on a kernel address. As the function has now become very small, inline it. With this change, on an 8xx the time spent in the loop in ftrace_replace_code() is reduced by 23% at function tracer activation and 27% at nop tracer activation. The overall time to activate function tracer (measured with shell command 'time') is 570ms before the patch and 470ms after the patch. Even vmlinux size is reduced (by 152 instruction). Before the patch: 00000018 : 18: 94 21 ff e0 stwu r1,-32(r1) 1c: 7c 08 02 a6 mflr r0 20: 38 a0 00 04 li r5,4 24: 93 e1 00 1c stw r31,28(r1) 28: 7c 7f 1b 78 mr r31,r3 2c: 38 61 00 08 addi r3,r1,8 30: 90 01 00 24 stw r0,36(r1) 34: 48 00 00 01 bl 34 34: R_PPC_REL24 copy_from_kernel_nofault 38: 2c 03 00 00 cmpwi r3,0 3c: 40 82 00 0c bne 48 40: 81 21 00 08 lwz r9,8(r1) 44: 91 3f 00 00 stw r9,0(r31) 48: 80 01 00 24 lwz r0,36(r1) 4c: 83 e1 00 1c lwz r31,28(r1) 50: 38 21 00 20 addi r1,r1,32 54: 7c 08 03 a6 mtlr r0 58: 4e 80 00 20 blr After the patch (before inlining): 00000018 : 18: 3d 20 b0 00 lis r9,-20480 1c: 7c 04 48 40 cmplw r4,r9 20: 7c 69 1b 78 mr r9,r3 24: 41 80 00 14 blt 38 28: 81 44 00 00 lwz r10,0(r4) 2c: 38 60 00 00 li r3,0 30: 91 49 00 00 stw r10,0(r9) 34: 4e 80 00 20 blr 38: 38 60 ff de li r3,-34 3c: 4e 80 00 20 blr 40: 38 60 ff f2 li r3,-14 44: 4e 80 00 20 blr Signed-off-by: Christophe Leroy [mpe: Add clang workaround, with version check as suggested by Nathan] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0d5b12183d5176dd702d29ad94c39c384e51c78f.1638208156.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/inst.h | 25 ++++++++++++++++++++++++- arch/powerpc/mm/maccess.c | 17 ----------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index 53a40faf362a..95e5243d2997 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -4,6 +4,8 @@ #include #include +#include +#include #define ___get_user_instr(gu_op, dest, ptr) \ ({ \ @@ -148,6 +150,27 @@ static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], ppc_inst_t x) __str; \ }) -int copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src); +static inline int copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src) +{ + unsigned int val, suffix; + + if (unlikely(!is_kernel_addr((unsigned long)src))) + return -ERANGE; + +/* See https://github.com/ClangBuiltLinux/linux/issues/1521 */ +#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 140000 + val = suffix = 0; +#endif + __get_kernel_nofault(&val, src, u32, Efault); + if (IS_ENABLED(CONFIG_PPC64) && get_op(val) == OP_PREFIX) { + __get_kernel_nofault(&suffix, src + 1, u32, Efault); + *inst = ppc_inst_prefix(val, suffix); + } else { + *inst = ppc_inst(val); + } + return 0; +Efault: + return -EFAULT; +} #endif /* _ASM_POWERPC_INST_H */ diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c index 5abae96b2b46..ea821d0ffe16 100644 --- a/arch/powerpc/mm/maccess.c +++ b/arch/powerpc/mm/maccess.c @@ -11,20 +11,3 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return is_kernel_addr((unsigned long)unsafe_src); } - -int copy_inst_from_kernel_nofault(ppc_inst_t *inst, u32 *src) -{ - unsigned int val, suffix; - int err; - - err = copy_from_kernel_nofault(&val, src, sizeof(val)); - if (err) - return err; - if (IS_ENABLED(CONFIG_PPC64) && get_op(val) == OP_PREFIX) { - err = copy_from_kernel_nofault(&suffix, src + 1, sizeof(suffix)); - *inst = ppc_inst_prefix(val, suffix); - } else { - *inst = ppc_inst(val); - } - return err; -} From 8cffe0b0b6b3342d75e5469f07496173feace6bc Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Sun, 5 Dec 2021 21:09:25 +0800 Subject: [PATCH 168/240] macintosh: Add const to of_device_id struct of_device_id should normally be const. Signed-off-by: Xiang wangx Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211205130925.28389-1-wangxiang@cdjrlc.com --- drivers/macintosh/mediabay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/mediabay.c b/drivers/macintosh/mediabay.c index eab7e83c11c4..b17660c022eb 100644 --- a/drivers/macintosh/mediabay.c +++ b/drivers/macintosh/mediabay.c @@ -703,7 +703,7 @@ static const struct mb_ops keylargo_mb_ops = { * Therefore we do it all by polling the media bay once each tick. */ -static struct of_device_id media_bay_match[] = +static const struct of_device_id media_bay_match[] = { { .name = "media-bay", From e89257e28e844f5d1d39081bb901d9f1183a7705 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 7 Dec 2021 12:02:28 +0100 Subject: [PATCH 169/240] powerpc/cell: Fix clang -Wimplicit-fallthrough warning Clang warns: arch/powerpc/platforms/cell/pervasive.c:81:2: error: unannotated fall-through between switch labels case SRR1_WAKEEE: ^ arch/powerpc/platforms/cell/pervasive.c:81:2: note: insert 'break;' to avoid fall-through case SRR1_WAKEEE: ^ break; 1 error generated. Clang is more pedantic than GCC, which does not warn when failing through to a case that is just break or return. Clang's version is more in line with the kernel's own stance in deprecated.rst. Add athe missing break to silence the warning. Fixes: 6e83985b0f6e ("powerpc/cbe: Do not process external or decremeter interrupts from sreset") Reported-by: Naresh Kamboju Signed-off-by: Anders Roxell Reviewed-by: Nathan Chancellor Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211207110228.698956-1-anders.roxell@linaro.org --- arch/powerpc/platforms/cell/pervasive.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c index 5b9a7e9f144b..dff8d5e7ab82 100644 --- a/arch/powerpc/platforms/cell/pervasive.c +++ b/arch/powerpc/platforms/cell/pervasive.c @@ -78,6 +78,7 @@ static int cbe_system_reset_exception(struct pt_regs *regs) switch (regs->msr & SRR1_WAKEMASK) { case SRR1_WAKEDEC: set_dec(1); + break; case SRR1_WAKEEE: /* * Handle these when interrupts get re-enabled and we take From 3c42e9542050d49610077e083c7c3f5fd5e26820 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 7 Dec 2021 10:05:57 -0300 Subject: [PATCH 170/240] selftests/powerpc/spectre_v2: Return skip code when miss_percent is high A mis-match between reported and actual mitigation is not restricted to the Vulnerable case. The guest might also report the mitigation as "Software count cache flush" and the host will still mitigate with branch cache disabled. So, instead of skipping depending on the detected mitigation, simply skip whenever the detected miss_percent is the expected one for a fully mitigated system, that is, above 95%. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211207130557.40566-1-cascardo@canonical.com --- tools/testing/selftests/powerpc/security/spectre_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/security/spectre_v2.c b/tools/testing/selftests/powerpc/security/spectre_v2.c index adc2b7294e5f..83647b8277e7 100644 --- a/tools/testing/selftests/powerpc/security/spectre_v2.c +++ b/tools/testing/selftests/powerpc/security/spectre_v2.c @@ -193,7 +193,7 @@ int spectre_v2_test(void) * We are not vulnerable and reporting otherwise, so * missing such a mismatch is safe. */ - if (state == VULNERABLE) + if (miss_percent > 95) return 4; return 1; From 219572d2fc4135b5ce65c735d881787d48b10e71 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 7 Dec 2021 16:07:18 +0530 Subject: [PATCH 171/240] powerpc: handle kdump appropriately with crash_kexec_post_notifiers option Kdump can be triggered after panic_notifers since commit f06e5153f4ae2 ("kernel/panic.c: add "crash_kexec_post_notifiers" option for kdump after panic_notifers") introduced crash_kexec_post_notifiers option. But using this option would mean smp_send_stop(), that marks all other CPUs as offline, gets called before kdump is triggered. As a result, kdump routines fail to save other CPUs' registers. To fix this, kdump friendly crash_smp_send_stop() function was introduced with kernel commit 0ee59413c967 ("x86/panic: replace smp_send_stop() with kdump friendly version in panic path"). Override this kdump friendly weak function to handle crash_kexec_post_notifiers option appropriately on powerpc. Reported-by: kernel test robot Signed-off-by: Hari Bathini [Fixed signature of crash_stop_this_cpu() - reported by lkp@intel.com] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211207103719.91117-1-hbathini@linux.ibm.com --- arch/powerpc/kernel/smp.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index aee3a7119f97..7201fdcf02f1 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -620,6 +620,36 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) } #endif +#ifdef CONFIG_NMI_IPI +static void crash_stop_this_cpu(struct pt_regs *regs) +#else +static void crash_stop_this_cpu(void *dummy) +#endif +{ + /* + * Just busy wait here and avoid marking CPU as offline to ensure + * register data is captured appropriately. + */ + while (1) + cpu_relax(); +} + +void crash_smp_send_stop(void) +{ + static bool stopped = false; + + if (stopped) + return; + + stopped = true; + +#ifdef CONFIG_NMI_IPI + smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_stop_this_cpu, 1000000); +#else + smp_call_function(crash_stop_this_cpu, NULL, 0); +#endif /* CONFIG_NMI_IPI */ +} + #ifdef CONFIG_NMI_IPI static void nmi_stop_this_cpu(struct pt_regs *regs) { From 06e629c25daa519be620a8c17359ae8fc7a2e903 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 7 Dec 2021 16:07:19 +0530 Subject: [PATCH 172/240] powerpc/fadump: Fix inaccurate CPU state info in vmcore generated with panic In panic path, fadump is triggered via a panic notifier function. Before calling panic notifier functions, smp_send_stop() gets called, which stops all CPUs except the panic'ing CPU. Commit 8389b37dffdc ("powerpc: stop_this_cpu: remove the cpu from the online map.") and again commit bab26238bbd4 ("powerpc: Offline CPU in stop_this_cpu()") started marking CPUs as offline while stopping them. So, if a kernel has either of the above commits, vmcore captured with fadump via panic path would not process register data for all CPUs except the panic'ing CPU. Sample output of crash-utility with such vmcore: # crash vmlinux vmcore ... KERNEL: vmlinux DUMPFILE: vmcore [PARTIAL DUMP] CPUS: 1 DATE: Wed Nov 10 09:56:34 EST 2021 UPTIME: 00:00:42 LOAD AVERAGE: 2.27, 0.69, 0.24 TASKS: 183 NODENAME: XXXXXXXXX RELEASE: 5.15.0+ VERSION: #974 SMP Wed Nov 10 04:18:19 CST 2021 MACHINE: ppc64le (2500 Mhz) MEMORY: 8 GB PANIC: "Kernel panic - not syncing: sysrq triggered crash" PID: 3394 COMMAND: "bash" TASK: c0000000150a5f80 [THREAD_INFO: c0000000150a5f80] CPU: 1 STATE: TASK_RUNNING (PANIC) crash> p -x __cpu_online_mask __cpu_online_mask = $1 = { bits = {0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0} } crash> crash> crash> p -x __cpu_active_mask __cpu_active_mask = $2 = { bits = {0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0} } crash> While this has been the case since fadump was introduced, the issue was not identified for two probable reasons: - In general, the bulk of the vmcores analyzed were from crash due to exception. - The above did change since commit 8341f2f222d7 ("sysrq: Use panic() to force a crash") started using panic() instead of deferencing NULL pointer to force a kernel crash. But then commit de6e5d38417e ("powerpc: smp_send_stop do not offline stopped CPUs") stopped marking CPUs as offline till kernel commit bab26238bbd4 ("powerpc: Offline CPU in stop_this_cpu()") reverted that change. To ensure post processing register data of all other CPUs happens as intended, let panic() function take the crash friendly path (read crash_smp_send_stop()) with the help of crash_kexec_post_notifiers option. Also, as register data for all CPUs is captured by f/w, skip IPI callbacks here for fadump, to avoid any complications in finding the right backtraces. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211207103719.91117-2-hbathini@linux.ibm.com --- arch/powerpc/kernel/fadump.c | 8 ++++++++ arch/powerpc/kernel/smp.c | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b7ceb041743c..60f5fc14aa23 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1641,6 +1641,14 @@ int __init setup_fadump(void) else if (fw_dump.reserve_dump_area_size) fw_dump.ops->fadump_init_mem_struct(&fw_dump); + /* + * In case of panic, fadump is triggered via ppc_panic_event() + * panic notifier. Setting crash_kexec_post_notifiers to 'true' + * lets panic() function take crash friendly path before panic + * notifiers are invoked. + */ + crash_kexec_post_notifiers = true; + return 1; } subsys_initcall(setup_fadump); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 7201fdcf02f1..c338f9d8ab37 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -61,6 +61,7 @@ #include #include #include +#include #ifdef DEBUG #include @@ -638,6 +639,15 @@ void crash_smp_send_stop(void) { static bool stopped = false; + /* + * In case of fadump, register data for all CPUs is captured by f/w + * on ibm,os-term rtas call. Skip IPI callbacks to other CPUs before + * this rtas call to avoid tricky post processing of those CPUs' + * backtraces. + */ + if (should_fadump_crash()) + return; + if (stopped) return; From b149d5d45ac9171ed699a256f026c8ebef901112 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 8 Dec 2021 17:36:52 +0000 Subject: [PATCH 173/240] powerpc/powermac: Add additional missing lockdep_register_key() Commit df1f679d19ed ("powerpc/powermac: Add missing lockdep_register_key()") fixed a problem that was causing a WARNING. There are two other places in the same file with the same problem originating from commit 9e607f72748d ("i2c_powermac: shut up lockdep warning"). Add missing lockdep_register_key() Fixes: 9e607f72748d ("i2c_powermac: shut up lockdep warning") Reported-by: Erhard Furtner Signed-off-by: Christophe Leroy Depends-on: df1f679d19ed ("powerpc/powermac: Add missing lockdep_register_key()") Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=200055 Link: https://lore.kernel.org/r/2c7e421874e21b2fb87813d768cf662f630c2ad4.1638984999.git.christophe.leroy@csgroup.eu --- arch/powerpc/platforms/powermac/low_i2c.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index de34fa34c42d..df89d916236d 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -811,6 +811,7 @@ static void __init pmu_i2c_probe(void) bus->hostdata = bus + 1; bus->xfer = pmu_i2c_xfer; mutex_init(&bus->mutex); + lockdep_register_key(&bus->lock_key); lockdep_set_class(&bus->mutex, &bus->lock_key); bus->flags = pmac_i2c_multibus; list_add(&bus->link, &pmac_i2c_busses); @@ -934,6 +935,7 @@ static void __init smu_i2c_probe(void) bus->hostdata = bus + 1; bus->xfer = smu_i2c_xfer; mutex_init(&bus->mutex); + lockdep_register_key(&bus->lock_key); lockdep_set_class(&bus->mutex, &bus->lock_key); bus->flags = 0; list_add(&bus->link, &pmac_i2c_busses); From 63fa47ba886b86cbd58f03b3b01b04bd57a1f233 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Dec 2021 17:45:56 +0000 Subject: [PATCH 174/240] KVM: PPC: Book3S HV P9: Use kvm_arch_vcpu_get_wait() to get rcuwait object Use kvm_arch_vcpu_get_wait() to get a vCPU's rcuwait object instead of using vcpu->wait directly in kvmhv_run_single_vcpu(). Functionally, this is a nop as vcpu->arch.waitp is guaranteed to point at vcpu->wait. But that is not obvious at first glance, and a future change coming in via the KVM tree, commit 510958e99721 ("KVM: Force PPC to define its own rcuwait object"), will hide vcpu->wait from architectures that define __KVM_HAVE_ARCH_WQP to prevent generic KVM from attepting to wake a vCPU with the wrong rcuwait object. Reported-by: Sachin Sant Signed-off-by: Sean Christopherson Tested-by: Sachin Sant Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211213174556.3871157-1-seanjc@google.com --- arch/powerpc/kvm/book3s_hv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7986911b873c..f64e45d6c0f4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4442,6 +4442,7 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu) int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) { + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); struct kvm_run *run = vcpu->run; int trap, r, pcpu; int srcu_idx; @@ -4588,7 +4589,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) { kvmppc_set_timer(vcpu); - prepare_to_rcuwait(&vcpu->wait); + prepare_to_rcuwait(wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) { @@ -4605,7 +4606,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, schedule(); trace_kvmppc_vcore_blocked(vc, 1); } - finish_rcuwait(&vcpu->wait); + finish_rcuwait(wait); } vcpu->arch.ceded = 0; From cb1c4aba055f928ffae0c868e8dfe08eeab302e7 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 6 Dec 2021 14:47:46 +0530 Subject: [PATCH 175/240] perf: Add new macros for mem_hops field Add new macros for mem_hops field which can be used to represent remote-node, socket and board level details. Currently the code had macro for HOPS_0, which corresponds to data coming from another core but same node. Add new macros for HOPS_1 to HOPS_3 to represent remote-node, socket and board level data. For ex: Encodings for mem_hops fields with L2 cache: L2 - local L2 L2 | REMOTE | HOPS_0 - remote core, same node L2 L2 | REMOTE | HOPS_1 - remote node, same socket L2 L2 | REMOTE | HOPS_2 - remote socket, same board L2 L2 | REMOTE | HOPS_3 - remote board L2 Signed-off-by: Kajol Jain Acked-by: Peter Zijlstra (Intel) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211206091749.87585-2-kjain@linux.ibm.com --- include/uapi/linux/perf_event.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index bd8860eeb291..1b65042ab1db 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1332,7 +1332,10 @@ union perf_mem_data_src { /* hop level */ #define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ -/* 2-7 available */ +#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ +#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ +#define PERF_MEM_HOPS_3 0x04 /* remote board */ +/* 5-7 available */ #define PERF_MEM_HOPS_SHIFT 43 #define PERF_MEM_S(a, s) \ From 4a20ee106154ac1765dea97932faad29f0ba57fc Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 6 Dec 2021 14:47:48 +0530 Subject: [PATCH 176/240] powerpc/perf: Add encodings to represent data based on newer composite PERF_MEM_LVLNUM* fields The code represent data coming from L1/L2/L3 cache hits based on PERF_MEM_LVL_* namespace, which is in the process of deprecation in the favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_,HOPS_} fields. Add data source encodings to represent L1/L2/L3 cache hits based on newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_,HOPS_} fields for power10 and older platforms Result in power9 system without patch changes: localhost:# ./perf mem report --sort="mem,sym,dso" --stdio # Overhead Samples Memory access Symbol Shared Object # ........ ............ ........................ ................................. ................ # 29.51% 1 L2 hit [k] perf_event_exec [kernel.vmlinux] 27.05% 1 L1 hit [k] perf_ctx_unlock [kernel.vmlinux] 13.93% 1 L1 hit [k] vtime_delta [kernel.vmlinux] 13.11% 1 L1 hit [k] prepend_path.isra.11 [kernel.vmlinux] 8.20% 1 L1 hit [.] 00000038.plt_call.__GI_strlen libc-2.28.so 8.20% 1 L1 hit [k] perf_event_interrupt [kernel.vmlinux] Result in power9 system with patch changes: localhost:# ./perf mem report --sort="mem,sym,dso" --stdio # Overhead Samples Memory access Symbol Shared Object # ........ ............ ........................ .......................... ................ # 36.63% 1 L2 or L2 hit [k] perf_event_exec [kernel.vmlinux] 25.50% 1 L1 or L1 hit [k] vtime_delta [kernel.vmlinux] 13.12% 1 L1 or L1 hit [k] unmap_region [kernel.vmlinux] 12.62% 1 L1 or L1 hit [k] perf_sample_event_took [kernel.vmlinux] 6.93% 1 L1 or L1 hit [k] perf_ctx_unlock [kernel.vmlinux] 5.20% 1 L1 or L1 hit [.] __memcpy_power7 libc-2.28.so Signed-off-by: Kajol Jain Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211206091749.87585-4-kjain@linux.ibm.com --- arch/powerpc/perf/isa207-common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 7ea873ab2e6f..6c6bc8b7d887 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -220,13 +220,13 @@ static inline u64 isa207_find_source(u64 idx, u32 sub_idx) /* Nothing to do */ break; case 1: - ret = PH(LVL, L1); + ret = PH(LVL, L1) | LEVEL(L1) | P(SNOOP, HIT); break; case 2: - ret = PH(LVL, L2); + ret = PH(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT); break; case 3: - ret = PH(LVL, L3); + ret = PH(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); break; case 4: if (sub_idx <= 1) From 6ed05a8efda56e5be11081954929421de19cce88 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 6 Dec 2021 14:47:49 +0530 Subject: [PATCH 177/240] powerpc/perf: Add data source encodings for power10 platform The code represent memory/cache level data based on PERF_MEM_LVL_* namespace, which is in the process of deprication in the favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_,HOPS_} fields. Add data source encodings to represent cache/memory data based on newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_,HOPS_} fields. Add data source encodings to represent data coming from local memory/Remote memory/distant memory and remote/distant cache hits. In order to represent data coming from OpenCAPI cache/memory, we use LVLNUM "PMEM" field which is used to present persistent memory accesses. Result in power10 system with patch changes: localhost:# ./perf mem report --sort="mem,sym,dso" --stdio # Overhead Samples Memory access Symbol Shared Object # ........ ............ ........................ .......................... ................ # 29.46% 2331 L1 or L1 hit [.] __random libc-2.28.so 23.11% 2121 L1 or L1 hit [.] producer_populate_cache producer_consumer 18.56% 1758 L1 or L1 hit [.] __random_r libc-2.28.so 15.64% 1559 L2 or L2 hit [.] __random libc-2.28.so ..... 0.09% 5 Remote socket, same board Any cache hit [.] __random libc-2.28.so 0.07% 4 Remote socket, same board Any cache hit [.] __random libc-2.28.so ..... Signed-off-by: Kajol Jain Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211206091749.87585-5-kjain@linux.ibm.com --- arch/powerpc/perf/isa207-common.c | 54 ++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 6c6bc8b7d887..4037ea652522 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -229,13 +229,28 @@ static inline u64 isa207_find_source(u64 idx, u32 sub_idx) ret = PH(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); break; case 4: - if (sub_idx <= 1) - ret = PH(LVL, LOC_RAM); - else if (sub_idx > 1 && sub_idx <= 2) - ret = PH(LVL, REM_RAM1); - else - ret = PH(LVL, REM_RAM2); - ret |= P(SNOOP, HIT); + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + ret = P(SNOOP, HIT); + + if (sub_idx == 1) + ret |= PH(LVL, LOC_RAM) | LEVEL(RAM); + else if (sub_idx == 2 || sub_idx == 3) + ret |= P(LVL, HIT) | LEVEL(PMEM); + else if (sub_idx == 4) + ret |= PH(LVL, REM_RAM1) | REM | LEVEL(RAM) | P(HOPS, 2); + else if (sub_idx == 5 || sub_idx == 7) + ret |= P(LVL, HIT) | LEVEL(PMEM) | REM; + else if (sub_idx == 6) + ret |= PH(LVL, REM_RAM2) | REM | LEVEL(RAM) | P(HOPS, 3); + } else { + if (sub_idx <= 1) + ret = PH(LVL, LOC_RAM); + else if (sub_idx > 1 && sub_idx <= 2) + ret = PH(LVL, REM_RAM1); + else + ret = PH(LVL, REM_RAM2); + ret |= P(SNOOP, HIT); + } break; case 5: if (cpu_has_feature(CPU_FTR_ARCH_31)) { @@ -261,11 +276,26 @@ static inline u64 isa207_find_source(u64 idx, u32 sub_idx) } break; case 6: - ret = PH(LVL, REM_CCE2); - if ((sub_idx == 0) || (sub_idx == 2)) - ret |= P(SNOOP, HIT); - else if ((sub_idx == 1) || (sub_idx == 3)) - ret |= P(SNOOP, HITM); + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + if (sub_idx == 0) + ret = PH(LVL, REM_CCE1) | LEVEL(ANY_CACHE) | REM | + P(SNOOP, HIT) | P(HOPS, 2); + else if (sub_idx == 1) + ret = PH(LVL, REM_CCE1) | LEVEL(ANY_CACHE) | REM | + P(SNOOP, HITM) | P(HOPS, 2); + else if (sub_idx == 2) + ret = PH(LVL, REM_CCE2) | LEVEL(ANY_CACHE) | REM | + P(SNOOP, HIT) | P(HOPS, 3); + else if (sub_idx == 3) + ret = PH(LVL, REM_CCE2) | LEVEL(ANY_CACHE) | REM | + P(SNOOP, HITM) | P(HOPS, 3); + } else { + ret = PH(LVL, REM_CCE2); + if (sub_idx == 0 || sub_idx == 2) + ret |= P(SNOOP, HIT); + else if (sub_idx == 1 || sub_idx == 3) + ret |= P(SNOOP, HITM); + } break; case 7: ret = PM(LVL, L1); From 0a006ace634dcaf1bbf9125fb8089a4a50bf33d6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 26 Nov 2021 15:21:33 +1000 Subject: [PATCH 178/240] powerpc/pseries/vas: Don't print an error when VAS is unavailable KVM does not support VAS so guests always print a useless error on boot vas: HCALL(398) error -2, query_type 0, result buffer 0x57f2000 Change this to only print the message if the error is not H_FUNCTION. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211126052133.1664375-1-npiggin@gmail.com --- arch/powerpc/platforms/pseries/vas.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index b043e3936d21..734523e2272f 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -151,8 +151,15 @@ int h_query_vas_capabilities(const u64 hcall, u8 query_type, u64 result) if (rc == H_SUCCESS) return 0; - pr_err("HCALL(%llx) error %ld, query_type %u, result buffer 0x%llx\n", - hcall, rc, query_type, result); + /* H_FUNCTION means HV does not support VAS so don't print an error */ + if (rc != H_FUNCTION) { + pr_err("%s error %ld, query_type %u, result buffer 0x%llx\n", + (hcall == H_QUERY_VAS_CAPABILITIES) ? + "H_QUERY_VAS_CAPABILITIES" : + "H_QUERY_NX_CAPABILITIES", + rc, query_type, result); + } + return -EIO; } EXPORT_SYMBOL_GPL(h_query_vas_capabilities); From 4423eb5ae32ec613af3fceee2fe84234e417ee55 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 23 Sep 2021 00:54:47 +1000 Subject: [PATCH 179/240] powerpc/64/interrupt: make normal synchronous interrupts enable MSR[EE] if possible Make synchronous interrupt handler entry wrappers enable MSR[EE] if MSR[EE] was enabled in the interrupted context. IRQs are soft-disabled at this point so there is no change to high level code, but it's a masked interrupt could fire. This is a performance disadvantage for interrupts which do not later call interrupt_cond_local_irq_enable(), because an an additional mtmsrd or wrtee instruction is executed. However the important synchronous interrupts (e.g., page fault) do enable interrupts, so the performance disadvantage is mostly avoided. In the next patch, MSR[RI] enabling can be combined with MSR[EE] enabling, which mitigates the performance drop for the former and gives a performance advanage for the latter interrupts, on 64s machines. 64e is coming along for the ride for now to avoid divergences with 64s in this tricky code. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210922145452.352571-2-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 6d414ddc8e24..2ab7e31c823f 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -151,7 +151,20 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup #ifdef CONFIG_PPC64 if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED) trace_hardirqs_off(); - local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If the interrupt was taken with HARD_DIS clear, then enable MSR[EE]. + * Asynchronous interrupts get here with HARD_DIS set (see below), so + * this enables MSR[EE] for synchronous interrupts. IRQs remain + * soft-masked. The interrupt handler may later call + * interrupt_cond_local_irq_enable() to achieve a regular process + * context. + */ + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) { + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + BUG_ON(!(regs->msr & MSR_EE)); + __hard_irq_enable(); + } if (user_mode(regs)) { kuap_lock(); @@ -203,6 +216,10 @@ static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct interrupt_state *state) { +#ifdef CONFIG_PPC64 + /* Ensure interrupt_enter_prepare does not enable MSR[EE] */ + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; +#endif #ifdef CONFIG_PPC_BOOK3S_64 if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_local_flags(_TLF_RUNLATCH)) From ff0b0d6e1a7bc202241a9b1e28d1da4b744e0312 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 23 Sep 2021 00:54:48 +1000 Subject: [PATCH 180/240] powerpc/64s/interrupt: handle MSR EE and RI in interrupt entry wrapper The mtmsrd to enable MSR[RI] can be combined with the mtmsrd to enable MSR[EE] in interrupt entry code, for those interrupts which enable EE. This helps performance of important synchronous interrupts (e.g., page faults). This is similar to what commit dd152f70bdc1 ("powerpc/64s: system call avoid setting MSR[RI] until we set MSR[EE]") does for system calls. Do this by enabling EE and RI together at the beginning of the entry wrapper if PACA_IRQ_HARD_DIS is clear, and only enabling RI if it is set. Asynchronous interrupts set PACA_IRQ_HARD_DIS, but synchronous ones leave it unchanged, so by default they always get EE=1 unless they have interrupted a caller that is hard disabled. When the sync interrupt later calls interrupt_cond_local_irq_enable(), it will not require another mtmsrd because MSR[EE] was already enabled here. This avoids one mtmsrd L=1 for synchronous interrupts on 64s, which saves about 20 cycles on POWER9. And for kernel-mode interrupts, both synchronous and asynchronous, this saves an additional 40 cycles due to the mtmsrd being moved ahead of mfspr SPRN_AMR, which prevents a SPR scoreboard stall. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210922145452.352571-3-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 27 +++++++++++++++++--- arch/powerpc/kernel/exceptions-64s.S | 38 +++------------------------- arch/powerpc/kernel/fpu.S | 5 ++++ arch/powerpc/kernel/vector.S | 10 ++++++++ 4 files changed, 42 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 2ab7e31c823f..aa65bb774cdb 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -149,8 +149,14 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup #endif #ifdef CONFIG_PPC64 - if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED) - trace_hardirqs_off(); + bool trace_enable = false; + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS)) { + if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED) + trace_enable = true; + } else { + irq_soft_mask_set(IRQS_ALL_DISABLED); + } /* * If the interrupt was taken with HARD_DIS clear, then enable MSR[EE]. @@ -164,8 +170,14 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) BUG_ON(!(regs->msr & MSR_EE)); __hard_irq_enable(); + } else { + __hard_RI_enable(); } + /* Do this when RI=1 because it can cause SLB faults */ + if (trace_enable) + trace_hardirqs_off(); + if (user_mode(regs)) { kuap_lock(); CT_WARN_ON(ct_state() != CONTEXT_USER); @@ -220,13 +232,16 @@ static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct in /* Ensure interrupt_enter_prepare does not enable MSR[EE] */ local_paca->irq_happened |= PACA_IRQ_HARD_DIS; #endif + interrupt_enter_prepare(regs, state); #ifdef CONFIG_PPC_BOOK3S_64 + /* + * RI=1 is set by interrupt_enter_prepare, so this thread flags access + * has to come afterward (it can cause SLB faults). + */ if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_local_flags(_TLF_RUNLATCH)) __ppc64_runlatch_on(); #endif - - interrupt_enter_prepare(regs, state); irq_enter(); } @@ -296,6 +311,8 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte regs->softe = IRQS_ALL_DISABLED; } + __hard_RI_enable(); + /* Don't do any per-CPU operations until interrupt state is fixed */ if (nmi_disables_ftrace(regs)) { @@ -393,6 +410,8 @@ interrupt_handler long func(struct pt_regs *regs) \ { \ long ret; \ \ + __hard_RI_enable(); \ + \ ret = ____##func (regs); \ \ return ret; \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a30f563bc7a8..4545b7a28aad 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -113,7 +113,6 @@ name: #define IISIDE .L_IISIDE_\name\() /* Uses SRR0/1 not DAR/DSISR */ #define IDAR .L_IDAR_\name\() /* Uses DAR (or SRR0) */ #define IDSISR .L_IDSISR_\name\() /* Uses DSISR (or SRR1) */ -#define ISET_RI .L_ISET_RI_\name\() /* Run common code w/ MSR[RI]=1 */ #define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */ #define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() /* Common runs in realmode */ #define IMASK .L_IMASK_\name\() /* IRQ soft-mask bit */ @@ -157,9 +156,6 @@ do_define_int n .ifndef IDSISR IDSISR=0 .endif - .ifndef ISET_RI - ISET_RI=1 - .endif .ifndef IBRANCH_TO_COMMON IBRANCH_TO_COMMON=1 .endif @@ -512,11 +508,6 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) stb r10,PACASRR_VALID(r13) .endif - .if ISET_RI - li r10,MSR_RI - mtmsrd r10,1 /* Set MSR_RI */ - .endif - .if ISTACK .if IKUAP kuap_save_amr_and_lock r9, r10, cr1, cr0 @@ -900,11 +891,6 @@ INT_DEFINE_BEGIN(system_reset) IVEC=0x100 IAREA=PACA_EXNMI IVIRT=0 /* no virt entry point */ - /* - * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is - * being used, so a nested NMI exception would corrupt it. - */ - ISET_RI=0 ISTACK=0 IKVM_REAL=1 INT_DEFINE_END(system_reset) @@ -977,16 +963,14 @@ TRAMP_REAL_BEGIN(system_reset_fwnmi) EXC_COMMON_BEGIN(system_reset_common) __GEN_COMMON_ENTRY system_reset /* - * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able - * to recover, but nested NMI will notice in_nmi and not recover - * because of the use of the NMI stack. in_nmi reentrancy is tested in - * system_reset_exception. + * Increment paca->in_nmi. When the interrupt entry wrapper later + * enable MSR_RI, then SLB or MCE will be able to recover, but a nested + * NMI will notice in_nmi and not recover because of the use of the NMI + * stack. in_nmi reentrancy is tested in system_reset_exception. */ lhz r10,PACA_IN_NMI(r13) addi r10,r10,1 sth r10,PACA_IN_NMI(r13) - li r10,MSR_RI - mtmsrd r10,1 mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) @@ -1060,12 +1044,6 @@ INT_DEFINE_BEGIN(machine_check_early) IAREA=PACA_EXMC IVIRT=0 /* no virt entry point */ IREALMODE_COMMON=1 - /* - * MSR_RI is not enabled, because PACA_EXMC is being used, so a - * nested machine check corrupts it. machine_check_common enables - * MSR_RI. - */ - ISET_RI=0 ISTACK=0 IDAR=1 IDSISR=1 @@ -1076,7 +1054,6 @@ INT_DEFINE_BEGIN(machine_check) IVEC=0x200 IAREA=PACA_EXMC IVIRT=0 /* no virt entry point */ - ISET_RI=0 IDAR=1 IDSISR=1 IKVM_REAL=1 @@ -1146,9 +1123,6 @@ EXC_COMMON_BEGIN(machine_check_early_common) BEGIN_FTR_SECTION bl enable_machine_check END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - li r10,MSR_RI - mtmsrd r10,1 - addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_early std r3,RESULT(r1) /* Save result */ @@ -1236,10 +1210,6 @@ EXC_COMMON_BEGIN(machine_check_common) * save area: PACA_EXMC instead of PACA_EXGEN. */ GEN_COMMON machine_check - - /* Enable MSR_RI when finished with PACA_EXMC */ - li r10,MSR_RI - mtmsrd r10,1 addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception_async b interrupt_return_srr diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index ba4afe3b5a9c..f71f2bbd4de6 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -81,7 +81,12 @@ EXPORT_SYMBOL(store_fp_state) */ _GLOBAL(load_up_fpu) mfmsr r5 +#ifdef CONFIG_PPC_BOOK3S_64 + /* interrupt doesn't set MSR[RI] and HPT can fault on current access */ + ori r5,r5,MSR_FP|MSR_RI +#else ori r5,r5,MSR_FP +#endif #ifdef CONFIG_VSX BEGIN_FTR_SECTION oris r5,r5,MSR_VSX@h diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index ba03eedfdcd8..5cc24d8cce94 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -47,6 +47,10 @@ EXPORT_SYMBOL(store_vr_state) */ _GLOBAL(load_up_altivec) mfmsr r5 /* grab the current MSR */ +#ifdef CONFIG_PPC_BOOK3S_64 + /* interrupt doesn't set MSR[RI] and HPT can fault on current access */ + ori r5,r5,MSR_RI +#endif oris r5,r5,MSR_VEC@h MTMSRD(r5) /* enable use of AltiVec now */ isync @@ -126,6 +130,12 @@ _GLOBAL(load_up_vsx) andis. r5,r12,MSR_VEC@h beql+ load_up_altivec /* skip if already loaded */ +#ifdef CONFIG_PPC_BOOK3S_64 + /* interrupt doesn't set MSR[RI] and HPT can fault on current access */ + li r5,MSR_RI + mtmsrd r5,1 +#endif + ld r4,PACACURRENT(r13) addi r4,r4,THREAD /* Get THREAD */ li r6,1 From 5a7745b96f43c69f9b4875bcf516a0341acbc3fb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 23 Sep 2021 00:54:49 +1000 Subject: [PATCH 181/240] powerpc/64s/perf: add power_pmu_wants_prompt_pmi to say whether perf wants PMIs to be soft-NMI Interrupt code enables MSR[EE] in some irq handlers while keeping local irqs disabled via soft-mask, allowing PMI interrupts to be taken as soft-NMI to improve profiling of irq handlers. When perf is not enabled, there is no point to doing this, it's additional overhead. So provide a function that can say if PMIs should be taken promptly if possible. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210922145452.352571-4-npiggin@gmail.com --- arch/powerpc/include/asm/hw_irq.h | 2 ++ arch/powerpc/perf/core-book3s.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 7a2690e97b0e..8d6f80101eda 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -342,6 +342,8 @@ static inline bool lazy_irq_pending_nocheck(void) return __lazy_irq_pending(local_paca->irq_happened); } +bool power_pmu_wants_prompt_pmi(void); + /* * This is called by asynchronous interrupts to conditionally * re-enable hard interrupts after having cleared the source diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 1f1ded29a06e..07fd61a8d59d 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #ifdef CONFIG_PPC64 @@ -2437,6 +2438,36 @@ static void perf_event_interrupt(struct pt_regs *regs) perf_sample_event_took(sched_clock() - start_clock); } +/* + * If the perf subsystem wants performance monitor interrupts as soon as + * possible (e.g., to sample the instruction address and stack chain), + * this should return true. The IRQ masking code can then enable MSR[EE] + * in some places (e.g., interrupt handlers) that allows PMI interrupts + * though to improve accuracy of profiles, at the cost of some performance. + * + * The PMU counters can be enabled by other means (e.g., sysfs raw SPR + * access), but in that case there is no need for prompt PMI handling. + * + * This currently returns true if any perf counter is being used. It + * could possibly return false if only events are being counted rather than + * samples being taken, but for now this is good enough. + */ +bool power_pmu_wants_prompt_pmi(void) +{ + struct cpu_hw_events *cpuhw; + + /* + * This could simply test local_paca->pmcregs_in_use if that were not + * under ifdef KVM. + */ + + if (!ppmu) + return false; + + cpuhw = this_cpu_ptr(&cpu_hw_events); + return cpuhw->n_events; +} + static int power_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); From 0faf20a1ad1647c0fc0f5a367c71e5e84deaf899 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 23 Sep 2021 00:54:50 +1000 Subject: [PATCH 182/240] powerpc/64s/interrupt: Don't enable MSR[EE] in irq handlers unless perf is in use Enabling MSR[EE] in interrupt handlers while interrupts are still soft masked allows PMIs to profile interrupt handlers to some degree, beyond what SIAR latching allows. When perf is not being used, this is almost useless work. It requires an extra mtmsrd in the irq handler, and it also opens the door to masked interrupts hitting and requiring replay, which is more expensive than just taking them directly. This effect can be noticable in high IRQ workloads. Avoid enabling MSR[EE] unless perf is currently in use. This saves about 60 cycles (or 8%) on a simple decrementer interrupt microbenchmark. Replayed interrupts drop from 1.4% of all interrupts taken, to 0.003%. This does prevent the soft-nmi interrupt being taken in these handlers, but that's not too reliable anyway. The SMP watchdog will continue to be the reliable way to catch lockups. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210922145452.352571-5-npiggin@gmail.com --- arch/powerpc/include/asm/hw_irq.h | 57 +++++++++++++++++++++++++------ arch/powerpc/kernel/dbell.c | 3 +- arch/powerpc/kernel/irq.c | 3 +- arch/powerpc/kernel/time.c | 31 +++++++++-------- 4 files changed, 67 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 8d6f80101eda..a58fb4aa6c81 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -345,17 +345,54 @@ static inline bool lazy_irq_pending_nocheck(void) bool power_pmu_wants_prompt_pmi(void); /* - * This is called by asynchronous interrupts to conditionally - * re-enable hard interrupts after having cleared the source - * of the interrupt. They are kept disabled if there is a different - * soft-masked interrupt pending that requires hard masking. + * This is called by asynchronous interrupts to check whether to + * conditionally re-enable hard interrupts after having cleared + * the source of the interrupt. They are kept disabled if there + * is a different soft-masked interrupt pending that requires hard + * masking. */ -static inline void may_hard_irq_enable(void) +static inline bool should_hard_irq_enable(void) { - if (!(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK)) { - get_paca()->irq_happened &= ~PACA_IRQ_HARD_DIS; - __hard_irq_enable(); - } +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); + WARN_ON(mfmsr() & MSR_EE); +#endif +#ifdef CONFIG_PERF_EVENTS + /* + * If the PMU is not running, there is not much reason to enable + * MSR[EE] in irq handlers because any interrupts would just be + * soft-masked. + * + * TODO: Add test for 64e + */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !power_pmu_wants_prompt_pmi()) + return false; + + if (get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK) + return false; + + return true; +#else + return false; +#endif +} + +/* + * Do the hard enabling, only call this if should_hard_irq_enable is true. + */ +static inline void do_hard_irq_enable(void) +{ +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); + WARN_ON(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK); + WARN_ON(mfmsr() & MSR_EE); +#endif + /* + * This allows PMI interrupts (and watchdog soft-NMIs) through. + * There is no other reason to enable this way. + */ + get_paca()->irq_happened &= ~PACA_IRQ_HARD_DIS; + __hard_irq_enable(); } static inline bool arch_irq_disabled_regs(struct pt_regs *regs) @@ -436,7 +473,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) return !(regs->msr & MSR_EE); } -static inline bool may_hard_irq_enable(void) +static inline bool should_hard_irq_enable(void) { return false; } diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 5545c9cd17c1..f55c6fb34a3a 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -27,7 +27,8 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception) ppc_msgsync(); - may_hard_irq_enable(); + if (should_hard_irq_enable()) + do_hard_irq_enable(); kvmppc_clear_host_ipi(smp_processor_id()); __this_cpu_inc(irq_stat.doorbell_irqs); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 8207f97d51e8..2cf31a97126c 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -745,7 +745,8 @@ void __do_irq(struct pt_regs *regs) irq = ppc_md.get_irq(); /* We can hard enable interrupts now to allow perf interrupts */ - may_hard_irq_enable(); + if (should_hard_irq_enable()) + do_hard_irq_enable(); /* And finally process it */ if (unlikely(!irq)) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 42df9dd7fb41..62361cc7281c 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -609,22 +609,23 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) return; } - /* Ensure a positive value is written to the decrementer, or else - * some CPUs will continue to take decrementer exceptions. When the - * PPC_WATCHDOG (decrementer based) is configured, keep this at most - * 31 bits, which is about 4 seconds on most systems, which gives - * the watchdog a chance of catching timer interrupt hard lockups. - */ - if (IS_ENABLED(CONFIG_PPC_WATCHDOG)) - set_dec(0x7fffffff); - else - set_dec(decrementer_max); - - /* Conditionally hard-enable interrupts now that the DEC has been - * bumped to its maximum value - */ - may_hard_irq_enable(); + /* Conditionally hard-enable interrupts. */ + if (should_hard_irq_enable()) { + /* + * Ensure a positive value is written to the decrementer, or + * else some CPUs will continue to take decrementer exceptions. + * When the PPC_WATCHDOG (decrementer based) is configured, + * keep this at most 31 bits, which is about 4 seconds on most + * systems, which gives the watchdog a chance of catching timer + * interrupt hard lockups. + */ + if (IS_ENABLED(CONFIG_PPC_WATCHDOG)) + set_dec(0x7fffffff); + else + set_dec(decrementer_max); + do_hard_irq_enable(); + } #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) if (atomic_read(&ppc_n_lost_interrupts) != 0) From ecb1057c0f9a0f3f052294de6cc2eb43ecf7547b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 23 Sep 2021 00:54:51 +1000 Subject: [PATCH 183/240] powerpc/64/interrupt: reduce expensive debug tests Move the assertions requiring restart table searches under CONFIG_PPC_IRQ_SOFT_MASK_DEBUG. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210922145452.352571-6-npiggin@gmail.com --- arch/powerpc/include/asm/interrupt.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index aa65bb774cdb..fc28f46d2f9d 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -97,6 +97,11 @@ static inline void srr_regs_clobbered(void) local_paca->hsrr_valid = 0; } #else +static inline unsigned long search_kernel_restart_table(unsigned long addr) +{ + return 0; +} + static inline bool is_implicit_soft_masked(struct pt_regs *regs) { return false; @@ -193,13 +198,14 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup */ if (TRAP(regs) != INTERRUPT_PROGRAM) { CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - BUG_ON(is_implicit_soft_masked(regs)); + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + BUG_ON(is_implicit_soft_masked(regs)); } -#ifdef CONFIG_PPC_BOOK3S + /* Move this under a debugging check */ - if (arch_irq_disabled_regs(regs)) + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && + arch_irq_disabled_regs(regs)) BUG_ON(search_kernel_restart_table(regs->nip)); -#endif } if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE)); From af47d79b041deccc31e0dddc6310a654c13d04b6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 23 Sep 2021 00:54:52 +1000 Subject: [PATCH 184/240] powerpc/64s/interrupt: avoid saving CFAR in some asynchronous interrupts Reading the CFAR register is quite costly (~20 cycles on POWER9). It is a good idea to have for most synchronous interrupts, but for async ones it is much less important. Doorbell, external, and decrementer interrupts are the important asynchronous ones. HV interrupts can't skip CFAR if KVM HV is possible, because it might be a guest exit that requires CFAR preserved. But the important pseries interrupts can avoid loading CFAR. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210922145452.352571-7-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 63 ++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4545b7a28aad..6fe7d7926370 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -111,6 +111,8 @@ name: #define IAREA .L_IAREA_\name\() /* PACA save area */ #define IVIRT .L_IVIRT_\name\() /* Has virt mode entry point */ #define IISIDE .L_IISIDE_\name\() /* Uses SRR0/1 not DAR/DSISR */ +#define ICFAR .L_ICFAR_\name\() /* Uses CFAR */ +#define ICFAR_IF_HVMODE .L_ICFAR_IF_HVMODE_\name\() /* Uses CFAR if HV */ #define IDAR .L_IDAR_\name\() /* Uses DAR (or SRR0) */ #define IDSISR .L_IDSISR_\name\() /* Uses DSISR (or SRR1) */ #define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */ @@ -150,6 +152,12 @@ do_define_int n .ifndef IISIDE IISIDE=0 .endif + .ifndef ICFAR + ICFAR=1 + .endif + .ifndef ICFAR_IF_HVMODE + ICFAR_IF_HVMODE=0 + .endif .ifndef IDAR IDAR=0 .endif @@ -287,9 +295,21 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) HMT_MEDIUM std r10,IAREA+EX_R10(r13) /* save r10 - r12 */ + .if ICFAR BEGIN_FTR_SECTION mfspr r10,SPRN_CFAR END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + .elseif ICFAR_IF_HVMODE +BEGIN_FTR_SECTION + BEGIN_FTR_SECTION_NESTED(69) + mfspr r10,SPRN_CFAR + END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 69) +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(69) + li r10,0 + END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 69) +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .endif .if \ool .if !\virt b tramp_real_\name @@ -305,9 +325,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) BEGIN_FTR_SECTION std r9,IAREA+EX_PPR(r13) END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + .if ICFAR || ICFAR_IF_HVMODE BEGIN_FTR_SECTION std r10,IAREA+EX_CFAR(r13) END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + .endif INTERRUPT_TO_KERNEL mfctr r10 std r10,IAREA+EX_CTR(r13) @@ -559,7 +581,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) .endif BEGIN_FTR_SECTION + .if ICFAR || ICFAR_IF_HVMODE ld r10,IAREA+EX_CFAR(r13) + .else + li r10,0 + .endif std r10,ORIG_GPR3(r1) END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r10,IAREA+EX_CTR(r13) @@ -1520,6 +1546,12 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) * * If soft masked, the masked handler will note the pending interrupt for * replay, and clear MSR[EE] in the interrupted context. + * + * CFAR is not required because this is an asynchronous interrupt that in + * general won't have much bearing on the state of the CPU, with the possible + * exception of crash/debug IPIs, but those are generally moving to use SRESET + * IPIs. Unless this is an HV interrupt and KVM HV is possible, in which case + * it may be exiting the guest and need CFAR to be saved. */ INT_DEFINE_BEGIN(hardware_interrupt) IVEC=0x500 @@ -1527,6 +1559,10 @@ INT_DEFINE_BEGIN(hardware_interrupt) IMASK=IRQS_DISABLED IKVM_REAL=1 IKVM_VIRT=1 + ICFAR=0 +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + ICFAR_IF_HVMODE=1 +#endif INT_DEFINE_END(hardware_interrupt) EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) @@ -1748,6 +1784,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) * If PPC_WATCHDOG is configured, the soft masked handler will actually set * things back up to run soft_nmi_interrupt as a regular interrupt handler * on the emergency stack. + * + * CFAR is not required because this is asynchronous (see hardware_interrupt). + * A watchdog interrupt may like to have CFAR, but usually the interesting + * branch is long gone by that point (e.g., infinite loop). */ INT_DEFINE_BEGIN(decrementer) IVEC=0x900 @@ -1755,6 +1795,7 @@ INT_DEFINE_BEGIN(decrementer) #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 #endif + ICFAR=0 INT_DEFINE_END(decrementer) EXC_REAL_BEGIN(decrementer, 0x900, 0x80) @@ -1830,6 +1871,8 @@ EXC_COMMON_BEGIN(hdecrementer_common) * If soft masked, the masked handler will note the pending interrupt for * replay, leaving MSR[EE] enabled in the interrupted context because the * doorbells are edge triggered. + * + * CFAR is not required, similarly to hardware_interrupt. */ INT_DEFINE_BEGIN(doorbell_super) IVEC=0xa00 @@ -1837,6 +1880,7 @@ INT_DEFINE_BEGIN(doorbell_super) #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 #endif + ICFAR=0 INT_DEFINE_END(doorbell_super) EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100) @@ -1888,6 +1932,7 @@ INT_DEFINE_BEGIN(system_call) IVEC=0xc00 IKVM_REAL=1 IKVM_VIRT=1 + ICFAR=0 INT_DEFINE_END(system_call) .macro SYSTEM_CALL virt @@ -2186,6 +2231,11 @@ EXC_COMMON_BEGIN(hmi_exception_common) * Interrupt 0xe80 - Directed Hypervisor Doorbell Interrupt. * This is an asynchronous interrupt in response to a msgsnd doorbell. * Similar to the 0xa00 doorbell but for host rather than guest. + * + * CFAR is not required (similar to doorbell_interrupt), unless KVM HV + * is enabled, in which case it may be a guest exit. Most PowerNV kernels + * include KVM support so it would be nice if this could be dynamically + * patched out if KVM was not currently running any guests. */ INT_DEFINE_BEGIN(h_doorbell) IVEC=0xe80 @@ -2193,6 +2243,9 @@ INT_DEFINE_BEGIN(h_doorbell) IMASK=IRQS_DISABLED IKVM_REAL=1 IKVM_VIRT=1 +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE + ICFAR=0 +#endif INT_DEFINE_END(h_doorbell) EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20) @@ -2216,6 +2269,9 @@ EXC_COMMON_BEGIN(h_doorbell_common) * Interrupt 0xea0 - Hypervisor Virtualization Interrupt. * This is an asynchronous interrupt in response to an "external exception". * Similar to 0x500 but for host only. + * + * Like h_doorbell, CFAR is only required for KVM HV because this can be + * a guest exit. */ INT_DEFINE_BEGIN(h_virt_irq) IVEC=0xea0 @@ -2223,6 +2279,9 @@ INT_DEFINE_BEGIN(h_virt_irq) IMASK=IRQS_DISABLED IKVM_REAL=1 IKVM_VIRT=1 +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE + ICFAR=0 +#endif INT_DEFINE_END(h_virt_irq) EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20) @@ -2259,6 +2318,8 @@ EXC_VIRT_NONE(0x4ee0, 0x20) * * If soft masked, the masked handler will note the pending interrupt for * replay, and clear MSR[EE] in the interrupted context. + * + * CFAR is not used by perf interrupts so not required. */ INT_DEFINE_BEGIN(performance_monitor) IVEC=0xf00 @@ -2266,6 +2327,7 @@ INT_DEFINE_BEGIN(performance_monitor) #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 #endif + ICFAR=0 INT_DEFINE_END(performance_monitor) EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20) @@ -2690,6 +2752,7 @@ EXC_VIRT_NONE(0x5800, 0x100) INT_DEFINE_BEGIN(soft_nmi) IVEC=0x900 ISTACK=0 + ICFAR=0 INT_DEFINE_END(soft_nmi) /* From 3b54c71537d7beaaca8be9c57a81045e2b641655 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 5 Nov 2021 23:29:23 +1000 Subject: [PATCH 185/240] powerpc/pseries: use slab context cpumask allocation in CPU hotplug init Slab is up at this point, using the bootmem allocator triggers a warning. Switch to using the regular cpumask allocator. Signed-off-by: Nicholas Piggin Tested-by: Sachin Sant Reviewed-by: Nathan Lynch Reviewed-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211105132923.1582514-1-npiggin@gmail.com --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 5ab44600c8d3..b81fc846d99c 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -864,12 +864,13 @@ static int __init pseries_cpu_hotplug_init(void) /* Processors can be added/removed only on LPAR */ if (firmware_has_feature(FW_FEATURE_LPAR)) { for_each_node(node) { - alloc_bootmem_cpumask_var(&node_recorded_ids_map[node]); + if (!alloc_cpumask_var_node(&node_recorded_ids_map[node], + GFP_KERNEL, node)) + return -ENOMEM; /* Record ids of CPU added at boot time */ - cpumask_or(node_recorded_ids_map[node], - node_recorded_ids_map[node], - cpumask_of_node(node)); + cpumask_copy(node_recorded_ids_map[node], + cpumask_of_node(node)); } of_reconfig_notifier_register(&pseries_smp_nb); From 18678591846d668649fbd4f87b4a4c470818d386 Mon Sep 17 00:00:00 2001 From: Sachin Sant Date: Mon, 13 Dec 2021 22:12:23 +0530 Subject: [PATCH 186/240] selftests/powerpc: skip tests for unavailable mitigations. Mitigation patching test iterates over a set of mitigations irrespective of whether a certain mitigation is supported/available in the kernel. This causes following messages on a kernel where some mitigations are unavailable: Spawned threads enabling/disabling mitigations ... cat: entry_flush: No such file or directory cat: uaccess_flush: No such file or directory Waiting for timeout ... OK This patch adds a check for available mitigations in the kernel. Reported-by: Nageswara R Sastry Signed-off-by: Sachin Sant Tested-by: Nageswara R Sastry Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/163941374362.36967.18016981579099073379.sendpatchset@1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa --- .../selftests/powerpc/security/mitigation-patching.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/security/mitigation-patching.sh b/tools/testing/selftests/powerpc/security/mitigation-patching.sh index b0b20e0b4e30..f43aa4b77fba 100755 --- a/tools/testing/selftests/powerpc/security/mitigation-patching.sh +++ b/tools/testing/selftests/powerpc/security/mitigation-patching.sh @@ -44,7 +44,10 @@ mitigations="barrier_nospec stf_barrier count_cache_flush rfi_flush entry_flush for m in $mitigations do - do_one "$m" & + if [[ -f /sys/kernel/debug/powerpc/$m ]] + then + do_one "$m" & + fi done echo "Spawned threads enabling/disabling mitigations ..." From 2fe4ca6ad7f6a0b98f97c498320051e5066e4b95 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Dec 2021 22:54:12 +0100 Subject: [PATCH 187/240] powerpc/mpic: Use bitmap_zalloc() when applicable 'mpic->protected' is a bitmap. So use 'bitmap_zalloc()' to simplify code and improve the semantic, instead of hand writing it. Signed-off-by: Christophe JAILLET Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/aa145f674e08044c98f13f1a985faa9cc29c3708.1639777976.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/sysdev/mpic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 995fb2ada507..626ba4a9f64f 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1323,8 +1323,7 @@ struct mpic * __init mpic_alloc(struct device_node *node, psrc = of_get_property(mpic->node, "protected-sources", &psize); if (psrc) { /* Allocate a bitmap with one bit per interrupt */ - unsigned int mapsize = BITS_TO_LONGS(intvec_top + 1); - mpic->protected = kcalloc(mapsize, sizeof(long), GFP_KERNEL); + mpic->protected = bitmap_zalloc(intvec_top + 1, GFP_KERNEL); BUG_ON(mpic->protected == NULL); for (i = 0; i < psize/sizeof(u32); i++) { if (psrc[i] > intvec_top) From a605b39e8ef703828b9e26750ea1925a6a5ef848 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Sat, 18 Dec 2021 09:59:17 +0800 Subject: [PATCH 188/240] powerpc: use swap() to make code cleaner Use the macro 'swap()' defined in 'include/linux/minmax.h' to avoid opencoding it. Reported-by: Zeal Robot Signed-off-by: David Yang Signed-off-by: Yang Guang [mpe: Add include of linux/minmax.h] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/71a702c2189b16c152affd8a8cda1d84ce32741c.1639792543.git.yang.guang5@zte.com.cn --- arch/powerpc/platforms/powermac/pic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 4921bccf0376..bb0566633af5 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -311,11 +312,8 @@ static void __init pmac_pic_probe_oldstyle(void) /* Check ordering of master & slave */ if (of_device_is_compatible(master, "gatwick")) { - struct device_node *tmp; BUG_ON(slave == NULL); - tmp = master; - master = slave; - slave = tmp; + swap(master, slave); } /* We found a slave */ From 467ba14e1660b52a2f9338b484704c461bd23019 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 16 Dec 2021 20:33:42 +1000 Subject: [PATCH 189/240] powerpc/64s/radix: Fix huge vmap false positive pmd_huge() is defined to false when HUGETLB_PAGE is not configured, but the vmap code still installs huge PMDs. This leads to false bad PMD errors when vunmapping because it is not seen as a huge PTE, and the bad PMD check catches it. The end result may not be much more serious than some bad pmd warning messages, because the pmd_none_or_clear_bad() does what we wanted and clears the huge PTE anyway. Fix this by checking pmd_is_leaf(), which checks for a PTE regardless of config options. The whole huge/large/leaf stuff is a tangled mess but that's kernel-wide and not something we can improve much in arch/powerpc code. pmd_page(), pud_page(), etc., called by vmalloc_to_page() on huge vmaps can similarly trigger a false VM_BUG_ON when CONFIG_HUGETLB_PAGE=n, so those checks are adjusted. The checks were added by commit d6eacedd1f0e ("powerpc/book3s: Use config independent helpers for page table walk"), while implementing a similar fix for other page table walking functions. Fixes: d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP") Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216103342.609192-1-npiggin@gmail.com --- arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++-- arch/powerpc/mm/pgtable_64.c | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 3c4f0ebe5df8..ca23f5d1883a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1076,7 +1076,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) int pud_clear_huge(pud_t *pud) { - if (pud_huge(*pud)) { + if (pud_is_leaf(*pud)) { pud_clear(pud); return 1; } @@ -1123,7 +1123,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) int pmd_clear_huge(pmd_t *pmd) { - if (pmd_huge(*pmd)) { + if (pmd_is_leaf(*pmd)) { pmd_clear(pmd); return 1; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 78c8cf01db5f..175aabf101e8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -102,7 +102,8 @@ EXPORT_SYMBOL(__pte_frag_size_shift); struct page *p4d_page(p4d_t p4d) { if (p4d_is_leaf(p4d)) { - VM_WARN_ON(!p4d_huge(p4d)); + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!p4d_huge(p4d)); return pte_page(p4d_pte(p4d)); } return virt_to_page(p4d_pgtable(p4d)); @@ -112,7 +113,8 @@ struct page *p4d_page(p4d_t p4d) struct page *pud_page(pud_t pud) { if (pud_is_leaf(pud)) { - VM_WARN_ON(!pud_huge(pud)); + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!pud_huge(pud)); return pte_page(pud_pte(pud)); } return virt_to_page(pud_pgtable(pud)); @@ -125,7 +127,13 @@ struct page *pud_page(pud_t pud) struct page *pmd_page(pmd_t pmd) { if (pmd_is_leaf(pmd)) { - VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); + /* + * vmalloc_to_page may be called on any vmap address (not only + * vmalloc), and it uses pmd_page() etc., when huge vmap is + * enabled so these checks can't be used. + */ + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) + VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); return pte_page(pmd_pte(pmd)); } return virt_to_page(pmd_page_vaddr(pmd)); From 30e120e6a9d247cec7effd55fd6783d5c619ed4c Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 15 Dec 2021 06:04:38 +0000 Subject: [PATCH 190/240] ocxl: remove redundant rc variable Return value from ocxl_context_attach() directly instead of taking this in another redundant variable. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211215060438.441918-1-chi.minghao@zte.com.cn --- drivers/misc/ocxl/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index e70525eedaae..d881f5e40ad9 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -74,7 +74,6 @@ static long afu_ioctl_attach(struct ocxl_context *ctx, { struct ocxl_ioctl_attach arg; u64 amr = 0; - int rc; pr_debug("%s for context %d\n", __func__, ctx->pasid); @@ -86,8 +85,7 @@ static long afu_ioctl_attach(struct ocxl_context *ctx, return -EINVAL; amr = arg.amr & mfspr(SPRN_UAMOR); - rc = ocxl_context_attach(ctx, amr, current->mm); - return rc; + return ocxl_context_attach(ctx, amr, current->mm); } static long afu_ioctl_get_metadata(struct ocxl_context *ctx, From 9cbbe6bae938dd335a5092b0ce41f88cb39ba40c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 17 Dec 2021 16:14:00 -0600 Subject: [PATCH 191/240] powerpc/dts: Remove "spidev" nodes "spidev" is not a real device, but a Linux implementation detail. It has never been documented either. The kernel has WARNed on the use of it for over 6 years. Time to remove its usage from the tree. Signed-off-by: Rob Herring Reviewed-by: Mark Brown Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211217221400.3667133-1-robh@kernel.org --- arch/powerpc/boot/dts/digsy_mtc.dts | 8 -------- arch/powerpc/boot/dts/o2d.dtsi | 6 ------ 2 files changed, 14 deletions(-) diff --git a/arch/powerpc/boot/dts/digsy_mtc.dts b/arch/powerpc/boot/dts/digsy_mtc.dts index 57024a4c1e7d..dfaf974c0ce6 100644 --- a/arch/powerpc/boot/dts/digsy_mtc.dts +++ b/arch/powerpc/boot/dts/digsy_mtc.dts @@ -25,14 +25,6 @@ status = "disabled"; }; - spi@f00 { - msp430@0 { - compatible = "spidev"; - spi-max-frequency = <32000>; - reg = <0>; - }; - }; - psc@2000 { // PSC1 status = "disabled"; }; diff --git a/arch/powerpc/boot/dts/o2d.dtsi b/arch/powerpc/boot/dts/o2d.dtsi index b55a9e5bd828..7e52509fa506 100644 --- a/arch/powerpc/boot/dts/o2d.dtsi +++ b/arch/powerpc/boot/dts/o2d.dtsi @@ -34,12 +34,6 @@ #address-cells = <1>; #size-cells = <0>; cell-index = <0>; - - spidev@0 { - compatible = "spidev"; - spi-max-frequency = <250000>; - reg = <0>; - }; }; psc@2200 { // PSC2 From a8968521cfdc3e339fe69473d6632e0aa8d7202a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 9 Dec 2021 22:59:44 +1100 Subject: [PATCH 192/240] selftests/powerpc: Add a test of sigreturning to the kernel We have a general signal fuzzer, sigfuz, which can modify the MSR & NIP before sigreturn. But the chance of it hitting a kernel address and also clearing MSR_PR is fairly slim. So add a specific test of sigreturn to a kernel address, both with and without attempting to clear MSR_PR (which the kernel must block). Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211209115944.4062384-1-mpe@ellerman.id.au --- .../selftests/powerpc/signal/.gitignore | 1 + .../testing/selftests/powerpc/signal/Makefile | 1 + .../powerpc/signal/sigreturn_kernel.c | 132 ++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 tools/testing/selftests/powerpc/signal/sigreturn_kernel.c diff --git a/tools/testing/selftests/powerpc/signal/.gitignore b/tools/testing/selftests/powerpc/signal/.gitignore index ce3375cd8e73..8f6c816099a4 100644 --- a/tools/testing/selftests/powerpc/signal/.gitignore +++ b/tools/testing/selftests/powerpc/signal/.gitignore @@ -4,3 +4,4 @@ signal_tm sigfuz sigreturn_vdso sig_sc_double_restart +sigreturn_kernel diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index d6ae54663aed..84e201572466 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart +TEST_GEN_PROGS += sigreturn_kernel CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm diff --git a/tools/testing/selftests/powerpc/signal/sigreturn_kernel.c b/tools/testing/selftests/powerpc/signal/sigreturn_kernel.c new file mode 100644 index 000000000000..0a1b6e591eee --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/sigreturn_kernel.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that we can't sigreturn to kernel addresses, or to kernel mode. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#define MSR_PR (1ul << 14) + +static volatile unsigned long long sigreturn_addr; +static volatile unsigned long long sigreturn_msr_mask; + +static void sigusr1_handler(int signo, siginfo_t *si, void *uc_ptr) +{ + ucontext_t *uc = (ucontext_t *)uc_ptr; + + if (sigreturn_addr) + UCONTEXT_NIA(uc) = sigreturn_addr; + + if (sigreturn_msr_mask) + UCONTEXT_MSR(uc) &= sigreturn_msr_mask; +} + +static pid_t fork_child(void) +{ + pid_t pid; + + pid = fork(); + if (pid == 0) { + raise(SIGUSR1); + exit(0); + } + + return pid; +} + +static int expect_segv(pid_t pid) +{ + int child_ret; + + waitpid(pid, &child_ret, 0); + FAIL_IF(WIFEXITED(child_ret)); + FAIL_IF(!WIFSIGNALED(child_ret)); + FAIL_IF(WTERMSIG(child_ret) != 11); + + return 0; +} + +int test_sigreturn_kernel(void) +{ + struct sigaction act; + int child_ret, i; + pid_t pid; + + act.sa_sigaction = sigusr1_handler; + act.sa_flags = SA_SIGINFO; + sigemptyset(&act.sa_mask); + + FAIL_IF(sigaction(SIGUSR1, &act, NULL)); + + for (i = 0; i < 2; i++) { + // Return to kernel + sigreturn_addr = 0xcull << 60; + pid = fork_child(); + expect_segv(pid); + + // Return to kernel virtual + sigreturn_addr = 0xc008ull << 48; + pid = fork_child(); + expect_segv(pid); + + // Return out of range + sigreturn_addr = 0xc010ull << 48; + pid = fork_child(); + expect_segv(pid); + + // Return to no-man's land, just below PAGE_OFFSET + sigreturn_addr = (0xcull << 60) - (64 * 1024); + pid = fork_child(); + expect_segv(pid); + + // Return to no-man's land, above TASK_SIZE_4PB + sigreturn_addr = 0x1ull << 52; + pid = fork_child(); + expect_segv(pid); + + // Return to 0xd space + sigreturn_addr = 0xdull << 60; + pid = fork_child(); + expect_segv(pid); + + // Return to 0xe space + sigreturn_addr = 0xeull << 60; + pid = fork_child(); + expect_segv(pid); + + // Return to 0xf space + sigreturn_addr = 0xfull << 60; + pid = fork_child(); + expect_segv(pid); + + // Attempt to set PR=0 for 2nd loop (should be blocked by kernel) + sigreturn_msr_mask = ~MSR_PR; + } + + printf("All children killed as expected\n"); + + // Don't change address, just MSR, should return to user as normal + sigreturn_addr = 0; + sigreturn_msr_mask = ~MSR_PR; + pid = fork_child(); + waitpid(pid, &child_ret, 0); + FAIL_IF(!WIFEXITED(child_ret)); + FAIL_IF(WIFSIGNALED(child_ret)); + FAIL_IF(WEXITSTATUS(child_ret) != 0); + + return 0; +} + +int main(void) +{ + return test_harness(test_sigreturn_kernel, "sigreturn_kernel"); +} From d276960d9296b6a9074795fe60a513abf8474e35 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:16 -0500 Subject: [PATCH 193/240] powerpc/kernel: Add __init attribute to eligible functions Some functions defined in `arch/powerpc/kernel` (and one in `arch/powerpc/ kexec`) are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-2-nick.child@ibm.com --- arch/powerpc/include/asm/btext.h | 10 +++++----- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/include/asm/fadump-internal.h | 6 +++--- arch/powerpc/include/asm/kexec.h | 2 +- arch/powerpc/include/asm/kvm_guest.h | 2 +- arch/powerpc/include/asm/pci.h | 2 +- arch/powerpc/include/asm/setup.h | 4 ++-- arch/powerpc/include/asm/udbg.h | 8 ++++---- arch/powerpc/kernel/btext.c | 12 ++++++------ arch/powerpc/kernel/dt_cpu_ftrs.c | 2 +- arch/powerpc/kernel/eeh_cache.c | 2 +- arch/powerpc/kernel/fadump.c | 18 +++++++++--------- arch/powerpc/kernel/nvram_64.c | 6 +++--- arch/powerpc/kernel/pci-common.c | 2 +- arch/powerpc/kernel/pci_32.c | 4 ++-- arch/powerpc/kernel/prom.c | 4 ++-- arch/powerpc/kernel/prom_init.c | 12 ++++++------ arch/powerpc/kernel/rtasd.c | 6 +++--- arch/powerpc/kernel/security.c | 4 ++-- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kernel/smp.c | 5 +++-- arch/powerpc/kernel/sysfs.c | 10 +++++----- arch/powerpc/kernel/udbg_16550.c | 10 +++++----- arch/powerpc/kexec/core.c | 2 +- 24 files changed, 69 insertions(+), 68 deletions(-) diff --git a/arch/powerpc/include/asm/btext.h b/arch/powerpc/include/asm/btext.h index 461b0f193864..860f8868f11e 100644 --- a/arch/powerpc/include/asm/btext.h +++ b/arch/powerpc/include/asm/btext.h @@ -23,12 +23,12 @@ extern void btext_unmap(void); extern void btext_drawchar(char c); extern void btext_drawstring(const char *str); -extern void btext_drawhex(unsigned long v); -extern void btext_drawtext(const char *c, unsigned int len); +void __init btext_drawhex(unsigned long v); +void __init btext_drawtext(const char *c, unsigned int len); -extern void btext_clearscreen(void); -extern void btext_flushscreen(void); -extern void btext_flushline(void); +void __init btext_clearscreen(void); +void __init btext_flushscreen(void); +void __init btext_flushline(void); #endif /* __KERNEL__ */ #endif /* __PPC_BTEXT_H */ diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index b1a5bba2e0b9..bd513fd49be9 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -460,7 +460,7 @@ static inline void eeh_readsl(const volatile void __iomem *addr, void * buf, } -void eeh_cache_debugfs_init(void); +void __init eeh_cache_debugfs_init(void); #endif /* CONFIG_PPC64 */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 8d61c8f3fec4..52189928ec08 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -137,10 +137,10 @@ struct fadump_ops { }; /* Helper functions */ -s32 fadump_setup_cpu_notes_buf(u32 num_cpus); +s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus); void fadump_free_cpu_notes_buf(void); -u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs); -void fadump_update_elfcore_header(char *bufp); +u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs); +void __init fadump_update_elfcore_header(char *bufp); bool is_fadump_boot_mem_contiguous(void); bool is_fadump_reserved_mem_contiguous(void); diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index c6f250eca3fb..8ebdd23d987c 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -84,7 +84,7 @@ extern int crash_shutdown_register(crash_shutdown_t handler); extern int crash_shutdown_unregister(crash_shutdown_t handler); extern void crash_kexec_secondary(struct pt_regs *regs); -extern int overlaps_crashkernel(unsigned long start, unsigned long size); +int __init overlaps_crashkernel(unsigned long start, unsigned long size); extern void reserve_crashkernel(void); extern void machine_kexec_mask_interrupts(void); diff --git a/arch/powerpc/include/asm/kvm_guest.h b/arch/powerpc/include/asm/kvm_guest.h index c63105d2c9e7..68e499abdb24 100644 --- a/arch/powerpc/include/asm/kvm_guest.h +++ b/arch/powerpc/include/asm/kvm_guest.h @@ -16,7 +16,7 @@ static inline bool is_kvm_guest(void) return static_branch_unlikely(&kvm_guest); } -int check_kvm_guest(void); +int __init check_kvm_guest(void); #else static inline bool is_kvm_guest(void) { return false; } static inline int check_kvm_guest(void) { return 0; } diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index d1f53260725c..915d6ee4b40a 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -48,7 +48,7 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) } #ifdef CONFIG_PCI -extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops); +void __init set_pci_dma_ops(const struct dma_map_ops *dma_ops); #else /* CONFIG_PCI */ #define set_pci_dma_ops(d) #endif diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 6c1a7d217d1a..cff58db6130f 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -55,7 +55,7 @@ void setup_entry_flush(bool enable); void setup_uaccess_flush(bool enable); void do_rfi_flush_fixups(enum l1d_flush_type types); #ifdef CONFIG_PPC_BARRIER_NOSPEC -void setup_barrier_nospec(void); +void __init setup_barrier_nospec(void); #else static inline void setup_barrier_nospec(void) { } #endif @@ -71,7 +71,7 @@ static inline void do_barrier_nospec_fixups_range(bool enable, void *start, void #endif #ifdef CONFIG_PPC_FSL_BOOK3E -void setup_spectre_v2(void); +void __init setup_spectre_v2(void); #else static inline void setup_spectre_v2(void) {} #endif diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 0ea9e70ed78b..5aec53f2dae0 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -23,11 +23,11 @@ extern void udbg_printf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); extern void udbg_progress(char *s, unsigned short hex); -extern void udbg_uart_init_mmio(void __iomem *addr, unsigned int stride); -extern void udbg_uart_init_pio(unsigned long port, unsigned int stride); +void __init udbg_uart_init_mmio(void __iomem *addr, unsigned int stride); +void __init udbg_uart_init_pio(unsigned long port, unsigned int stride); -extern void udbg_uart_setup(unsigned int speed, unsigned int clock); -extern unsigned int udbg_probe_uart_speed(unsigned int clock); +void __init udbg_uart_setup(unsigned int speed, unsigned int clock); +unsigned int __init udbg_probe_uart_speed(unsigned int clock); struct device_node; extern void udbg_scc_init(int force_scc); diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 1cffb5e7c38d..9d9d56b574cc 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -161,7 +161,7 @@ void btext_map(void) boot_text_mapped = 1; } -static int btext_initialize(struct device_node *np) +static int __init btext_initialize(struct device_node *np) { unsigned int width, height, depth, pitch; unsigned long address = 0; @@ -292,7 +292,7 @@ void btext_update_display(unsigned long phys, int width, int height, } EXPORT_SYMBOL(btext_update_display); -void btext_clearscreen(void) +void __init btext_clearscreen(void) { unsigned int *base = (unsigned int *)calc_base(0, 0); unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * @@ -310,7 +310,7 @@ void btext_clearscreen(void) rmci_maybe_off(); } -void btext_flushscreen(void) +void __init btext_flushscreen(void) { unsigned int *base = (unsigned int *)calc_base(0, 0); unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * @@ -329,7 +329,7 @@ void btext_flushscreen(void) __asm__ __volatile__ ("sync" ::: "memory"); } -void btext_flushline(void) +void __init btext_flushline(void) { unsigned int *base = (unsigned int *)calc_base(0, g_loc_Y << 4); unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * @@ -544,7 +544,7 @@ void btext_drawstring(const char *c) btext_drawchar(*c++); } -void btext_drawtext(const char *c, unsigned int len) +void __init btext_drawtext(const char *c, unsigned int len) { if (!boot_text_mapped) return; @@ -552,7 +552,7 @@ void btext_drawtext(const char *c, unsigned int len) btext_drawchar(*c++); } -void btext_drawhex(unsigned long v) +void __init btext_drawhex(unsigned long v) { if (!boot_text_mapped) return; diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1ac8d7357195..7d1b2c4a4891 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -344,7 +344,7 @@ static int __init feat_enable_dscr(struct dt_cpu_feature *f) return 1; } -static void hfscr_pmu_enable(void) +static void __init hfscr_pmu_enable(void) { u64 hfscr = mfspr(SPRN_HFSCR); hfscr |= PPC_BIT(60); diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 9bdaaf7fddc9..2f9dbf8ad2ee 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -280,7 +280,7 @@ static int eeh_addr_cache_show(struct seq_file *s, void *v) } DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); -void eeh_cache_debugfs_init(void) +void __init eeh_cache_debugfs_init(void) { debugfs_create_file_unsafe("eeh_address_cache", 0400, arch_debugfs_dir, NULL, diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 60f5fc14aa23..d03e488cfe9c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -251,7 +251,7 @@ bool is_fadump_reserved_mem_contiguous(void) } /* Print firmware assisted dump configurations for debugging purpose. */ -static void fadump_show_config(void) +static void __init fadump_show_config(void) { int i; @@ -353,7 +353,7 @@ static __init u64 fadump_calculate_reserve_size(void) * Calculate the total memory size required to be reserved for * firmware-assisted dump registration. */ -static unsigned long get_fadump_area_size(void) +static unsigned long __init get_fadump_area_size(void) { unsigned long size = 0; @@ -462,7 +462,7 @@ static int __init fadump_get_boot_mem_regions(void) * with the given memory range. * False, otherwise. */ -static bool overlaps_reserved_ranges(u64 base, u64 end, int *idx) +static bool __init overlaps_reserved_ranges(u64 base, u64 end, int *idx) { bool ret = false; int i; @@ -737,7 +737,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) fw_dump.ops->fadump_trigger(fdh, str); } -u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -752,7 +752,7 @@ u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) return buf; } -void fadump_update_elfcore_header(char *bufp) +void __init fadump_update_elfcore_header(char *bufp) { struct elf_phdr *phdr; @@ -770,7 +770,7 @@ void fadump_update_elfcore_header(char *bufp) return; } -static void *fadump_alloc_buffer(unsigned long size) +static void *__init fadump_alloc_buffer(unsigned long size) { unsigned long count, i; struct page *page; @@ -792,7 +792,7 @@ static void fadump_free_buffer(unsigned long vaddr, unsigned long size) free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL); } -s32 fadump_setup_cpu_notes_buf(u32 num_cpus) +s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus) { /* Allocate buffer to hold cpu crash notes. */ fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); @@ -1447,7 +1447,7 @@ static ssize_t release_mem_store(struct kobject *kobj, } /* Release the reserved memory and disable the FADump */ -static void unregister_fadump(void) +static void __init unregister_fadump(void) { fadump_cleanup(); fadump_release_memory(fw_dump.reserve_dump_area_start, @@ -1547,7 +1547,7 @@ ATTRIBUTE_GROUPS(fadump); DEFINE_SHOW_ATTRIBUTE(fadump_region); -static void fadump_init_files(void) +static void __init fadump_init_files(void) { int rc = 0; diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 3c8d9bbb51cf..0d9f9cd41e13 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -540,7 +540,7 @@ static struct pstore_info nvram_pstore_info = { .write = nvram_pstore_write, }; -static int nvram_pstore_init(void) +static int __init nvram_pstore_init(void) { int rc = 0; @@ -562,7 +562,7 @@ static int nvram_pstore_init(void) return rc; } #else -static int nvram_pstore_init(void) +static int __init nvram_pstore_init(void) { return -1; } @@ -755,7 +755,7 @@ static unsigned char __init nvram_checksum(struct nvram_header *p) * Per the criteria passed via nvram_remove_partition(), should this * partition be removed? 1=remove, 0=keep */ -static int nvram_can_remove_partition(struct nvram_partition *part, +static int __init nvram_can_remove_partition(struct nvram_partition *part, const char *name, int sig, const char *exceptions[]) { if (part->header.signature != sig) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 6749905932f4..8bc9cf62cd93 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -62,7 +62,7 @@ EXPORT_SYMBOL(isa_mem_base); static const struct dma_map_ops *pci_dma_ops; -void set_pci_dma_ops(const struct dma_map_ops *dma_ops) +void __init set_pci_dma_ops(const struct dma_map_ops *dma_ops) { pci_dma_ops = dma_ops; } diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index b49e1060a3bf..48537964fba1 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -37,7 +37,7 @@ int pcibios_assign_bus_offset = 1; EXPORT_SYMBOL(isa_io_base); EXPORT_SYMBOL(pci_dram_offset); -void pcibios_make_OF_bus_map(void); +void __init pcibios_make_OF_bus_map(void); static void fixup_cpc710_pci64(struct pci_dev* dev); static u8* pci_to_OF_bus_map; @@ -109,7 +109,7 @@ make_one_node_map(struct device_node* node, u8 pci_bus) } } -void +void __init pcibios_make_OF_bus_map(void) { int i; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 75678ff04dd7..4c4a047f691c 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -447,7 +447,7 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned long node, */ #ifdef CONFIG_SPARSEMEM -static bool validate_mem_limit(u64 base, u64 *size) +static bool __init validate_mem_limit(u64 base, u64 *size) { u64 max_mem = 1UL << (MAX_PHYSMEM_BITS); @@ -458,7 +458,7 @@ static bool validate_mem_limit(u64 base, u64 *size) return true; } #else -static bool validate_mem_limit(u64 base, u64 *size) +static bool __init validate_mem_limit(u64 base, u64 *size) { return true; } diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index f845065c860e..0ac5faacc909 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -672,7 +672,7 @@ static inline int __init prom_getproplen(phandle node, const char *pname) return call_prom("getproplen", 2, 1, node, ADDR(pname)); } -static void add_string(char **str, const char *q) +static void __init add_string(char **str, const char *q) { char *p = *str; @@ -682,7 +682,7 @@ static void add_string(char **str, const char *q) *str = p; } -static char *tohex(unsigned int x) +static char *__init tohex(unsigned int x) { static const char digits[] __initconst = "0123456789abcdef"; static char result[9] __prombss; @@ -728,7 +728,7 @@ static int __init prom_setprop(phandle node, const char *nodename, #define prom_islower(c) ('a' <= (c) && (c) <= 'z') #define prom_toupper(c) (prom_islower(c) ? ((c) - 'a' + 'A') : (c)) -static unsigned long prom_strtoul(const char *cp, const char **endp) +static unsigned long __init prom_strtoul(const char *cp, const char **endp) { unsigned long result = 0, base = 10, value; @@ -753,7 +753,7 @@ static unsigned long prom_strtoul(const char *cp, const char **endp) return result; } -static unsigned long prom_memparse(const char *ptr, const char **retptr) +static unsigned long __init prom_memparse(const char *ptr, const char **retptr) { unsigned long ret = prom_strtoul(ptr, retptr); int shift = 0; @@ -1786,7 +1786,7 @@ static void __init prom_close_stdin(void) } #ifdef CONFIG_PPC_SVM -static int prom_rtas_hcall(uint64_t args) +static int __init prom_rtas_hcall(uint64_t args) { register uint64_t arg1 asm("r3") = H_RTAS; register uint64_t arg2 asm("r4") = args; @@ -3248,7 +3248,7 @@ static void __init prom_check_initrd(unsigned long r3, unsigned long r4) /* * Perform the Enter Secure Mode ultracall. */ -static int enter_secure_mode(unsigned long kbase, unsigned long fdt) +static int __init enter_secure_mode(unsigned long kbase, unsigned long fdt) { register unsigned long r3 asm("r3") = UV_ESM; register unsigned long r4 asm("r4") = kbase; diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 32ee17753eb4..cf0f42909ddf 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -455,7 +455,7 @@ static void rtas_event_scan(struct work_struct *w) } #ifdef CONFIG_PPC64 -static void retrieve_nvram_error_log(void) +static void __init retrieve_nvram_error_log(void) { unsigned int err_type ; int rc ; @@ -473,12 +473,12 @@ static void retrieve_nvram_error_log(void) } } #else /* CONFIG_PPC64 */ -static void retrieve_nvram_error_log(void) +static void __init retrieve_nvram_error_log(void) { } #endif /* CONFIG_PPC64 */ -static void start_event_scan(void) +static void __init start_event_scan(void) { printk(KERN_DEBUG "RTAS daemon started\n"); pr_debug("rtasd: will sleep for %d milliseconds\n", diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 15fb5ea1b9ea..e159d4093d98 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -44,7 +44,7 @@ static void enable_barrier_nospec(bool enable) do_barrier_nospec_fixups(enable); } -void setup_barrier_nospec(void) +void __init setup_barrier_nospec(void) { bool enable; @@ -132,7 +132,7 @@ early_param("nospectre_v2", handle_nospectre_v2); #endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_FSL_BOOK3E -void setup_spectre_v2(void) +void __init setup_spectre_v2(void) { if (no_spectrev2 || cpu_mitigations_off()) do_btb_flush_fixups(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 703a2e6ab08d..d87f7c1103ce 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -499,7 +499,7 @@ void smp_release_cpus(void) * routines and/or provided to userland */ -static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize, +static void __init init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize, u32 bsize, u32 sets) { info->size = size; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index c338f9d8ab37..b7fd6a72aa76 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -936,7 +936,8 @@ out: return tg; } -static int update_mask_from_threadgroup(cpumask_var_t *mask, struct thread_groups *tg, int cpu, int cpu_group_start) +static int __init update_mask_from_threadgroup(cpumask_var_t *mask, struct thread_groups *tg, + int cpu, int cpu_group_start) { int first_thread = cpu_first_thread_sibling(cpu); int i; @@ -1682,7 +1683,7 @@ int setup_profiling_timer(unsigned int multiplier) } #endif -static void fixup_topology(void) +static void __init fixup_topology(void) { int i; diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 08d8072d6e7a..d45a415d5374 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -214,7 +214,7 @@ static ssize_t __used store_dscr_default(struct device *dev, static DEVICE_ATTR(dscr_default, 0600, show_dscr_default, store_dscr_default); -static void sysfs_create_dscr_default(void) +static void __init sysfs_create_dscr_default(void) { if (cpu_has_feature(CPU_FTR_DSCR)) { int cpu; @@ -744,12 +744,12 @@ static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char } static DEVICE_ATTR(svm, 0444, show_svm, NULL); -static void create_svm_file(void) +static void __init create_svm_file(void) { device_create_file(cpu_subsys.dev_root, &dev_attr_svm); } #else -static void create_svm_file(void) +static void __init create_svm_file(void) { } #endif /* CONFIG_PPC_SVM */ @@ -1110,7 +1110,7 @@ EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group); /* NUMA stuff */ #ifdef CONFIG_NUMA -static void register_nodes(void) +static void __init register_nodes(void) { int i; @@ -1134,7 +1134,7 @@ void sysfs_remove_device_from_node(struct device *dev, int nid) EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); #else -static void register_nodes(void) +static void __init register_nodes(void) { return; } diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c index 8513aa49614e..d3942de254c6 100644 --- a/arch/powerpc/kernel/udbg_16550.c +++ b/arch/powerpc/kernel/udbg_16550.c @@ -84,7 +84,7 @@ static int udbg_uart_getc(void) return udbg_uart_in(UART_RBR); } -static void udbg_use_uart(void) +static void __init udbg_use_uart(void) { udbg_putc = udbg_uart_putc; udbg_flush = udbg_uart_flush; @@ -92,7 +92,7 @@ static void udbg_use_uart(void) udbg_getc_poll = udbg_uart_getc_poll; } -void udbg_uart_setup(unsigned int speed, unsigned int clock) +void __init udbg_uart_setup(unsigned int speed, unsigned int clock) { unsigned int dll, base_bauds; @@ -121,7 +121,7 @@ void udbg_uart_setup(unsigned int speed, unsigned int clock) udbg_uart_out(UART_FCR, 0x7); } -unsigned int udbg_probe_uart_speed(unsigned int clock) +unsigned int __init udbg_probe_uart_speed(unsigned int clock) { unsigned int dll, dlm, divisor, prescaler, speed; u8 old_lcr; @@ -172,7 +172,7 @@ static void udbg_uart_out_pio(unsigned int reg, u8 data) outb(data, udbg_uart.pio_base + (reg * udbg_uart_stride)); } -void udbg_uart_init_pio(unsigned long port, unsigned int stride) +void __init udbg_uart_init_pio(unsigned long port, unsigned int stride) { if (!port) return; @@ -194,7 +194,7 @@ static void udbg_uart_out_mmio(unsigned int reg, u8 data) } -void udbg_uart_init_mmio(void __iomem *addr, unsigned int stride) +void __init udbg_uart_init_mmio(void __iomem *addr, unsigned int stride) { if (!addr) return; diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index a2242017e55f..8b68d9f91a03 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -185,7 +185,7 @@ void __init reserve_crashkernel(void) } } -int overlaps_crashkernel(unsigned long start, unsigned long size) +int __init overlaps_crashkernel(unsigned long start, unsigned long size) { return (start + size) > crashk_res.start && start <= crashk_res.end; } From ce0c6be9c69883df38e7631d1d7364b52f6db135 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:17 -0500 Subject: [PATCH 194/240] powerpc/lib: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/lib' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-3-nick.child@ibm.com --- arch/powerpc/include/asm/setup.h | 2 +- arch/powerpc/lib/code-patching.c | 2 +- arch/powerpc/lib/feature-fixups.c | 26 +++++++++++++------------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index cff58db6130f..607e42b8cbf0 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -75,7 +75,7 @@ void __init setup_spectre_v2(void); #else static inline void setup_spectre_v2(void) {} #endif -void do_btb_flush_fixups(void); +void __init do_btb_flush_fixups(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 312324a26df3..ee54cb447f80 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -397,7 +397,7 @@ void __patch_exception(int exc, unsigned long addr) #ifdef CONFIG_CODE_PATCHING_SELFTEST -static int instr_is_branch_to_addr(const u32 *instr, unsigned long addr) +static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) { if (instr_is_branch_iform(ppc_inst_read(instr)) || instr_is_branch_bform(ppc_inst_read(instr))) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 57c6bb802f6c..343a78826035 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -580,7 +580,7 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_ printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i); } -static void patch_btb_flush_section(long *curr) +static void __init patch_btb_flush_section(long *curr) { unsigned int *start, *end; @@ -592,7 +592,7 @@ static void patch_btb_flush_section(long *curr) } } -void do_btb_flush_fixups(void) +void __init do_btb_flush_fixups(void) { long *start, *end; @@ -621,7 +621,7 @@ void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) } } -static void do_final_fixups(void) +static void __init do_final_fixups(void) { #if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE) ppc_inst_t inst; @@ -715,12 +715,12 @@ late_initcall(check_features); /* This must be after the text it fixes up, vmlinux.lds.S enforces that atm */ static struct fixup_entry fixup; -static long calc_offset(struct fixup_entry *entry, unsigned int *p) +static long __init calc_offset(struct fixup_entry *entry, unsigned int *p) { return (unsigned long)p - (unsigned long)entry; } -static void test_basic_patching(void) +static void __init test_basic_patching(void) { extern unsigned int ftr_fixup_test1[]; extern unsigned int end_ftr_fixup_test1[]; @@ -751,7 +751,7 @@ static void test_basic_patching(void) check(memcmp(ftr_fixup_test1, ftr_fixup_test1_expected, size) == 0); } -static void test_alternative_patching(void) +static void __init test_alternative_patching(void) { extern unsigned int ftr_fixup_test2[]; extern unsigned int end_ftr_fixup_test2[]; @@ -784,7 +784,7 @@ static void test_alternative_patching(void) check(memcmp(ftr_fixup_test2, ftr_fixup_test2_expected, size) == 0); } -static void test_alternative_case_too_big(void) +static void __init test_alternative_case_too_big(void) { extern unsigned int ftr_fixup_test3[]; extern unsigned int end_ftr_fixup_test3[]; @@ -810,7 +810,7 @@ static void test_alternative_case_too_big(void) check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); } -static void test_alternative_case_too_small(void) +static void __init test_alternative_case_too_small(void) { extern unsigned int ftr_fixup_test4[]; extern unsigned int end_ftr_fixup_test4[]; @@ -856,7 +856,7 @@ static void test_alternative_case_with_branch(void) check(memcmp(ftr_fixup_test5, ftr_fixup_test5_expected, size) == 0); } -static void test_alternative_case_with_external_branch(void) +static void __init test_alternative_case_with_external_branch(void) { extern unsigned int ftr_fixup_test6[]; extern unsigned int end_ftr_fixup_test6[]; @@ -866,7 +866,7 @@ static void test_alternative_case_with_external_branch(void) check(memcmp(ftr_fixup_test6, ftr_fixup_test6_expected, size) == 0); } -static void test_alternative_case_with_branch_to_end(void) +static void __init test_alternative_case_with_branch_to_end(void) { extern unsigned int ftr_fixup_test7[]; extern unsigned int end_ftr_fixup_test7[]; @@ -876,7 +876,7 @@ static void test_alternative_case_with_branch_to_end(void) check(memcmp(ftr_fixup_test7, ftr_fixup_test7_expected, size) == 0); } -static void test_cpu_macros(void) +static void __init test_cpu_macros(void) { extern u8 ftr_fixup_test_FTR_macros[]; extern u8 ftr_fixup_test_FTR_macros_expected[]; @@ -888,7 +888,7 @@ static void test_cpu_macros(void) ftr_fixup_test_FTR_macros_expected, size) == 0); } -static void test_fw_macros(void) +static void __init test_fw_macros(void) { #ifdef CONFIG_PPC64 extern u8 ftr_fixup_test_FW_FTR_macros[]; @@ -902,7 +902,7 @@ static void test_fw_macros(void) #endif } -static void test_lwsync_macros(void) +static void __init test_lwsync_macros(void) { extern u8 lwsync_fixup_test[]; extern u8 end_lwsync_fixup_test[]; From c13f2b2bb5afd90f152c389c1c9245a0d43bce80 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:18 -0500 Subject: [PATCH 195/240] powerpc/mm: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/mm' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-4-nick.child@ibm.com --- arch/powerpc/include/asm/hugetlb.h | 2 +- arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/mm/book3s32/mmu.c | 2 +- arch/powerpc/mm/book3s64/hash_utils.c | 6 +++--- arch/powerpc/mm/book3s64/hugetlbpage.c | 2 +- arch/powerpc/mm/book3s64/mmu_context.c | 2 +- arch/powerpc/mm/book3s64/pkeys.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++-- arch/powerpc/mm/nohash/44x.c | 4 ++-- arch/powerpc/mm/nohash/fsl_book3e.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 4 ++-- arch/powerpc/mm/numa.c | 6 +++--- arch/powerpc/mm/ptdump/ptdump.c | 2 +- 13 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index f18c543bc01d..962708fa1017 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -15,7 +15,7 @@ extern bool hugetlb_disabled; -void hugetlbpage_init_default(void); +void __init hugetlbpage_init_default(void); int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len); diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index e46394d27785..fd277b15635c 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -71,7 +71,7 @@ static inline void switch_mmu_context(struct mm_struct *prev, } extern int hash__alloc_context_id(void); -extern void hash__reserve_context_id(int id); +void __init hash__reserve_context_id(int id); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 33ab63d56435..94045b265b6b 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -76,7 +76,7 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -static int find_free_bat(void) +static int __init find_free_bat(void) { int b; int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index eced266dc5e9..7abf82a698d3 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -662,7 +662,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, } #endif /* CONFIG_HUGETLB_PAGE */ -static void mmu_psize_set_default_penc(void) +static void __init mmu_psize_set_default_penc(void) { int bpsize, apsize; for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) @@ -672,7 +672,7 @@ static void mmu_psize_set_default_penc(void) #ifdef CONFIG_PPC_64K_PAGES -static bool might_have_hea(void) +static bool __init might_have_hea(void) { /* * The HEA ethernet adapter requires awareness of the @@ -743,7 +743,7 @@ static void __init htab_scan_page_sizes(void) * low-order N bits as the encoding for the 2^(12+N) byte page size * (if it exists). */ -static void init_hpte_page_sizes(void) +static void __init init_hpte_page_sizes(void) { long int ap, bp; long int shift, penc; diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index 95b2a283fd6e..ea8f83afb0ae 100644 --- a/arch/powerpc/mm/book3s64/hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -150,7 +150,7 @@ void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } -void hugetlbpage_init_default(void) +void __init hugetlbpage_init_default(void) { /* Set default large page size. Currently, we pick 16M or 1M * depending on what is available diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 24aa953c9311..c766e4c26e42 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -32,7 +32,7 @@ static int alloc_context_id(int min_id, int max_id) } #ifdef CONFIG_PPC_64S_HASH_MMU -void hash__reserve_context_id(int id) +void __init hash__reserve_context_id(int id) { int result = ida_alloc_range(&mmu_context_ida, id, id, GFP_KERNEL); diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index a2d9ad138709..753e62ba67af 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -66,7 +66,7 @@ static int __init dt_scan_storage_keys(unsigned long node, return 1; } -static int scan_pkey_feature(void) +static int __init scan_pkey_feature(void) { int ret; int pkeys_total = 0; diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index ca23f5d1883a..def04631a74d 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -499,7 +499,7 @@ static int __init probe_memory_block_size(unsigned long node, const char *uname, return 1; } -static unsigned long radix_memory_block_size(void) +static unsigned long __init radix_memory_block_size(void) { unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE; @@ -517,7 +517,7 @@ static unsigned long radix_memory_block_size(void) #else /* CONFIG_MEMORY_HOTPLUG */ -static unsigned long radix_memory_block_size(void) +static unsigned long __init radix_memory_block_size(void) { return 1UL * 1024 * 1024 * 1024; } diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c index 796c824acc8c..1beae802bb1c 100644 --- a/arch/powerpc/mm/nohash/44x.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -38,7 +38,7 @@ int icache_44x_need_flush; unsigned long tlb_47x_boltmap[1024/8]; -static void ppc44x_update_tlb_hwater(void) +static void __init ppc44x_update_tlb_hwater(void) { /* The TLB miss handlers hard codes the watermark in a cmpli * instruction to improve performances rather than loading it @@ -122,7 +122,7 @@ static void __init ppc47x_update_boltmap(void) /* * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU */ -static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +static void __init ppc47x_pin_tlb(unsigned int virt, unsigned int phys) { unsigned int rA; int bolted; diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c index 7f71bc3bf85f..dfe715e0f70a 100644 --- a/arch/powerpc/mm/nohash/fsl_book3e.c +++ b/arch/powerpc/mm/nohash/fsl_book3e.c @@ -259,7 +259,7 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } -static unsigned long tlbcam_sz(int idx) +static unsigned long __init tlbcam_sz(int idx) { return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; } diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 311281063d48..fd2c77af5c55 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -432,7 +432,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) } } -static void setup_page_sizes(void) +static void __init setup_page_sizes(void) { unsigned int tlb0cfg; unsigned int tlb0ps; @@ -570,7 +570,7 @@ out: } } -static void setup_mmu_htw(void) +static void __init setup_mmu_htw(void) { /* * If we want to use HW tablewalk, enable it by patching the TLB miss diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 59d3cfcd7887..9d5f710d2c20 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -134,7 +134,7 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, return 0; } -static void reset_numa_cpu_lookup_table(void) +static void __init reset_numa_cpu_lookup_table(void) { unsigned int cpu; @@ -372,7 +372,7 @@ void update_numa_distance(struct device_node *node) * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} */ -static void initialize_form2_numa_distance_lookup_table(void) +static void __init initialize_form2_numa_distance_lookup_table(void) { int i, j; struct device_node *root; @@ -581,7 +581,7 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) return 0; } -static int get_nid_and_numa_distance(struct drmem_lmb *lmb) +static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb) { struct assoc_arrays aa = { .arrays = NULL }; int default_nid = NUMA_NO_NODE; diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 031956d0ee84..473960e4b07a 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -315,7 +315,7 @@ static int ptdump_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump); -static void build_pgtable_complete_mask(void) +static void __init build_pgtable_complete_mask(void) { unsigned int i, j; From c49f5d88ff0166ffa4e48ee8ce84d63719f346be Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:19 -0500 Subject: [PATCH 196/240] powerpc/perf: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/perf' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-5-nick.child@ibm.com --- arch/powerpc/include/asm/perf_event_server.h | 2 +- arch/powerpc/perf/core-book3s.c | 2 +- arch/powerpc/perf/generic-compat-pmu.c | 2 +- arch/powerpc/perf/internal.h | 18 +++++++++--------- arch/powerpc/perf/power10-pmu.c | 2 +- arch/powerpc/perf/power5+-pmu.c | 2 +- arch/powerpc/perf/power5-pmu.c | 2 +- arch/powerpc/perf/power6-pmu.c | 2 +- arch/powerpc/perf/power7-pmu.c | 2 +- arch/powerpc/perf/power8-pmu.c | 2 +- arch/powerpc/perf/power9-pmu.c | 2 +- arch/powerpc/perf/ppc970-pmu.c | 2 +- 12 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index f4c3428e816b..e2221d29fdf9 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -98,7 +98,7 @@ struct power_pmu { #define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ #define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ -extern int register_power_pmu(struct power_pmu *); +int __init register_power_pmu(struct power_pmu *pmu); struct pt_regs; extern unsigned long perf_misc_flags(struct pt_regs *regs); diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 07fd61a8d59d..a684901b6965 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2479,7 +2479,7 @@ static int power_pmu_prepare_cpu(unsigned int cpu) return 0; } -int register_power_pmu(struct power_pmu *pmu) +int __init register_power_pmu(struct power_pmu *pmu) { if (ppmu) return -EBUSY; /* something's already registered */ diff --git a/arch/powerpc/perf/generic-compat-pmu.c b/arch/powerpc/perf/generic-compat-pmu.c index 695975227e60..b6e25f75109d 100644 --- a/arch/powerpc/perf/generic-compat-pmu.c +++ b/arch/powerpc/perf/generic-compat-pmu.c @@ -307,7 +307,7 @@ static struct power_pmu generic_compat_pmu = { .attr_groups = generic_compat_pmu_attr_groups, }; -int init_generic_compat_pmu(void) +int __init init_generic_compat_pmu(void) { int rc = 0; diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h index 80bbf72bfec2..4c18b5504326 100644 --- a/arch/powerpc/perf/internal.h +++ b/arch/powerpc/perf/internal.h @@ -2,12 +2,12 @@ // // Copyright 2019 Madhavan Srinivasan, IBM Corporation. -extern int init_ppc970_pmu(void); -extern int init_power5_pmu(void); -extern int init_power5p_pmu(void); -extern int init_power6_pmu(void); -extern int init_power7_pmu(void); -extern int init_power8_pmu(void); -extern int init_power9_pmu(void); -extern int init_power10_pmu(void); -extern int init_generic_compat_pmu(void); +int __init init_ppc970_pmu(void); +int __init init_power5_pmu(void); +int __init init_power5p_pmu(void); +int __init init_power6_pmu(void); +int __init init_power7_pmu(void); +int __init init_power8_pmu(void); +int __init init_power9_pmu(void); +int __init init_power10_pmu(void); +int __init init_generic_compat_pmu(void); diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c index 9dd75f385837..0975ad0b42c4 100644 --- a/arch/powerpc/perf/power10-pmu.c +++ b/arch/powerpc/perf/power10-pmu.c @@ -592,7 +592,7 @@ static struct power_pmu power10_pmu = { .check_attr_config = power10_check_attr_config, }; -int init_power10_pmu(void) +int __init init_power10_pmu(void) { unsigned int pvr; int rc; diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c index 18732267993a..753b4740ef64 100644 --- a/arch/powerpc/perf/power5+-pmu.c +++ b/arch/powerpc/perf/power5+-pmu.c @@ -677,7 +677,7 @@ static struct power_pmu power5p_pmu = { .cache_events = &power5p_cache_events, }; -int init_power5p_pmu(void) +int __init init_power5p_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+") diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c index cb611c1e7abe..1f83c4cba0aa 100644 --- a/arch/powerpc/perf/power5-pmu.c +++ b/arch/powerpc/perf/power5-pmu.c @@ -618,7 +618,7 @@ static struct power_pmu power5_pmu = { .flags = PPMU_HAS_SSLOT, }; -int init_power5_pmu(void) +int __init init_power5_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5")) diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c index 69ef38216418..aec746f86804 100644 --- a/arch/powerpc/perf/power6-pmu.c +++ b/arch/powerpc/perf/power6-pmu.c @@ -539,7 +539,7 @@ static struct power_pmu power6_pmu = { .cache_events = &power6_cache_events, }; -int init_power6_pmu(void) +int __init init_power6_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6")) diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c index 894c17f9a762..99b5ba314ea7 100644 --- a/arch/powerpc/perf/power7-pmu.c +++ b/arch/powerpc/perf/power7-pmu.c @@ -445,7 +445,7 @@ static struct power_pmu power7_pmu = { .cache_events = &power7_cache_events, }; -int init_power7_pmu(void) +int __init init_power7_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7")) diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 5282e8415ddf..f21194b5604a 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -378,7 +378,7 @@ static struct power_pmu power8_pmu = { .bhrb_nr = 32, }; -int init_power8_pmu(void) +int __init init_power8_pmu(void) { int rc; diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index ff3382140d7e..4b7c17e36100 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -452,7 +452,7 @@ static struct power_pmu power9_pmu = { .check_attr_config = power9_check_attr_config, }; -int init_power9_pmu(void) +int __init init_power9_pmu(void) { int rc = 0; unsigned int pvr = mfspr(SPRN_PVR); diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c index 1f8263785286..09802482ba72 100644 --- a/arch/powerpc/perf/ppc970-pmu.c +++ b/arch/powerpc/perf/ppc970-pmu.c @@ -489,7 +489,7 @@ static struct power_pmu ppc970_pmu = { .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; -int init_ppc970_pmu(void) +int __init init_ppc970_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970") From 6c552983d0e65a8c923dfacc4f69b694205672c1 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:20 -0500 Subject: [PATCH 197/240] powerpc/sysdev: Add __init attribute to eligible functions Some files functions in 'arch/powerpc/sysdev' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-6-nick.child@ibm.com --- arch/powerpc/include/asm/cpm2.h | 6 +++--- arch/powerpc/include/asm/i8259.h | 2 +- arch/powerpc/include/asm/ipic.h | 2 +- arch/powerpc/include/asm/mpic.h | 2 +- arch/powerpc/include/asm/xics.h | 4 ++-- arch/powerpc/sysdev/cpm2.c | 6 +++--- arch/powerpc/sysdev/dart_iommu.c | 2 +- arch/powerpc/sysdev/fsl_mpic_err.c | 4 ++-- arch/powerpc/sysdev/fsl_pci.c | 2 +- arch/powerpc/sysdev/fsl_pci.h | 2 +- arch/powerpc/sysdev/i8259.c | 2 +- arch/powerpc/sysdev/ipic.c | 2 +- arch/powerpc/sysdev/mpic.c | 2 +- arch/powerpc/sysdev/mpic.h | 8 ++++---- arch/powerpc/sysdev/mpic_msi.c | 6 +++--- arch/powerpc/sysdev/mpic_timer.c | 6 +++--- arch/powerpc/sysdev/mpic_u3msi.c | 2 +- arch/powerpc/sysdev/tsi108_pci.c | 2 +- arch/powerpc/sysdev/udbg_memcons.c | 2 +- arch/powerpc/sysdev/xics/icp-hv.c | 2 +- arch/powerpc/sysdev/xics/icp-opal.c | 2 +- arch/powerpc/sysdev/xics/xics-common.c | 2 +- arch/powerpc/sysdev/xive/native.c | 4 ++-- arch/powerpc/sysdev/xive/spapr.c | 6 +++--- 24 files changed, 40 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/include/asm/cpm2.h b/arch/powerpc/include/asm/cpm2.h index bda45788cfcc..9ee192a6c5d7 100644 --- a/arch/powerpc/include/asm/cpm2.h +++ b/arch/powerpc/include/asm/cpm2.h @@ -1133,8 +1133,8 @@ enum cpm_clk { CPM_CLK_DUMMY }; -extern int cpm2_clk_setup(enum cpm_clk_target target, int clock, int mode); -extern int cpm2_smc_clk_setup(enum cpm_clk_target target, int clock); +int __init cpm2_clk_setup(enum cpm_clk_target target, int clock, int mode); +int __init cpm2_smc_clk_setup(enum cpm_clk_target target, int clock); #define CPM_PIN_INPUT 0 #define CPM_PIN_OUTPUT 1 @@ -1143,7 +1143,7 @@ extern int cpm2_smc_clk_setup(enum cpm_clk_target target, int clock); #define CPM_PIN_GPIO 4 #define CPM_PIN_OPENDRAIN 8 -void cpm2_set_pin(int port, int pin, int flags); +void __init cpm2_set_pin(int port, int pin, int flags); #endif /* __CPM2__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/i8259.h b/arch/powerpc/include/asm/i8259.h index d7f08ae49e12..75481d363cd8 100644 --- a/arch/powerpc/include/asm/i8259.h +++ b/arch/powerpc/include/asm/i8259.h @@ -7,7 +7,7 @@ extern void i8259_init(struct device_node *node, unsigned long intack_addr); extern unsigned int i8259_irq(void); -extern struct irq_domain *i8259_get_host(void); +struct irq_domain *__init i8259_get_host(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_I8259_H */ diff --git a/arch/powerpc/include/asm/ipic.h b/arch/powerpc/include/asm/ipic.h index 0524df31a7e6..b47ca7dc7199 100644 --- a/arch/powerpc/include/asm/ipic.h +++ b/arch/powerpc/include/asm/ipic.h @@ -65,7 +65,7 @@ enum ipic_mcp_irq { IPIC_MCP_MU = 7, }; -extern void ipic_set_default_priority(void); +void __init ipic_set_default_priority(void); extern u32 ipic_get_mcp_status(void); extern void ipic_clear_mcp_status(u32 mask); diff --git a/arch/powerpc/include/asm/mpic.h b/arch/powerpc/include/asm/mpic.h index 0abf2e7fd222..58353c5bd3fb 100644 --- a/arch/powerpc/include/asm/mpic.h +++ b/arch/powerpc/include/asm/mpic.h @@ -472,7 +472,7 @@ extern int mpic_cpu_get_priority(void); extern void mpic_cpu_set_priority(int prio); /* Request IPIs on primary mpic */ -extern void mpic_request_ipis(void); +void __init mpic_request_ipis(void); /* Send a message (IPI) to a given target (cpu number or MSG_*) */ void smp_mpic_message_pass(int target, int msg); diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index 0ac9bfddf704..e2e704eca5f6 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -38,13 +38,13 @@ static inline int icp_native_init(void) { return -ENODEV; } /* PAPR ICP */ #ifdef CONFIG_PPC_ICP_HV -extern int icp_hv_init(void); +int __init icp_hv_init(void); #else static inline int icp_hv_init(void) { return -ENODEV; } #endif #ifdef CONFIG_PPC_POWERNV -extern int icp_opal_init(void); +int __init icp_opal_init(void); extern void icp_opal_flush_interrupt(void); #else static inline int icp_opal_init(void) { return -ENODEV; } diff --git a/arch/powerpc/sysdev/cpm2.c b/arch/powerpc/sysdev/cpm2.c index 68538b8329f7..3f130312b6e9 100644 --- a/arch/powerpc/sysdev/cpm2.c +++ b/arch/powerpc/sysdev/cpm2.c @@ -135,7 +135,7 @@ void __cpm2_setbrg(uint brg, uint rate, uint clk, int div16, int src) } EXPORT_SYMBOL(__cpm2_setbrg); -int cpm2_clk_setup(enum cpm_clk_target target, int clock, int mode) +int __init cpm2_clk_setup(enum cpm_clk_target target, int clock, int mode) { int ret = 0; int shift; @@ -265,7 +265,7 @@ int cpm2_clk_setup(enum cpm_clk_target target, int clock, int mode) return ret; } -int cpm2_smc_clk_setup(enum cpm_clk_target target, int clock) +int __init cpm2_smc_clk_setup(enum cpm_clk_target target, int clock) { int ret = 0; int shift; @@ -326,7 +326,7 @@ struct cpm2_ioports { u32 res[3]; }; -void cpm2_set_pin(int port, int pin, int flags) +void __init cpm2_set_pin(int port, int pin, int flags) { struct cpm2_ioports __iomem *iop = (struct cpm2_ioports __iomem *)&cpm2_immr->im_ioport; diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 1d33b7a5ea83..be6b99b1b352 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -226,7 +226,7 @@ static void dart_free(struct iommu_table *tbl, long index, long npages) dart_cache_sync(orig_dp, orig_npages); } -static void allocate_dart(void) +static void __init allocate_dart(void) { unsigned long tmp; diff --git a/arch/powerpc/sysdev/fsl_mpic_err.c b/arch/powerpc/sysdev/fsl_mpic_err.c index 9a98bb212922..df06bb6b838f 100644 --- a/arch/powerpc/sysdev/fsl_mpic_err.c +++ b/arch/powerpc/sysdev/fsl_mpic_err.c @@ -58,7 +58,7 @@ static struct irq_chip fsl_mpic_err_chip = { .irq_unmask = fsl_mpic_unmask_err, }; -int mpic_setup_error_int(struct mpic *mpic, int intvec) +int __init mpic_setup_error_int(struct mpic *mpic, int intvec) { int i; @@ -121,7 +121,7 @@ static irqreturn_t fsl_error_int_handler(int irq, void *data) return IRQ_HANDLED; } -void mpic_err_int_init(struct mpic *mpic, irq_hw_number_t irqnum) +void __init mpic_err_int_init(struct mpic *mpic, irq_hw_number_t irqnum) { unsigned int virq; int ret; diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index b8f76f3fd994..674f047b7820 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -1106,7 +1106,7 @@ static const struct of_device_id pci_ids[] = { struct device_node *fsl_pci_primary; -void fsl_pci_assign_primary(void) +void __init fsl_pci_assign_primary(void) { struct device_node *np; diff --git a/arch/powerpc/sysdev/fsl_pci.h b/arch/powerpc/sysdev/fsl_pci.h index 1d7a41205695..cdbde2e0c96e 100644 --- a/arch/powerpc/sysdev/fsl_pci.h +++ b/arch/powerpc/sysdev/fsl_pci.h @@ -120,7 +120,7 @@ u64 fsl_pci_immrbar_base(struct pci_controller *hose); extern struct device_node *fsl_pci_primary; #ifdef CONFIG_PCI -void fsl_pci_assign_primary(void); +void __init fsl_pci_assign_primary(void); #else static inline void fsl_pci_assign_primary(void) {} #endif diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index dc1a151c63d7..3b1ae98e3ce9 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -208,7 +208,7 @@ static const struct irq_domain_ops i8259_host_ops = { .xlate = i8259_host_xlate, }; -struct irq_domain *i8259_get_host(void) +struct irq_domain *__init i8259_get_host(void) { return i8259_host; } diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 7638a50a7c38..3f10c9fc3b68 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -767,7 +767,7 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) return ipic; } -void ipic_set_default_priority(void) +void __init ipic_set_default_priority(void) { ipic_write(primary_ipic->regs, IPIC_SIPRR_A, IPIC_PRIORITY_DEFAULT); ipic_write(primary_ipic->regs, IPIC_SIPRR_B, IPIC_PRIORITY_DEFAULT); diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 626ba4a9f64f..d5cb48b61bbd 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1839,7 +1839,7 @@ unsigned int mpic_get_mcirq(void) } #ifdef CONFIG_SMP -void mpic_request_ipis(void) +void __init mpic_request_ipis(void) { struct mpic *mpic = mpic_primary; int i; diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h index 73a31a429d46..cbcc3fee9fca 100644 --- a/arch/powerpc/sysdev/mpic.h +++ b/arch/powerpc/sysdev/mpic.h @@ -8,8 +8,8 @@ #ifdef CONFIG_PCI_MSI extern void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq); -extern int mpic_msi_init_allocator(struct mpic *mpic); -extern int mpic_u3msi_init(struct mpic *mpic); +int __init mpic_msi_init_allocator(struct mpic *mpic); +int __init mpic_u3msi_init(struct mpic *mpic); #else static inline void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq) @@ -37,8 +37,8 @@ extern void mpic_reset_core(int cpu); #ifdef CONFIG_FSL_SOC extern int mpic_map_error_int(struct mpic *mpic, unsigned int virq, irq_hw_number_t hw); -extern void mpic_err_int_init(struct mpic *mpic, irq_hw_number_t irqnum); -extern int mpic_setup_error_int(struct mpic *mpic, int intvec); +void __init mpic_err_int_init(struct mpic *mpic, irq_hw_number_t irqnum); +int __init mpic_setup_error_int(struct mpic *mpic, int intvec); #else static inline int mpic_map_error_int(struct mpic *mpic, unsigned int virq, irq_hw_number_t hw) { diff --git a/arch/powerpc/sysdev/mpic_msi.c b/arch/powerpc/sysdev/mpic_msi.c index 4695c04320ae..f412d6ad0b66 100644 --- a/arch/powerpc/sysdev/mpic_msi.c +++ b/arch/powerpc/sysdev/mpic_msi.c @@ -24,7 +24,7 @@ void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq) } #ifdef CONFIG_MPIC_U3_HT_IRQS -static int mpic_msi_reserve_u3_hwirqs(struct mpic *mpic) +static int __init mpic_msi_reserve_u3_hwirqs(struct mpic *mpic) { irq_hw_number_t hwirq; const struct irq_domain_ops *ops = mpic->irqhost->ops; @@ -68,13 +68,13 @@ static int mpic_msi_reserve_u3_hwirqs(struct mpic *mpic) return 0; } #else -static int mpic_msi_reserve_u3_hwirqs(struct mpic *mpic) +static int __init mpic_msi_reserve_u3_hwirqs(struct mpic *mpic) { return -1; } #endif -int mpic_msi_init_allocator(struct mpic *mpic) +int __init mpic_msi_init_allocator(struct mpic *mpic) { int rc; diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c index a42a20280035..444e9ce42d0a 100644 --- a/arch/powerpc/sysdev/mpic_timer.c +++ b/arch/powerpc/sysdev/mpic_timer.c @@ -384,7 +384,7 @@ struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev, } EXPORT_SYMBOL(mpic_request_timer); -static int timer_group_get_freq(struct device_node *np, +static int __init timer_group_get_freq(struct device_node *np, struct timer_group_priv *priv) { u32 div; @@ -411,7 +411,7 @@ static int timer_group_get_freq(struct device_node *np, return 0; } -static int timer_group_get_irq(struct device_node *np, +static int __init timer_group_get_irq(struct device_node *np, struct timer_group_priv *priv) { const u32 all_timer[] = { 0, TIMERS_PER_GROUP }; @@ -459,7 +459,7 @@ static int timer_group_get_irq(struct device_node *np, return 0; } -static void timer_group_init(struct device_node *np) +static void __init timer_group_init(struct device_node *np) { struct timer_group_priv *priv; unsigned int i = 0; diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c index 3861023d378a..b1219eaa80cf 100644 --- a/arch/powerpc/sysdev/mpic_u3msi.c +++ b/arch/powerpc/sysdev/mpic_u3msi.c @@ -174,7 +174,7 @@ static int u3msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return 0; } -int mpic_u3msi_init(struct mpic *mpic) +int __init mpic_u3msi_init(struct mpic *mpic) { int rc; struct pci_controller *phb; diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c index 042bb38fa5c2..1070220f15d5 100644 --- a/arch/powerpc/sysdev/tsi108_pci.c +++ b/arch/powerpc/sysdev/tsi108_pci.c @@ -257,7 +257,7 @@ static void tsi108_pci_int_unmask(u_int irq) mb(); } -static void init_pci_source(void) +static void __init init_pci_source(void) { tsi108_write_reg(TSI108_PCI_OFFSET + TSI108_PCI_IRP_CFG_CTL, 0x0000ff00); diff --git a/arch/powerpc/sysdev/udbg_memcons.c b/arch/powerpc/sysdev/udbg_memcons.c index d38bbeed219b..5020044400dc 100644 --- a/arch/powerpc/sysdev/udbg_memcons.c +++ b/arch/powerpc/sysdev/udbg_memcons.c @@ -92,7 +92,7 @@ int memcons_getc(void) return c; } -void udbg_init_memcons(void) +void __init udbg_init_memcons(void) { udbg_putc = memcons_putc; udbg_getc = memcons_getc; diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c index 6765d9e264a3..cf8db19a4f7d 100644 --- a/arch/powerpc/sysdev/xics/icp-hv.c +++ b/arch/powerpc/sysdev/xics/icp-hv.c @@ -162,7 +162,7 @@ static const struct icp_ops icp_hv_ops = { #endif }; -int icp_hv_init(void) +int __init icp_hv_init(void) { struct device_node *np; diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c index 675d708863d5..bda4c32582d9 100644 --- a/arch/powerpc/sysdev/xics/icp-opal.c +++ b/arch/powerpc/sysdev/xics/icp-opal.c @@ -184,7 +184,7 @@ static const struct icp_ops icp_opal_ops = { #endif }; -int icp_opal_init(void) +int __init icp_opal_init(void) { struct device_node *np; diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 244a727c6ba4..f3fb2a12124c 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -121,7 +121,7 @@ void xics_mask_unknown_vec(unsigned int vec) #ifdef CONFIG_SMP -static void xics_request_ipi(void) +static void __init xics_request_ipi(void) { unsigned int ipi; diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index d4243dab230e..f940428ad13f 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -492,7 +492,7 @@ static const struct xive_ops xive_native_ops = { .name = "native", }; -static bool xive_parse_provisioning(struct device_node *np) +static bool __init xive_parse_provisioning(struct device_node *np) { int rc; @@ -532,7 +532,7 @@ static bool xive_parse_provisioning(struct device_node *np) return true; } -static void xive_native_setup_pools(void) +static void __init xive_native_setup_pools(void) { /* Allocate a pool big enough */ pr_debug("XIVE: Allocating VP block for pool size %u\n", nr_cpu_ids); diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index 77943dc70860..dfc4634335cc 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -44,7 +44,7 @@ struct xive_irq_bitmap { static LIST_HEAD(xive_irq_bitmaps); -static int xive_irq_bitmap_add(int base, int count) +static int __init xive_irq_bitmap_add(int base, int count) { struct xive_irq_bitmap *xibm; @@ -687,7 +687,7 @@ static const struct xive_ops xive_spapr_ops = { /* * get max priority from "/ibm,plat-res-int-priorities" */ -static bool xive_get_max_prio(u8 *max_prio) +static bool __init xive_get_max_prio(u8 *max_prio) { struct device_node *rootdn; const __be32 *reg; @@ -741,7 +741,7 @@ static bool xive_get_max_prio(u8 *max_prio) return true; } -static const u8 *get_vec5_feature(unsigned int index) +static const u8 *__init get_vec5_feature(unsigned int index) { unsigned long root, chosen; int size; From 456e8eb324a47573b377f7041f4c038fac403f86 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:21 -0500 Subject: [PATCH 198/240] powerpc/xmon: Add __init attribute to eligible functions `xmon_register_spus` defined in 'arch/powerpc/xmon' is deserving of an `__init` macro attribute. This functions is only called by other initialization functions and therefore should inherit the attribute. Also, change the function declaration in the header file to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-7-nick.child@ibm.com --- arch/powerpc/include/asm/xmon.h | 2 +- arch/powerpc/xmon/xmon.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h index 68bfb2361f03..f2d44b44f46c 100644 --- a/arch/powerpc/include/asm/xmon.h +++ b/arch/powerpc/include/asm/xmon.h @@ -12,7 +12,7 @@ #ifdef CONFIG_XMON extern void xmon_setup(void); -extern void xmon_register_spus(struct list_head *list); +void __init xmon_register_spus(struct list_head *list); struct pt_regs; extern int xmon(struct pt_regs *excp); extern irqreturn_t xmon_irq(int, void *); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index f9ae0b398260..f51d7404a6ea 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -4136,7 +4136,7 @@ struct spu_info { static struct spu_info spu_info[XMON_NUM_SPUS]; -void xmon_register_spus(struct list_head *list) +void __init xmon_register_spus(struct list_head *list) { struct spu *spu; From 7c1ab16b2d035c6bc3b6b6980ab7e72f547edc45 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:22 -0500 Subject: [PATCH 199/240] powerpc/cell: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/cell' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-8-nick.child@ibm.com --- arch/powerpc/platforms/cell/cbe_regs.c | 2 +- arch/powerpc/platforms/cell/iommu.c | 14 +++++++------- arch/powerpc/platforms/cell/spu_base.c | 6 +++--- arch/powerpc/platforms/cell/spu_manage.c | 16 ++++++++-------- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c index c2a0678d85db..1c4c53bec66c 100644 --- a/arch/powerpc/platforms/cell/cbe_regs.c +++ b/arch/powerpc/platforms/cell/cbe_regs.c @@ -165,7 +165,7 @@ u32 cbe_node_to_cpu(int node) } EXPORT_SYMBOL_GPL(cbe_node_to_cpu); -static struct device_node *cbe_get_be_node(int cpu_id) +static struct device_node *__init cbe_get_be_node(int cpu_id) { struct device_node *np; diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index d32f24de8479..25e726bf0172 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -253,7 +253,7 @@ static irqreturn_t ioc_interrupt(int irq, void *data) return IRQ_HANDLED; } -static int cell_iommu_find_ioc(int nid, unsigned long *base) +static int __init cell_iommu_find_ioc(int nid, unsigned long *base) { struct device_node *np; struct resource r; @@ -293,7 +293,7 @@ static int cell_iommu_find_ioc(int nid, unsigned long *base) return -ENODEV; } -static void cell_iommu_setup_stab(struct cbe_iommu *iommu, +static void __init cell_iommu_setup_stab(struct cbe_iommu *iommu, unsigned long dbase, unsigned long dsize, unsigned long fbase, unsigned long fsize) { @@ -313,7 +313,7 @@ static void cell_iommu_setup_stab(struct cbe_iommu *iommu, memset(iommu->stab, 0, stab_size); } -static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, +static unsigned long *__init cell_iommu_alloc_ptab(struct cbe_iommu *iommu, unsigned long base, unsigned long size, unsigned long gap_base, unsigned long gap_size, unsigned long page_shift) { @@ -373,7 +373,7 @@ static unsigned long *cell_iommu_alloc_ptab(struct cbe_iommu *iommu, return ptab; } -static void cell_iommu_enable_hardware(struct cbe_iommu *iommu) +static void __init cell_iommu_enable_hardware(struct cbe_iommu *iommu) { int ret; unsigned long reg, xlate_base; @@ -413,7 +413,7 @@ static void cell_iommu_enable_hardware(struct cbe_iommu *iommu) out_be64(iommu->cmd_regs + IOC_IOCmd_Cfg, reg); } -static void cell_iommu_setup_hardware(struct cbe_iommu *iommu, +static void __init cell_iommu_setup_hardware(struct cbe_iommu *iommu, unsigned long base, unsigned long size) { cell_iommu_setup_stab(iommu, base, size, 0, 0); @@ -858,7 +858,7 @@ static bool cell_pci_iommu_bypass_supported(struct pci_dev *pdev, u64 mask) cell_iommu_get_fixed_address(&pdev->dev) != OF_BAD_ADDR; } -static void insert_16M_pte(unsigned long addr, unsigned long *ptab, +static void __init insert_16M_pte(unsigned long addr, unsigned long *ptab, unsigned long base_pte) { unsigned long segment, offset; @@ -873,7 +873,7 @@ static void insert_16M_pte(unsigned long addr, unsigned long *ptab, ptab[offset] = base_pte | (__pa(addr) & CBE_IOPTE_RPN_Mask); } -static void cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, +static void __init cell_iommu_setup_fixed_ptab(struct cbe_iommu *iommu, struct device_node *np, unsigned long dbase, unsigned long dsize, unsigned long fbase, unsigned long fsize) { diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index bc48234443b6..83cea9e7ee72 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -387,7 +387,7 @@ spu_irq_class_2(int irq, void *data) return stat ? IRQ_HANDLED : IRQ_NONE; } -static int spu_request_irqs(struct spu *spu) +static int __init spu_request_irqs(struct spu *spu) { int ret = 0; @@ -540,7 +540,7 @@ void spu_remove_dev_attr_group(struct attribute_group *attrs) } EXPORT_SYMBOL_GPL(spu_remove_dev_attr_group); -static int spu_create_dev(struct spu *spu) +static int __init spu_create_dev(struct spu *spu) { int ret; @@ -711,7 +711,7 @@ static void crash_kexec_stop_spus(void) } } -static void crash_register_spus(struct list_head *list) +static void __init crash_register_spus(struct list_head *list) { struct spu *spu; int ret; diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c index 8e9ef65240c3..ddf8742f09a3 100644 --- a/arch/powerpc/platforms/cell/spu_manage.c +++ b/arch/powerpc/platforms/cell/spu_manage.c @@ -186,7 +186,7 @@ err: return -EINVAL; } -static int spu_map_resource(struct spu *spu, int nr, +static int __init spu_map_resource(struct spu *spu, int nr, void __iomem** virt, unsigned long *phys) { struct device_node *np = spu->devnode; @@ -361,7 +361,7 @@ static void disable_spu_by_master_run(struct spu_context *ctx) static int qs20_reg_idxs[QS20_SPES_PER_BE] = { 0, 2, 4, 6, 7, 5, 3, 1 }; static int qs20_reg_memory[QS20_SPES_PER_BE] = { 1, 1, 0, 0, 0, 0, 0, 0 }; -static struct spu *spu_lookup_reg(int node, u32 reg) +static struct spu *__init spu_lookup_reg(int node, u32 reg) { struct spu *spu; const u32 *spu_reg; @@ -374,7 +374,7 @@ static struct spu *spu_lookup_reg(int node, u32 reg) return NULL; } -static void init_affinity_qs20_harcoded(void) +static void __init init_affinity_qs20_harcoded(void) { int node, i; struct spu *last_spu, *spu; @@ -396,7 +396,7 @@ static void init_affinity_qs20_harcoded(void) } } -static int of_has_vicinity(void) +static int __init of_has_vicinity(void) { struct device_node *dn; @@ -409,7 +409,7 @@ static int of_has_vicinity(void) return 0; } -static struct spu *devnode_spu(int cbe, struct device_node *dn) +static struct spu *__init devnode_spu(int cbe, struct device_node *dn) { struct spu *spu; @@ -419,7 +419,7 @@ static struct spu *devnode_spu(int cbe, struct device_node *dn) return NULL; } -static struct spu * +static struct spu * __init neighbour_spu(int cbe, struct device_node *target, struct device_node *avoid) { struct spu *spu; @@ -440,7 +440,7 @@ neighbour_spu(int cbe, struct device_node *target, struct device_node *avoid) return NULL; } -static void init_affinity_node(int cbe) +static void __init init_affinity_node(int cbe) { struct spu *spu, *last_spu; struct device_node *vic_dn, *last_spu_dn; @@ -494,7 +494,7 @@ static void init_affinity_node(int cbe) } } -static void init_affinity_fw(void) +static void __init init_affinity_fw(void) { int cbe; diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index cb25acccd746..4c702192412f 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -648,7 +648,7 @@ static void spufs_exit_isolated_loader(void) get_order(isolated_loader_size)); } -static void +static void __init spufs_init_isolated_loader(void) { struct device_node *dn; From d3aa3c5edf0cb7ac0b0b5b0d144bba60b0ee77da Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:23 -0500 Subject: [PATCH 200/240] powerpc/chrp: Add __init attribute to eligible functions The function `Enable_SRAM` defined in 'arch/powerpc/platforms/chrp' is deserving of an `__init` macro attribute. This function is only called by other initialization functions and therefore should inherit the attribute. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-9-nick.child@ibm.com --- arch/powerpc/platforms/chrp/pegasos_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/chrp/pegasos_eth.c b/arch/powerpc/platforms/chrp/pegasos_eth.c index 485cf5ef73d4..5c4f1a9ca154 100644 --- a/arch/powerpc/platforms/chrp/pegasos_eth.c +++ b/arch/powerpc/platforms/chrp/pegasos_eth.c @@ -113,7 +113,7 @@ static struct platform_device *mv643xx_eth_pd_devs[] __initdata = { static void __iomem *mv643xx_reg_base; -static int Enable_SRAM(void) +static int __init Enable_SRAM(void) { u32 ALong; From e37e06af9b0d6b7828159455d33f8ef45c456460 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:24 -0500 Subject: [PATCH 201/240] powerpc/pasemi: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/pasemi' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-10-nick.child@ibm.com --- arch/powerpc/platforms/pasemi/msi.c | 2 +- arch/powerpc/platforms/pasemi/pasemi.h | 2 +- arch/powerpc/platforms/pasemi/pci.c | 2 +- arch/powerpc/platforms/pasemi/setup.c | 2 +- arch/powerpc/sysdev/mpic.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c index d38944a1e258..11ab9c13457c 100644 --- a/arch/powerpc/platforms/pasemi/msi.c +++ b/arch/powerpc/platforms/pasemi/msi.c @@ -135,7 +135,7 @@ static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return 0; } -int mpic_pasemi_msi_init(struct mpic *mpic) +int __init mpic_pasemi_msi_init(struct mpic *mpic) { int rc; struct pci_controller *phb; diff --git a/arch/powerpc/platforms/pasemi/pasemi.h b/arch/powerpc/platforms/pasemi/pasemi.h index 70b56048ed1b..3f277a200fd8 100644 --- a/arch/powerpc/platforms/pasemi/pasemi.h +++ b/arch/powerpc/platforms/pasemi/pasemi.h @@ -7,7 +7,7 @@ extern void pas_pci_init(void); extern void pas_pci_irq_fixup(struct pci_dev *dev); extern void pas_pci_dma_dev_setup(struct pci_dev *dev); -extern void __iomem *pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset); +void __iomem *__init pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset); extern void __init pasemi_map_registers(void); diff --git a/arch/powerpc/platforms/pasemi/pci.c b/arch/powerpc/platforms/pasemi/pci.c index 8779b107d872..d4b922759d6e 100644 --- a/arch/powerpc/platforms/pasemi/pci.c +++ b/arch/powerpc/platforms/pasemi/pci.c @@ -287,7 +287,7 @@ void __init pas_pci_init(void) } } -void __iomem *pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset) +void __iomem *__init pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset) { struct pci_controller *hose; diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c index 376797eb7894..f974bfe7fde1 100644 --- a/arch/powerpc/platforms/pasemi/setup.c +++ b/arch/powerpc/platforms/pasemi/setup.c @@ -212,7 +212,7 @@ static void sb600_8259_cascade(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -static void nemo_init_IRQ(struct mpic *mpic) +static void __init nemo_init_IRQ(struct mpic *mpic) { struct device_node *np; int gpio_virq; diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h index cbcc3fee9fca..bb460ff57a06 100644 --- a/arch/powerpc/sysdev/mpic.h +++ b/arch/powerpc/sysdev/mpic.h @@ -24,7 +24,7 @@ static inline int mpic_u3msi_init(struct mpic *mpic) #endif #if defined(CONFIG_PCI_MSI) && defined(CONFIG_PPC_PASEMI) -int mpic_pasemi_msi_init(struct mpic *mpic); +int __init mpic_pasemi_msi_init(struct mpic *mpic); #else static inline int mpic_pasemi_msi_init(struct mpic *mpic) { return -1; } #endif From b346f57100e9417f23ee9051f0efe621a492be96 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:25 -0500 Subject: [PATCH 202/240] powerpc/powermac: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/powermac` are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-11-nick.child@ibm.com --- arch/powerpc/include/asm/smu.h | 2 +- arch/powerpc/include/asm/udbg.h | 2 +- arch/powerpc/platforms/powermac/feature.c | 2 +- arch/powerpc/platforms/powermac/nvram.c | 2 +- arch/powerpc/platforms/powermac/pfunc_base.c | 6 +++--- arch/powerpc/platforms/powermac/setup.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 4 ++-- arch/powerpc/platforms/powermac/udbg_scc.c | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/smu.h b/arch/powerpc/include/asm/smu.h index 4b30a0205c93..2ac6ab903023 100644 --- a/arch/powerpc/include/asm/smu.h +++ b/arch/powerpc/include/asm/smu.h @@ -456,7 +456,7 @@ extern void smu_poll(void); /* * Init routine, presence check.... */ -extern int smu_init(void); +int __init smu_init(void); extern int smu_present(void); struct platform_device; extern struct platform_device *smu_get_ofdev(void); diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 5aec53f2dae0..b4aa0d88ce2c 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -30,7 +30,7 @@ void __init udbg_uart_setup(unsigned int speed, unsigned int clock); unsigned int __init udbg_probe_uart_speed(unsigned int clock); struct device_node; -extern void udbg_scc_init(int force_scc); +void __init udbg_scc_init(int force_scc); extern int udbg_adb_init(int force_btext); extern void udbg_adb_init_early(void); diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index 5c77b9a24c0e..e67c624f35a2 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -1530,7 +1530,7 @@ static long g5_reset_cpu(struct device_node *node, long param, long value) * This takes the second CPU off the bus on dual CPU machines * running UP */ -void g5_phy_disable_cpu1(void) +void __init g5_phy_disable_cpu1(void) { if (uninorth_maj == 3) UN_OUT(U3_API_PHY_CONFIG_1, 0); diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index 853ccc4480e2..de8fcb607290 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -258,7 +258,7 @@ static u32 core99_calc_adler(u8 *buffer) return (high << 16) | low; } -static u32 core99_check(u8* datas) +static u32 __init core99_check(u8 *datas) { struct core99_header* hdr99 = (struct core99_header*)datas; diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c index f5422506d4b0..9c2947a3edd5 100644 --- a/arch/powerpc/platforms/powermac/pfunc_base.c +++ b/arch/powerpc/platforms/powermac/pfunc_base.c @@ -93,7 +93,7 @@ static struct pmf_handlers macio_gpio_handlers = { .delay = macio_do_delay, }; -static void macio_gpio_init_one(struct macio_chip *macio) +static void __init macio_gpio_init_one(struct macio_chip *macio) { struct device_node *gparent, *gp; @@ -265,7 +265,7 @@ static struct pmf_handlers macio_mmio_handlers = { .delay = macio_do_delay, }; -static void macio_mmio_init_one(struct macio_chip *macio) +static void __init macio_mmio_init_one(struct macio_chip *macio) { DBG("Installing MMIO functions for macio %pOF\n", macio->of_node); @@ -294,7 +294,7 @@ static struct pmf_handlers unin_mmio_handlers = { .delay = macio_do_delay, }; -static void uninorth_install_pfunc(void) +static void __init uninorth_install_pfunc(void) { struct device_node *np; diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 13e8a8a9841c..f7661b81db18 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -194,7 +194,7 @@ int find_via_pmu(void) #endif #ifndef CONFIG_PMAC_SMU -int smu_init(void) +int __init smu_init(void) { /* should check and warn if SMU is present */ return 0; diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 3256a316e884..da1efdc30d6c 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -186,7 +186,7 @@ static const struct irq_domain_ops psurge_host_ops = { .map = psurge_host_map, }; -static int psurge_secondary_ipi_init(void) +static int __init psurge_secondary_ipi_init(void) { int rc = -ENOMEM; @@ -875,7 +875,7 @@ static int smp_core99_cpu_online(unsigned int cpu) static void __init smp_core99_bringup_done(void) { - extern void g5_phy_disable_cpu1(void); + extern void __init g5_phy_disable_cpu1(void); /* Close i2c bus if it was used for tb sync */ if (pmac_tb_clock_chip_host) diff --git a/arch/powerpc/platforms/powermac/udbg_scc.c b/arch/powerpc/platforms/powermac/udbg_scc.c index f286bdfe8346..965827ac2e9c 100644 --- a/arch/powerpc/platforms/powermac/udbg_scc.c +++ b/arch/powerpc/platforms/powermac/udbg_scc.c @@ -62,7 +62,7 @@ static unsigned char scc_inittab[] = { 3, 0xc1, /* rx enable, 8 bits */ }; -void udbg_scc_init(int force_scc) +void __init udbg_scc_init(int force_scc) { const u32 *reg; unsigned long addr; From e5913db1ef22817e128f0a794752f7393205e00b Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:26 -0500 Subject: [PATCH 203/240] powerpc/powernv: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/powernv' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-12-nick.child@ibm.com --- arch/powerpc/include/asm/cpuidle.h | 2 +- arch/powerpc/include/asm/opal.h | 2 +- arch/powerpc/platforms/powernv/idle.c | 6 +++--- arch/powerpc/platforms/powernv/opal-core.c | 6 +++--- arch/powerpc/platforms/powernv/opal-fadump.c | 2 +- arch/powerpc/platforms/powernv/opal-msglog.c | 4 ++-- arch/powerpc/platforms/powernv/opal-power.c | 2 +- arch/powerpc/platforms/powernv/opal-powercap.c | 2 +- arch/powerpc/platforms/powernv/opal-rtc.c | 2 +- arch/powerpc/platforms/powernv/opal-sensor-groups.c | 4 ++-- arch/powerpc/platforms/powernv/opal.c | 8 ++++---- arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++-- arch/powerpc/platforms/powernv/powernv.h | 4 ++-- arch/powerpc/platforms/powernv/rng.c | 2 +- arch/powerpc/platforms/powernv/setup.c | 6 +++--- 15 files changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 9844b3ded187..0cce5dc7fb1c 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -85,7 +85,7 @@ extern struct pnv_idle_states_t *pnv_idle_states; extern int nr_pnv_idle_states; unsigned long pnv_cpu_offline(unsigned int cpu); -int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); +int __init validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); static inline void report_invalid_psscr_val(u64 psscr_val, int err) { switch (err) { diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 6ea9001de9a9..bfd3142cd0ba 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -314,7 +314,7 @@ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); extern int early_init_dt_scan_recoverable_ranges(unsigned long node, const char *uname, int depth, void *data); -extern void opal_configure_cores(void); +void __init opal_configure_cores(void); extern int opal_get_chars(uint32_t vtermno, char *buf, int count); extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 885ef229aba1..9942289f379b 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -62,7 +62,7 @@ static bool deepest_stop_found; static unsigned long power7_offline_type; -static int pnv_save_sprs_for_deep_states(void) +static int __init pnv_save_sprs_for_deep_states(void) { int cpu; int rc; @@ -1123,7 +1123,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) * stop instruction */ -int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) +int __init validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) { int err = 0; @@ -1317,7 +1317,7 @@ static void __init pnv_probe_idle_states(void) * which is the number of cpuidle states discovered through device-tree. */ -static int pnv_parse_cpuidle_dt(void) +static int __init pnv_parse_cpuidle_dt(void) { struct device_node *np; int nr_idle_states, i; diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 5b9736bbc2aa..0331f1973f0e 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -89,7 +89,7 @@ static inline int is_opalcore_usable(void) return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0; } -static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name, +static Elf64_Word *__init append_elf64_note(Elf64_Word *buf, char *name, u32 type, void *data, size_t data_len) { @@ -108,7 +108,7 @@ static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name, return buf; } -static void fill_prstatus(struct elf_prstatus *prstatus, int pir, +static void __init fill_prstatus(struct elf_prstatus *prstatus, int pir, struct pt_regs *regs) { memset(prstatus, 0, sizeof(struct elf_prstatus)); @@ -134,7 +134,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir, } } -static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf, +static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf, u64 opal_boot_entry) { Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf; diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 9a360ced663b..c8ad057c7221 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -112,7 +112,7 @@ static void opal_fadump_update_config(struct fw_dump *fadump_conf, * This function is called in the capture kernel to get configuration details * from metadata setup by the first kernel. */ -static void opal_fadump_get_config(struct fw_dump *fadump_conf, +static void __init opal_fadump_get_config(struct fw_dump *fadump_conf, const struct opal_fadump_mem_struct *fdm) { unsigned long base, size, last_end, hole_size; diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index d3b6e135c18b..22d6efe17b0d 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -105,7 +105,7 @@ static struct bin_attribute opal_msglog_attr = { .read = opal_msglog_read }; -struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name) +struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name) { u64 mcaddr; struct memcons *mc; @@ -133,7 +133,7 @@ out_err: return NULL; } -u32 memcons_get_size(struct memcons *mc) +u32 __init memcons_get_size(struct memcons *mc) { return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size); } diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c index 2a3717fc24ea..db99ffcb7b82 100644 --- a/arch/powerpc/platforms/powernv/opal-power.c +++ b/arch/powerpc/platforms/powernv/opal-power.c @@ -53,7 +53,7 @@ static bool detect_epow(void) } /* Check for existing EPOW, DPO events */ -static bool poweroff_pending(void) +static bool __init poweroff_pending(void) { int rc; __be64 opal_dpo_timeout; diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c index c16d44f6f1d1..64506b46e77b 100644 --- a/arch/powerpc/platforms/powernv/opal-powercap.c +++ b/arch/powerpc/platforms/powernv/opal-powercap.c @@ -129,7 +129,7 @@ out_token: return ret; } -static void powercap_add_attr(int handle, const char *name, +static void __init powercap_add_attr(int handle, const char *name, struct powercap_attr *attr) { attr->handle = handle; diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c index 44d7dacb33a2..a9bcf9217e64 100644 --- a/arch/powerpc/platforms/powernv/opal-rtc.c +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -18,7 +18,7 @@ #include #include -static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) +static void __init opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) { tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) + bcd2bin((y_m_d >> 16) & 0xff)) - 1900; diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c index f8ae1fb0c102..8fba7d25ae56 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -126,7 +126,7 @@ static void add_attr(int handle, struct sg_attr *attr, int index) attr->attr.store = ops_info[index].store; } -static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, +static int __init add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, u32 handle) { int i, j; @@ -144,7 +144,7 @@ static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, return sysfs_create_group(sg_kobj, &sg->sg); } -static int get_nr_attrs(const __be32 *ops, int len) +static int __init get_nr_attrs(const __be32 *ops, int len) { int i, j; int nr_attrs = 0; diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index e9d18519e650..55a8fbfdb5b2 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -73,7 +73,7 @@ static struct task_struct *kopald_tsk; static struct opal_msg *opal_msg; static u32 opal_msg_size __ro_after_init; -void opal_configure_cores(void) +void __init opal_configure_cores(void) { u64 reinit_flags = 0; @@ -779,7 +779,7 @@ out: return !!recover_addr; } -static int opal_sysfs_init(void) +static int __init opal_sysfs_init(void) { opal_kobj = kobject_create_and_add("opal", firmware_kobj); if (!opal_kobj) { @@ -937,7 +937,7 @@ static void __init opal_dump_region_init(void) "rc = %d\n", rc); } -static void opal_pdev_init(const char *compatible) +static void __init opal_pdev_init(const char *compatible) { struct device_node *np; @@ -981,7 +981,7 @@ void opal_wake_poller(void) wake_up_process(kopald_tsk); } -static void opal_init_heartbeat(void) +static void __init opal_init_heartbeat(void) { /* Old firwmware, we assume the HVC heartbeat is sufficient */ if (of_property_read_u32(opal_node, "ibm,heartbeat-ms", diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 004cd6a96c8a..acd763593ab4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2265,7 +2265,7 @@ static const struct irq_domain_ops pnv_irq_domain_ops = { .free = pnv_irq_domain_free, }; -static int pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) +static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) { struct pnv_phb *phb = hose->private_data; struct irq_domain *parent = irq_get_default_host(); @@ -2298,7 +2298,7 @@ static int pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int co return 0; } -static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +static void __init pnv_pci_init_ioda_msis(struct pnv_phb *phb) { unsigned int count; const __be32 *prop = of_get_property(phb->hose->dn, diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 11df4e16a1cc..e297bf4abfcb 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -39,7 +39,7 @@ bool cpu_core_split_required(void); struct memcons; ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count); -u32 memcons_get_size(struct memcons *mc); -struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name); +u32 __init memcons_get_size(struct memcons *mc); +struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name); #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 72c25295c1c2..b4386714494a 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -80,7 +80,7 @@ static int powernv_get_random_darn(unsigned long *v) return 1; } -static int initialise_darn(void) +static int __init initialise_darn(void) { unsigned long val; int i; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index f37d6524a24d..105d889abd51 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -40,7 +40,7 @@ #include "powernv.h" -static bool fw_feature_is(const char *state, const char *name, +static bool __init fw_feature_is(const char *state, const char *name, struct device_node *fw_features) { struct device_node *np; @@ -55,7 +55,7 @@ static bool fw_feature_is(const char *state, const char *name, return rc; } -static void init_fw_feat_flags(struct device_node *np) +static void __init init_fw_feat_flags(struct device_node *np) { if (fw_feature_is("enabled", "inst-spec-barrier-ori31,31,0", np)) security_ftr_set(SEC_FTR_SPEC_BAR_ORI31); @@ -98,7 +98,7 @@ static void init_fw_feat_flags(struct device_node *np) security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR); } -static void pnv_setup_security_mitigations(void) +static void __init pnv_setup_security_mitigations(void) { struct device_node *np, *fw_features; enum l1d_flush_type type; From e14ff96d08f0ade9dd33081d909ad65a02a858c1 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:27 -0500 Subject: [PATCH 204/240] powerpc/pseries: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/pseries' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-13-nick.child@ibm.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 2 +- arch/powerpc/include/asm/iommu.h | 2 +- arch/powerpc/include/asm/setup.h | 2 +- arch/powerpc/platforms/pseries/event_sources.c | 2 +- arch/powerpc/platforms/pseries/iommu.c | 2 +- arch/powerpc/platforms/pseries/lpar.c | 6 +++--- arch/powerpc/platforms/pseries/pseries.h | 2 +- arch/powerpc/platforms/pseries/rtas-fadump.c | 6 +++--- arch/powerpc/platforms/pseries/setup.c | 4 ++-- arch/powerpc/platforms/pseries/vas.c | 2 +- arch/powerpc/platforms/pseries/vio.c | 6 +++--- 11 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 7fee46e50377..ba5b1becf518 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -258,7 +258,7 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, } #ifdef CONFIG_PPC_PSERIES -extern void radix_init_pseries(void); +void __init radix_init_pseries(void); #else static inline void radix_init_pseries(void) { } #endif diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index c361212ac160..d7912b66c874 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -275,7 +275,7 @@ extern void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction, unsigned long attrs); -extern void iommu_init_early_pSeries(void); +void __init iommu_init_early_pSeries(void); extern void iommu_init_early_dart(struct pci_controller_ops *controller_ops); extern void iommu_init_early_pasemi(void); diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 607e42b8cbf0..71658504dadd 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -32,7 +32,7 @@ void setup_panic(void); extern bool pseries_enable_reloc_on_exc(void); extern void pseries_disable_reloc_on_exc(void); extern void pseries_big_endian_exceptions(void); -extern void pseries_little_endian_exceptions(void); +void __init pseries_little_endian_exceptions(void); #else static inline bool pseries_enable_reloc_on_exc(void) { return false; } static inline void pseries_disable_reloc_on_exc(void) {} diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c index be661e919c76..623dfe0d8e1c 100644 --- a/arch/powerpc/platforms/pseries/event_sources.c +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -8,7 +8,7 @@ #include "pseries.h" -void request_event_sources_irqs(struct device_node *np, +void __init request_event_sources_irqs(struct device_node *np, irq_handler_t handler, const char *name) { diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 8f998e55735b..4d991cf840d9 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1654,7 +1654,7 @@ static struct notifier_block iommu_reconfig_nb = { }; /* These are called very early. */ -void iommu_init_early_pSeries(void) +void __init iommu_init_early_pSeries(void) { if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL)) return; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index fac5d86777db..f8899d506ea4 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -714,7 +714,7 @@ void vpa_init(int cpu) #ifdef CONFIG_PPC_BOOK3S_64 -static int pseries_lpar_register_process_table(unsigned long base, +static int __init pseries_lpar_register_process_table(unsigned long base, unsigned long page_size, unsigned long table_size) { long rc; @@ -1737,7 +1737,7 @@ void __init hpte_init_pseries(void) #endif /* CONFIG_PPC_64S_HASH_MMU */ #ifdef CONFIG_PPC_RADIX_MMU -void radix_init_pseries(void) +void __init radix_init_pseries(void) { pr_info("Using radix MMU under hypervisor\n"); @@ -1938,7 +1938,7 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) } #ifdef CONFIG_PPC_64S_HASH_MMU -static unsigned long vsid_unscramble(unsigned long vsid, int ssize) +static unsigned long __init vsid_unscramble(unsigned long vsid, int ssize) { unsigned long protovsid; unsigned long va_bits = VA_BITS; diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index b4c63c481f33..56c9ef9052e9 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -11,7 +11,7 @@ struct device_node; -extern void request_event_sources_irqs(struct device_node *np, +void __init request_event_sources_irqs(struct device_node *np, irq_handler_t handler, const char *name); #include diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index f8f73b47b107..35f9cb602c30 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -39,7 +39,7 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf, * This function is called in the capture kernel to get configuration details * setup in the first kernel and passed to the f/w. */ -static void rtas_fadump_get_config(struct fw_dump *fadump_conf, +static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) { fadump_conf->boot_mem_addr[0] = @@ -247,7 +247,7 @@ static inline int rtas_fadump_gpr_index(u64 id) return i; } -static void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) +static void __init rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) { int i; @@ -272,7 +272,7 @@ static void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val regs->dsisr = (unsigned long)reg_val; } -static struct rtas_fadump_reg_entry* +static struct rtas_fadump_reg_entry* __init rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry, struct pt_regs *regs) { diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 7f69237d4fa4..83a04d967a59 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -447,7 +447,7 @@ void pseries_big_endian_exceptions(void) panic("Could not enable big endian exceptions"); } -void pseries_little_endian_exceptions(void) +void __init pseries_little_endian_exceptions(void) { long rc; @@ -907,7 +907,7 @@ void pSeries_coalesce_init(void) * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions, * handle that here. (Stolen from parse_system_parameter_string) */ -static void pSeries_cmo_feature_init(void) +static void __init pSeries_cmo_feature_init(void) { char *ptr, *key, *value, *end; int call_status; diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 734523e2272f..d243ddc58827 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -489,7 +489,7 @@ EXPORT_SYMBOL_GPL(vas_unregister_api_pseries); * Get the specific capabilities based on the feature type. * Right now supports GZIP default and GZIP QoS capabilities. */ -static int get_vas_capabilities(u8 feat, enum vas_cop_feat_type type, +static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type, struct hv_vas_cop_feat_caps *hv_caps) { struct vas_cop_feat_caps *caps; diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index feafcb582e1b..c9f9be4ea26a 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1061,7 +1061,7 @@ static struct attribute *vio_bus_attrs[] = { }; ATTRIBUTE_GROUPS(vio_bus); -static void vio_cmo_sysfs_init(void) +static void __init vio_cmo_sysfs_init(void) { vio_bus_type.dev_groups = vio_cmo_dev_groups; vio_bus_type.bus_groups = vio_bus_groups; @@ -1073,7 +1073,7 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; } static void vio_cmo_bus_remove(struct vio_dev *viodev) {} static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {} static void vio_cmo_bus_init(void) {} -static void vio_cmo_sysfs_init(void) { } +static void __init vio_cmo_sysfs_init(void) { } #endif /* CONFIG_PPC_SMLPAR */ EXPORT_SYMBOL(vio_cmo_entitlement_update); EXPORT_SYMBOL(vio_cmo_set_dev_desired); @@ -1479,7 +1479,7 @@ EXPORT_SYMBOL(vio_register_device_node); * Starting from the root node provide, register the device node for * each child beneath the root. */ -static void vio_bus_scan_register_devices(char *root_name) +static void __init vio_bus_scan_register_devices(char *root_name) { struct device_node *node_root, *node_child; From f1ba9b9474a9e32b9c173c91e71f713bfa7b2463 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:28 -0500 Subject: [PATCH 205/240] powerpc/ps3: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/ps3' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-14-nick.child@ibm.com --- arch/powerpc/platforms/ps3/gelic_udbg.c | 2 +- arch/powerpc/platforms/ps3/mm.c | 4 ++-- arch/powerpc/platforms/ps3/os-area.c | 4 ++-- arch/powerpc/platforms/ps3/platform.h | 14 +++++++------- arch/powerpc/platforms/ps3/repository.c | 20 ++++++++++---------- arch/powerpc/platforms/ps3/smp.c | 2 +- arch/powerpc/platforms/ps3/spu.c | 2 +- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/platforms/ps3/gelic_udbg.c b/arch/powerpc/platforms/ps3/gelic_udbg.c index cba4f8f5b8d7..6b298010fd84 100644 --- a/arch/powerpc/platforms/ps3/gelic_udbg.c +++ b/arch/powerpc/platforms/ps3/gelic_udbg.c @@ -113,7 +113,7 @@ static int unmap_dma_mem(int bus_id, int dev_id, u64 bus_addr, size_t len) return lv1_free_device_dma_region(bus_id, dev_id, real_bus_addr); } -static void gelic_debug_init(void) +static void __init gelic_debug_init(void) { s64 result; u64 v2; diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index 9c44f335c0b9..5ce924611b94 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -41,7 +41,7 @@ enum { PAGE_SHIFT_16M = 24U, }; -static unsigned long make_page_sizes(unsigned long a, unsigned long b) +static unsigned long __init make_page_sizes(unsigned long a, unsigned long b) { return (a << 56) | (b << 48); } @@ -215,7 +215,7 @@ notrace void ps3_mm_vas_destroy(void) } } -static int ps3_mm_get_repository_highmem(struct mem_region *r) +static int __init ps3_mm_get_repository_highmem(struct mem_region *r) { int result; diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c index e8530371aed6..cb844e0add2b 100644 --- a/arch/powerpc/platforms/ps3/os-area.c +++ b/arch/powerpc/platforms/ps3/os-area.c @@ -501,7 +501,7 @@ static int db_set_64(struct os_area_db *db, const struct os_area_db_id *id, return -1; } -static int db_get_64(const struct os_area_db *db, +static int __init db_get_64(const struct os_area_db *db, const struct os_area_db_id *id, uint64_t *value) { struct db_iterator i; @@ -517,7 +517,7 @@ static int db_get_64(const struct os_area_db *db, return -1; } -static int db_get_rtc_diff(const struct os_area_db *db, int64_t *rtc_diff) +static int __init db_get_rtc_diff(const struct os_area_db *db, int64_t *rtc_diff) { return db_get_64(db, &os_area_db_id_rtc_diff, (uint64_t*)rtc_diff); } diff --git a/arch/powerpc/platforms/ps3/platform.h b/arch/powerpc/platforms/ps3/platform.h index 07bd39ef71ff..6beecdb0d51f 100644 --- a/arch/powerpc/platforms/ps3/platform.h +++ b/arch/powerpc/platforms/ps3/platform.h @@ -35,7 +35,7 @@ void __init ps3_register_ipi_irq(unsigned int cpu, unsigned int virq); /* smp */ -void smp_init_ps3(void); +void __init smp_init_ps3(void); #ifdef CONFIG_SMP void ps3_smp_cleanup_cpu(int cpu); #else @@ -134,9 +134,9 @@ struct ps3_repository_device { int ps3_repository_find_device(struct ps3_repository_device *repo); int ps3_repository_find_device_by_id(struct ps3_repository_device *repo, u64 bus_id, u64 dev_id); -int ps3_repository_find_devices(enum ps3_bus_type bus_type, +int __init ps3_repository_find_devices(enum ps3_bus_type bus_type, int (*callback)(const struct ps3_repository_device *repo)); -int ps3_repository_find_bus(enum ps3_bus_type bus_type, unsigned int from, +int __init ps3_repository_find_bus(enum ps3_bus_type bus_type, unsigned int from, unsigned int *bus_index); int ps3_repository_find_interrupt(const struct ps3_repository_device *repo, enum ps3_interrupt_type intr_type, unsigned int *interrupt_id); @@ -211,8 +211,8 @@ static inline int ps3_repository_delete_highmem_info(unsigned int region_index) int ps3_repository_read_num_be(unsigned int *num_be); int ps3_repository_read_be_node_id(unsigned int be_index, u64 *node_id); int ps3_repository_read_be_id(u64 node_id, u64 *be_id); -int ps3_repository_read_tb_freq(u64 node_id, u64 *tb_freq); -int ps3_repository_read_be_tb_freq(unsigned int be_index, u64 *tb_freq); +int __init ps3_repository_read_tb_freq(u64 node_id, u64 *tb_freq); +int __init ps3_repository_read_be_tb_freq(unsigned int be_index, u64 *tb_freq); /* repository performance monitor info */ @@ -247,7 +247,7 @@ int ps3_repository_read_spu_resource_id(unsigned int res_index, /* repository vuart info */ -int ps3_repository_read_vuart_av_port(unsigned int *port); -int ps3_repository_read_vuart_sysmgr_port(unsigned int *port); +int __init ps3_repository_read_vuart_av_port(unsigned int *port); +int __init ps3_repository_read_vuart_sysmgr_port(unsigned int *port); #endif diff --git a/arch/powerpc/platforms/ps3/repository.c b/arch/powerpc/platforms/ps3/repository.c index 21712964e76f..205763061a2d 100644 --- a/arch/powerpc/platforms/ps3/repository.c +++ b/arch/powerpc/platforms/ps3/repository.c @@ -413,7 +413,7 @@ found_dev: return 0; } -int ps3_repository_find_devices(enum ps3_bus_type bus_type, +int __init ps3_repository_find_devices(enum ps3_bus_type bus_type, int (*callback)(const struct ps3_repository_device *repo)) { int result = 0; @@ -455,7 +455,7 @@ int ps3_repository_find_devices(enum ps3_bus_type bus_type, return result; } -int ps3_repository_find_bus(enum ps3_bus_type bus_type, unsigned int from, +int __init ps3_repository_find_bus(enum ps3_bus_type bus_type, unsigned int from, unsigned int *bus_index) { unsigned int i; @@ -908,7 +908,7 @@ int ps3_repository_read_boot_dat_size(unsigned int *size) return result; } -int ps3_repository_read_vuart_av_port(unsigned int *port) +int __init ps3_repository_read_vuart_av_port(unsigned int *port) { int result; u64 v1 = 0; @@ -923,7 +923,7 @@ int ps3_repository_read_vuart_av_port(unsigned int *port) return result; } -int ps3_repository_read_vuart_sysmgr_port(unsigned int *port) +int __init ps3_repository_read_vuart_sysmgr_port(unsigned int *port) { int result; u64 v1 = 0; @@ -1005,7 +1005,7 @@ int ps3_repository_read_be_id(u64 node_id, u64 *be_id) be_id, NULL); } -int ps3_repository_read_tb_freq(u64 node_id, u64 *tb_freq) +int __init ps3_repository_read_tb_freq(u64 node_id, u64 *tb_freq) { return read_node(PS3_LPAR_ID_PME, make_first_field("be", 0), @@ -1015,7 +1015,7 @@ int ps3_repository_read_tb_freq(u64 node_id, u64 *tb_freq) tb_freq, NULL); } -int ps3_repository_read_be_tb_freq(unsigned int be_index, u64 *tb_freq) +int __init ps3_repository_read_be_tb_freq(unsigned int be_index, u64 *tb_freq) { int result; u64 node_id; @@ -1178,7 +1178,7 @@ int ps3_repository_delete_highmem_info(unsigned int region_index) #if defined(DEBUG) -int ps3_repository_dump_resource_info(const struct ps3_repository_device *repo) +int __init ps3_repository_dump_resource_info(const struct ps3_repository_device *repo) { int result = 0; unsigned int res_index; @@ -1231,7 +1231,7 @@ int ps3_repository_dump_resource_info(const struct ps3_repository_device *repo) return result; } -static int dump_stor_dev_info(struct ps3_repository_device *repo) +static int __init dump_stor_dev_info(struct ps3_repository_device *repo) { int result = 0; unsigned int num_regions, region_index; @@ -1279,7 +1279,7 @@ out: return result; } -static int dump_device_info(struct ps3_repository_device *repo, +static int __init dump_device_info(struct ps3_repository_device *repo, unsigned int num_dev) { int result = 0; @@ -1323,7 +1323,7 @@ static int dump_device_info(struct ps3_repository_device *repo, return result; } -int ps3_repository_dump_bus_info(void) +int __init ps3_repository_dump_bus_info(void) { int result = 0; struct ps3_repository_device repo; diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c index 93b1e73b3529..85295756005a 100644 --- a/arch/powerpc/platforms/ps3/smp.c +++ b/arch/powerpc/platforms/ps3/smp.c @@ -112,7 +112,7 @@ static struct smp_ops_t ps3_smp_ops = { .kick_cpu = smp_generic_kick_cpu, }; -void smp_init_ps3(void) +void __init smp_init_ps3(void) { DBG(" -> %s\n", __func__); smp_ops = &ps3_smp_ops; diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index 0c252478e556..4a2520ec6d7f 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -137,7 +137,7 @@ u64 ps3_get_spe_id(void *arg) } EXPORT_SYMBOL_GPL(ps3_get_spe_id); -static unsigned long get_vas_id(void) +static unsigned long __init get_vas_id(void) { u64 id; From 1e3d992d213928851f7ddec6f150fb54fe759b64 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:29 -0500 Subject: [PATCH 206/240] powerpc/4xx: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/4xx' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-15-nick.child@ibm.com --- arch/powerpc/platforms/4xx/cpm.c | 4 ++-- arch/powerpc/platforms/4xx/pci.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/4xx/cpm.c b/arch/powerpc/platforms/4xx/cpm.c index ae8b812c9202..2571841625a2 100644 --- a/arch/powerpc/platforms/4xx/cpm.c +++ b/arch/powerpc/platforms/4xx/cpm.c @@ -163,7 +163,7 @@ static ssize_t cpm_idle_store(struct kobject *kobj, static struct kobj_attribute cpm_idle_attr = __ATTR(idle, 0644, cpm_idle_show, cpm_idle_store); -static void cpm_idle_config_sysfs(void) +static void __init cpm_idle_config_sysfs(void) { struct device *dev; unsigned long ret; @@ -231,7 +231,7 @@ static const struct platform_suspend_ops cpm_suspend_ops = { .enter = cpm_suspend_enter, }; -static int cpm_get_uint_property(struct device_node *np, +static int __init cpm_get_uint_property(struct device_node *np, const char *name) { int len; diff --git a/arch/powerpc/platforms/4xx/pci.c b/arch/powerpc/platforms/4xx/pci.c index c13d64c3b019..24f41e178cbc 100644 --- a/arch/powerpc/platforms/4xx/pci.c +++ b/arch/powerpc/platforms/4xx/pci.c @@ -1273,7 +1273,7 @@ static int __init ppc405ex_pciex_core_init(struct device_node *np) return 2; } -static void ppc405ex_pcie_phy_reset(struct ppc4xx_pciex_port *port) +static void __init ppc405ex_pcie_phy_reset(struct ppc4xx_pciex_port *port) { /* Assert the PE0_PHY reset */ mtdcri(SDR0, port->sdr_base + PESDRn_RCSSET, 0x01010000); From 1ee969be25ed21a1192ca569ad827013eb7fac04 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:30 -0500 Subject: [PATCH 207/240] powerpc/44x: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/44x/' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-16-nick.child@ibm.com --- arch/powerpc/platforms/44x/fsp2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c index 823397c802de..af13a59d2f60 100644 --- a/arch/powerpc/platforms/44x/fsp2.c +++ b/arch/powerpc/platforms/44x/fsp2.c @@ -197,7 +197,7 @@ static irqreturn_t rst_wrn_handler(int irq, void *data) { } } -static void node_irq_request(const char *compat, irq_handler_t errirq_handler) +static void __init node_irq_request(const char *compat, irq_handler_t errirq_handler) { struct device_node *np; unsigned int irq; @@ -222,7 +222,7 @@ static void node_irq_request(const char *compat, irq_handler_t errirq_handler) } } -static void critical_irq_setup(void) +static void __init critical_irq_setup(void) { node_irq_request(FSP2_CMU_ERR, cmu_err_handler); node_irq_request(FSP2_BUS_ERR, bus_err_handler); From c0dc225ae7dd9f01d46ea779f7f169d49aa59b78 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:31 -0500 Subject: [PATCH 208/240] powerpc/embedded6xx: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/embedded6xx' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-17-nick.child@ibm.com --- arch/powerpc/platforms/embedded6xx/hlwd-pic.c | 4 ++-- arch/powerpc/platforms/embedded6xx/hlwd-pic.h | 2 +- arch/powerpc/platforms/embedded6xx/holly.c | 2 +- arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c | 4 ++-- arch/powerpc/platforms/embedded6xx/wii.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c index a4b020e4b6af..380b4285cce4 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c @@ -153,7 +153,7 @@ static void __hlwd_quiesce(void __iomem *io_base) out_be32(io_base + HW_BROADWAY_ICR, 0xffffffff); } -static struct irq_domain *hlwd_pic_init(struct device_node *np) +static struct irq_domain *__init hlwd_pic_init(struct device_node *np) { struct irq_domain *irq_domain; struct resource res; @@ -197,7 +197,7 @@ unsigned int hlwd_pic_get_irq(void) * */ -void hlwd_pic_probe(void) +void __init hlwd_pic_probe(void) { struct irq_domain *host; struct device_node *np; diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.h b/arch/powerpc/platforms/embedded6xx/hlwd-pic.h index f18eeeef0815..c2fa42e191dc 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.h +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.h @@ -11,7 +11,7 @@ #define __HLWD_PIC_H extern unsigned int hlwd_pic_get_irq(void); -extern void hlwd_pic_probe(void); +void __init hlwd_pic_probe(void); extern void hlwd_quiesce(void); #endif diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 7a85b117f7a4..07e71ba3e846 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -50,7 +50,7 @@ static int holly_exclude_device(struct pci_controller *hose, u_char bus, return PCIBIOS_SUCCESSFUL; } -static void holly_remap_bridge(void) +static void __init holly_remap_bridge(void) { u32 lut_val, lut_addr; int i; diff --git a/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c b/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c index ed45db70a781..5aea46566233 100644 --- a/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c +++ b/arch/powerpc/platforms/embedded6xx/usbgecko_udbg.c @@ -194,7 +194,7 @@ static int ug_udbg_getc_poll(void) /* * Retrieves and prepares the virtual address needed to access the hardware. */ -static void __iomem *ug_udbg_setup_exi_io_base(struct device_node *np) +static void __iomem *__init ug_udbg_setup_exi_io_base(struct device_node *np) { void __iomem *exi_io_base = NULL; phys_addr_t paddr; @@ -212,7 +212,7 @@ static void __iomem *ug_udbg_setup_exi_io_base(struct device_node *np) /* * Checks if a USB Gecko adapter is inserted in any memory card slot. */ -static void __iomem *ug_udbg_probe(void __iomem *exi_io_base) +static void __iomem *__init ug_udbg_probe(void __iomem *exi_io_base) { int i; diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index a802ef957d63..f60ade584bb2 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -69,7 +69,7 @@ static void __noreturn wii_spin(void) cpu_relax(); } -static void __iomem *wii_ioremap_hw_regs(char *name, char *compatible) +static void __iomem *__init wii_ioremap_hw_regs(char *name, char *compatible) { void __iomem *hw_regs = NULL; struct device_node *np; From f4a88b0ef5c5f7ce218aced7d811a31dd311a0b0 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:32 -0500 Subject: [PATCH 209/240] powerpc/83xx: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/83xx' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-18-nick.child@ibm.com --- arch/powerpc/platforms/83xx/km83xx.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc837x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc837x_rdb.c | 2 +- arch/powerpc/platforms/83xx/mpc83xx.h | 6 +++--- arch/powerpc/platforms/83xx/usb.c | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c index 108e1e4d2683..d9eed0decb28 100644 --- a/arch/powerpc/platforms/83xx/km83xx.c +++ b/arch/powerpc/platforms/83xx/km83xx.c @@ -39,7 +39,7 @@ #define SVR_REV(svr) (((svr) >> 0) & 0xFFFF) /* Revision field */ -static void quirk_mpc8360e_qe_enet10(void) +static void __init quirk_mpc8360e_qe_enet10(void) { /* * handle mpc8360E Erratum QE_ENET10: diff --git a/arch/powerpc/platforms/83xx/mpc834x_mds.c b/arch/powerpc/platforms/83xx/mpc834x_mds.c index 6d91bdce0a18..0713deffb40c 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc834x_mds.c @@ -35,7 +35,7 @@ #include "mpc83xx.h" #define BCSR5_INT_USB 0x02 -static int mpc834xemds_usb_cfg(void) +static int __init mpc834xemds_usb_cfg(void) { struct device_node *np; void __iomem *bcsr_regs = NULL; diff --git a/arch/powerpc/platforms/83xx/mpc837x_mds.c b/arch/powerpc/platforms/83xx/mpc837x_mds.c index f28d166ea7db..fc88ab97f6e3 100644 --- a/arch/powerpc/platforms/83xx/mpc837x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc837x_mds.c @@ -23,7 +23,7 @@ #define BCSR12_USB_SER_PIN 0x80 #define BCSR12_USB_SER_DEVICE 0x02 -static int mpc837xmds_usb_cfg(void) +static int __init mpc837xmds_usb_cfg(void) { struct device_node *np; const void *phy_type, *mode; diff --git a/arch/powerpc/platforms/83xx/mpc837x_rdb.c b/arch/powerpc/platforms/83xx/mpc837x_rdb.c index 7fb7684c256b..5d48c6842098 100644 --- a/arch/powerpc/platforms/83xx/mpc837x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc837x_rdb.c @@ -18,7 +18,7 @@ #include "mpc83xx.h" -static void mpc837x_rdb_sd_cfg(void) +static void __init mpc837x_rdb_sd_cfg(void) { void __iomem *im; diff --git a/arch/powerpc/platforms/83xx/mpc83xx.h b/arch/powerpc/platforms/83xx/mpc83xx.h index a30d30588cf6..aea803ba3a15 100644 --- a/arch/powerpc/platforms/83xx/mpc83xx.h +++ b/arch/powerpc/platforms/83xx/mpc83xx.h @@ -68,9 +68,9 @@ extern void __noreturn mpc83xx_restart(char *cmd); extern long mpc83xx_time_init(void); -extern int mpc837x_usb_cfg(void); -extern int mpc834x_usb_cfg(void); -extern int mpc831x_usb_cfg(void); +int __init mpc837x_usb_cfg(void); +int __init mpc834x_usb_cfg(void); +int __init mpc831x_usb_cfg(void); extern void mpc83xx_ipic_init_IRQ(void); #ifdef CONFIG_PCI diff --git a/arch/powerpc/platforms/83xx/usb.c b/arch/powerpc/platforms/83xx/usb.c index 3d247d726ed5..b0bda20aaccf 100644 --- a/arch/powerpc/platforms/83xx/usb.c +++ b/arch/powerpc/platforms/83xx/usb.c @@ -20,7 +20,7 @@ #ifdef CONFIG_PPC_MPC834x -int mpc834x_usb_cfg(void) +int __init mpc834x_usb_cfg(void) { unsigned long sccr, sicrl, sicrh; void __iomem *immap; @@ -96,7 +96,7 @@ int mpc834x_usb_cfg(void) #endif /* CONFIG_PPC_MPC834x */ #ifdef CONFIG_PPC_MPC831x -int mpc831x_usb_cfg(void) +int __init mpc831x_usb_cfg(void) { u32 temp; void __iomem *immap, *usb_regs; @@ -209,7 +209,7 @@ out: #endif /* CONFIG_PPC_MPC831x */ #ifdef CONFIG_PPC_MPC837x -int mpc837x_usb_cfg(void) +int __init mpc837x_usb_cfg(void) { void __iomem *immap; struct device_node *np = NULL; From 407454cafd3f1878dae6bb839d8bac2db264300f Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:33 -0500 Subject: [PATCH 210/240] powerpc/85xx: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/85xx' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-19-nick.child@ibm.com --- arch/powerpc/platforms/85xx/ge_imp3a.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_cds.c | 2 +- arch/powerpc/platforms/85xx/socrates_fpga_pic.c | 2 +- arch/powerpc/platforms/85xx/socrates_fpga_pic.h | 2 +- arch/powerpc/platforms/85xx/xes_mpc85xx.c | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/85xx/ge_imp3a.c b/arch/powerpc/platforms/85xx/ge_imp3a.c index 83a0f7a1f0de..743c65e4d8e4 100644 --- a/arch/powerpc/platforms/85xx/ge_imp3a.c +++ b/arch/powerpc/platforms/85xx/ge_imp3a.c @@ -78,7 +78,7 @@ void __init ge_imp3a_pic_init(void) of_node_put(cascade_node); } -static void ge_imp3a_pci_assign_primary(void) +static void __init ge_imp3a_pci_assign_primary(void) { #ifdef CONFIG_PCI struct device_node *np; diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c index 172d2b7cfeb7..5bd487030256 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -282,7 +282,7 @@ machine_device_initcall(mpc85xx_cds, mpc85xx_cds_8259_attach); #endif /* CONFIG_PPC_I8259 */ -static void mpc85xx_cds_pci_assign_primary(void) +static void __init mpc85xx_cds_pci_assign_primary(void) { #ifdef CONFIG_PCI struct device_node *np; diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c index 199a137c0ddb..3768c86b9629 100644 --- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c +++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c @@ -271,7 +271,7 @@ static const struct irq_domain_ops socrates_fpga_pic_host_ops = { .xlate = socrates_fpga_pic_host_xlate, }; -void socrates_fpga_pic_init(struct device_node *pic) +void __init socrates_fpga_pic_init(struct device_node *pic) { unsigned long flags; int i; diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.h b/arch/powerpc/platforms/85xx/socrates_fpga_pic.h index c592b8bc94dd..c50b23794a06 100644 --- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.h +++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.h @@ -6,6 +6,6 @@ #ifndef SOCRATES_FPGA_PIC_H #define SOCRATES_FPGA_PIC_H -void socrates_fpga_pic_init(struct device_node *pic); +void __init socrates_fpga_pic_init(struct device_node *pic); #endif diff --git a/arch/powerpc/platforms/85xx/xes_mpc85xx.c b/arch/powerpc/platforms/85xx/xes_mpc85xx.c index d54e1ae56997..397e158c1edb 100644 --- a/arch/powerpc/platforms/85xx/xes_mpc85xx.c +++ b/arch/powerpc/platforms/85xx/xes_mpc85xx.c @@ -45,7 +45,7 @@ void __init xes_mpc85xx_pic_init(void) mpic_init(mpic); } -static void xes_mpc85xx_configure_l2(void __iomem *l2_base) +static void __init xes_mpc85xx_configure_l2(void __iomem *l2_base) { volatile uint32_t ctl, tmp; @@ -72,7 +72,7 @@ static void xes_mpc85xx_configure_l2(void __iomem *l2_base) asm volatile("msync; isync"); } -static void xes_mpc85xx_fixups(void) +static void __init xes_mpc85xx_fixups(void) { struct device_node *np; int err; From 2493a24271dab3d5c1235a13cf6ee2d12773c9a1 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:34 -0500 Subject: [PATCH 211/240] powerpc/512x: Add __init attribute to eligible functions Some functions defined in 'arch/powerpc/platforms/512x' are deserving of an `__init` macro attribute. These functions are only called by other initialization functions and therefore should inherit the attribute. Also, change function declarations in header files to include `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-20-nick.child@ibm.com --- arch/powerpc/platforms/512x/clock-commonclk.c | 52 +++++++++---------- arch/powerpc/platforms/512x/mpc512x.h | 4 +- arch/powerpc/platforms/512x/mpc512x_shared.c | 4 +- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/platforms/512x/clock-commonclk.c b/arch/powerpc/platforms/512x/clock-commonclk.c index 30342b60aa63..0b03d812baae 100644 --- a/arch/powerpc/platforms/512x/clock-commonclk.c +++ b/arch/powerpc/platforms/512x/clock-commonclk.c @@ -97,7 +97,7 @@ static enum soc_type { MPC512x_SOC_MPC5125, } soc; -static void mpc512x_clk_determine_soc(void) +static void __init mpc512x_clk_determine_soc(void) { if (of_machine_is_compatible("fsl,mpc5121")) { soc = MPC512x_SOC_MPC5121; @@ -113,98 +113,98 @@ static void mpc512x_clk_determine_soc(void) } } -static bool soc_has_mbx(void) +static bool __init soc_has_mbx(void) { if (soc == MPC512x_SOC_MPC5121) return true; return false; } -static bool soc_has_axe(void) +static bool __init soc_has_axe(void) { if (soc == MPC512x_SOC_MPC5125) return false; return true; } -static bool soc_has_viu(void) +static bool __init soc_has_viu(void) { if (soc == MPC512x_SOC_MPC5125) return false; return true; } -static bool soc_has_spdif(void) +static bool __init soc_has_spdif(void) { if (soc == MPC512x_SOC_MPC5125) return false; return true; } -static bool soc_has_pata(void) +static bool __init soc_has_pata(void) { if (soc == MPC512x_SOC_MPC5125) return false; return true; } -static bool soc_has_sata(void) +static bool __init soc_has_sata(void) { if (soc == MPC512x_SOC_MPC5125) return false; return true; } -static bool soc_has_pci(void) +static bool __init soc_has_pci(void) { if (soc == MPC512x_SOC_MPC5125) return false; return true; } -static bool soc_has_fec2(void) +static bool __init soc_has_fec2(void) { if (soc == MPC512x_SOC_MPC5125) return true; return false; } -static int soc_max_pscnum(void) +static int __init soc_max_pscnum(void) { if (soc == MPC512x_SOC_MPC5125) return 10; return 12; } -static bool soc_has_sdhc2(void) +static bool __init soc_has_sdhc2(void) { if (soc == MPC512x_SOC_MPC5125) return true; return false; } -static bool soc_has_nfc_5125(void) +static bool __init soc_has_nfc_5125(void) { if (soc == MPC512x_SOC_MPC5125) return true; return false; } -static bool soc_has_outclk(void) +static bool __init soc_has_outclk(void) { if (soc == MPC512x_SOC_MPC5125) return true; return false; } -static bool soc_has_cpmf_0_bypass(void) +static bool __init soc_has_cpmf_0_bypass(void) { if (soc == MPC512x_SOC_MPC5125) return true; return false; } -static bool soc_has_mclk_mux0_canin(void) +static bool __init soc_has_mclk_mux0_canin(void) { if (soc == MPC512x_SOC_MPC5125) return true; @@ -294,7 +294,7 @@ static inline int get_bit_field(uint32_t __iomem *reg, uint8_t pos, uint8_t len) } /* get the SPMF and translate it into the "sys pll" multiplier */ -static int get_spmf_mult(void) +static int __init get_spmf_mult(void) { static int spmf_to_mult[] = { 68, 1, 12, 16, 20, 24, 28, 32, @@ -312,7 +312,7 @@ static int get_spmf_mult(void) * values returned from here are a multiple of the real factor since the * divide ratio is fractional */ -static int get_sys_div_x2(void) +static int __init get_sys_div_x2(void) { static int sysdiv_code_to_x2[] = { 4, 5, 6, 7, 8, 9, 10, 14, @@ -333,7 +333,7 @@ static int get_sys_div_x2(void) * values returned from here are a multiple of the real factor since the * multiplier ratio is fractional */ -static int get_cpmf_mult_x2(void) +static int __init get_cpmf_mult_x2(void) { static int cpmf_to_mult_x36[] = { /* 0b000 is "times 36" */ @@ -379,7 +379,7 @@ static const struct clk_div_table divtab_1234[] = { { .div = 0, }, }; -static int get_freq_from_dt(char *propname) +static int __init get_freq_from_dt(char *propname) { struct device_node *np; const unsigned int *prop; @@ -396,7 +396,7 @@ static int get_freq_from_dt(char *propname) return val; } -static void mpc512x_clk_preset_data(void) +static void __init mpc512x_clk_preset_data(void) { size_t i; @@ -418,7 +418,7 @@ static void mpc512x_clk_preset_data(void) * SYS -> CSB -> IPS) from the REF clock rate and the returned mul/div * values */ -static void mpc512x_clk_setup_ref_clock(struct device_node *np, int bus_freq, +static void __init mpc512x_clk_setup_ref_clock(struct device_node *np, int bus_freq, int *sys_mul, int *sys_div, int *ips_div) { @@ -592,7 +592,7 @@ static struct mclk_setup_data mclk_outclk_data[] = { }; /* setup the MCLK clock subtree of an individual PSC/MSCAN/SPDIF */ -static void mpc512x_clk_setup_mclk(struct mclk_setup_data *entry, size_t idx) +static void __init mpc512x_clk_setup_mclk(struct mclk_setup_data *entry, size_t idx) { size_t clks_idx_pub, clks_idx_int; u32 __iomem *mccr_reg; /* MCLK control register (mux, en, div) */ @@ -701,7 +701,7 @@ static void mpc512x_clk_setup_mclk(struct mclk_setup_data *entry, size_t idx) /* }}} MCLK helpers */ -static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq) +static void __init mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq) { int sys_mul, sys_div, ips_div; int mul, div; @@ -937,7 +937,7 @@ static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq) * registers the set of public clocks (those listed in the dt-bindings/ * header file) for OF lookups, keeps the intermediates private to us */ -static void mpc5121_clk_register_of_provider(struct device_node *np) +static void __init mpc5121_clk_register_of_provider(struct device_node *np) { clk_data.clks = clks; clk_data.clk_num = MPC512x_CLK_LAST_PUBLIC + 1; /* _not_ ARRAY_SIZE() */ @@ -948,7 +948,7 @@ static void mpc5121_clk_register_of_provider(struct device_node *np) * temporary support for the period of time between introduction of CCF * support and the adjustment of peripheral drivers to OF based lookups */ -static void mpc5121_clk_provide_migration_support(void) +static void __init mpc5121_clk_provide_migration_support(void) { /* @@ -1009,7 +1009,7 @@ static void mpc5121_clk_provide_migration_support(void) * case of not yet adjusted device tree data, where clock related specs * are missing) */ -static void mpc5121_clk_provide_backwards_compat(void) +static void __init mpc5121_clk_provide_backwards_compat(void) { enum did_reg_flags { DID_REG_PSC = BIT(0), diff --git a/arch/powerpc/platforms/512x/mpc512x.h b/arch/powerpc/platforms/512x/mpc512x.h index fff225901e2f..2f3c60e373e1 100644 --- a/arch/powerpc/platforms/512x/mpc512x.h +++ b/arch/powerpc/platforms/512x/mpc512x.h @@ -12,8 +12,8 @@ extern void __init mpc512x_init_early(void); extern void __init mpc512x_init(void); extern void __init mpc512x_setup_arch(void); extern int __init mpc5121_clk_init(void); -extern const char *mpc512x_select_psc_compat(void); -extern const char *mpc512x_select_reset_compat(void); +const char *__init mpc512x_select_psc_compat(void); +const char *__init mpc512x_select_reset_compat(void); extern void __noreturn mpc512x_restart(char *cmd); #endif /* __MPC512X_H__ */ diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index 7a9ae9591d60..e3411663edad 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -352,7 +352,7 @@ static void __init mpc512x_declare_of_platform_devices(void) #define DEFAULT_FIFO_SIZE 16 -const char *mpc512x_select_psc_compat(void) +const char *__init mpc512x_select_psc_compat(void) { if (of_machine_is_compatible("fsl,mpc5121")) return "fsl,mpc5121-psc"; @@ -363,7 +363,7 @@ const char *mpc512x_select_psc_compat(void) return NULL; } -const char *mpc512x_select_reset_compat(void) +const char *__init mpc512x_select_reset_compat(void) { if (of_machine_is_compatible("fsl,mpc5121")) return "fsl,mpc5121-reset"; From 7da1d1ddd1f02e5de7497a0c849256912652fb6c Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 16 Dec 2021 17:00:35 -0500 Subject: [PATCH 212/240] cuda/pmu: Make find_via_cuda/pmu init functions Make `find_via_cuda` and `find_via_pmu` initialization functions. Previously, their definitions in `drivers/macintosh/via-cuda.h` include the `__init` attribute but their alternative definitions in `arch/powerpc/powermac/sectup./c` and prototypes in `include/linux/ cuda.h` and `include/linux/pmu.h` do not use the `__init` macro. Since, only initialization functions call `find_via_cuda` and `find_via_pmu` it is safe to label these functions with `__init`. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211216220035.605465-21-nick.child@ibm.com --- arch/powerpc/platforms/powermac/setup.c | 4 ++-- include/linux/cuda.h | 2 +- include/linux/pmu.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index f7661b81db18..974d4b49867b 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -166,7 +166,7 @@ static void pmac_show_cpuinfo(struct seq_file *m) } #ifndef CONFIG_ADB_CUDA -int find_via_cuda(void) +int __init find_via_cuda(void) { struct device_node *dn = of_find_node_by_name(NULL, "via-cuda"); @@ -180,7 +180,7 @@ int find_via_cuda(void) #endif #ifndef CONFIG_ADB_PMU -int find_via_pmu(void) +int __init find_via_pmu(void) { struct device_node *dn = of_find_node_by_name(NULL, "via-pmu"); diff --git a/include/linux/cuda.h b/include/linux/cuda.h index 45bfe9d61271..daf3e6f98444 100644 --- a/include/linux/cuda.h +++ b/include/linux/cuda.h @@ -12,7 +12,7 @@ #include -extern int find_via_cuda(void); +extern int __init find_via_cuda(void); extern int cuda_request(struct adb_request *req, void (*done)(struct adb_request *), int nbytes, ...); extern void cuda_poll(void); diff --git a/include/linux/pmu.h b/include/linux/pmu.h index 52453a24a24f..c677442d007c 100644 --- a/include/linux/pmu.h +++ b/include/linux/pmu.h @@ -13,7 +13,7 @@ #include -extern int find_via_pmu(void); +extern int __init find_via_pmu(void); extern int pmu_request(struct adb_request *req, void (*done)(struct adb_request *), int nbytes, ...); From a3ad84da076009c94969fa97f604257667e2980f Mon Sep 17 00:00:00 2001 From: Alan Modra Date: Tue, 21 Dec 2021 16:58:59 +1100 Subject: [PATCH 213/240] powerpc/toc: Future proof kernel toc This patch future-proofs the kernel against linker changes that might put the toc pointer at some location other than .got+0x8000, by replacing __toc_start+0x8000 with .TOC. throughout. If the kernel's idea of the toc pointer doesn't agree with the linker, bad things happen. prom_init.c code relocating its toc is also changed so that a symbolic __prom_init_toc_start toc-pointer relative address is calculated rather than assuming that it is always at toc-pointer - 0x8000. The length calculations loading values from the toc are also avoided. It's a little incestuous to do that with unreloc_toc picking up adjusted values (which is fine in practice, they both adjust by the same amount if all goes well). I've also changed the way .got is aligned in vmlinux.lds and zImage.lds, mostly so that dumping out section info by objdump or readelf plainly shows the alignment is 256. This linker script feature was added 2005-09-27, available in FSF binutils releases from 2.17 onwards. Should be safe to use in the kernel, I think. Finally, put *(.got) before the prom_init.o entry which only needs *(.toc), so that the GOT header goes in the correct place. I don't believe this makes any difference for the kernel as it would for dynamic objects being loaded by ld.so. That change is just to stop lusers who blindly copy kernel scripts being led astray. Of course, this change needs the prom_init.c changes. Some notes on .toc and .got. .toc is a compiler generated section of addresses. .got is a linker generated section of addresses, generally built when the linker sees R_*_*GOT* relocations. In the case of powerpc64 ld.bfd, there are multiple generated .got sections, one per input object file. So you can somewhat reasonably write in a linker script an input section statement like *prom_init.o(.got .toc) to mean "the .got and .toc section for files matching *prom_init.o". On other architectures that doesn't make sense, because the linker generally has just one .got section. Even on powerpc64, note well that the GOT entries for prom_init.o may be merged with GOT entries from other objects. That means that if prom_init.o references, say, _end via some GOT relocation, and some other object also references _end via a GOT relocation, the GOT entry for _end may be in the range __prom_init_toc_start to __prom_init_toc_end and if the kernel does something special to GOT/TOC entries in that range then the value of _end as seen by objects other than prom_init.o will be affected. On the other hand the GOT entry for _end may not be in the range __prom_init_toc_start to __prom_init_toc_end. Which way it turns out is deterministic but a detail of linker operation that should not be relied on. A feature of ld.bfd is that input .toc (and .got) sections matching one linker input section statement may be sorted, to put entries used by small-model code first, near the toc base. This is why scripts for powerpc64 normally use *(.got .toc) rather than *(.got) *(.toc), since the first form allows more freedom to sort. Another feature of ld.bfd is that indirect addressing sequences using the GOT/TOC may be edited by the linker to relative addressing. In many cases relative addressing would be emitted by gcc for -mcmodel=medium if you appropriately decorate variable declarations with non-default visibility. The original patch is here: https://lore.kernel.org/linuxppc-dev/20210310034813.GM6042@bubble.grove.modra.org/ Signed-off-by: Alan Modra [aik: removed non-relocatable which is gone in 24d33ac5b8ffb] [aik: added <=2.24 check] [aik: because of llvm-as, kernel_toc_addr() uses "mr" instead of global register variable] Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221055904.555763-2-aik@ozlabs.ru --- arch/powerpc/Makefile | 5 +++-- arch/powerpc/boot/crt0.S | 2 +- arch/powerpc/boot/zImage.lds.S | 7 ++----- arch/powerpc/include/asm/sections.h | 14 +++++++------- arch/powerpc/kernel/head_64.S | 2 +- arch/powerpc/kernel/vmlinux.lds.S | 8 +++----- 6 files changed, 17 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index e02568f17334..e9aa4e8b07dd 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -445,10 +445,11 @@ PHONY += checkbin # Check toolchain versions: # - gcc-4.6 is the minimum kernel-wide version so nothing required. checkbin: - @if test "x${CONFIG_CPU_LITTLE_ENDIAN}" = "xy" \ - && $(LD) --version | head -1 | grep ' 2\.24$$' >/dev/null ; then \ + @if test "x${CONFIG_LD_IS_LLD}" != "xy" -a \ + "x$(call ld-ifversion, -le, 22400, y)" = "xy" ; then \ echo -n '*** binutils 2.24 miscompiles weak symbols ' ; \ echo 'in some circumstances.' ; \ + echo '*** binutils 2.23 do not define the TOC symbol ' ; \ echo -n '*** Please use a different binutils version.' ; \ false ; \ fi diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S index e8f10a599659..feadee18e271 100644 --- a/arch/powerpc/boot/crt0.S +++ b/arch/powerpc/boot/crt0.S @@ -28,7 +28,7 @@ p_etext: .8byte _etext p_bss_start: .8byte __bss_start p_end: .8byte _end -p_toc: .8byte __toc_start + 0x8000 - p_base +p_toc: .8byte .TOC. - p_base p_dyn: .8byte __dynamic_start - p_base p_rela: .8byte __rela_dyn_start - p_base p_prom: .8byte 0 diff --git a/arch/powerpc/boot/zImage.lds.S b/arch/powerpc/boot/zImage.lds.S index d6f072865627..d65cd55a6f38 100644 --- a/arch/powerpc/boot/zImage.lds.S +++ b/arch/powerpc/boot/zImage.lds.S @@ -36,12 +36,9 @@ SECTIONS } #ifdef CONFIG_PPC64_BOOT_WRAPPER - . = ALIGN(256); - .got : + .got : ALIGN(256) { - __toc_start = .; - *(.got) - *(.toc) + *(.got .toc) } #endif diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 79cb7a25a5fb..38f79e42bf3c 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -25,16 +25,16 @@ extern char start_virt_trampolines[]; extern char end_virt_trampolines[]; #endif +/* + * This assumes the kernel is never compiled -mcmodel=small or + * the total .toc is always less than 64k. + */ static inline unsigned long kernel_toc_addr(void) { - /* Defined by the linker, see vmlinux.lds.S */ - extern unsigned long __toc_start; + unsigned long toc_ptr; - /* - * The TOC register (r2) points 32kB into the TOC, so that 64kB of - * the TOC can be addressed using a single machine instruction. - */ - return (unsigned long)(&__toc_start) + 0x8000UL; + asm volatile("mr %0, 2" : "=r" (toc_ptr)); + return toc_ptr; } static inline int overlaps_interrupt_vector_text(unsigned long start, diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index f17ae2083733..a08c050ff645 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -904,7 +904,7 @@ _GLOBAL(relative_toc) blr .balign 8 -p_toc: .8byte __toc_start + 0x8000 - 0b +p_toc: .8byte .TOC. - 0b /* * This is where the main kernel code starts. diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index dfc3f39d365f..2bcca818136a 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -337,15 +337,13 @@ SECTIONS *(.branch_lt) } - . = ALIGN(256); - .got : AT(ADDR(.got) - LOAD_OFFSET) { - __toc_start = .; + .got : AT(ADDR(.got) - LOAD_OFFSET) ALIGN(256) { + *(.got) #ifndef CONFIG_RELOCATABLE __prom_init_toc_start = .; - arch/powerpc/kernel/prom_init.o*(.toc .got) + arch/powerpc/kernel/prom_init.o*(.toc) __prom_init_toc_end = .; #endif - *(.got) *(.toc) } #endif From f5140cab448e4819ca6f158cb4130352f73c92e4 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 21 Dec 2021 16:59:00 +1100 Subject: [PATCH 214/240] powerpc: check for support for -Wa,-m{power4,any} LLVM's integrated assembler does not like either -Wa,-mpower4 or -Wa,-many. So just don't pass them if they're not supported. Signed-off-by: Daniel Axtens Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221055904.555763-3-aik@ozlabs.ru --- arch/powerpc/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index e9aa4e8b07dd..5f16ac1583c5 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -245,7 +245,9 @@ cpu-as-$(CONFIG_E500) += -Wa,-me500 # When using '-many -mpower4' gas will first try and find a matching power4 # mnemonic and failing that it will allow any valid mnemonic that GAS knows # about. GCC will pass -many to GAS when assembling, clang does not. -cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 -Wa,-many +# LLVM IAS doesn't understand either flag: https://github.com/ClangBuiltLinux/linux/issues/675 +# but LLVM IAS only supports ISA >= 2.06 for Book3S 64 anyway... +cpu-as-$(CONFIG_PPC_BOOK3S_64) += $(call as-option,-Wa$(comma)-mpower4) $(call as-option,-Wa$(comma)-many) cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc) KBUILD_AFLAGS += $(cpu-as-y) From fd983957971632088908c646116383402f04084b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 Dec 2021 16:59:01 +1100 Subject: [PATCH 215/240] powerpc/64/asm: Inline BRANCH_TO_C000 It is used just once and does not really help with readability, remove it. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221055904.555763-4-aik@ozlabs.ru --- arch/powerpc/kernel/exceptions-64s.S | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6fe7d7926370..6f29fb789c9a 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -89,19 +89,6 @@ name: ori reg,reg,(ABS_ADDR(label))@l; \ addis reg,reg,(ABS_ADDR(label))@h -/* - * Branch to label using its 0xC000 address. This results in instruction - * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned - * on using mtmsr rather than rfid. - * - * This could set the 0xc bits for !RELOCATABLE as an immediate, rather than - * load KBASE for a slight optimisation. - */ -#define BRANCH_TO_C000(reg, label) \ - __LOAD_FAR_HANDLER(reg, label); \ - mtctr reg; \ - bctr - /* * Interrupt code generation macros */ @@ -974,7 +961,9 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) /* We are waking up from idle, so may clobber any volatile register */ cmpwi cr1,r5,2 bltlr cr1 /* no state loss, return to idle caller with r3=SRR1 */ - BRANCH_TO_C000(r12, DOTSYM(idle_return_gpr_loss)) + __LOAD_FAR_HANDLER(r12, DOTSYM(idle_return_gpr_loss)) + mtctr r12 + bctr #endif #ifdef CONFIG_PPC_PSERIES From d72c4a36d7ab560127885473a310ece28988b604 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 21 Dec 2021 16:59:02 +1100 Subject: [PATCH 216/240] powerpc/64/asm: Do not reassign labels The LLVM integrated assembler really does not like us reassigning things to the same label: :7:9: error: invalid reassignment of non-absolute variable 'fs_label' This happens across a bunch of platforms: https://github.com/ClangBuiltLinux/linux/issues/1043 https://github.com/ClangBuiltLinux/linux/issues/1008 https://github.com/ClangBuiltLinux/linux/issues/920 https://github.com/ClangBuiltLinux/linux/issues/1050 There is no hope of getting this fixed in LLVM (see https://github.com/ClangBuiltLinux/linux/issues/1043#issuecomment-641571200 and https://bugs.llvm.org/show_bug.cgi?id=47798#c1 ) so if we want to build with LLVM_IAS, we need to hack around it ourselves. For us the big problem comes from this: \#define USE_FIXED_SECTION(sname) \ fs_label = start_##sname; \ fs_start = sname##_start; \ use_ftsec sname; \#define USE_TEXT_SECTION() fs_label = start_text; \ fs_start = text_start; \ .text and in particular fs_label. This works around it by not setting those 'variables' and requiring that users of the variables instead track for themselves what section they are in. This isn't amazing, by any stretch, but it gets us further in the compilation. Note that even though users have to keep track of the section, using a wrong one produces an error with both binutils and llvm which prevents from using wrong section at the compile time: llvm error example: AS arch/powerpc/kernel/head_64.o :0: error: Cannot represent a difference across sections make[3]: *** [/home/aik/p/kernels-llvm/llvm/scripts/Makefile.build:388: arch/powerpc/kernel/head_64.o] Error 1 binutils error example: /home/aik/p/kernels-llvm/llvm/arch/powerpc/kernel/exceptions-64s.S: Assembler messages: /home/aik/p/kernels-llvm/llvm/arch/powerpc/kernel/exceptions-64s.S:1974: Error: can't resolve `system_call_common' {.text section} - `start_r eal_vectors' {.head.text.real_vectors section} make[3]: *** [/home/aik/p/kernels-llvm/llvm/scripts/Makefile.build:388: arch/powerpc/kernel/head_64.o] Error 1 Signed-off-by: Daniel Axtens Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221055904.555763-5-aik@ozlabs.ru --- arch/powerpc/include/asm/head-64.h | 12 +++++------ arch/powerpc/kernel/exceptions-64s.S | 32 ++++++++++++++-------------- arch/powerpc/kernel/head_64.S | 18 ++++++++-------- arch/powerpc/kernel/interrupt_64.S | 2 +- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 242204e12993..d73153b0275d 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -98,13 +98,9 @@ start_text: . = sname##_len; #define USE_FIXED_SECTION(sname) \ - fs_label = start_##sname; \ - fs_start = sname##_start; \ use_ftsec sname; #define USE_TEXT_SECTION() \ - fs_label = start_text; \ - fs_start = text_start; \ .text #define CLOSE_FIXED_SECTION(sname) \ @@ -161,13 +157,15 @@ name: * - ABS_ADDR is used to find the absolute address of any symbol, from within * a fixed section. */ -#define DEFINE_FIXED_SYMBOL(label) \ - label##_absolute = (label - fs_label + fs_start) +// define label as being _in_ sname +#define DEFINE_FIXED_SYMBOL(label, sname) \ + label##_absolute = (label - start_ ## sname + sname ## _start) #define FIXED_SYMBOL_ABS_ADDR(label) \ (label##_absolute) -#define ABS_ADDR(label) (label - fs_label + fs_start) +// find label from _within_ sname +#define ABS_ADDR(label, sname) (label - start_ ## sname + sname ## _start) #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6f29fb789c9a..55caeee37c08 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -48,7 +48,7 @@ .balign IFETCH_ALIGN_BYTES; \ .global name; \ _ASM_NOKPROBE_SYMBOL(name); \ - DEFINE_FIXED_SYMBOL(name); \ + DEFINE_FIXED_SYMBOL(name, text); \ name: #define TRAMP_REAL_BEGIN(name) \ @@ -76,18 +76,18 @@ name: ld reg,PACAKBASE(r13); /* get high part of &label */ \ ori reg,reg,FIXED_SYMBOL_ABS_ADDR(label) -#define __LOAD_HANDLER(reg, label) \ +#define __LOAD_HANDLER(reg, label, section) \ ld reg,PACAKBASE(r13); \ - ori reg,reg,(ABS_ADDR(label))@l + ori reg,reg,(ABS_ADDR(label, section))@l /* * Branches from unrelocated code (e.g., interrupts) to labels outside * head-y require >64K offsets. */ -#define __LOAD_FAR_HANDLER(reg, label) \ +#define __LOAD_FAR_HANDLER(reg, label, section) \ ld reg,PACAKBASE(r13); \ - ori reg,reg,(ABS_ADDR(label))@l; \ - addis reg,reg,(ABS_ADDR(label))@h + ori reg,reg,(ABS_ADDR(label, section))@l; \ + addis reg,reg,(ABS_ADDR(label, section))@h /* * Interrupt code generation macros @@ -381,7 +381,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) * This switches to virtual mode and sets MSR[RI]. */ .macro __GEN_COMMON_ENTRY name -DEFINE_FIXED_SYMBOL(\name\()_common_real) +DEFINE_FIXED_SYMBOL(\name\()_common_real, text) \name\()_common_real: .if IKVM_REAL KVMTEST \name kvm_interrupt @@ -404,7 +404,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) .endif .balign IFETCH_ALIGN_BYTES -DEFINE_FIXED_SYMBOL(\name\()_common_virt) +DEFINE_FIXED_SYMBOL(\name\()_common_virt, text) \name\()_common_virt: .if IKVM_VIRT KVMTEST \name kvm_interrupt @@ -418,7 +418,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt) * want to run in real mode. */ .macro __GEN_REALMODE_COMMON_ENTRY name -DEFINE_FIXED_SYMBOL(\name\()_common_real) +DEFINE_FIXED_SYMBOL(\name\()_common_real, text) \name\()_common_real: .if IKVM_REAL KVMTEST \name kvm_interrupt @@ -852,12 +852,12 @@ SOFT_MASK_TABLE(0xc000000000003000, 0xc000000000004000) #ifdef CONFIG_RELOCATABLE TRAMP_VIRT_BEGIN(system_call_vectored_tramp) - __LOAD_HANDLER(r10, system_call_vectored_common) + __LOAD_HANDLER(r10, system_call_vectored_common, virt_trampolines) mtctr r10 bctr TRAMP_VIRT_BEGIN(system_call_vectored_sigill_tramp) - __LOAD_HANDLER(r10, system_call_vectored_sigill) + __LOAD_HANDLER(r10, system_call_vectored_sigill, virt_trampolines) mtctr r10 bctr #endif @@ -961,7 +961,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) /* We are waking up from idle, so may clobber any volatile register */ cmpwi cr1,r5,2 bltlr cr1 /* no state loss, return to idle caller with r3=SRR1 */ - __LOAD_FAR_HANDLER(r12, DOTSYM(idle_return_gpr_loss)) + __LOAD_FAR_HANDLER(r12, DOTSYM(idle_return_gpr_loss), real_trampolines) mtctr r12 bctr #endif @@ -1960,12 +1960,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) HMT_MEDIUM .if ! \virt - __LOAD_HANDLER(r10, system_call_common_real) + __LOAD_HANDLER(r10, system_call_common_real, real_vectors) mtctr r10 bctr .else #ifdef CONFIG_RELOCATABLE - __LOAD_HANDLER(r10, system_call_common) + __LOAD_HANDLER(r10, system_call_common, virt_vectors) mtctr r10 bctr #else @@ -2019,7 +2019,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * Requires __LOAD_FAR_HANDLER beause kvmppc_hcall lives * outside the head section. */ - __LOAD_FAR_HANDLER(r10, kvmppc_hcall) + __LOAD_FAR_HANDLER(r10, kvmppc_hcall, real_trampolines) mtctr r10 bctr #else @@ -3061,7 +3061,7 @@ USE_FIXED_SECTION(virt_trampolines) .align 7 .globl __end_interrupts __end_interrupts: -DEFINE_FIXED_SYMBOL(__end_interrupts) +DEFINE_FIXED_SYMBOL(__end_interrupts, virt_trampolines) CLOSE_FIXED_SECTION(real_vectors); CLOSE_FIXED_SECTION(real_trampolines); diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index a08c050ff645..5c5181e8d5f1 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -126,7 +126,7 @@ __secondary_hold_acknowledge: . = 0x5c .globl __run_at_load __run_at_load: -DEFINE_FIXED_SYMBOL(__run_at_load) +DEFINE_FIXED_SYMBOL(__run_at_load, first_256B) .long RUN_AT_LOAD_DEFAULT #endif @@ -156,7 +156,7 @@ __secondary_hold: /* Tell the master cpu we're here */ /* Relocation is off & we are located at an address less */ /* than 0x100, so only need to grab low order offset. */ - std r24,(ABS_ADDR(__secondary_hold_acknowledge))(0) + std r24,(ABS_ADDR(__secondary_hold_acknowledge, first_256B))(0) sync li r26,0 @@ -164,7 +164,7 @@ __secondary_hold: tovirt(r26,r26) #endif /* All secondary cpus wait here until told to start. */ -100: ld r12,(ABS_ADDR(__secondary_hold_spinloop))(r26) +100: ld r12,(ABS_ADDR(__secondary_hold_spinloop, first_256B))(r26) cmpdi 0,r12,0 beq 100b @@ -649,15 +649,15 @@ __after_prom_start: 3: #endif /* # bytes of memory to copy */ - lis r5,(ABS_ADDR(copy_to_here))@ha - addi r5,r5,(ABS_ADDR(copy_to_here))@l + lis r5,(ABS_ADDR(copy_to_here, text))@ha + addi r5,r5,(ABS_ADDR(copy_to_here, text))@l bl copy_and_flush /* copy the first n bytes */ /* this includes the code being */ /* executed here. */ /* Jump to the copy of this code that we just made */ - addis r8,r3,(ABS_ADDR(4f))@ha - addi r12,r8,(ABS_ADDR(4f))@l + addis r8,r3,(ABS_ADDR(4f, text))@ha + addi r12,r8,(ABS_ADDR(4f, text))@l mtctr r12 bctr @@ -669,8 +669,8 @@ p_end: .8byte _end - copy_to_here * Now copy the rest of the kernel up to _end, add * _end - copy_to_here to the copy limit and run again. */ - addis r8,r26,(ABS_ADDR(p_end))@ha - ld r8,(ABS_ADDR(p_end))@l(r8) + addis r8,r26,(ABS_ADDR(p_end, text))@ha + ld r8,(ABS_ADDR(p_end, text))@l(r8) add r5,r5,r8 5: bl copy_and_flush /* copy the rest */ diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 2ad223597ca2..d3180139e35a 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -695,7 +695,7 @@ interrupt_return_macro hsrr .globl __end_soft_masked __end_soft_masked: -DEFINE_FIXED_SYMBOL(__end_soft_masked) +DEFINE_FIXED_SYMBOL(__end_soft_masked, text) #endif /* CONFIG_PPC_BOOK3S */ #ifdef CONFIG_PPC_BOOK3S From d51f86cfd8e378d4907958db77da3074f6dce3ba Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 Dec 2021 16:59:03 +1100 Subject: [PATCH 217/240] powerpc/mm: Switch obsolete dssall to .long The dssall ("Data Stream Stop All") instruction is obsolete altogether with other Data Cache Instructions since ISA 2.03 (year 2006). LLVM IAS does not support it but PPC970 seems to be using it. This switches dssall to .long as there is no much point in fixing LLVM. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221055904.555763-6-aik@ozlabs.ru --- arch/powerpc/include/asm/ppc-opcode.h | 2 ++ arch/powerpc/kernel/idle.c | 2 +- arch/powerpc/kernel/idle_6xx.S | 2 +- arch/powerpc/kernel/l2cr_6xx.S | 6 +++--- arch/powerpc/kernel/swsusp_32.S | 2 +- arch/powerpc/kernel/swsusp_asm64.S | 2 +- arch/powerpc/mm/mmu_context.c | 2 +- arch/powerpc/platforms/powermac/cache.S | 4 ++-- 8 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index f50213e2a3e0..9fe3223e7820 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -249,6 +249,7 @@ #define PPC_INST_COPY 0x7c20060c #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe +#define PPC_INST_DSSALL 0x7e00066c #define PPC_INST_ISEL 0x7c00001e #define PPC_INST_ISEL_MASK 0xfc00003e #define PPC_INST_LSWI 0x7c0004aa @@ -577,6 +578,7 @@ #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_RAW_DCBZL(a, b)) #define PPC_DIVDE(t, a, b) stringify_in_c(.long PPC_RAW_DIVDE(t, a, b)) #define PPC_DIVDEU(t, a, b) stringify_in_c(.long PPC_RAW_DIVDEU(t, a, b)) +#define PPC_DSSALL stringify_in_c(.long PPC_INST_DSSALL) #define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh)) #define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_RAW_STQCX(t, a, b)) #define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHD(t, a, b, c)) diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 1f835539fda4..4ad79eb638c6 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -82,7 +82,7 @@ void power4_idle(void) return; if (cpu_has_feature(CPU_FTR_ALTIVEC)) - asm volatile("DSSALL ; sync" ::: "memory"); + asm volatile(PPC_DSSALL " ; sync" ::: "memory"); power4_idle_nap(); diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index 13cad9297d82..3c097356366b 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -129,7 +129,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFCLR(CPU_FTR_NO_DPM) mtspr SPRN_HID0,r4 BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) lwz r8,TI_LOCAL_FLAGS(r2) /* set napping bit */ diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S index 225511d73bef..f2e03ed423d0 100644 --- a/arch/powerpc/kernel/l2cr_6xx.S +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -96,7 +96,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L2CR) /* Stop DST streams */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) @@ -292,7 +292,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) isync /* Stop DST streams */ - DSSALL + PPC_DSSALL sync /* Get the current enable bit of the L3CR into r4 */ @@ -401,7 +401,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_L3CR) _GLOBAL(__flush_disable_L1) /* Stop pending alitvec streams and memory accesses */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) sync diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S index f73f4d72fea4..e0cbd63007f2 100644 --- a/arch/powerpc/kernel/swsusp_32.S +++ b/arch/powerpc/kernel/swsusp_32.S @@ -181,7 +181,7 @@ _GLOBAL(swsusp_arch_resume) #ifdef CONFIG_ALTIVEC /* Stop pending alitvec streams and memory accesses */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif sync diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index 96bb20715aa9..9f1903c7f540 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -141,7 +141,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) _GLOBAL(swsusp_arch_resume) /* Stop pending alitvec streams and memory accesses */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) sync diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 735c36f26388..1fb9c99f8679 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -90,7 +90,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * context */ if (cpu_has_feature(CPU_FTR_ALTIVEC)) - asm volatile ("dssall"); + asm volatile (PPC_DSSALL); if (!new_on_cpu) membarrier_arch_switch_mm(prev, next, tsk); diff --git a/arch/powerpc/platforms/powermac/cache.S b/arch/powerpc/platforms/powermac/cache.S index ced225415486..b8ae56e9f414 100644 --- a/arch/powerpc/platforms/powermac/cache.S +++ b/arch/powerpc/platforms/powermac/cache.S @@ -48,7 +48,7 @@ flush_disable_75x: /* Stop DST streams */ BEGIN_FTR_SECTION - DSSALL + PPC_DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) @@ -197,7 +197,7 @@ flush_disable_745x: isync /* Stop prefetch streams */ - DSSALL + PPC_DSSALL sync /* Disable L2 prefetching */ From 62479e6e26ef18f00e2e540c0e30156254533a43 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 21 Dec 2021 16:59:04 +1100 Subject: [PATCH 218/240] powerpc/mm/book3s64/hash: Switch pre 2.06 tlbiel to .long The llvm integrated assembler does not recognise the ISA 2.05 tlbiel version. Work around it by switching to .long when an old arch level detected. Signed-off-by: Daniel Axtens [aik: did "Eventually do this more smartly"] Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221055904.555763-7-aik@ozlabs.ru --- arch/powerpc/include/asm/ppc-opcode.h | 2 ++ arch/powerpc/mm/book3s64/hash_native.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 9fe3223e7820..efad07081cc0 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -394,6 +394,7 @@ (0x7c000264 | ___PPC_RB(rb) | ___PPC_RS(rs) | ___PPC_RIC(ric) | ___PPC_PRS(prs) | ___PPC_R(r)) #define PPC_RAW_TLBIEL(rb, rs, ric, prs, r) \ (0x7c000224 | ___PPC_RB(rb) | ___PPC_RS(rs) | ___PPC_RIC(ric) | ___PPC_PRS(prs) | ___PPC_R(r)) +#define PPC_RAW_TLBIEL_v205(rb, l) (0x7c000224 | ___PPC_RB(rb) | (l << 21)) #define PPC_RAW_TLBSRX_DOT(a, b) (0x7c0006a5 | __PPC_RA0(a) | __PPC_RB(b)) #define PPC_RAW_TLBIVAX(a, b) (0x7c000624 | __PPC_RA0(a) | __PPC_RB(b)) #define PPC_RAW_ERATWE(s, a, w) (0x7c0001a6 | __PPC_RS(s) | __PPC_RA(a) | __PPC_WS(w)) @@ -606,6 +607,7 @@ stringify_in_c(.long PPC_RAW_TLBIE_5(rb, rs, ric, prs, r)) #define PPC_TLBIEL(rb,rs,ric,prs,r) \ stringify_in_c(.long PPC_RAW_TLBIEL(rb, rs, ric, prs, r)) +#define PPC_TLBIEL_v205(rb, l) stringify_in_c(.long PPC_RAW_TLBIEL_v205(rb, l)) #define PPC_TLBSRX_DOT(a, b) stringify_in_c(.long PPC_RAW_TLBSRX_DOT(a, b)) #define PPC_TLBIVAX(a, b) stringify_in_c(.long PPC_RAW_TLBIVAX(a, b)) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index d2a320828c0b..623a7b7ab38b 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -163,7 +163,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) va |= ssize << 8; sllp = get_sllp_encoding(apsize); va |= sllp << 5; - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,0", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 0), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; @@ -182,7 +182,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) */ va |= (vpn & 0xfe); va |= 1; /* L */ - asm volatile(ASM_FTR_IFSET("tlbiel %0", "tlbiel %0,1", %1) + asm volatile(ASM_FTR_IFSET("tlbiel %0", PPC_TLBIEL_v205(%0, 1), %1) : : "r" (va), "i" (CPU_FTR_ARCH_206) : "memory"); break; From edecd2d6d6f4a122dd62bce654b4f63301e8ad9a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:17 +0100 Subject: [PATCH 219/240] powerpc/code-patching: Remove pr_debug()/pr_devel() messages and fix check() code-patching has been working for years now, time has come to remove debugging messages. Change useful message to KERN_INFO and remove other ones. Also add KERN_ERR to check() macro and change it into a do/while to make checkpatch happy. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3ff9823c0a812a8a145d979a9600a6d4591b80ee.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index ee54cb447f80..7334ea99efd0 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -95,7 +95,6 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr) err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); - pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err); if (err) return -1; @@ -130,8 +129,6 @@ static inline int unmap_patch_area(unsigned long addr) if (unlikely(!ptep)) return -EINVAL; - pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr); - /* * In hash, pte_clear flushes the tlb, in radix, we have to */ @@ -190,10 +187,9 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) int patch_instruction(u32 *addr, ppc_inst_t instr) { /* Make sure we aren't patching a freed init section */ - if (init_mem_is_free && init_section_contains(addr, 4)) { - pr_debug("Skipping init section patching addr: 0x%px\n", addr); + if (init_mem_is_free && init_section_contains(addr, 4)) return 0; - } + return do_patch_instruction(addr, instr); } NOKPROBE_SYMBOL(patch_instruction); @@ -411,8 +407,10 @@ static void __init test_trampoline(void) asm ("nop;\n"); } -#define check(x) \ - if (!(x)) printk("code-patching: test failed at line %d\n", __LINE__); +#define check(x) do { \ + if (!(x)) \ + pr_err("code-patching: test failed at line %d\n", __LINE__); \ +} while (0) static void __init test_branch_iform(void) { @@ -737,7 +735,7 @@ static inline void test_prefixed_patching(void) {} static int __init test_code_patching(void) { - printk(KERN_DEBUG "Running code patching self-tests ...\n"); + pr_info("Running code patching self-tests ...\n"); test_branch_iform(); test_branch_bform(); From af5304a7506588221d8317ef3f76585eb4483506 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:18 +0100 Subject: [PATCH 220/240] powerpc/code-patching: Remove init_mem_is_free A new state has been added by commit d2635f2012a4 ("mm: create a new system state and fix core_kernel_text()"). That state tells when initmem is about to be released and is redundant with init_mem_is_free. Remove init_mem_is_free. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ad8c3ccb39c8edaa89fd3eda1cc7218baea1cde5.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/setup.h | 1 - arch/powerpc/lib/code-patching.c | 3 +-- arch/powerpc/mm/mem.c | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 71658504dadd..d0d3dd531c7f 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -9,7 +9,6 @@ extern void ppc_printk_progress(char *s, unsigned short hex); extern unsigned int rtas_data; extern unsigned long long memory_limit; -extern bool init_mem_is_free; extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask); struct device_node; diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 7334ea99efd0..1c05fc725af8 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -15,7 +15,6 @@ #include #include #include -#include #include static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr) @@ -187,7 +186,7 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) int patch_instruction(u32 *addr, ppc_inst_t instr) { /* Make sure we aren't patching a freed init section */ - if (init_mem_is_free && init_section_contains(addr, 4)) + if (system_state >= SYSTEM_FREEING_INITMEM && init_section_contains(addr, 4)) return 0; return do_patch_instruction(addr, instr); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index bd5d91a31183..8e301cd8925b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -26,7 +26,6 @@ #include unsigned long long memory_limit; -bool init_mem_is_free; unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -312,7 +311,6 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); - init_mem_is_free = true; free_initmem_default(POISON_FREE_INITMEM); } From 285672f99327d5b8febdf83cadba61a68abe5d69 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:19 +0100 Subject: [PATCH 221/240] powerpc/code-patching: Fix error handling in do_patch_instruction() Use real errors instead of using -1 as error, so that errors returned by callees can be used towards callers. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/85259d894069e47f915ea580b169e1adbeec7a61.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 1c05fc725af8..380c55d2e41a 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -85,19 +85,13 @@ void __init poking_init(void) static int map_patch_area(void *addr, unsigned long text_poke_addr) { unsigned long pfn; - int err; if (is_vmalloc_or_module_addr(addr)) pfn = vmalloc_to_pfn(addr); else pfn = __pa_symbol(addr) >> PAGE_SHIFT; - err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); - - if (err) - return -1; - - return 0; + return map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); } static inline int unmap_patch_area(unsigned long addr) @@ -156,10 +150,9 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) local_irq_save(flags); text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr; - if (map_patch_area(addr, text_poke_addr)) { - err = -1; + err = map_patch_area(addr, text_poke_addr); + if (err) goto out; - } patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK)); From a3483c3dd18c136785a31406fe27210649fc4fba Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:20 +0100 Subject: [PATCH 222/240] powerpc/code-patching: Fix unmap_patch_area() error handling pXd_offset() doesn't return NULL. When the base is NULL, it still adds the offset. Use pXd_none() to check validity instead. It also improves performance by folding out none existing levels as pXd_none() always returns 0 in that case. Such an error is unexpected, use WARN_ON() so that the caller doesn't have to worry about it, and drop the returned value. And now that unmap_patch_area() doesn't return error, we can take into account the error returned by __patch_instruction(). While at it, remove the 'inline' property which is useless. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/299804b117fae35c786c827536c91f25352e279b.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 380c55d2e41a..740ba0dc5da0 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -94,7 +94,7 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr) return map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); } -static inline int unmap_patch_area(unsigned long addr) +static void unmap_patch_area(unsigned long addr) { pte_t *ptep; pmd_t *pmdp; @@ -103,32 +103,30 @@ static inline int unmap_patch_area(unsigned long addr) pgd_t *pgdp; pgdp = pgd_offset_k(addr); - if (unlikely(!pgdp)) - return -EINVAL; + if (WARN_ON(pgd_none(*pgdp))) + return; p4dp = p4d_offset(pgdp, addr); - if (unlikely(!p4dp)) - return -EINVAL; + if (WARN_ON(p4d_none(*p4dp))) + return; pudp = pud_offset(p4dp, addr); - if (unlikely(!pudp)) - return -EINVAL; + if (WARN_ON(pud_none(*pudp))) + return; pmdp = pmd_offset(pudp, addr); - if (unlikely(!pmdp)) - return -EINVAL; + if (WARN_ON(pmd_none(*pmdp))) + return; ptep = pte_offset_kernel(pmdp, addr); - if (unlikely(!ptep)) - return -EINVAL; + if (WARN_ON(pte_none(*ptep))) + return; /* * In hash, pte_clear flushes the tlb, in radix, we have to */ pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - - return 0; } static int do_patch_instruction(u32 *addr, ppc_inst_t instr) @@ -156,11 +154,9 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK)); - __patch_instruction(addr, instr, patch_addr); + err = __patch_instruction(addr, instr, patch_addr); - err = unmap_patch_area(text_poke_addr); - if (err) - pr_warn("failed to unmap %lx\n", text_poke_addr); + unmap_patch_area(text_poke_addr); out: local_irq_restore(flags); From 6b21af74495b556f9d496d97d74e7a3d0ab16d7c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:21 +0100 Subject: [PATCH 223/240] powerpc/code-patching: Reorganise do_patch_instruction() to ease error handling Split do_patch_instruction() in two functions, the caller doing the spin locking and the callee doing everything else. And remove a few unnecessary initialisations and intermediate variables. This allows the callee to return from anywhere in the function. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/dbc85980a0d2a935731b272e8907e8bb1d8fc8c5.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 37 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 740ba0dc5da0..3da7224fbd6e 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -129,13 +129,30 @@ static void unmap_patch_area(unsigned long addr) flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } +static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) +{ + int err; + u32 *patch_addr; + unsigned long text_poke_addr; + + text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr; + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + + err = map_patch_area(addr, text_poke_addr); + if (err) + return err; + + err = __patch_instruction(addr, instr, patch_addr); + + unmap_patch_area(text_poke_addr); + + return err; +} + static int do_patch_instruction(u32 *addr, ppc_inst_t instr) { int err; - u32 *patch_addr = NULL; unsigned long flags; - unsigned long text_poke_addr; - unsigned long kaddr = (unsigned long)addr; /* * During early early boot patch_instruction is called @@ -146,19 +163,7 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) return raw_patch_instruction(addr, instr); local_irq_save(flags); - - text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr; - err = map_patch_area(addr, text_poke_addr); - if (err) - goto out; - - patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK)); - - err = __patch_instruction(addr, instr, patch_addr); - - unmap_patch_area(text_poke_addr); - -out: + err = __do_patch_instruction(addr, instr); local_irq_restore(flags); return err; From d5937db114e4b6446c62809484729955f1aeb108 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:22 +0100 Subject: [PATCH 224/240] powerpc/code-patching: Fix patch_branch() return on out-of-range failure Do not silentely ignore a failure of create_branch() in patch_branch(). Return -ERANGE. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8540cb64b1f06710eaf41e3835c7ba3e21fa2b05.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 3da7224fbd6e..998aeb9e1aac 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -191,7 +191,9 @@ int patch_branch(u32 *addr, unsigned long target, int flags) { ppc_inst_t instr; - create_branch(&instr, addr, target, flags); + if (create_branch(&instr, addr, target, flags)) + return -ERANGE; + return patch_instruction(addr, instr); } From ff14a9c09fe91a70bfc6381809877e5a19e38cdb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:23 +0100 Subject: [PATCH 225/240] powerpc/code-patching: Use test_trampoline for prefixed patch test Use the dedicated test_trampoline function for testing prefixed patching like other tests and remove the hand coded assembly stuff. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a450ef3f8653f75e1bd9aaf7a3889d379752f33b.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/code-patching.c | 24 +++++++++--------------- arch/powerpc/lib/test_code-patching.S | 20 -------------------- 3 files changed, 10 insertions(+), 36 deletions(-) delete mode 100644 arch/powerpc/lib/test_code-patching.S diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 9e5d0f413b71..c2654894b468 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -19,7 +19,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING endif -obj-y += alloc.o code-patching.o feature-fixups.o pmem.o test_code-patching.o +obj-y += alloc.o code-patching.o feature-fixups.o pmem.o ifndef CONFIG_KASAN obj-y += string.o memcmp_$(BITS).o diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 998aeb9e1aac..441a71c22dfa 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -399,7 +399,7 @@ static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) static void __init test_trampoline(void) { - asm ("nop;\n"); + asm ("nop;nop;\n"); } #define check(x) do { \ @@ -708,25 +708,19 @@ static void __init test_translate_branch(void) vfree(buf); } -#ifdef CONFIG_PPC64 static void __init test_prefixed_patching(void) { - extern unsigned int code_patching_test1[]; - extern unsigned int code_patching_test1_expected[]; - extern unsigned int end_code_patching_test1[]; + u32 *iptr = (u32 *)ppc_function_entry(test_trampoline); + u32 expected[2] = {OP_PREFIX << 26, 0}; + ppc_inst_t inst = ppc_inst_prefix(OP_PREFIX << 26, 0); - __patch_instruction(code_patching_test1, - ppc_inst_prefix(OP_PREFIX << 26, 0x00000000), - code_patching_test1); + if (!IS_ENABLED(CONFIG_PPC64)) + return; - check(!memcmp(code_patching_test1, - code_patching_test1_expected, - sizeof(unsigned int) * - (end_code_patching_test1 - code_patching_test1))); + patch_instruction(iptr, inst); + + check(!memcmp(iptr, expected, sizeof(expected))); } -#else -static inline void test_prefixed_patching(void) {} -#endif static int __init test_code_patching(void) { diff --git a/arch/powerpc/lib/test_code-patching.S b/arch/powerpc/lib/test_code-patching.S deleted file mode 100644 index a9be6107844e..000000000000 --- a/arch/powerpc/lib/test_code-patching.S +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2020 IBM Corporation - */ -#include - - .text - -#define globl(x) \ - .globl x; \ -x: - -globl(code_patching_test1) - nop - nop -globl(end_code_patching_test1) - -globl(code_patching_test1_expected) - .long OP_PREFIX << 26 - .long 0x0000000 From 29562a9da29478834e57f81e3804e9ec7a6b350b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:24 +0100 Subject: [PATCH 226/240] powerpc/code-patching: Move patch_exception() outside code-patching.c patch_exception() is dedicated to book3e/64 is nothing more than a normal use of patch_branch(), so move it into a place dedicated to book3e/64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0968622b98b1fb51838c35b844c42ad6609de62e.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 7 ------- arch/powerpc/include/asm/exception-64e.h | 4 ++++ arch/powerpc/include/asm/nohash/64/pgtable.h | 6 ++++++ arch/powerpc/lib/code-patching.c | 16 ---------------- arch/powerpc/mm/nohash/book3e_pgtable.c | 15 +++++++++++++++ 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 46e8c5a8ce51..275061c3c977 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -63,13 +63,6 @@ int instr_is_relative_link_branch(ppc_inst_t instr); unsigned long branch_target(const u32 *instr); int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src); bool is_conditional_branch(ppc_inst_t instr); -#ifdef CONFIG_PPC_BOOK3E_64 -void __patch_exception(int exc, unsigned long addr); -#define patch_exception(exc, name) do { \ - extern unsigned int name; \ - __patch_exception((exc), (unsigned long)&name); \ -} while (0) -#endif #define OP_RT_RA_MASK 0xffff0000UL #define LIS_R2 (PPC_RAW_LIS(_R2, 0)) diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index 40cdcb2fb057..b1ef1e92c34a 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -149,6 +149,10 @@ exc_##label##_book3e: addi r11,r13,PACA_EXTLB; \ TLB_MISS_RESTORE(r11) +#ifndef __ASSEMBLY__ +extern unsigned int interrupt_base_book3e; +#endif + #define SET_IVOR(vector_number, vector_offset) \ LOAD_REG_ADDR(r3,interrupt_base_book3e);\ ori r3,r3,vector_offset@l; \ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 9d2905a47410..a3313e853e5e 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -313,6 +313,12 @@ extern int __meminit vmemmap_create_mapping(unsigned long start, unsigned long phys); extern void vmemmap_remove_mapping(unsigned long start, unsigned long page_size); +void __patch_exception(int exc, unsigned long addr); +#define patch_exception(exc, name) do { \ + extern unsigned int name; \ + __patch_exception((exc), (unsigned long)&name); \ +} while (0) + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 441a71c22dfa..f4986a781d53 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -370,22 +370,6 @@ int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src) return 1; } -#ifdef CONFIG_PPC_BOOK3E_64 -void __patch_exception(int exc, unsigned long addr) -{ - extern unsigned int interrupt_base_book3e; - unsigned int *ibase = &interrupt_base_book3e; - - /* Our exceptions vectors start with a NOP and -then- a branch - * to deal with single stepping from userspace which stops on - * the second instruction. Thus we need to patch the second - * instruction of the exception, not the first one - */ - - patch_branch(ibase + (exc / 4) + 1, addr, 0); -} -#endif - #ifdef CONFIG_CODE_PATCHING_SELFTEST static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index 77884e24281d..7d4368d055a6 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -115,3 +116,17 @@ int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) smp_wmb(); return 0; } + +void __patch_exception(int exc, unsigned long addr) +{ + unsigned int *ibase = &interrupt_base_book3e; + + /* + * Our exceptions vectors start with a NOP and -then- a branch + * to deal with single stepping from userspace which stops on + * the second instruction. Thus we need to patch the second + * instruction of the exception, not the first one. + */ + + patch_branch(ibase + (exc / 4) + 1, addr, 0); +} From 31acc599564120fa41f9df2c567842d003728dab Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:25 +0100 Subject: [PATCH 227/240] powerpc/code-patching: Move instr_is_branch_{i/b}form() in code-patching.h To enable moving selftests in their own C file in following patch, move instr_is_branch_iform() and instr_is_branch_bform() to code-patching.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/fca0f3b191211b3681020885a611bf73eef20563.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 15 +++++++++++++++ arch/powerpc/lib/code-patching.c | 15 --------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 275061c3c977..e26080539c31 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -58,6 +58,21 @@ static inline int modify_instruction_site(s32 *site, unsigned int clr, unsigned return modify_instruction((unsigned int *)patch_site_addr(site), clr, set); } +static inline unsigned int branch_opcode(ppc_inst_t instr) +{ + return ppc_inst_primary_opcode(instr) & 0x3F; +} + +static inline int instr_is_branch_iform(ppc_inst_t instr) +{ + return branch_opcode(instr) == 18; +} + +static inline int instr_is_branch_bform(ppc_inst_t instr) +{ + return branch_opcode(instr) == 16; +} + int instr_is_relative_branch(ppc_inst_t instr); int instr_is_relative_link_branch(ppc_inst_t instr); unsigned long branch_target(const u32 *instr); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index f4986a781d53..e24a4b0ce877 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -285,21 +285,6 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr, return 0; } -static unsigned int branch_opcode(ppc_inst_t instr) -{ - return ppc_inst_primary_opcode(instr) & 0x3F; -} - -static int instr_is_branch_iform(ppc_inst_t instr) -{ - return branch_opcode(instr) == 18; -} - -static int instr_is_branch_bform(ppc_inst_t instr) -{ - return branch_opcode(instr) == 16; -} - int instr_is_relative_branch(ppc_inst_t instr) { if (ppc_inst_val(instr) & BRANCH_ABSOLUTE) From f30a578d7653f7dbb253a20daad4bcd9f881d6c9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:26 +0100 Subject: [PATCH 228/240] powerpc/code-patching: Move code patching selftests in its own file Code patching selftests are half of code-patching.c. As they are guarded by CONFIG_CODE_PATCHING_SELFTESTS, they'd be better in their own file. Also add a missing __init for instr_is_branch_to_addr() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c0c30504f04eb546a48ff77127a8bccd12a3d809.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/Makefile | 2 + arch/powerpc/lib/code-patching.c | 355 ------------------------- arch/powerpc/lib/test-code-patching.c | 357 ++++++++++++++++++++++++++ 3 files changed, 359 insertions(+), 355 deletions(-) create mode 100644 arch/powerpc/lib/test-code-patching.c diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index c2654894b468..3e183f4b4bda 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -21,6 +21,8 @@ endif obj-y += alloc.o code-patching.o feature-fixups.o pmem.o +obj-$(CONFIG_CODE_PATCHING_SELFTEST) += test-code-patching.o + ifndef CONFIG_KASAN obj-y += string.o memcmp_$(BITS).o obj-$(CONFIG_PPC32) += strlen_32.o diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index e24a4b0ce877..906d43463366 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -3,13 +3,10 @@ * Copyright 2008 Michael Ellerman, IBM Corporation. */ -#include #include #include #include -#include #include -#include #include #include @@ -354,355 +351,3 @@ int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src) return 1; } - -#ifdef CONFIG_CODE_PATCHING_SELFTEST - -static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) -{ - if (instr_is_branch_iform(ppc_inst_read(instr)) || - instr_is_branch_bform(ppc_inst_read(instr))) - return branch_target(instr) == addr; - - return 0; -} - -static void __init test_trampoline(void) -{ - asm ("nop;nop;\n"); -} - -#define check(x) do { \ - if (!(x)) \ - pr_err("code-patching: test failed at line %d\n", __LINE__); \ -} while (0) - -static void __init test_branch_iform(void) -{ - int err; - ppc_inst_t instr; - u32 tmp[2]; - u32 *iptr = tmp; - unsigned long addr = (unsigned long)tmp; - - /* The simplest case, branch to self, no flags */ - check(instr_is_branch_iform(ppc_inst(0x48000000))); - /* All bits of target set, and flags */ - check(instr_is_branch_iform(ppc_inst(0x4bffffff))); - /* High bit of opcode set, which is wrong */ - check(!instr_is_branch_iform(ppc_inst(0xcbffffff))); - /* Middle bits of opcode set, which is wrong */ - check(!instr_is_branch_iform(ppc_inst(0x7bffffff))); - - /* Simplest case, branch to self with link */ - check(instr_is_branch_iform(ppc_inst(0x48000001))); - /* All bits of targets set */ - check(instr_is_branch_iform(ppc_inst(0x4bfffffd))); - /* Some bits of targets set */ - check(instr_is_branch_iform(ppc_inst(0x4bff00fd))); - /* Must be a valid branch to start with */ - check(!instr_is_branch_iform(ppc_inst(0x7bfffffd))); - - /* Absolute branch to 0x100 */ - patch_instruction(iptr, ppc_inst(0x48000103)); - check(instr_is_branch_to_addr(iptr, 0x100)); - /* Absolute branch to 0x420fc */ - patch_instruction(iptr, ppc_inst(0x480420ff)); - check(instr_is_branch_to_addr(iptr, 0x420fc)); - /* Maximum positive relative branch, + 20MB - 4B */ - patch_instruction(iptr, ppc_inst(0x49fffffc)); - check(instr_is_branch_to_addr(iptr, addr + 0x1FFFFFC)); - /* Smallest negative relative branch, - 4B */ - patch_instruction(iptr, ppc_inst(0x4bfffffc)); - check(instr_is_branch_to_addr(iptr, addr - 4)); - /* Largest negative relative branch, - 32 MB */ - patch_instruction(iptr, ppc_inst(0x4a000000)); - check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); - - /* Branch to self, with link */ - err = create_branch(&instr, iptr, addr, BRANCH_SET_LINK); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr)); - - /* Branch to self - 0x100, with link */ - err = create_branch(&instr, iptr, addr - 0x100, BRANCH_SET_LINK); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr - 0x100)); - - /* Branch to self + 0x100, no link */ - err = create_branch(&instr, iptr, addr + 0x100, 0); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr + 0x100)); - - /* Maximum relative negative offset, - 32 MB */ - err = create_branch(&instr, iptr, addr - 0x2000000, BRANCH_SET_LINK); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); - - /* Out of range relative negative offset, - 32 MB + 4*/ - err = create_branch(&instr, iptr, addr - 0x2000004, BRANCH_SET_LINK); - check(err); - - /* Out of range relative positive offset, + 32 MB */ - err = create_branch(&instr, iptr, addr + 0x2000000, BRANCH_SET_LINK); - check(err); - - /* Unaligned target */ - err = create_branch(&instr, iptr, addr + 3, BRANCH_SET_LINK); - check(err); - - /* Check flags are masked correctly */ - err = create_branch(&instr, iptr, addr, 0xFFFFFFFC); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr)); - check(ppc_inst_equal(instr, ppc_inst(0x48000000))); -} - -static void __init test_create_function_call(void) -{ - u32 *iptr; - unsigned long dest; - ppc_inst_t instr; - - /* Check we can create a function call */ - iptr = (u32 *)ppc_function_entry(test_trampoline); - dest = ppc_function_entry(test_create_function_call); - create_branch(&instr, iptr, dest, BRANCH_SET_LINK); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, dest)); -} - -static void __init test_branch_bform(void) -{ - int err; - unsigned long addr; - ppc_inst_t instr; - u32 tmp[2]; - u32 *iptr = tmp; - unsigned int flags; - - addr = (unsigned long)iptr; - - /* The simplest case, branch to self, no flags */ - check(instr_is_branch_bform(ppc_inst(0x40000000))); - /* All bits of target set, and flags */ - check(instr_is_branch_bform(ppc_inst(0x43ffffff))); - /* High bit of opcode set, which is wrong */ - check(!instr_is_branch_bform(ppc_inst(0xc3ffffff))); - /* Middle bits of opcode set, which is wrong */ - check(!instr_is_branch_bform(ppc_inst(0x7bffffff))); - - /* Absolute conditional branch to 0x100 */ - patch_instruction(iptr, ppc_inst(0x43ff0103)); - check(instr_is_branch_to_addr(iptr, 0x100)); - /* Absolute conditional branch to 0x20fc */ - patch_instruction(iptr, ppc_inst(0x43ff20ff)); - check(instr_is_branch_to_addr(iptr, 0x20fc)); - /* Maximum positive relative conditional branch, + 32 KB - 4B */ - patch_instruction(iptr, ppc_inst(0x43ff7ffc)); - check(instr_is_branch_to_addr(iptr, addr + 0x7FFC)); - /* Smallest negative relative conditional branch, - 4B */ - patch_instruction(iptr, ppc_inst(0x43fffffc)); - check(instr_is_branch_to_addr(iptr, addr - 4)); - /* Largest negative relative conditional branch, - 32 KB */ - patch_instruction(iptr, ppc_inst(0x43ff8000)); - check(instr_is_branch_to_addr(iptr, addr - 0x8000)); - - /* All condition code bits set & link */ - flags = 0x3ff000 | BRANCH_SET_LINK; - - /* Branch to self */ - err = create_cond_branch(&instr, iptr, addr, flags); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr)); - - /* Branch to self - 0x100 */ - err = create_cond_branch(&instr, iptr, addr - 0x100, flags); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr - 0x100)); - - /* Branch to self + 0x100 */ - err = create_cond_branch(&instr, iptr, addr + 0x100, flags); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr + 0x100)); - - /* Maximum relative negative offset, - 32 KB */ - err = create_cond_branch(&instr, iptr, addr - 0x8000, flags); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr - 0x8000)); - - /* Out of range relative negative offset, - 32 KB + 4*/ - err = create_cond_branch(&instr, iptr, addr - 0x8004, flags); - check(err); - - /* Out of range relative positive offset, + 32 KB */ - err = create_cond_branch(&instr, iptr, addr + 0x8000, flags); - check(err); - - /* Unaligned target */ - err = create_cond_branch(&instr, iptr, addr + 3, flags); - check(err); - - /* Check flags are masked correctly */ - err = create_cond_branch(&instr, iptr, addr, 0xFFFFFFFC); - patch_instruction(iptr, instr); - check(instr_is_branch_to_addr(iptr, addr)); - check(ppc_inst_equal(instr, ppc_inst(0x43FF0000))); -} - -static void __init test_translate_branch(void) -{ - unsigned long addr; - void *p, *q; - ppc_inst_t instr; - void *buf; - - buf = vmalloc(PAGE_ALIGN(0x2000000 + 1)); - check(buf); - if (!buf) - return; - - /* Simple case, branch to self moved a little */ - p = buf; - addr = (unsigned long)p; - patch_branch(p, addr, 0); - check(instr_is_branch_to_addr(p, addr)); - q = p + 4; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(q, addr)); - - /* Maximum negative case, move b . to addr + 32 MB */ - p = buf; - addr = (unsigned long)p; - patch_branch(p, addr, 0); - q = buf + 0x2000000; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x4a000000))); - - /* Maximum positive case, move x to x - 32 MB + 4 */ - p = buf + 0x2000000; - addr = (unsigned long)p; - patch_branch(p, addr, 0); - q = buf + 4; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x49fffffc))); - - /* Jump to x + 16 MB moved to x + 20 MB */ - p = buf; - addr = 0x1000000 + (unsigned long)buf; - patch_branch(p, addr, BRANCH_SET_LINK); - q = buf + 0x1400000; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - - /* Jump to x + 16 MB moved to x - 16 MB + 4 */ - p = buf + 0x1000000; - addr = 0x2000000 + (unsigned long)buf; - patch_branch(p, addr, 0); - q = buf + 4; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - - - /* Conditional branch tests */ - - /* Simple case, branch to self moved a little */ - p = buf; - addr = (unsigned long)p; - create_cond_branch(&instr, p, addr, 0); - patch_instruction(p, instr); - check(instr_is_branch_to_addr(p, addr)); - q = buf + 4; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(q, addr)); - - /* Maximum negative case, move b . to addr + 32 KB */ - p = buf; - addr = (unsigned long)p; - create_cond_branch(&instr, p, addr, 0xFFFFFFFC); - patch_instruction(p, instr); - q = buf + 0x8000; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff8000))); - - /* Maximum positive case, move x to x - 32 KB + 4 */ - p = buf + 0x8000; - addr = (unsigned long)p; - create_cond_branch(&instr, p, addr, 0xFFFFFFFC); - patch_instruction(p, instr); - q = buf + 4; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff7ffc))); - - /* Jump to x + 12 KB moved to x + 20 KB */ - p = buf; - addr = 0x3000 + (unsigned long)buf; - create_cond_branch(&instr, p, addr, BRANCH_SET_LINK); - patch_instruction(p, instr); - q = buf + 0x5000; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - - /* Jump to x + 8 KB moved to x - 8 KB + 4 */ - p = buf + 0x2000; - addr = 0x4000 + (unsigned long)buf; - create_cond_branch(&instr, p, addr, 0); - patch_instruction(p, instr); - q = buf + 4; - translate_branch(&instr, q, p); - patch_instruction(q, instr); - check(instr_is_branch_to_addr(p, addr)); - check(instr_is_branch_to_addr(q, addr)); - - /* Free the buffer we were using */ - vfree(buf); -} - -static void __init test_prefixed_patching(void) -{ - u32 *iptr = (u32 *)ppc_function_entry(test_trampoline); - u32 expected[2] = {OP_PREFIX << 26, 0}; - ppc_inst_t inst = ppc_inst_prefix(OP_PREFIX << 26, 0); - - if (!IS_ENABLED(CONFIG_PPC64)) - return; - - patch_instruction(iptr, inst); - - check(!memcmp(iptr, expected, sizeof(expected))); -} - -static int __init test_code_patching(void) -{ - pr_info("Running code patching self-tests ...\n"); - - test_branch_iform(); - test_branch_bform(); - test_create_function_call(); - test_translate_branch(); - test_prefixed_patching(); - - return 0; -} -late_initcall(test_code_patching); - -#endif /* CONFIG_CODE_PATCHING_SELFTEST */ diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c new file mode 100644 index 000000000000..e358c9d8a03e --- /dev/null +++ b/arch/powerpc/lib/test-code-patching.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + */ + +#include +#include + +#include + +static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) +{ + if (instr_is_branch_iform(ppc_inst_read(instr)) || + instr_is_branch_bform(ppc_inst_read(instr))) + return branch_target(instr) == addr; + + return 0; +} + +static void __init test_trampoline(void) +{ + asm ("nop;nop;\n"); +} + +#define check(x) do { \ + if (!(x)) \ + pr_err("code-patching: test failed at line %d\n", __LINE__); \ +} while (0) + +static void __init test_branch_iform(void) +{ + int err; + ppc_inst_t instr; + u32 tmp[2]; + u32 *iptr = tmp; + unsigned long addr = (unsigned long)tmp; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_iform(ppc_inst(0x48000000))); + /* All bits of target set, and flags */ + check(instr_is_branch_iform(ppc_inst(0x4bffffff))); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_iform(ppc_inst(0xcbffffff))); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_iform(ppc_inst(0x7bffffff))); + + /* Simplest case, branch to self with link */ + check(instr_is_branch_iform(ppc_inst(0x48000001))); + /* All bits of targets set */ + check(instr_is_branch_iform(ppc_inst(0x4bfffffd))); + /* Some bits of targets set */ + check(instr_is_branch_iform(ppc_inst(0x4bff00fd))); + /* Must be a valid branch to start with */ + check(!instr_is_branch_iform(ppc_inst(0x7bfffffd))); + + /* Absolute branch to 0x100 */ + patch_instruction(iptr, ppc_inst(0x48000103)); + check(instr_is_branch_to_addr(iptr, 0x100)); + /* Absolute branch to 0x420fc */ + patch_instruction(iptr, ppc_inst(0x480420ff)); + check(instr_is_branch_to_addr(iptr, 0x420fc)); + /* Maximum positive relative branch, + 20MB - 4B */ + patch_instruction(iptr, ppc_inst(0x49fffffc)); + check(instr_is_branch_to_addr(iptr, addr + 0x1FFFFFC)); + /* Smallest negative relative branch, - 4B */ + patch_instruction(iptr, ppc_inst(0x4bfffffc)); + check(instr_is_branch_to_addr(iptr, addr - 4)); + /* Largest negative relative branch, - 32 MB */ + patch_instruction(iptr, ppc_inst(0x4a000000)); + check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); + + /* Branch to self, with link */ + err = create_branch(&instr, iptr, addr, BRANCH_SET_LINK); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + + /* Branch to self - 0x100, with link */ + err = create_branch(&instr, iptr, addr - 0x100, BRANCH_SET_LINK); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x100)); + + /* Branch to self + 0x100, no link */ + err = create_branch(&instr, iptr, addr + 0x100, 0); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 MB */ + err = create_branch(&instr, iptr, addr - 0x2000000, BRANCH_SET_LINK); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); + + /* Out of range relative negative offset, - 32 MB + 4*/ + err = create_branch(&instr, iptr, addr - 0x2000004, BRANCH_SET_LINK); + check(err); + + /* Out of range relative positive offset, + 32 MB */ + err = create_branch(&instr, iptr, addr + 0x2000000, BRANCH_SET_LINK); + check(err); + + /* Unaligned target */ + err = create_branch(&instr, iptr, addr + 3, BRANCH_SET_LINK); + check(err); + + /* Check flags are masked correctly */ + err = create_branch(&instr, iptr, addr, 0xFFFFFFFC); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + check(ppc_inst_equal(instr, ppc_inst(0x48000000))); +} + +static void __init test_create_function_call(void) +{ + u32 *iptr; + unsigned long dest; + ppc_inst_t instr; + + /* Check we can create a function call */ + iptr = (u32 *)ppc_function_entry(test_trampoline); + dest = ppc_function_entry(test_create_function_call); + create_branch(&instr, iptr, dest, BRANCH_SET_LINK); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, dest)); +} + +static void __init test_branch_bform(void) +{ + int err; + unsigned long addr; + ppc_inst_t instr; + u32 tmp[2]; + u32 *iptr = tmp; + unsigned int flags; + + addr = (unsigned long)iptr; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_bform(ppc_inst(0x40000000))); + /* All bits of target set, and flags */ + check(instr_is_branch_bform(ppc_inst(0x43ffffff))); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_bform(ppc_inst(0xc3ffffff))); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_bform(ppc_inst(0x7bffffff))); + + /* Absolute conditional branch to 0x100 */ + patch_instruction(iptr, ppc_inst(0x43ff0103)); + check(instr_is_branch_to_addr(iptr, 0x100)); + /* Absolute conditional branch to 0x20fc */ + patch_instruction(iptr, ppc_inst(0x43ff20ff)); + check(instr_is_branch_to_addr(iptr, 0x20fc)); + /* Maximum positive relative conditional branch, + 32 KB - 4B */ + patch_instruction(iptr, ppc_inst(0x43ff7ffc)); + check(instr_is_branch_to_addr(iptr, addr + 0x7FFC)); + /* Smallest negative relative conditional branch, - 4B */ + patch_instruction(iptr, ppc_inst(0x43fffffc)); + check(instr_is_branch_to_addr(iptr, addr - 4)); + /* Largest negative relative conditional branch, - 32 KB */ + patch_instruction(iptr, ppc_inst(0x43ff8000)); + check(instr_is_branch_to_addr(iptr, addr - 0x8000)); + + /* All condition code bits set & link */ + flags = 0x3ff000 | BRANCH_SET_LINK; + + /* Branch to self */ + err = create_cond_branch(&instr, iptr, addr, flags); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + + /* Branch to self - 0x100 */ + err = create_cond_branch(&instr, iptr, addr - 0x100, flags); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x100)); + + /* Branch to self + 0x100 */ + err = create_cond_branch(&instr, iptr, addr + 0x100, flags); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 KB */ + err = create_cond_branch(&instr, iptr, addr - 0x8000, flags); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr - 0x8000)); + + /* Out of range relative negative offset, - 32 KB + 4*/ + err = create_cond_branch(&instr, iptr, addr - 0x8004, flags); + check(err); + + /* Out of range relative positive offset, + 32 KB */ + err = create_cond_branch(&instr, iptr, addr + 0x8000, flags); + check(err); + + /* Unaligned target */ + err = create_cond_branch(&instr, iptr, addr + 3, flags); + check(err); + + /* Check flags are masked correctly */ + err = create_cond_branch(&instr, iptr, addr, 0xFFFFFFFC); + patch_instruction(iptr, instr); + check(instr_is_branch_to_addr(iptr, addr)); + check(ppc_inst_equal(instr, ppc_inst(0x43FF0000))); +} + +static void __init test_translate_branch(void) +{ + unsigned long addr; + void *p, *q; + ppc_inst_t instr; + void *buf; + + buf = vmalloc(PAGE_ALIGN(0x2000000 + 1)); + check(buf); + if (!buf) + return; + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + check(instr_is_branch_to_addr(p, addr)); + q = p + 4; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 MB */ + p = buf; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + q = buf + 0x2000000; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x4a000000))); + + /* Maximum positive case, move x to x - 32 MB + 4 */ + p = buf + 0x2000000; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + q = buf + 4; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x49fffffc))); + + /* Jump to x + 16 MB moved to x + 20 MB */ + p = buf; + addr = 0x1000000 + (unsigned long)buf; + patch_branch(p, addr, BRANCH_SET_LINK); + q = buf + 0x1400000; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 16 MB moved to x - 16 MB + 4 */ + p = buf + 0x1000000; + addr = 0x2000000 + (unsigned long)buf; + patch_branch(p, addr, 0); + q = buf + 4; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + + /* Conditional branch tests */ + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + create_cond_branch(&instr, p, addr, 0); + patch_instruction(p, instr); + check(instr_is_branch_to_addr(p, addr)); + q = buf + 4; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 KB */ + p = buf; + addr = (unsigned long)p; + create_cond_branch(&instr, p, addr, 0xFFFFFFFC); + patch_instruction(p, instr); + q = buf + 0x8000; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff8000))); + + /* Maximum positive case, move x to x - 32 KB + 4 */ + p = buf + 0x8000; + addr = (unsigned long)p; + create_cond_branch(&instr, p, addr, 0xFFFFFFFC); + patch_instruction(p, instr); + q = buf + 4; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff7ffc))); + + /* Jump to x + 12 KB moved to x + 20 KB */ + p = buf; + addr = 0x3000 + (unsigned long)buf; + create_cond_branch(&instr, p, addr, BRANCH_SET_LINK); + patch_instruction(p, instr); + q = buf + 0x5000; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 8 KB moved to x - 8 KB + 4 */ + p = buf + 0x2000; + addr = 0x4000 + (unsigned long)buf; + create_cond_branch(&instr, p, addr, 0); + patch_instruction(p, instr); + q = buf + 4; + translate_branch(&instr, q, p); + patch_instruction(q, instr); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Free the buffer we were using */ + vfree(buf); +} + +static void __init test_prefixed_patching(void) +{ + u32 *iptr = (u32 *)ppc_function_entry(test_trampoline); + u32 expected[2] = {OP_PREFIX << 26, 0}; + ppc_inst_t inst = ppc_inst_prefix(OP_PREFIX << 26, 0); + + if (!IS_ENABLED(CONFIG_PPC64)) + return; + + patch_instruction(iptr, inst); + + check(!memcmp(iptr, expected, sizeof(expected))); +} + +static int __init test_code_patching(void) +{ + pr_info("Running code patching self-tests ...\n"); + + test_branch_iform(); + test_branch_bform(); + test_create_function_call(); + test_translate_branch(); + test_prefixed_patching(); + + return 0; +} +late_initcall(test_code_patching); From 309a0a601864831510209531dd72da486225d8ae Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Dec 2021 13:00:27 +0100 Subject: [PATCH 229/240] powerpc/code-patching: Replace patch_instruction() by ppc_inst_write() in selftests The purpose of selftests is to check that instructions are properly formed. Not to check that they properly run. For that test it uses normal memory, not special test memory. In preparation of a future patch enforcing patch_instruction() to be used only on valid text areas, implement a ppc_inst_write() instruction which is the complement of ppc_inst_read(). This new function writes the formated instruction in valid kernel memory and doesn't bother about icache. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7cf5335cc07ca9b6f8cdaa20ca9887fce4df3bea.1638446239.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/inst.h | 8 +++ arch/powerpc/lib/test-code-patching.c | 85 ++++++++++++++------------- 2 files changed, 53 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index 95e5243d2997..80b6d74146c6 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -131,6 +131,14 @@ static inline unsigned long ppc_inst_as_ulong(ppc_inst_t x) return (u64)ppc_inst_val(x) << 32 | ppc_inst_suffix(x); } +static inline void ppc_inst_write(u32 *ptr, ppc_inst_t x) +{ + if (!ppc_inst_prefixed(x)) + *ptr = ppc_inst_val(x); + else + *(u64 *)ptr = ppc_inst_as_ulong(x); +} + #define PPC_INST_STR_LEN sizeof("00000000 00000000") static inline char *__ppc_inst_as_str(char str[PPC_INST_STR_LEN], ppc_inst_t x) diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c index e358c9d8a03e..c44823292f73 100644 --- a/arch/powerpc/lib/test-code-patching.c +++ b/arch/powerpc/lib/test-code-patching.c @@ -54,39 +54,39 @@ static void __init test_branch_iform(void) check(!instr_is_branch_iform(ppc_inst(0x7bfffffd))); /* Absolute branch to 0x100 */ - patch_instruction(iptr, ppc_inst(0x48000103)); + ppc_inst_write(iptr, ppc_inst(0x48000103)); check(instr_is_branch_to_addr(iptr, 0x100)); /* Absolute branch to 0x420fc */ - patch_instruction(iptr, ppc_inst(0x480420ff)); + ppc_inst_write(iptr, ppc_inst(0x480420ff)); check(instr_is_branch_to_addr(iptr, 0x420fc)); /* Maximum positive relative branch, + 20MB - 4B */ - patch_instruction(iptr, ppc_inst(0x49fffffc)); + ppc_inst_write(iptr, ppc_inst(0x49fffffc)); check(instr_is_branch_to_addr(iptr, addr + 0x1FFFFFC)); /* Smallest negative relative branch, - 4B */ - patch_instruction(iptr, ppc_inst(0x4bfffffc)); + ppc_inst_write(iptr, ppc_inst(0x4bfffffc)); check(instr_is_branch_to_addr(iptr, addr - 4)); /* Largest negative relative branch, - 32 MB */ - patch_instruction(iptr, ppc_inst(0x4a000000)); + ppc_inst_write(iptr, ppc_inst(0x4a000000)); check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); /* Branch to self, with link */ err = create_branch(&instr, iptr, addr, BRANCH_SET_LINK); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr)); /* Branch to self - 0x100, with link */ err = create_branch(&instr, iptr, addr - 0x100, BRANCH_SET_LINK); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr - 0x100)); /* Branch to self + 0x100, no link */ err = create_branch(&instr, iptr, addr + 0x100, 0); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr + 0x100)); /* Maximum relative negative offset, - 32 MB */ err = create_branch(&instr, iptr, addr - 0x2000000, BRANCH_SET_LINK); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr - 0x2000000)); /* Out of range relative negative offset, - 32 MB + 4*/ @@ -103,7 +103,7 @@ static void __init test_branch_iform(void) /* Check flags are masked correctly */ err = create_branch(&instr, iptr, addr, 0xFFFFFFFC); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr)); check(ppc_inst_equal(instr, ppc_inst(0x48000000))); } @@ -143,19 +143,19 @@ static void __init test_branch_bform(void) check(!instr_is_branch_bform(ppc_inst(0x7bffffff))); /* Absolute conditional branch to 0x100 */ - patch_instruction(iptr, ppc_inst(0x43ff0103)); + ppc_inst_write(iptr, ppc_inst(0x43ff0103)); check(instr_is_branch_to_addr(iptr, 0x100)); /* Absolute conditional branch to 0x20fc */ - patch_instruction(iptr, ppc_inst(0x43ff20ff)); + ppc_inst_write(iptr, ppc_inst(0x43ff20ff)); check(instr_is_branch_to_addr(iptr, 0x20fc)); /* Maximum positive relative conditional branch, + 32 KB - 4B */ - patch_instruction(iptr, ppc_inst(0x43ff7ffc)); + ppc_inst_write(iptr, ppc_inst(0x43ff7ffc)); check(instr_is_branch_to_addr(iptr, addr + 0x7FFC)); /* Smallest negative relative conditional branch, - 4B */ - patch_instruction(iptr, ppc_inst(0x43fffffc)); + ppc_inst_write(iptr, ppc_inst(0x43fffffc)); check(instr_is_branch_to_addr(iptr, addr - 4)); /* Largest negative relative conditional branch, - 32 KB */ - patch_instruction(iptr, ppc_inst(0x43ff8000)); + ppc_inst_write(iptr, ppc_inst(0x43ff8000)); check(instr_is_branch_to_addr(iptr, addr - 0x8000)); /* All condition code bits set & link */ @@ -163,22 +163,22 @@ static void __init test_branch_bform(void) /* Branch to self */ err = create_cond_branch(&instr, iptr, addr, flags); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr)); /* Branch to self - 0x100 */ err = create_cond_branch(&instr, iptr, addr - 0x100, flags); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr - 0x100)); /* Branch to self + 0x100 */ err = create_cond_branch(&instr, iptr, addr + 0x100, flags); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr + 0x100)); /* Maximum relative negative offset, - 32 KB */ err = create_cond_branch(&instr, iptr, addr - 0x8000, flags); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr - 0x8000)); /* Out of range relative negative offset, - 32 KB + 4*/ @@ -195,7 +195,7 @@ static void __init test_branch_bform(void) /* Check flags are masked correctly */ err = create_cond_branch(&instr, iptr, addr, 0xFFFFFFFC); - patch_instruction(iptr, instr); + ppc_inst_write(iptr, instr); check(instr_is_branch_to_addr(iptr, addr)); check(ppc_inst_equal(instr, ppc_inst(0x43FF0000))); } @@ -215,20 +215,22 @@ static void __init test_translate_branch(void) /* Simple case, branch to self moved a little */ p = buf; addr = (unsigned long)p; - patch_branch(p, addr, 0); + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); check(instr_is_branch_to_addr(p, addr)); q = p + 4; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(q, addr)); /* Maximum negative case, move b . to addr + 32 MB */ p = buf; addr = (unsigned long)p; - patch_branch(p, addr, 0); + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); q = buf + 0x2000000; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x4a000000))); @@ -236,10 +238,11 @@ static void __init test_translate_branch(void) /* Maximum positive case, move x to x - 32 MB + 4 */ p = buf + 0x2000000; addr = (unsigned long)p; - patch_branch(p, addr, 0); + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); q = buf + 4; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x49fffffc))); @@ -247,20 +250,22 @@ static void __init test_translate_branch(void) /* Jump to x + 16 MB moved to x + 20 MB */ p = buf; addr = 0x1000000 + (unsigned long)buf; - patch_branch(p, addr, BRANCH_SET_LINK); + create_branch(&instr, p, addr, BRANCH_SET_LINK); + ppc_inst_write(p, instr); q = buf + 0x1400000; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); /* Jump to x + 16 MB moved to x - 16 MB + 4 */ p = buf + 0x1000000; addr = 0x2000000 + (unsigned long)buf; - patch_branch(p, addr, 0); + create_branch(&instr, p, addr, 0); + ppc_inst_write(p, instr); q = buf + 4; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); @@ -271,21 +276,21 @@ static void __init test_translate_branch(void) p = buf; addr = (unsigned long)p; create_cond_branch(&instr, p, addr, 0); - patch_instruction(p, instr); + ppc_inst_write(p, instr); check(instr_is_branch_to_addr(p, addr)); q = buf + 4; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(q, addr)); /* Maximum negative case, move b . to addr + 32 KB */ p = buf; addr = (unsigned long)p; create_cond_branch(&instr, p, addr, 0xFFFFFFFC); - patch_instruction(p, instr); + ppc_inst_write(p, instr); q = buf + 0x8000; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff8000))); @@ -294,10 +299,10 @@ static void __init test_translate_branch(void) p = buf + 0x8000; addr = (unsigned long)p; create_cond_branch(&instr, p, addr, 0xFFFFFFFC); - patch_instruction(p, instr); + ppc_inst_write(p, instr); q = buf + 4; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); check(ppc_inst_equal(ppc_inst_read(q), ppc_inst(0x43ff7ffc))); @@ -306,10 +311,10 @@ static void __init test_translate_branch(void) p = buf; addr = 0x3000 + (unsigned long)buf; create_cond_branch(&instr, p, addr, BRANCH_SET_LINK); - patch_instruction(p, instr); + ppc_inst_write(p, instr); q = buf + 0x5000; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); @@ -317,10 +322,10 @@ static void __init test_translate_branch(void) p = buf + 0x2000; addr = 0x4000 + (unsigned long)buf; create_cond_branch(&instr, p, addr, 0); - patch_instruction(p, instr); + ppc_inst_write(p, instr); q = buf + 4; translate_branch(&instr, q, p); - patch_instruction(q, instr); + ppc_inst_write(q, instr); check(instr_is_branch_to_addr(p, addr)); check(instr_is_branch_to_addr(q, addr)); From bba496656a73fc1d1330b49c7f82843836e9feb1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 22 Dec 2021 13:07:31 +0000 Subject: [PATCH 230/240] powerpc/32: Fix boot failure with GCC latent entropy plugin Boot fails with GCC latent entropy plugin enabled. This is due to early boot functions trying to access 'latent_entropy' global data while the kernel is not relocated at its final destination yet. As there is no way to tell GCC to use PTRRELOC() to access it, disable latent entropy plugin in early_32.o and feature-fixups.o and code-patching.o Fixes: 38addce8b600 ("gcc-plugins: Add latent_entropy plugin") Cc: stable@vger.kernel.org # v4.9+ Reported-by: Erhard Furtner Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=215217 Link: https://lore.kernel.org/r/2bac55483b8daf5b1caa163a45fa5f9cdbe18be4.1640178426.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/Makefile | 1 + arch/powerpc/lib/Makefile | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 5fa68c2ef1f8..36f3f5a8868d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -11,6 +11,7 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif +CFLAGS_early_32.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 3e183f4b4bda..5d1881d2e39a 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -19,6 +19,9 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_code-patching.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_feature-fixups.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) + obj-y += alloc.o code-patching.o feature-fixups.o pmem.o obj-$(CONFIG_CODE_PATCHING_SELFTEST) += test-code-patching.o From 5b09250cca85ae6f91c9562cf1f5e5747de0a75d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 22 Dec 2021 16:39:42 -0800 Subject: [PATCH 231/240] powerpc/perf: Fix spelling of "its" Use the possessive "its" instead of the contraction of "it is" (it's). Signed-off-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211223003942.22098-1-rdunlap@infradead.org --- arch/powerpc/perf/hv-24x7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 1816f560a465..1e8aa934e37e 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -756,7 +756,7 @@ static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event, } if (calc_ev_end > ev_end) { - pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n", + pr_warn("event %zu exceeds its own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n", event_idx, event, ev_end, offset, calc_ev_end); return -1; } From 314f6c23dd8d417281eb9e8a516dd98036f2e7b3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 22 Dec 2021 00:50:59 +1100 Subject: [PATCH 232/240] powerpc/64s: Mask NIP before checking against SRR0 When CONFIG_PPC_RFI_SRR_DEBUG=y we check that NIP and SRR0 match when returning from interrupts. This can trigger falsely if NIP has either of its two low bits set via sigreturn or ptrace, while SRR0 has its low two bits masked in hardware. As a quick fix make sure to mask the low bits before doing the check. Fixes: 59dc5bfca0cb ("powerpc/64s: avoid reloading (H)SRR registers if they are still valid") Reported-by: Sachin Sant Signed-off-by: Michael Ellerman Tested-by: Sachin Sant Link: https://lore.kernel.org/r/20211221135101.2085547-1-mpe@ellerman.id.au --- arch/powerpc/kernel/interrupt_64.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index d3180139e35a..d729c2e7dae0 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -30,6 +30,7 @@ COMPAT_SYS_CALL_TABLE: .ifc \srr,srr mfspr r11,SPRN_SRR0 ld r12,_NIP(r1) + clrrdi r12,r12,2 100: tdne r11,r12 EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) mfspr r11,SPRN_SRR1 @@ -39,6 +40,7 @@ COMPAT_SYS_CALL_TABLE: .else mfspr r11,SPRN_HSRR0 ld r12,_NIP(r1) + clrrdi r12,r12,2 100: tdne r11,r12 EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) mfspr r11,SPRN_HSRR1 From fd1eaaaaa6864b5fb8f99880fcefb49760b8fe4e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 22 Dec 2021 00:51:00 +1100 Subject: [PATCH 233/240] powerpc/64s: Use EMIT_WARN_ENTRY for SRR debug warnings When CONFIG_PPC_RFI_SRR_DEBUG=y we check the SRR values before returning from interrupts. This is done in asm using EMIT_BUG_ENTRY, and passing BUGFLAG_WARNING. However that fails to create an exception table entry for the warning, and so do_program_check() fails the exception table search and proceeds to call _exception(), resulting in an oops like: Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 2 PID: 1204 Comm: sigreturn_unali Tainted: P 5.16.0-rc2-00194-g91ca3d4f77c5 #12 NIP: c00000000000c5b0 LR: 0000000000000000 CTR: 0000000000000000 ... NIP [c00000000000c5b0] system_call_common+0x150/0x268 LR [0000000000000000] 0x0 Call Trace: [c00000000db73e10] [c00000000000c558] system_call_common+0xf8/0x268 (unreliable) ... Instruction dump: 7cc803a6 888d0931 2c240000 4082001c 38800000 988d0931 e8810170 e8a10178 7c9a03a6 7cbb03a6 7d7a02a6 e9810170 <7f0b6088> 7d7b02a6 e9810178 7f0b6088 We should instead use EMIT_WARN_ENTRY, which creates an exception table entry for the warning, allowing the warning to be correctly recognised, and the code to resume after printing the warning. Note however that because this warning is buried deep in the interrupt return path, we are not able to recover from it (due to MSR_RI being clear), so we still end up in die() with an unrecoverable exception. Fixes: 59dc5bfca0cb ("powerpc/64s: avoid reloading (H)SRR registers if they are still valid") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221135101.2085547-2-mpe@ellerman.id.au --- arch/powerpc/kernel/interrupt_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index d729c2e7dae0..92088f848266 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -32,21 +32,21 @@ COMPAT_SYS_CALL_TABLE: ld r12,_NIP(r1) clrrdi r12,r12,2 100: tdne r11,r12 - EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) mfspr r11,SPRN_SRR1 ld r12,_MSR(r1) 100: tdne r11,r12 - EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) .else mfspr r11,SPRN_HSRR0 ld r12,_NIP(r1) clrrdi r12,r12,2 100: tdne r11,r12 - EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) mfspr r11,SPRN_HSRR1 ld r12,_MSR(r1) 100: tdne r11,r12 - EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) .endif #endif .endm From beeac538c366cd2828092adecd1edab28326c55b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 22 Dec 2021 00:51:01 +1100 Subject: [PATCH 234/240] selftests/powerpc: Add a test of sigreturning to an unaligned address Add a test of sigreturning to an unaligned address (low two bits set). This should have no effect because the hardware will mask those bits. However it previously falsely triggered a warning when CONFIG_PPC_RFI_SRR_DEBUG=y. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211221135101.2085547-3-mpe@ellerman.id.au --- .../selftests/powerpc/signal/.gitignore | 1 + .../testing/selftests/powerpc/signal/Makefile | 1 + .../powerpc/signal/sigreturn_unaligned.c | 43 +++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c diff --git a/tools/testing/selftests/powerpc/signal/.gitignore b/tools/testing/selftests/powerpc/signal/.gitignore index 8f6c816099a4..9d0915777fed 100644 --- a/tools/testing/selftests/powerpc/signal/.gitignore +++ b/tools/testing/selftests/powerpc/signal/.gitignore @@ -5,3 +5,4 @@ sigfuz sigreturn_vdso sig_sc_double_restart sigreturn_kernel +sigreturn_unaligned diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index 84e201572466..f679d260afc8 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart TEST_GEN_PROGS += sigreturn_kernel +TEST_GEN_PROGS += sigreturn_unaligned CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm diff --git a/tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c b/tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c new file mode 100644 index 000000000000..6e58ee4f0fdf --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/sigreturn_unaligned.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test sigreturn to an unaligned address, ie. low 2 bits set. + * Nothing bad should happen. + * This was able to trigger warnings with CONFIG_PPC_RFI_SRR_DEBUG=y. + */ + +#include +#include +#include +#include +#include +#include + +#include "utils.h" + + +static void sigusr1_handler(int signo, siginfo_t *info, void *ptr) +{ + ucontext_t *uc = ptr; + + UCONTEXT_NIA(uc) |= 3; +} + +static int test_sigreturn_unaligned(void) +{ + struct sigaction action; + + memset(&action, 0, sizeof(action)); + action.sa_sigaction = sigusr1_handler; + action.sa_flags = SA_SIGINFO; + + FAIL_IF(sigaction(SIGUSR1, &action, NULL) == -1); + + raise(SIGUSR1); + + return 0; +} + +int main(void) +{ + return test_harness(test_sigreturn_unaligned, "sigreturn_unaligned"); +} From e57c2fd6cdf8db581ac93b909b2664751e7cf30c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 2 Jan 2022 11:29:54 +0100 Subject: [PATCH 235/240] powerpc/floppy: Remove usage of the deprecated "pci-dma-compat.h" API In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. A coccinelle script has been used to perform the needed transformation Only relevant parts are given below. @@ @@ - PCI_DMA_TODEVICE + DMA_TO_DEVICE @@ @@ - PCI_DMA_FROMDEVICE + DMA_FROM_DEVICE @@ expression e1, e2, e3, e4; @@ - pci_map_single(e1, e2, e3, e4) + dma_map_single(&e1->dev, e2, e3, e4) @@ expression e1, e2, e3, e4; @@ - pci_unmap_single(e1, e2, e3, e4) + dma_unmap_single(&e1->dev, e2, e3, e4) [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Signed-off-by: Christophe JAILLET Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/9e24eedeab44cbb840598bb188561a48811de845.1641119338.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/include/asm/floppy.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/floppy.h b/arch/powerpc/include/asm/floppy.h index 7af9a68fd949..f8ce178b43b7 100644 --- a/arch/powerpc/include/asm/floppy.h +++ b/arch/powerpc/include/asm/floppy.h @@ -134,17 +134,19 @@ static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) int dir; doing_vdma = 0; - dir = (mode == DMA_MODE_READ) ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE; + dir = (mode == DMA_MODE_READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; if (bus_addr && (addr != prev_addr || size != prev_size || dir != prev_dir)) { /* different from last time -- unmap prev */ - pci_unmap_single(isa_bridge_pcidev, bus_addr, prev_size, prev_dir); + dma_unmap_single(&isa_bridge_pcidev->dev, bus_addr, prev_size, + prev_dir); bus_addr = 0; } if (!bus_addr) /* need to map it */ - bus_addr = pci_map_single(isa_bridge_pcidev, addr, size, dir); + bus_addr = dma_map_single(&isa_bridge_pcidev->dev, addr, size, + dir); /* remember this one as prev */ prev_addr = addr; From 18dbfcdedc802f9500b2c29794f22a31d27639c0 Mon Sep 17 00:00:00 2001 From: Ammar Faizi Date: Sun, 26 Dec 2021 20:54:02 +0700 Subject: [PATCH 236/240] powerpc/xive: Add missing null check after calling kmalloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 930914b7d528fc ("powerpc/xive: Add a debugfs file to dump internal XIVE state") forgot to add a null check. Add it. Fixes: 930914b7d528fc6b0249bffc00564100bcf6ef75 ("powerpc/xive: Add a debugfs file to dump internal XIVE state") Signed-off-by: Ammar Faizi Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211226135314.251221-1-ammar.faizi@intel.com --- arch/powerpc/sysdev/xive/spapr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index dfc4634335cc..928f95004501 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -653,6 +653,9 @@ static int xive_spapr_debug_show(struct seq_file *m, void *private) struct xive_irq_bitmap *xibm; char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + list_for_each_entry(xibm, &xive_irq_bitmaps, list) { memset(buf, 0, PAGE_SIZE); bitmap_print_to_pagebuf(true, buf, xibm->bitmap, xibm->count); From 08035a67f35a8765cac39bb12e56c61ee880227a Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 28 Dec 2021 14:47:25 +0800 Subject: [PATCH 237/240] powerpc/sched: Remove unused TASK_SIZE_OF This macro isn't used in Linux sched, now. Delete in include/linux/sched.h and arch's include/asm. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211228064730.2882351-5-guoren@kernel.org --- arch/powerpc/include/asm/task_size_64.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h index c993482237ed..38fdf8041d12 100644 --- a/arch/powerpc/include/asm/task_size_64.h +++ b/arch/powerpc/include/asm/task_size_64.h @@ -44,11 +44,7 @@ */ #define TASK_SIZE_USER32 (0x0000000100000000UL - (1 * PAGE_SIZE)) -#define TASK_SIZE_OF(tsk) \ - (test_tsk_thread_flag(tsk, TIF_32BIT) ? TASK_SIZE_USER32 : \ - TASK_SIZE_USER64) - -#define TASK_SIZE TASK_SIZE_OF(current) +#define TASK_SIZE (is_32bit_task() ? TASK_SIZE_USER32 : TASK_SIZE_USER64) #define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) #define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) From 2bdf3f9e9df0a4ce7709fc916b9997ca2dc30d25 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Jan 2022 16:54:50 +0100 Subject: [PATCH 238/240] powerpc/cacheinfo: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the powerpc cacheinfo sysfs code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Tyrel Datwyler Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220104155450.1291277-1-gregkh@linuxfoundation.org --- arch/powerpc/kernel/cacheinfo.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index cf1be75b7833..00b0992be3e7 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -710,7 +710,7 @@ static struct kobj_attribute cache_shared_cpu_list_attr = __ATTR(shared_cpu_list, 0444, shared_cpu_list_show, NULL); /* Attributes which should always be created -- the kobject/sysfs core - * does this automatically via kobj_type->default_attrs. This is the + * does this automatically via kobj_type->default_groups. This is the * minimum data required to uniquely identify a cache. */ static struct attribute *cache_index_default_attrs[] = { @@ -720,6 +720,7 @@ static struct attribute *cache_index_default_attrs[] = { &cache_shared_cpu_list_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(cache_index_default); /* Attributes which should be created if the cache device node has the * right properties -- see cacheinfo_create_index_opt_attrs @@ -738,7 +739,7 @@ static const struct sysfs_ops cache_index_ops = { static struct kobj_type cache_index_type = { .release = cache_index_release, .sysfs_ops = &cache_index_ops, - .default_attrs = cache_index_default_attrs, + .default_groups = cache_index_default_groups, }; static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) From 32a1bda4b12a3d324bd585e1aa20dac824ab719c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Jan 2022 17:13:18 +0100 Subject: [PATCH 239/240] powerpc/opal: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the powerpc opal dump and elog sysfs code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220104161318.1306023-1-gregkh@linuxfoundation.org --- arch/powerpc/platforms/powernv/opal-dump.c | 3 ++- arch/powerpc/platforms/powernv/opal-elog.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c index 717d1d30ade5..410ed5b9de29 100644 --- a/arch/powerpc/platforms/powernv/opal-dump.c +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -208,11 +208,12 @@ static struct attribute *dump_default_attrs[] = { &ack_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(dump_default); static struct kobj_type dump_ktype = { .sysfs_ops = &dump_sysfs_ops, .release = &dump_release, - .default_attrs = dump_default_attrs, + .default_groups = dump_default_groups, }; static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type) diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c index 5821b0fa8614..554fdd7f88b8 100644 --- a/arch/powerpc/platforms/powernv/opal-elog.c +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -144,11 +144,12 @@ static struct attribute *elog_default_attrs[] = { &ack_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(elog_default); static struct kobj_type elog_ktype = { .sysfs_ops = &elog_sysfs_ops, .release = &elog_release, - .default_attrs = elog_default_attrs, + .default_groups = elog_default_groups, }; /* Maximum size of a single log on FSP is 16KB */ From f1aa0e47c29268776205698f2453dc07fab49855 Mon Sep 17 00:00:00 2001 From: Sachin Sant Date: Wed, 5 Jan 2022 19:47:48 +0530 Subject: [PATCH 240/240] powerpc/xmon: Dump XIVE information for online-only processors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dxa command in XMON debugger iterates through all possible processors. As a result, empty lines are printed even for processors which are not online. CPU 47:pp=00 CPPR=ff IPI=0x0040002f PQ=-- EQ idx=699 T=0 00000000 00000000 CPU 48: CPU 49: Restrict XIVE information(dxa) to be displayed for online processors only. Signed-off-by: Sachin Sant Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/164139226833.12930.272224382183014664.sendpatchset@MacBook-Pro.local --- arch/powerpc/xmon/xmon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index f51d7404a6ea..fd72753e8ad5 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2817,12 +2817,12 @@ static void dump_all_xives(void) { int cpu; - if (num_possible_cpus() == 0) { + if (num_online_cpus() == 0) { printf("No possible cpus, use 'dx #' to dump individual cpus\n"); return; } - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) dump_one_xive(cpu); }