From 65b1aae0d9d5962faccc06bdb8e91a2a0b09451c Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 6 Jan 2020 14:42:12 -0800 Subject: [PATCH 001/147] mwifiex: fix unbalanced locking in mwifiex_process_country_ie() We called rcu_read_lock(), so we need to call rcu_read_unlock() before we return. Fixes: 3d94a4a8373b ("mwifiex: fix possible heap overflow in mwifiex_process_country_ie()") Cc: stable@vger.kernel.org Cc: huangwen Cc: Ganapathi Bhat Signed-off-by: Brian Norris Acked-by: Ganapathi Bhat Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/sta_ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c index 6dd835f1efc2..fbfa0b15d0c8 100644 --- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c +++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c @@ -232,6 +232,7 @@ static int mwifiex_process_country_ie(struct mwifiex_private *priv, if (country_ie_len > (IEEE80211_COUNTRY_STRING_LEN + MWIFIEX_MAX_TRIPLET_802_11D)) { + rcu_read_unlock(); mwifiex_dbg(priv->adapter, ERROR, "11D: country_ie_len overflow!, deauth AP\n"); return -EINVAL; From 3a9b153c5591548612c3955c9600a98150c81875 Mon Sep 17 00:00:00 2001 From: Qing Xu Date: Thu, 2 Jan 2020 10:39:26 +0800 Subject: [PATCH 002/147] mwifiex: Fix possible buffer overflows in mwifiex_ret_wmm_get_status() mwifiex_ret_wmm_get_status() calls memcpy() without checking the destination size.Since the source is given from remote AP which contains illegal wmm elements , this may trigger a heap buffer overflow. Fix it by putting the length check before calling memcpy(). Signed-off-by: Qing Xu Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/wmm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/wmm.c b/drivers/net/wireless/marvell/mwifiex/wmm.c index 41f0231376c0..132f9e8ed68c 100644 --- a/drivers/net/wireless/marvell/mwifiex/wmm.c +++ b/drivers/net/wireless/marvell/mwifiex/wmm.c @@ -970,6 +970,10 @@ int mwifiex_ret_wmm_get_status(struct mwifiex_private *priv, "WMM Parameter Set Count: %d\n", wmm_param_ie->qos_info_bitmap & mask); + if (wmm_param_ie->vend_hdr.len + 2 > + sizeof(struct ieee_types_wmm_parameter)) + break; + memcpy((u8 *) &priv->curr_bss_params.bss_descriptor. wmm_ie, wmm_param_ie, wmm_param_ie->vend_hdr.len + 2); From b70261a288ea4d2f4ac7cd04be08a9f0f2de4f4d Mon Sep 17 00:00:00 2001 From: Qing Xu Date: Thu, 2 Jan 2020 10:39:27 +0800 Subject: [PATCH 003/147] mwifiex: Fix possible buffer overflows in mwifiex_cmd_append_vsie_tlv() mwifiex_cmd_append_vsie_tlv() calls memcpy() without checking the destination size may trigger a buffer overflower, which a local user could use to cause denial of service or the execution of arbitrary code. Fix it by putting the length check before calling memcpy(). Signed-off-by: Qing Xu Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/mwifiex/scan.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c b/drivers/net/wireless/marvell/mwifiex/scan.c index 98f942b797f7..a7968a84aaf8 100644 --- a/drivers/net/wireless/marvell/mwifiex/scan.c +++ b/drivers/net/wireless/marvell/mwifiex/scan.c @@ -2884,6 +2884,13 @@ mwifiex_cmd_append_vsie_tlv(struct mwifiex_private *priv, vs_param_set->header.len = cpu_to_le16((((u16) priv->vs_ie[id].ie[1]) & 0x00FF) + 2); + if (le16_to_cpu(vs_param_set->header.len) > + MWIFIEX_MAX_VSIE_LEN) { + mwifiex_dbg(priv->adapter, ERROR, + "Invalid param length!\n"); + break; + } + memcpy(vs_param_set->ie, priv->vs_ie[id].ie, le16_to_cpu(vs_param_set->header.len)); *buffer += le16_to_cpu(vs_param_set->header.len) + From c7bf1fb7ddca331780b9a733ae308737b39f1ad4 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Tue, 14 Jan 2020 11:39:02 +0100 Subject: [PATCH 004/147] libertas: don't exit from lbs_ibss_join_existing() with RCU read lock held Commit e5e884b42639 ("libertas: Fix two buffer overflows at parsing bss descriptor") introduced a bounds check on the number of supplied rates to lbs_ibss_join_existing(). Unfortunately, it introduced a return path from within a RCU read side critical section without a corresponding rcu_read_unlock(). Fix this. Fixes: e5e884b42639 ("libertas: Fix two buffer overflows at parsing bss descriptor") Signed-off-by: Nicolai Stange Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/libertas/cfg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index c9401c121a14..68985d766349 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -1785,6 +1785,7 @@ static int lbs_ibss_join_existing(struct lbs_private *priv, rates_max = rates_eid[1]; if (rates_max > MAX_RATES) { lbs_deb_join("invalid rates"); + rcu_read_unlock(); goto out; } rates = cmd.bss.rates; From 1754c4f60aaf1e17d886afefee97e94d7f27b4cb Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Tue, 14 Jan 2020 11:39:03 +0100 Subject: [PATCH 005/147] libertas: make lbs_ibss_join_existing() return error code on rates overflow Commit e5e884b42639 ("libertas: Fix two buffer overflows at parsing bss descriptor") introduced a bounds check on the number of supplied rates to lbs_ibss_join_existing() and made it to return on overflow. However, the aforementioned commit doesn't set the return value accordingly and thus, lbs_ibss_join_existing() would return with zero even though it failed. Make lbs_ibss_join_existing return -EINVAL in case the bounds check on the number of supplied rates fails. Fixes: e5e884b42639 ("libertas: Fix two buffer overflows at parsing bss descriptor") Signed-off-by: Nicolai Stange Signed-off-by: Kalle Valo --- drivers/net/wireless/marvell/libertas/cfg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index 68985d766349..4e3de684928b 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -1786,6 +1786,7 @@ static int lbs_ibss_join_existing(struct lbs_private *priv, if (rates_max > MAX_RATES) { lbs_deb_join("invalid rates"); rcu_read_unlock(); + ret = -EINVAL; goto out; } rates = cmd.bss.rates; From fcfd7345209210ab309f4422a308bad3e1fa6b8c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 27 Jan 2020 14:37:48 +0000 Subject: [PATCH 006/147] MAINTAINERS: Correct path to time namespace source file According to reviews, Time Namespace source was moved from kernel/time_namespace.c to kernel/time/namespace.c between patchset versions, while the path in MAINTERNERS file wasn't adjusted properly. Correct it, so get_maintainer.pl produces a correct emails list again. Fixes: 769071ac9f20 ("ns: Introduce Time Namespace") Reported-by: Dmitry Vyukov Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200127143748.268515-1-dima@arista.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 141b8d3e4ca2..3a4163be98da 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13267,7 +13267,7 @@ S: Maintained F: fs/timerfd.c F: include/linux/timer* F: include/linux/time_namespace.h -F: kernel/time_namespace.c +F: kernel/time/namespace.c F: kernel/time/*timer* POWER MANAGEMENT CORE From 926b5dfa6b8dc666ff398044af6906b156e1d949 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 16 Dec 2019 11:24:57 +0000 Subject: [PATCH 007/147] irqchip/gic-v3: Only provision redistributors that are enabled in ACPI We currently allocate redistributor region structures for individual redistributors when ACPI doesn't present us with compact MMIO regions covering multiple redistributors. It turns out that we allocate these structures even when the redistributor is flagged as disabled by ACPI. It works fine until someone actually tries to tarse one of these structures, and access the corresponding MMIO region. Instead, track the number of enabled redistributors, and only allocate what is required. This makes sure that there is no invalid data to misuse. Signed-off-by: Marc Zyngier Reported-by: Heyi Guo Tested-by: Heyi Guo Link: https://lore.kernel.org/r/20191216062745.63397-1-guoheyi@huawei.com --- drivers/irqchip/irq-gic-v3.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 286f98222878..c1f7af9d9ae7 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1839,6 +1839,7 @@ static struct struct redist_region *redist_regs; u32 nr_redist_regions; bool single_redist; + int enabled_rdists; u32 maint_irq; int maint_irq_mode; phys_addr_t vcpu_base; @@ -1933,8 +1934,10 @@ static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header, * If GICC is enabled and has valid gicr base address, then it means * GICR base is presented via GICC */ - if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) + if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) { + acpi_data.enabled_rdists++; return 0; + } /* * It's perfectly valid firmware can pass disabled GICC entry, driver @@ -1964,8 +1967,10 @@ static int __init gic_acpi_count_gicr_regions(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, gic_acpi_match_gicc, 0); - if (count > 0) + if (count > 0) { acpi_data.single_redist = true; + count = acpi_data.enabled_rdists; + } return count; } From 25a3a15417cf4311f812f5a2b18c5fc2809f66d7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 27 Jan 2020 09:39:15 +0100 Subject: [PATCH 008/147] smp: Remove superfluous cond_func check in smp_call_function_many_cond() It was requested to remove the cond_func check but the follow up patch was overlooked. Remove it now. Fixes: 67719ef25eeb ("smp: Add a smp_cond_func_t argument to smp_call_function_many()") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200127083915.434tdkztorkklpdu@linutronix.de --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index 3b7bedc97af3..d0ada39eb4d4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -435,7 +435,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, /* Fastpath: do that cpu by itself. */ if (next_cpu >= nr_cpu_ids) { - if (!cond_func || (cond_func && cond_func(cpu, info))) + if (!cond_func || cond_func(cpu, info)) smp_call_function_single(cpu, func, info, wait); return; } From 003461559ef7a9bd0239bae35a22ad8924d6e9ad Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 23 Jan 2020 10:11:46 -0800 Subject: [PATCH 009/147] perf/core: Fix mlock accounting in perf_mmap() Decreasing sysctl_perf_event_mlock between two consecutive perf_mmap()s of a perf ring buffer may lead to an integer underflow in locked memory accounting. This may lead to the undesired behaviors, such as failures in BPF map creation. Address this by adjusting the accounting logic to take into account the possibility that the amount of already locked memory may exceed the current limit. Fixes: c4b75479741c ("perf/core: Make the mlock accounting simple again") Suggested-by: Alexander Shishkin Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Acked-by: Alexander Shishkin Link: https://lkml.kernel.org/r/20200123181146.2238074-1-songliubraving@fb.com --- kernel/events/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2173c23c25b4..2d9aeba1f3e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5916,7 +5916,15 @@ accounting: */ user_lock_limit *= num_online_cpus(); - user_locked = atomic_long_read(&user->locked_vm) + user_extra; + user_locked = atomic_long_read(&user->locked_vm); + + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += user_extra; if (user_locked > user_lock_limit) { /* From 07c5972951f088094776038006a0592a46d14bbc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 22 Jan 2020 11:50:27 -0800 Subject: [PATCH 010/147] perf/cgroups: Install cgroup events to correct cpuctx cgroup events are always installed in the cpuctx. However, when it is not installed via IPI, list_update_cgroup_event() adds it to cpuctx of current CPU, which triggers list corruption: [] list_add double add: new=ffff888ff7cf0db0, prev=ffff888ff7ce82f0, next=ffff888ff7cf0db0. To reproduce this, we can simply run: # perf stat -e cs -a & # perf stat -e cs -G anycgroup Fix this by installing it to cpuctx that contains event->ctx, and the proper cgrp_cpuctx_list. Fixes: db0503e4f675 ("perf/core: Optimize perf_install_in_event()") Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Link: https://lkml.kernel.org/r/20200122195027.2112449-1-songliubraving@fb.com --- kernel/events/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d9aeba1f3e2..fdb7f7ef380c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -951,9 +951,9 @@ list_update_cgroup_event(struct perf_event *event, /* * Because cgroup events are always per-cpu events, - * this will always be called from the right CPU. + * @ctx == &cpuctx->ctx. */ - cpuctx = __get_cpu_context(ctx); + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); /* * Since setting cpuctx->cgrp is conditional on the current @cgrp @@ -979,7 +979,8 @@ list_update_cgroup_event(struct perf_event *event, cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; if (add) - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + list_add(cpuctx_entry, + per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); else list_del(cpuctx_entry); } From f1ec3a517b4352e78dbef6b1e591f43202ecb3fe Mon Sep 17 00:00:00 2001 From: Benjamin Thiel Date: Thu, 9 Jan 2020 14:13:51 +0100 Subject: [PATCH 011/147] kernel/events: Add a missing prototype for arch_perf_update_userpage() ... in order to fix a -Wmissing-prototype warning. No functional changes. Signed-off-by: Benjamin Thiel Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200109131351.9468-1-b.thiel@posteo.de --- include/linux/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6d4c22aee384..52928e089bc7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1544,4 +1544,8 @@ int perf_event_exit_cpu(unsigned int cpu); #define perf_event_exit_cpu NULL #endif +extern void __weak arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, + u64 now); + #endif /* _LINUX_PERF_EVENT_H */ From 979923871f69a4dc926658f9f9a1a4c1bde57552 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Jan 2020 12:54:53 +0100 Subject: [PATCH 012/147] x86/timer: Don't skip PIT setup when APIC is disabled or in legacy mode Tony reported a boot regression caused by the recent workaround for systems which have a disabled (clock gate off) PIT. On his machine the kernel fails to initialize the PIT because apic_needs_pit() does not take into account whether the local APIC interrupt delivery mode will actually allow to setup and use the local APIC timer. This should be easy to reproduce with acpi=off on the command line which also disables HPET. Due to the way the PIT/HPET and APIC setup ordering works (APIC setup can require working PIT/HPET) the information is not available at the point where apic_needs_pit() makes this decision. To address this, split out the interrupt mode selection from apic_intr_mode_init(), invoke the selection before making the decision whether PIT is required or not, and add the missing checks into apic_needs_pit(). Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") Reported-by: Anthony Buckley Tested-by: Anthony Buckley Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Daniel Drake Link: https://bugzilla.kernel.org/show_bug.cgi?id=206125 Link: https://lore.kernel.org/r/87sgk6tmk2.fsf@nanos.tec.linutronix.de --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/apic.c | 23 ++++++++++++++++++----- arch/x86/kernel/time.c | 14 +++++++++++--- arch/x86/kernel/x86_init.c | 1 + arch/x86/xen/enlighten_pv.c | 1 + 6 files changed, 35 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 2ebc17d9c72c..be0b9cf941c4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -140,6 +140,7 @@ extern void apic_soft_disable(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); +extern void apic_intr_mode_select(void); extern void apic_intr_mode_init(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); @@ -188,6 +189,7 @@ static inline void disable_local_APIC(void) { } # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } static inline void init_bsp_APIC(void) { } +static inline void apic_intr_mode_select(void) { } static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 19435858df5f..96d9cd208610 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -51,12 +51,14 @@ struct x86_init_resources { * are set up. * @intr_init: interrupt init code * @trap_init: platform specific trap setup + * @intr_mode_select: interrupt delivery mode selection * @intr_mode_init: interrupt delivery mode setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); void (*trap_init)(void); + void (*intr_mode_select)(void); void (*intr_mode_init)(void); }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 28446fa6bf18..4b0f9117e1cd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -830,8 +830,17 @@ bool __init apic_needs_pit(void) if (!tsc_khz || !cpu_khz) return true; - /* Is there an APIC at all? */ - if (!boot_cpu_has(X86_FEATURE_APIC)) + /* Is there an APIC at all or is it disabled? */ + if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic) + return true; + + /* + * If interrupt delivery mode is legacy PIC or virtual wire without + * configuration, the local APIC timer wont be set up. Make sure + * that the PIT is initialized. + */ + if (apic_intr_mode == APIC_PIC || + apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG) return true; /* Virt guests may lack ARAT, but still have DEADLINE */ @@ -1322,7 +1331,7 @@ void __init sync_Arb_IDs(void) enum apic_intr_mode_id apic_intr_mode __ro_after_init; -static int __init apic_intr_mode_select(void) +static int __init __apic_intr_mode_select(void) { /* Check kernel option */ if (disable_apic) { @@ -1384,6 +1393,12 @@ static int __init apic_intr_mode_select(void) return APIC_SYMMETRIC_IO; } +/* Select the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_select(void) +{ + apic_intr_mode = __apic_intr_mode_select(); +} + /* * An initial setup of the virtual wire mode. */ @@ -1440,8 +1455,6 @@ void __init apic_intr_mode_init(void) { bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); - apic_intr_mode = apic_intr_mode_select(); - switch (apic_intr_mode) { case APIC_PIC: pr_info("APIC: Keep in PIC mode(8259)\n"); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 7ce29cee9f9e..d8673d8a779b 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -91,10 +91,18 @@ void __init hpet_time_init(void) static __init void x86_late_time_init(void) { - x86_init.timers.timer_init(); /* - * After PIT/HPET timers init, select and setup - * the final interrupt mode for delivering IRQs. + * Before PIT/HPET init, select the interrupt mode. This is required + * to make the decision whether PIT should be initialized correct. + */ + x86_init.irqs.intr_mode_select(); + + /* Setup the legacy timers */ + x86_init.timers.timer_init(); + + /* + * After PIT/HPET timers init, set up the final interrupt mode for + * delivering IRQs. */ x86_init.irqs.intr_mode_init(); tsc_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ce89430a7f80..9a89261dcd2a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -80,6 +80,7 @@ struct x86_init_ops x86_init __initdata = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .trap_init = x86_init_noop, + .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init }, diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ae4a41ca19f6..1f756ffffe8b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1205,6 +1205,7 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_platform.get_nmi_reason = xen_get_nmi_reason; x86_init.resources.memory_setup = xen_memory_setup; + x86_init.irqs.intr_mode_select = x86_init_noop; x86_init.irqs.intr_mode_init = x86_init_noop; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; From 656b42deddea57e55a570671db9403af0ed3c439 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 28 Jan 2020 18:25:14 -0800 Subject: [PATCH 013/147] irqchip: Some Kconfig cleanup for C-SKY Fixes to Kconfig help text: - spell out "hardware" - fix verb usage Signed-off-by: Randy Dunlap Signed-off-by: Marc Zyngier Acked-by: Guo Ren Link: https://lore.kernel.org/r/d44baeee-cceb-7c02-7249-e6b4817f0847@infradead.org --- drivers/irqchip/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 1006c694d9fb..6d397732138d 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -438,7 +438,7 @@ config CSKY_MPINTC help Say yes here to enable C-SKY SMP interrupt controller driver used for C-SKY SMP system. - In fact it's not mmio map in hw and it use ld/st to visit the + In fact it's not mmio map in hardware and it uses ld/st to visit the controller's register inside CPU. config CSKY_APB_INTC @@ -446,7 +446,7 @@ config CSKY_APB_INTC depends on CSKY help Say yes here to enable C-SKY APB interrupt controller driver used - by C-SKY single core SOC system. It use mmio map apb-bus to visit + by C-SKY single core SOC system. It uses mmio map apb-bus to visit the controller's register. config IMX_IRQSTEER From d3e42bb0a329fadff98fcb927714d0a486840e3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 27 Jan 2020 09:51:45 -0800 Subject: [PATCH 014/147] bpf: Reuse log from btf_prase_vmlinux() in btf_struct_ops_init() Instead of using a locally defined "struct bpf_verifier_log log = {}", btf_struct_ops_init() should reuse the "log" from its calling function "btf_parse_vmlinux()". It should also resolve the frame-size too large compiler warning in some ARCH. Fixes: 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200127175145.1154438-1-kafai@fb.com --- include/linux/bpf.h | 7 +++++-- kernel/bpf/bpf_struct_ops.c | 5 ++--- kernel/bpf/btf.c | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8e9ad3943cd9..49b1a70e12c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -728,7 +728,7 @@ struct bpf_struct_ops { #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); -void bpf_struct_ops_init(struct btf *btf); +void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, @@ -752,7 +752,10 @@ static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { return NULL; } -static inline void bpf_struct_ops_init(struct btf *btf) { } +static inline void bpf_struct_ops_init(struct btf *btf, + struct bpf_verifier_log *log) +{ +} static inline bool bpf_try_module_get(const void *data, struct module *owner) { return try_module_get(owner); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 8ad1c9ea26b2..042f95534f86 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -96,12 +96,11 @@ const struct bpf_prog_ops bpf_struct_ops_prog_ops = { static const struct btf_type *module_type; -void bpf_struct_ops_init(struct btf *btf) +void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) { s32 type_id, value_id, module_id; const struct btf_member *member; struct bpf_struct_ops *st_ops; - struct bpf_verifier_log log = {}; const struct btf_type *t; char value_name[128]; const char *mname; @@ -172,7 +171,7 @@ void bpf_struct_ops_init(struct btf *btf) member->type, NULL); if (func_proto && - btf_distill_func_proto(&log, btf, + btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[j])) { pr_warn("Error in parsing func ptr %s in struct %s\n", diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b7c1660fb594..8c9d8f266bef 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3643,7 +3643,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; } - bpf_struct_ops_init(btf); + bpf_struct_ops_init(btf, log); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); From e884602b57c07fae54ff357e4b996b2053b47c1e Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 17 Jan 2020 13:52:50 +0800 Subject: [PATCH 015/147] perf parse: Refactor 'struct perf_evsel_config_term' The struct perf_evsel_config_term::val is a union which contains fields 'callgraph', 'drv_cfg' and 'branch' as string pointers. This leads to the complex code logic for handling every type's string separately, and it's hard to release string as a general way. This patch refactors the structure to add a common field 'str' in the 'val' union as string pointer and remove the other three fields 'callgraph', 'drv_cfg' and 'branch'. Without passing field name, the patch simplifies the string handling with macro ADD_CONFIG_TERM_STR() for string pointer assignment. This patch fixes multiple warnings of line over 80 characters detected by checkpatch tool. Signed-off-by: Leo Yan Reviewed-by: Andi Kleen Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Mark Rutland Cc: Mathieu Poirier Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20200117055251.24058-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm/util/cs-etm.c | 2 +- tools/perf/util/evsel.c | 6 +-- tools/perf/util/evsel_config.h | 4 +- tools/perf/util/parse-events.c | 62 ++++++++++++++++++++----------- 4 files changed, 45 insertions(+), 29 deletions(-) diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index ede040cf82ad..2898cfdf8fe1 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -226,7 +226,7 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu, if (term->type != PERF_EVSEL__CONFIG_TERM_DRV_CFG) continue; - sink = term->val.drv_cfg; + sink = term->val.str; snprintf(path, PATH_MAX, "sinks/%s", sink); ret = perf_pmu__scan_file(pmu, path, "%x", &hash); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a69e64236120..549abd43816f 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -808,12 +808,12 @@ static void apply_config_terms(struct evsel *evsel, perf_evsel__reset_sample_bit(evsel, TIME); break; case PERF_EVSEL__CONFIG_TERM_CALLGRAPH: - callgraph_buf = term->val.callgraph; + callgraph_buf = term->val.str; break; case PERF_EVSEL__CONFIG_TERM_BRANCH: - if (term->val.branch && strcmp(term->val.branch, "no")) { + if (term->val.str && strcmp(term->val.str, "no")) { perf_evsel__set_sample_bit(evsel, BRANCH_STACK); - parse_branch_str(term->val.branch, + parse_branch_str(term->val.str, &attr->branch_sample_type); } else perf_evsel__reset_sample_bit(evsel, BRANCH_STACK); diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index 1f8d2fe0b66e..b4a65201e4f7 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -36,18 +36,16 @@ struct perf_evsel_config_term { u64 period; u64 freq; bool time; - char *callgraph; - char *drv_cfg; u64 stack_user; int max_stack; bool inherit; bool overwrite; - char *branch; unsigned long max_events; bool percore; bool aux_output; u32 aux_sample_size; u64 cfg_chg; + char *str; } val; bool weak; }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index ed7c008b9c8b..f59f3c8da473 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1219,8 +1219,7 @@ static int config_attr(struct perf_event_attr *attr, static int get_config_terms(struct list_head *head_config, struct list_head *head_terms __maybe_unused) { -#define ADD_CONFIG_TERM(__type, __name, __val) \ -do { \ +#define ADD_CONFIG_TERM(__type) \ struct perf_evsel_config_term *__t; \ \ __t = zalloc(sizeof(*__t)); \ @@ -1229,9 +1228,19 @@ do { \ \ INIT_LIST_HEAD(&__t->list); \ __t->type = PERF_EVSEL__CONFIG_TERM_ ## __type; \ - __t->val.__name = __val; \ __t->weak = term->weak; \ - list_add_tail(&__t->list, head_terms); \ + list_add_tail(&__t->list, head_terms) + +#define ADD_CONFIG_TERM_VAL(__type, __name, __val) \ +do { \ + ADD_CONFIG_TERM(__type); \ + __t->val.__name = __val; \ +} while (0) + +#define ADD_CONFIG_TERM_STR(__type, __val) \ +do { \ + ADD_CONFIG_TERM(__type); \ + __t->val.str = __val; \ } while (0) struct parse_events_term *term; @@ -1239,53 +1248,62 @@ do { \ list_for_each_entry(term, head_config, list) { switch (term->type_term) { case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD: - ADD_CONFIG_TERM(PERIOD, period, term->val.num); + ADD_CONFIG_TERM_VAL(PERIOD, period, term->val.num); break; case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ: - ADD_CONFIG_TERM(FREQ, freq, term->val.num); + ADD_CONFIG_TERM_VAL(FREQ, freq, term->val.num); break; case PARSE_EVENTS__TERM_TYPE_TIME: - ADD_CONFIG_TERM(TIME, time, term->val.num); + ADD_CONFIG_TERM_VAL(TIME, time, term->val.num); break; case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: - ADD_CONFIG_TERM(CALLGRAPH, callgraph, term->val.str); + ADD_CONFIG_TERM_STR(CALLGRAPH, term->val.str); break; case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE: - ADD_CONFIG_TERM(BRANCH, branch, term->val.str); + ADD_CONFIG_TERM_STR(BRANCH, term->val.str); break; case PARSE_EVENTS__TERM_TYPE_STACKSIZE: - ADD_CONFIG_TERM(STACK_USER, stack_user, term->val.num); + ADD_CONFIG_TERM_VAL(STACK_USER, stack_user, + term->val.num); break; case PARSE_EVENTS__TERM_TYPE_INHERIT: - ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 1 : 0); + ADD_CONFIG_TERM_VAL(INHERIT, inherit, + term->val.num ? 1 : 0); break; case PARSE_EVENTS__TERM_TYPE_NOINHERIT: - ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 0 : 1); + ADD_CONFIG_TERM_VAL(INHERIT, inherit, + term->val.num ? 0 : 1); break; case PARSE_EVENTS__TERM_TYPE_MAX_STACK: - ADD_CONFIG_TERM(MAX_STACK, max_stack, term->val.num); + ADD_CONFIG_TERM_VAL(MAX_STACK, max_stack, + term->val.num); break; case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS: - ADD_CONFIG_TERM(MAX_EVENTS, max_events, term->val.num); + ADD_CONFIG_TERM_VAL(MAX_EVENTS, max_events, + term->val.num); break; case PARSE_EVENTS__TERM_TYPE_OVERWRITE: - ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0); + ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, + term->val.num ? 1 : 0); break; case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE: - ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 0 : 1); + ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite, + term->val.num ? 0 : 1); break; case PARSE_EVENTS__TERM_TYPE_DRV_CFG: - ADD_CONFIG_TERM(DRV_CFG, drv_cfg, term->val.str); + ADD_CONFIG_TERM_STR(DRV_CFG, term->val.str); break; case PARSE_EVENTS__TERM_TYPE_PERCORE: - ADD_CONFIG_TERM(PERCORE, percore, - term->val.num ? true : false); + ADD_CONFIG_TERM_VAL(PERCORE, percore, + term->val.num ? true : false); break; case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT: - ADD_CONFIG_TERM(AUX_OUTPUT, aux_output, term->val.num ? 1 : 0); + ADD_CONFIG_TERM_VAL(AUX_OUTPUT, aux_output, + term->val.num ? 1 : 0); break; case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE: - ADD_CONFIG_TERM(AUX_SAMPLE_SIZE, aux_sample_size, term->val.num); + ADD_CONFIG_TERM_VAL(AUX_SAMPLE_SIZE, aux_sample_size, + term->val.num); break; default: break; @@ -1322,7 +1340,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct list_head *head_config, } if (bits) - ADD_CONFIG_TERM(CFG_CHG, cfg_chg, bits); + ADD_CONFIG_TERM_VAL(CFG_CHG, cfg_chg, bits); #undef ADD_CONFIG_TERM return 0; From 3220fb8d5e59d7a9b59d02965d4209aef7691e9f Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 17 Jan 2020 13:52:51 +0800 Subject: [PATCH 016/147] perf parse: Copy string to perf_evsel_config_term perf with CoreSight fails to record trace data with command: perf record -e cs_etm/@tmc_etr0/u --per-thread ls failed to set sink "" on event cs_etm/@tmc_etr0/u with 21 (Is a directory)/perf/ This failure is root caused with the commit 1dc925568f01 ("perf parse: Add a deep delete for parse event terms"). The log shows, cs_etm fails to parse the sink attribution; cs_etm event relies on the event configuration to pass sink name, but the event specific configuration data cannot be passed properly with flow: get_config_terms() ADD_CONFIG_TERM(DRV_CFG, term->val.str); __t->val.str = term->val.str; `> __t->val.str is assigned to term->val.str; parse_events_terms__purge() parse_events_term__delete() zfree(&term->val.str); `> term->val.str is freed and assigned to NULL pointer; cs_etm_set_sink_attr() sink = __t->val.str; `> sink string has been freed. To fix this issue, in the function get_config_terms(), this patch changes to use strdup() for allocation a new duplicate string rather than directly assignment string pointer. This patch addes a new field 'free_str' in the data structure perf_evsel_config_term; 'free_str' is set to true when the union is used as a string pointer; thus it can tell perf_evsel__free_config_terms() to free the string. Fixes: 1dc925568f01 ("perf parse: Add a deep delete for parse event terms") Suggested-by: Jiri Olsa Signed-off-by: Leo Yan Acked-by: Jiri Olsa Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Mark Rutland Cc: Mathieu Poirier Cc: Mike Leach Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Suzuki Poulouse Cc: linux-arm-kernel@lists.infradead.org Link: http://lore.kernel.org/lkml/20200117055251.24058-2-leo.yan@linaro.org [ Use zfree() in perf_evsel__free_config_terms ] Signed-off-by: Arnaldo Carvalho de Melo :# modified: tools/perf/util/evsel_config.h --- tools/perf/util/evsel.c | 2 ++ tools/perf/util/evsel_config.h | 1 + tools/perf/util/parse-events.c | 7 ++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 549abd43816f..c8dc4450884c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1265,6 +1265,8 @@ static void perf_evsel__free_config_terms(struct evsel *evsel) list_for_each_entry_safe(term, h, &evsel->config_terms, list) { list_del_init(&term->list); + if (term->free_str) + zfree(&term->val.str); free(term); } } diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index b4a65201e4f7..e026ab67b008 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -32,6 +32,7 @@ enum evsel_term_type { struct perf_evsel_config_term { struct list_head list; enum evsel_term_type type; + bool free_str; union { u64 period; u64 freq; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f59f3c8da473..c01ba6f8fdad 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1240,7 +1240,12 @@ do { \ #define ADD_CONFIG_TERM_STR(__type, __val) \ do { \ ADD_CONFIG_TERM(__type); \ - __t->val.str = __val; \ + __t->val.str = strdup(__val); \ + if (!__t->val.str) { \ + zfree(&__t); \ + return -ENOMEM; \ + } \ + __t->free_str = true; \ } while (0) struct parse_events_term *term; From 0dd1979f7f9981dcc5c497e68390f208580032ae Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 20 Jan 2020 14:20:09 +0100 Subject: [PATCH 017/147] perf test: Fix test case Merge cpu map Commit a2408a70368a ("perf evlist: Maintain evlist->all_cpus") introduces a test case for cpumap merge operation, see functions perf_cpu_map__merge() and test__cpu_map_merge(). The test case fails on s390 with this error message: [root@m35lp76 perf]# ./perf test -Fvvvvv 52 52: Merge cpu map : --- start --- cpumask list: 1-2,4-5,7 perf: /root/linux/tools/include/linux/refcount.h:131:\ refcount_sub_and_test: Assertion `!(new > val)' failed. Aborted (core dumped) [root@m35lp76 perf]# The root cause is in the function test__cpu_map_merge(): It creates two cpu_maps named 'a' and 'b': struct perf_cpu_map *a = perf_cpu_map__new("4,2,1"); struct perf_cpu_map *b = perf_cpu_map__new("4,5,7"); and creates a third map named 'c' which is the result of the merge of maps a and b: struct perf_cpu_map *c = perf_cpu_map__merge(a, b); After some verifaction of the merged cpu_map all three of them are have their reference count reduced and are freed: perf_cpu_map__put(a); (1) perf_cpu_map__put(b); perf_cpu_map__put(c); The release of perf_cpu_map__put(a) is wrong. The map is already released and free'ed as part of the function perf_cpu_map__merge(struct perf_cpu_map *orig, | struct perf_cpu_map *other) +--> perf_cpu_map__put(orig); | +--> cpu_map__delete(orig) At the end perf_cpu_map_put() is called for map 'orig' alias 'a' and since the reference count is 1, the map is deleted, as can be seen by the following gdb trace: (gdb) where #0 tcache_put (tc_idx=0, chunk=0x156cc30) at malloc.c:2940 #1 _int_free (av=0x3fffd49ee80 , p=0x156cc30, have_lock=) at malloc.c:4222 #2 0x00000000012d5e78 in cpu_map__delete (map=0x156cc40) at cpumap.c:31 #3 0x00000000012d5f7a in perf_cpu_map__put (map=0x156cc40) at cpumap.c:45 #4 0x00000000012d723a in perf_cpu_map__merge (orig=0x156cc40, other=0x156cc60) at cpumap.c:343 #5 0x000000000110cdd0 in test__cpu_map_merge ( test=0x14ea6c8 , subtest=-1) at tests/cpumap.c:128 Thus the perf_cpu_map__put(a) (see (1) above) frees map 'a' a second time and causes the failure. Fix this be removing that function call. Output after: [root@m35lp76 perf]# ./perf test -Fvvvvv 52 52: Merge cpu map : --- start --- cpumask list: 1-2,4-5,7 ---- end ---- Merge cpu map: Ok [root@m35lp76 perf]# Signed-off-by: Thomas Richter Reviewed-by: Andi Kleen Cc: Heiko Carstens Cc: Vasily Gorbik Cc: sumanthk@linux.ibm.com Link: http://lore.kernel.org/lkml/20200120132011.64698-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/cpumap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c index 4ac56741ac5f..29c793ac7d10 100644 --- a/tools/perf/tests/cpumap.c +++ b/tools/perf/tests/cpumap.c @@ -131,7 +131,6 @@ int test__cpu_map_merge(struct test *test __maybe_unused, int subtest __maybe_un TEST_ASSERT_VAL("failed to merge map: bad nr", c->nr == 5); cpu_map__snprint(c, buf, sizeof(buf)); TEST_ASSERT_VAL("failed to merge map: bad result", !strcmp(buf, "1-2,4-5,7")); - perf_cpu_map__put(a); perf_cpu_map__put(b); perf_cpu_map__put(c); return 0; From 0ada120c883d4f1f6aafd01cf0fbb10d8bbba015 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 28 Jan 2020 23:29:38 +0800 Subject: [PATCH 018/147] perf: Make perf able to build with latest libbfd libbfd has changed the bfd_section_* macros to inline functions bfd_section_ since 2019-09-18. See below two commits: o http://www.sourceware.org/ml/gdb-cvs/2019-09/msg00064.html o https://www.sourceware.org/ml/gdb-cvs/2019-09/msg00072.html This fix make perf able to build with both old and new libbfd. Signed-off-by: Changbin Du Acked-by: Jiri Olsa Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200128152938.31413-1-changbin.du@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/srcline.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 6ccf6f6d09df..5b7d6c16d33f 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -193,16 +193,30 @@ static void find_address_in_section(bfd *abfd, asection *section, void *data) bfd_vma pc, vma; bfd_size_type size; struct a2l_data *a2l = data; + flagword flags; if (a2l->found) return; - if ((bfd_get_section_flags(abfd, section) & SEC_ALLOC) == 0) +#ifdef bfd_get_section_flags + flags = bfd_get_section_flags(abfd, section); +#else + flags = bfd_section_flags(section); +#endif + if ((flags & SEC_ALLOC) == 0) return; pc = a2l->addr; +#ifdef bfd_get_section_vma vma = bfd_get_section_vma(abfd, section); +#else + vma = bfd_section_vma(section); +#endif +#ifdef bfd_get_section_size size = bfd_get_section_size(section); +#else + size = bfd_section_size(section); +#endif if (pc < vma || pc >= vma + size) return; From 00fe717ee1ea3c2979db4f94b1533c57aed8dea9 Mon Sep 17 00:00:00 2001 From: Arun Easi Date: Thu, 23 Jan 2020 20:50:14 -0800 Subject: [PATCH 019/147] scsi: qla2xxx: Fix unbound NVME response length On certain cases when response length is less than 32, NVME response data is supplied inline in IOCB. This is indicated by some combination of state flags. There was an instance when a high, and incorrect, response length was indicated causing driver to overrun buffers. Fix this by checking and limiting the response payload length. Fixes: 7401bc18d1ee3 ("scsi: qla2xxx: Add FC-NVMe command handling") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200124045014.23554-1-hmadhani@marvell.com Signed-off-by: Arun Easi Signed-off-by: Himanshu Madhani Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_dbg.c | 6 ------ drivers/scsi/qla2xxx/qla_dbg.h | 6 ++++++ drivers/scsi/qla2xxx/qla_isr.c | 12 ++++++++++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_dbg.c b/drivers/scsi/qla2xxx/qla_dbg.c index e5500bba06ca..88a56e8480f7 100644 --- a/drivers/scsi/qla2xxx/qla_dbg.c +++ b/drivers/scsi/qla2xxx/qla_dbg.c @@ -2519,12 +2519,6 @@ qla83xx_fw_dump_failed: /* Driver Debug Functions. */ /****************************************************************************/ -static inline int -ql_mask_match(uint level) -{ - return (level & ql2xextended_error_logging) == level; -} - /* * This function is for formatting and logging debug information. * It is to be used when vha is available. It formats the message diff --git a/drivers/scsi/qla2xxx/qla_dbg.h b/drivers/scsi/qla2xxx/qla_dbg.h index bb01b680ce9f..433e95502808 100644 --- a/drivers/scsi/qla2xxx/qla_dbg.h +++ b/drivers/scsi/qla2xxx/qla_dbg.h @@ -374,3 +374,9 @@ extern int qla24xx_dump_ram(struct qla_hw_data *, uint32_t, uint32_t *, extern void qla24xx_pause_risc(struct device_reg_24xx __iomem *, struct qla_hw_data *); extern int qla24xx_soft_reset(struct qla_hw_data *); + +static inline int +ql_mask_match(uint level) +{ + return (level & ql2xextended_error_logging) == level; +} diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index e7bad0bfffda..e40705d38cea 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -1939,6 +1939,18 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, inbuf = (uint32_t *)&sts->nvme_ersp_data; outbuf = (uint32_t *)fd->rspaddr; iocb->u.nvme.rsp_pyld_len = le16_to_cpu(sts->nvme_rsp_pyld_len); + if (unlikely(iocb->u.nvme.rsp_pyld_len > + sizeof(struct nvme_fc_ersp_iu))) { + if (ql_mask_match(ql_dbg_io)) { + WARN_ONCE(1, "Unexpected response payload length %u.\n", + iocb->u.nvme.rsp_pyld_len); + ql_log(ql_log_warn, fcport->vha, 0x5100, + "Unexpected response payload length %u.\n", + iocb->u.nvme.rsp_pyld_len); + } + iocb->u.nvme.rsp_pyld_len = + sizeof(struct nvme_fc_ersp_iu); + } iter = iocb->u.nvme.rsp_pyld_len >> 2; for (; iter; iter--) *outbuf++ = swab32(*inbuf++); From 92b4f9d150593a7a78d9872c2d5dc05ffae4521b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 13 Jan 2020 14:26:09 +0100 Subject: [PATCH 020/147] scsi: megaraid_sas: fixup MSIx interrupt setup during resume Streamline resume workflow by using the same functions for enabling MSIx interrupts as used during initialisation. Without it the driver might crash during resume with: WARNING: CPU: 2 PID: 4306 at ../drivers/pci/msi.c:1303 pci_irq_get_affinity+0x3b/0x90 Link: https://lore.kernel.org/r/20200113132609.69536-1-hare@suse.de Signed-off-by: Hannes Reinecke Acked-by: Sumit Saxena Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 43cbc749f66c..07a33fdf2316 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -7604,7 +7604,6 @@ megasas_resume(struct pci_dev *pdev) int rval; struct Scsi_Host *host; struct megasas_instance *instance; - int irq_flags = PCI_IRQ_LEGACY; u32 status_reg; instance = pci_get_drvdata(pdev); @@ -7673,16 +7672,15 @@ megasas_resume(struct pci_dev *pdev) atomic_set(&instance->ldio_outstanding, 0); /* Now re-enable MSI-X */ - if (instance->msix_vectors) { - irq_flags = PCI_IRQ_MSIX; - if (instance->smp_affinity_enable) - irq_flags |= PCI_IRQ_AFFINITY; + if (instance->msix_vectors) + megasas_alloc_irq_vectors(instance); + + if (!instance->msix_vectors) { + rval = pci_alloc_irq_vectors(instance->pdev, 1, 1, + PCI_IRQ_LEGACY); + if (rval < 0) + goto fail_reenable_msix; } - rval = pci_alloc_irq_vectors(instance->pdev, 1, - instance->msix_vectors ? - instance->msix_vectors : 1, irq_flags); - if (rval < 0) - goto fail_reenable_msix; megasas_setup_reply_map(instance); From 0171c1c10d2684a6bbaf4d0a1a64fd4b40d69644 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Wed, 22 Jan 2020 11:27:51 +0100 Subject: [PATCH 021/147] scsi: MAINTAINERS: ufs: remove pedrom.sousa@synopsys.com Pedro has left Synopsys and his email address doesn't work anymore. Everytime after sending email I will receive his undeliverable email. Remove his email address from MAINTAINERS. Signed-off-by: Bean Huo Signed-off-by: Martin K. Petersen --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e09bd92a1e44..8c8956ab248c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16957,7 +16957,6 @@ F: drivers/staging/unisys/ UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER R: Alim Akhtar R: Avri Altman -R: Pedro Sousa L: linux-scsi@vger.kernel.org S: Supported F: Documentation/scsi/ufs.txt From 20bc1ad2e4da2e59e1b8481f19f022d78b43429e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 Jan 2020 00:57:06 +0000 Subject: [PATCH 022/147] scsi: pm80xx: fix spelling mistake "to" -> "too" There is a spelling mistake in a pm8001_printk message. Fix it. Link: https://lore.kernel.org/r/20200123005706.2834281-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Jack Wang Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm80xx_hwi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 98dcdbd146d5..d1d95f1a2c6a 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -2377,7 +2377,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, void *piomb) ts->buf_valid_size = sizeof(*resp); } else PM8001_IO_DBG(pm8001_ha, - pm8001_printk("response to large\n")); + pm8001_printk("response too large\n")); } if (pm8001_dev) pm8001_dev->running_req--; From e0a514259378718e0deea1def03b7025a0daaf42 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 22 Jan 2020 09:12:50 +0000 Subject: [PATCH 023/147] scsi: ufs: fix spelling mistake "initilized" -> "initialized" There is a spelling mistake in a pr_err message. Fix it. Link: https://lore.kernel.org/r/20200122091250.2777221-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Bean Huo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h index dde2eb02f76f..cfe380348bf0 100644 --- a/drivers/scsi/ufs/ufs.h +++ b/drivers/scsi/ufs/ufs.h @@ -546,7 +546,7 @@ static inline bool ufs_is_valid_unit_desc_lun(struct ufs_dev_info *dev_info, u8 lun) { if (!dev_info || !dev_info->max_lu_supported) { - pr_err("Max General LU supported by UFS isn't initilized\n"); + pr_err("Max General LU supported by UFS isn't initialized\n"); return false; } From 2577e373bbc07f18680287b3958d3b2fdebe4b20 Mon Sep 17 00:00:00 2001 From: Yulia Kartseva Date: Thu, 30 Jan 2020 15:13:10 -0800 Subject: [PATCH 024/147] runqslower: Fix Makefile Fix undefined reference linker errors when building runqslower with gcc 7.4.0 on Ubuntu 18.04. The issue is with misplaced -lelf, -lz options in Makefile: $(Q)$(CC) $(CFLAGS) -lelf -lz $^ -o $@ -lelf, -lz options should follow the list of target dependencies: $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ or after substitution cc -g -Wall runqslower.o libbpf.a -lelf -lz -o runqslower The current order of gcc params causes failure in libelf symbols resolution, e.g. undefined reference to `elf_memory' Fixes: 9c01546d26d2 ("tools/bpf: Add runqslower tool to tools/bpf") Signed-off-by: Julia Kartseva Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/908498f794661c44dca54da9e09dc0c382df6fcb.1580425879.git.hex@fb.com --- tools/bpf/runqslower/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 0c021352beed..87eae5be9bcd 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -41,7 +41,7 @@ clean: $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) $(call msg,BINARY,$@) - $(Q)$(CC) $(CFLAGS) -lelf -lz $^ -o $@ + $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o From 1873f1547dde65c687de143938581347a9312207 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 20 Jan 2020 14:20:10 +0100 Subject: [PATCH 025/147] perf probe: Add ustring support for perf probe command Kernel commit 88903c464321 ("tracing/probe: Add ustring type for user-space string") adds support for user-space strings when type 'ustring' is specified. Here is an example using sysfs command line interface for kprobes: Function to probe: struct filename * getname_flags(const char __user *filename, int flags, int *empty) Setup: # cd /sys/kernel/debug/tracing/ # echo 'p:tmr1 getname_flags +0(%r2):ustring' > kprobe_events # cat events/kprobes/tmr1/format | fgrep print print fmt: "(%lx) arg1=\"%s\"", REC->__probe_ip, REC->arg1 # echo 1 > events/kprobes/tmr1/enable # touch /tmp/111 # echo 0 > events/kprobes/tmr1/enable # cat trace|fgrep /tmp/111 touch-5846 [005] d..2 255520.717960: tmr1:\ (getname_flags+0x0/0x400) arg1="/tmp/111" Doing the same with the perf tool fails. Using type 'string' succeeds: # perf probe "vfs_getname=getname_flags:72 pathname=filename:string" Added new event: probe:vfs_getname (on getname_flags:72 with pathname=filename:string) .... # perf probe -d probe:vfs_getname Removed event: probe:vfs_getname However using type 'ustring' fails (output before): # perf probe "vfs_getname=getname_flags:72 pathname=filename:ustring" Failed to write event: Invalid argument Error: Failed to add events. # Fix this by adding type 'ustring' in function convert_variable_type(). Using ustring succeeds (output after): # ./perf probe "vfs_getname=getname_flags:72 pathname=filename:ustring" Added new event: probe:vfs_getname (on getname_flags:72 with pathname=filename:ustring) You can now use it in all perf tools, such as: perf record -e probe:vfs_getname -aR sleep 1 # Note: This issue also exists on x86, it is not s390 specific. Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Vasily Gorbik Cc: sumanthk@linux.ibm.com Link: http://lore.kernel.org/lkml/20200120132011.64698-2-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-finder.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index c470c49a804f..1c817add6ca4 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -303,7 +303,8 @@ static int convert_variable_type(Dwarf_Die *vr_die, char prefix; /* TODO: check all types */ - if (cast && strcmp(cast, "string") != 0 && strcmp(cast, "x") != 0 && + if (cast && strcmp(cast, "string") != 0 && strcmp(cast, "ustring") && + strcmp(cast, "x") != 0 && strcmp(cast, "s") != 0 && strcmp(cast, "u") != 0) { /* Non string type is OK */ /* and respect signedness/hexadecimal cast */ From 85fc95d75970ee7dd8e01904e7fb1197c275ba6b Mon Sep 17 00:00:00 2001 From: Cengiz Can Date: Mon, 20 Jan 2020 17:15:54 +0300 Subject: [PATCH 026/147] perf maps: Add missing unlock to maps__insert() error case `tools/perf/util/map.c` has a function named `maps__insert` that acquires a write lock if its in multithread context. Even though this lock is released when function successfully completes, there's a branch that is executed when `maps_by_name == NULL` that returns from this function without releasing the write lock. Added an `up_write` to release the lock when this happens. Fixes: a7c2b572e217 ("perf map_groups: Auto sort maps by name, if needed") Signed-off-by: Cengiz Can Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20200120141553.23934-1-cengiz@kernel.wtf Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index fdd5bddb3075..f67960bedebb 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -549,6 +549,7 @@ void maps__insert(struct maps *maps, struct map *map) if (maps_by_name == NULL) { __maps__free_maps_by_name(maps); + up_write(&maps->lock); return; } From 2b73ea3796242608b4ccf019ff217156c92e92fe Mon Sep 17 00:00:00 2001 From: Steven Clarkson Date: Thu, 30 Jan 2020 16:48:16 -0800 Subject: [PATCH 027/147] x86/boot: Handle malformed SRAT tables during early ACPI parsing Break an infinite loop when early parsing of the SRAT table is caused by a subtable with zero length. Known to affect the ASUS WS X299 SAGE motherboard with firmware version 1201 which has a large block of zeros in its SRAT table. The kernel could boot successfully on this board/firmware prior to the introduction of early parsing this table or after a BIOS update. [ bp: Fixup whitespace damage and commit message. Make it return 0 to denote that there are no immovable regions because who knows what else is broken in this BIOS. ] Fixes: 02a3e3cdb7f1 ("x86/boot: Parse SRAT table and count immovable memory regions") Signed-off-by: Steven Clarkson Signed-off-by: Borislav Petkov Cc: linux-acpi@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=206343 Link: https://lkml.kernel.org/r/CAHKq8taGzj0u1E_i=poHUam60Bko5BpiJ9jn0fAupFUYexvdUQ@mail.gmail.com --- arch/x86/boot/compressed/acpi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 25019d42ae93..ef2ad7253cd5 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -393,7 +393,13 @@ int count_immovable_mem_regions(void) table = table_addr + sizeof(struct acpi_table_srat); while (table + sizeof(struct acpi_subtable_header) < table_end) { + sub_table = (struct acpi_subtable_header *)table; + if (!sub_table->length) { + debug_putstr("Invalid zero length SRAT subtable.\n"); + return 0; + } + if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) { struct acpi_srat_mem_affinity *ma; From 6f1a4891a5928a5969c87fa5a584844c983ec823 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 31 Jan 2020 15:26:52 +0100 Subject: [PATCH 028/147] x86/apic/msi: Plug non-maskable MSI affinity race Evan tracked down a subtle race between the update of the MSI message and the device raising an interrupt internally on PCI devices which do not support MSI masking. The update of the MSI message is non-atomic and consists of either 2 or 3 sequential 32bit wide writes to the PCI config space. - Write address low 32bits - Write address high 32bits (If supported by device) - Write data When an interrupt is migrated then both address and data might change, so the kernel attempts to mask the MSI interrupt first. But for MSI masking is optional, so there exist devices which do not provide it. That means that if the device raises an interrupt internally between the writes then a MSI message is sent built from half updated state. On x86 this can lead to spurious interrupts on the wrong interrupt vector when the affinity setting changes both address and data. As a consequence the device interrupt can be lost causing the device to become stuck or malfunctioning. Evan tried to handle that by disabling MSI accross an MSI message update. That's not feasible because disabling MSI has issues on its own: If MSI is disabled the PCI device is routing an interrupt to the legacy INTx mechanism. The INTx delivery can be disabled, but the disablement is not working on all devices. Some devices lose interrupts when both MSI and INTx delivery are disabled. Another way to solve this would be to enforce the allocation of the same vector on all CPUs in the system for this kind of screwed devices. That could be done, but it would bring back the vector space exhaustion problems which got solved a few years ago. Fortunately the high address (if supported by the device) is only relevant when X2APIC is enabled which implies interrupt remapping. In the interrupt remapping case the affinity setting is happening at the interrupt remapping unit and the PCI MSI message is programmed only once when the PCI device is initialized. That makes it possible to solve it with a two step update: 1) Target the MSI msg to the new vector on the current target CPU 2) Target the MSI msg to the new vector on the new target CPU In both cases writing the MSI message is only changing a single 32bit word which prevents the issue of inconsistency. After writing the final destination it is necessary to check whether the device issued an interrupt while the intermediate state #1 (new vector, current CPU) was in effect. This is possible because the affinity change is always happening on the current target CPU. The code runs with interrupts disabled, so the interrupt can be detected by checking the IRR of the local APIC. If the vector is pending in the IRR then the interrupt is retriggered on the new target CPU by sending an IPI for the associated vector on the target CPU. This can cause spurious interrupts on both the local and the new target CPU. 1) If the new vector is not in use on the local CPU and the device affected by the affinity change raised an interrupt during the transitional state (step #1 above) then interrupt entry code will ignore that spurious interrupt. The vector is marked so that the 'No irq handler for vector' warning is supressed once. 2) If the new vector is in use already on the local CPU then the IRR check might see an pending interrupt from the device which is using this vector. The IPI to the new target CPU will then invoke the handler of the device, which got the affinity change, even if that device did not issue an interrupt 3) If the new vector is in use already on the local CPU and the device affected by the affinity change raised an interrupt during the transitional state (step #1 above) then the handler of the device which uses that vector on the local CPU will be invoked. expose issues in device driver interrupt handlers which are not prepared to handle a spurious interrupt correctly. This not a regression, it's just exposing something which was already broken as spurious interrupts can happen for a lot of reasons and all driver handlers need to be able to deal with them. Reported-by: Evan Green Debugged-by: Evan Green Signed-off-by: Thomas Gleixner Tested-by: Evan Green Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87imkr4s7n.fsf@nanos.tec.linutronix.de --- arch/x86/include/asm/apic.h | 8 +++ arch/x86/kernel/apic/msi.c | 128 +++++++++++++++++++++++++++++++++++- include/linux/irq.h | 18 +++++ include/linux/irqdomain.h | 7 ++ kernel/irq/debugfs.c | 1 + kernel/irq/msi.c | 5 +- 6 files changed, 163 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index be0b9cf941c4..19e94af9cc5d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -454,6 +454,14 @@ static inline void ack_APIC_irq(void) apic_eoi(); } + +static inline bool lapic_vector_set_in_irr(unsigned int vector) +{ + u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + + return !!(irr & (1U << (vector % 32))); +} + static inline unsigned default_get_apic_id(unsigned long x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 7f7533462474..159bd0cb8548 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -23,10 +23,8 @@ static struct irq_domain *msi_default_domain; -static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) { - struct irq_cfg *cfg = irqd_cfg(data); - msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) @@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg); +} + +static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) +{ + struct msi_msg msg[2] = { [1] = { }, }; + + __irq_msi_compose_msg(cfg, msg); + irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); +} + +static int +msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) +{ + struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd); + struct irq_data *parent = irqd->parent_data; + unsigned int cpu; + int ret; + + /* Save the current configuration */ + cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + old_cfg = *cfg; + + /* Allocate a new target vector */ + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + /* + * For non-maskable and non-remapped MSI interrupts the migration + * to a different destination CPU and a different vector has to be + * done careful to handle the possible stray interrupt which can be + * caused by the non-atomic update of the address/data pair. + * + * Direct update is possible when: + * - The MSI is maskable (remapped MSI does not use this code path)). + * The quirk bit is not set in this case. + * - The new vector is the same as the old vector + * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The new destination CPU is the same as the old destination CPU + */ + if (!irqd_msi_nomask_quirk(irqd) || + cfg->vector == old_cfg.vector || + old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + cfg->dest_apicid == old_cfg.dest_apicid) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Paranoia: Validate that the interrupt target is the local + * CPU. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Redirect the interrupt to the new vector on the current CPU + * first. This might cause a spurious interrupt on this vector if + * the device raises an interrupt right between this update and the + * update to the final destination CPU. + * + * If the vector is in use then the installed device handler will + * denote it as spurious which is no harm as this is a rare event + * and interrupt handlers have to cope with spurious interrupts + * anyway. If the vector is unused, then it is marked so it won't + * trigger the 'No irq handler for vector' warning in do_IRQ(). + * + * This requires to hold vector lock to prevent concurrent updates to + * the affected vector. + */ + lock_vector_lock(); + + /* + * Mark the new target vector on the local CPU if it is currently + * unused. Reuse the VECTOR_RETRIGGERED state which is also used in + * the CPU hotplug path for a similar purpose. This cannot be + * undone here as the current CPU has interrupts disabled and + * cannot handle the interrupt before the whole set_affinity() + * section is done. In the CPU unplug case, the current CPU is + * about to vanish and will not handle any interrupts anymore. The + * vector is cleaned up when the CPU comes online again. + */ + if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector]))) + this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED); + + /* Redirect it to the new vector on the local CPU temporarily */ + old_cfg.vector = cfg->vector; + irq_msi_update_msg(irqd, &old_cfg); + + /* Now transition it to the target CPU */ + irq_msi_update_msg(irqd, cfg); + + /* + * All interrupts after this point are now targeted at the new + * vector/CPU. + * + * Drop vector lock before testing whether the temporary assignment + * to the local CPU was hit by an interrupt raised in the device, + * because the retrigger function acquires vector lock again. + */ + unlock_vector_lock(); + + /* + * Check whether the transition raced with a device interrupt and + * is pending in the local APICs IRR. It is safe to do this outside + * of vector lock as the irq_desc::lock of this interrupt is still + * held and interrupts are disabled: The check is not accessing the + * underlying vector store. It's just checking the local APIC's + * IRR. + */ + if (lapic_vector_set_in_irr(cfg->vector)) + irq_data_get_irq_chip(irqd)->irq_retrigger(irqd); + + return ret; +} + /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_set_affinity = msi_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent) } if (!msi_default_domain) pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); + else + msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; } #ifdef CONFIG_IRQ_REMAP diff --git a/include/linux/irq.h b/include/linux/irq.h index 7853eb9301f2..3ed5a055b5f4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -209,6 +209,8 @@ struct irq_data { * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set * IRQD_CAN_RESERVE - Can use reservation mode + * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change + * required */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -231,6 +233,7 @@ enum { IRQD_SINGLE_TARGET = (1 << 24), IRQD_DEFAULT_TRIGGER_SET = (1 << 25), IRQD_CAN_RESERVE = (1 << 26), + IRQD_MSI_NOMASK_QUIRK = (1 << 27), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(struct irq_data *d) return __irqd_to_state(d) & IRQD_CAN_RESERVE; } +static inline void irqd_set_msi_nomask_quirk(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK; +} + +static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK; +} + +static inline bool irqd_msi_nomask_quirk(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 3c340dbc5a1f..4da8df57618a 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -206,6 +206,13 @@ enum { /* Irq domain implements MSI remapping */ IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5), + /* + * Quirk to handle MSI implementations which do not provide + * masking. Currently known to affect x86, but partially + * handled in core code. + */ + IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c1eccd4f6520..a949bd39e343 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), + BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ad26fbcfbfc8..eb95f6106a1e 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, continue; irq_data = irq_domain_get_irq_data(domain, desc->irq); - if (!can_reserve) + if (!can_reserve) { irqd_clr_can_reserve(irq_data); + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + irqd_set_msi_nomask_quirk(irq_data); + } ret = irq_domain_activate_irq(irq_data, can_reserve); if (ret) goto cleanup; From 05bd330a7fd8875c423fc07d8ddcad73c10e556e Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 6 Jan 2020 14:42:39 -0800 Subject: [PATCH 029/147] x86/hyperv: Suspend/resume the hypercall page for hibernation For hibernation the hypercall page must be disabled before the hibernation image is created so that subsequent hypercall operations fail safely. On resume the hypercall page has to be restored and reenabled to ensure proper operation of the resumed kernel. Implement the necessary suspend/resume callbacks. [ tglx: Decrypted changelog ] Signed-off-by: Dexuan Cui Signed-off-by: Thomas Gleixner Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1578350559-130275-1-git-send-email-decui@microsoft.com --- arch/x86/hyperv/hv_init.c | 50 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index caaf4dce99bf..b0da5320bcff 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -21,11 +21,15 @@ #include #include #include +#include #include void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); +/* Storage to save the hypercall page temporarily for hibernation */ +static void *hv_hypercall_pg_saved; + u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); @@ -246,6 +250,48 @@ static int __init hv_pci_init(void) return 1; } +static int hv_suspend(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* + * Reset the hypercall page as it is going to be invalidated + * accross hibernation. Setting hv_hypercall_pg to NULL ensures + * that any subsequent hypercall operation fails safely instead of + * crashing due to an access of an invalid page. The hypercall page + * pointer is restored on resume. + */ + hv_hypercall_pg_saved = hv_hypercall_pg; + hv_hypercall_pg = NULL; + + /* Disable the hypercall page in the hypervisor */ + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 0; + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + return 0; +} + +static void hv_resume(void) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + + /* Re-enable the hypercall page */ + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 1; + hypercall_msr.guest_physical_address = + vmalloc_to_pfn(hv_hypercall_pg_saved); + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + hv_hypercall_pg = hv_hypercall_pg_saved; + hv_hypercall_pg_saved = NULL; +} + +static struct syscore_ops hv_syscore_ops = { + .suspend = hv_suspend, + .resume = hv_resume, +}; + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -330,6 +376,8 @@ void __init hyperv_init(void) x86_init.pci.arch_init = hv_pci_init; + register_syscore_ops(&hv_syscore_ops); + return; remove_cpuhp_state: @@ -349,6 +397,8 @@ void hyperv_cleanup(void) { union hv_x64_msr_hypercall_contents hypercall_msr; + unregister_syscore_ops(&hv_syscore_ops); + /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); From febac332a819f0e764aa4da62757ba21d18c182b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 31 Jan 2020 19:08:59 +0300 Subject: [PATCH 030/147] clocksource: Prevent double add_timer_on() for watchdog_timer Kernel crashes inside QEMU/KVM are observed: kernel BUG at kernel/time/timer.c:1154! BUG_ON(timer_pending(timer) || !timer->function) in add_timer_on(). At the same time another cpu got: general protection fault: 0000 [#1] SMP PTI of poinson pointer 0xdead000000000200 in: __hlist_del at include/linux/list.h:681 (inlined by) detach_timer at kernel/time/timer.c:818 (inlined by) expire_timers at kernel/time/timer.c:1355 (inlined by) __run_timers at kernel/time/timer.c:1686 (inlined by) run_timer_softirq at kernel/time/timer.c:1699 Unfortunately kernel logs are badly scrambled, stacktraces are lost. Printing the timer->function before the BUG_ON() pointed to clocksource_watchdog(). The execution of clocksource_watchdog() can race with a sequence of clocksource_stop_watchdog() .. clocksource_start_watchdog(): expire_timers() detach_timer(timer, true); timer->entry.pprev = NULL; raw_spin_unlock_irq(&base->lock); call_timer_fn clocksource_watchdog() clocksource_watchdog_kthread() or clocksource_unbind() spin_lock_irqsave(&watchdog_lock, flags); clocksource_stop_watchdog(); del_timer(&watchdog_timer); watchdog_running = 0; spin_unlock_irqrestore(&watchdog_lock, flags); spin_lock_irqsave(&watchdog_lock, flags); clocksource_start_watchdog(); add_timer_on(&watchdog_timer, ...); watchdog_running = 1; spin_unlock_irqrestore(&watchdog_lock, flags); spin_lock(&watchdog_lock); add_timer_on(&watchdog_timer, ...); BUG_ON(timer_pending(timer) || !timer->function); timer_pending() -> true BUG() I.e. inside clocksource_watchdog() watchdog_timer could be already armed. Check timer_pending() before calling add_timer_on(). This is sufficient as all operations are synchronized by watchdog_lock. Fixes: 75c5158f70c0 ("timekeeping: Update clocksource with stop_machine") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/158048693917.4378.13823603769948933793.stgit@buzz --- kernel/time/clocksource.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fff5f64981c6..428beb69426a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -293,8 +293,15 @@ static void clocksource_watchdog(struct timer_list *unused) next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + + /* + * Arm timer if not already pending: could race with concurrent + * pair clocksource_stop_watchdog() clocksource_start_watchdog(). + */ + if (!timer_pending(&watchdog_timer)) { + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); + } out: spin_unlock(&watchdog_lock); } From 59365cadfbcd299b8cdbe0c165faf15767c5f166 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 2 Feb 2020 00:33:04 +0100 Subject: [PATCH 031/147] efi/x86: Fix boot regression on systems with invalid memmap entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In efi_clean_memmap(), we do a pass over the EFI memory map to remove bogus entries that may be returned on certain systems. This recent commit: 1db91035d01aa8bf ("efi: Add tracking for dynamically allocated memmaps") refactored this code to pass the input to efi_memmap_install() via a temporary struct on the stack, which is populated using an initializer which inadvertently defines the value of its size field in terms of its desc_size field, which value cannot be relied upon yet in the initializer itself. Fix this by using efi.memmap.desc_size instead, which is where we get the value for desc_size from in the first place. Reported-by: Jörg Otte Tested-by: Jörg Otte Tested-by: Dan Williams Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: linux-efi@vger.kernel.org Cc: jrg.otte@gmail.com Cc: torvalds@linux-foundation.org Cc: mingo@kernel.org Link: https://lore.kernel.org/r/20200201233304.18322-1-ardb@kernel.org --- arch/x86/platform/efi/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 59f7f6d60cf6..ae923ee8e2b4 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -308,7 +308,7 @@ static void __init efi_clean_memmap(void) .phys_map = efi.memmap.phys_map, .desc_version = efi.memmap.desc_version, .desc_size = efi.memmap.desc_size, - .size = data.desc_size * (efi.memmap.nr_map - n_removal), + .size = efi.memmap.desc_size * (efi.memmap.nr_map - n_removal), .flags = 0, }; From 107945227ac5d4c37911c7841b27c64b489ce9a9 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Mon, 2 Dec 2019 15:10:21 +0800 Subject: [PATCH 032/147] irqchip/gic-v3-its: Reference to its_invall_cmd descriptor when building INVALL It looks like an obvious mistake to use its_mapc_cmd descriptor when building the INVALL command block. It so far worked by luck because both its_mapc_cmd.col and its_invall_cmd.col sit at the same offset of the ITS command descriptor, but we should not rely on it. Fixes: cc2d3216f53c ("irqchip: GICv3: ITS command queue") Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20191202071021.1251-1-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index f71758632f8d..e5a25d97f8db 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -661,7 +661,7 @@ static struct its_collection *its_build_invall_cmd(struct its_node *its, struct its_cmd_desc *desc) { its_encode_cmd(cmd, GITS_CMD_INVALL); - its_encode_collection(cmd, desc->its_mapc_cmd.col->col_id); + its_encode_collection(cmd, desc->its_invall_cmd.col->col_id); its_fixup_cmd(cmd); From c8fb7d7e48d11520ad24808cfce7afb7b9c9f798 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 1 Feb 2020 14:03:11 +0900 Subject: [PATCH 033/147] kconfig: fix broken dependency in randconfig-generated .config Running randconfig on arm64 using KCONFIG_SEED=0x40C5E904 (e.g. on v5.5) produces the .config with CONFIG_EFI=y and CONFIG_CPU_BIG_ENDIAN=y, which does not meet the !CONFIG_CPU_BIG_ENDIAN dependency. This is because the user choice for CONFIG_CPU_LITTLE_ENDIAN vs CONFIG_CPU_BIG_ENDIAN is set by randomize_choice_values() after the value of CONFIG_EFI is calculated. When this happens, the has_changed flag should be set. Currently, it takes the result from the last iteration. It should accumulate all the results of the loop. Fixes: 3b9a19e08960 ("kconfig: loop as long as we changed some symbols in randconfig") Reported-by: Vincenzo Frascino Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 11f6c72c2eee..63d307b0d1ac 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -1312,7 +1312,7 @@ bool conf_set_all_new_symbols(enum conf_def_mode mode) sym_calc_value(csym); if (mode == def_random) - has_changed = randomize_choice_values(csym); + has_changed |= randomize_choice_values(csym); else { set_all_choice_values(csym); has_changed = true; From faa7bdd7e9e1441ed82819b8db8bb43d3d3fd818 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2020 01:49:23 +0900 Subject: [PATCH 034/147] kbuild: fix the document to use extra-y for vmlinux.lds The difference between "always" and "extra-y" is that the targets listed in $(always) are always built, whereas the ones in $(extra-y) are built only when KBUILD_BUILTIN is set. So, "make modules" does not build the targets in $(extra-y). vmlinux.lds is only needed for linking vmlinux. So, adding it to extra-y is more correct. In fact, arch/x86/kernel/Makefile does this. Fix the example code. Signed-off-by: Masahiro Yamada --- Documentation/kbuild/makefiles.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index d7e6534a8505..b1733b877025 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -1269,12 +1269,12 @@ When kbuild executes, the following steps are followed (roughly): Example:: #arch/x86/kernel/Makefile - always := vmlinux.lds + extra-y := vmlinux.lds #Makefile export CPPFLAGS_vmlinux.lds += -P -C -U$(ARCH) - The assignment to $(always) is used to tell kbuild to build the + The assignment to extra-y is used to tell kbuild to build the target vmlinux.lds. The assignment to $(CPPFLAGS_vmlinux.lds) tells kbuild to use the specified options when building the target vmlinux.lds. From 5f2fb52fac15a8a8e10ce020dd532504a8abfc4e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2020 01:49:24 +0900 Subject: [PATCH 035/147] kbuild: rename hostprogs-y/always to hostprogs/always-y In old days, the "host-progs" syntax was used for specifying host programs. It was renamed to the current "hostprogs-y" in 2004. It is typically useful in scripts/Makefile because it allows Kbuild to selectively compile host programs based on the kernel configuration. This commit renames like follows: always -> always-y hostprogs-y -> hostprogs So, scripts/Makefile will look like this: always-$(CONFIG_BUILD_BIN2C) += ... always-$(CONFIG_KALLSYMS) += ... ... hostprogs := $(always-y) $(always-m) I think this makes more sense because a host program is always a host program, irrespective of the kernel configuration. We want to specify which ones to compile by CONFIG options, so always-y will be handier. The "always", "hostprogs-y", "hostprogs-m" will be kept for backward compatibility for a while. Signed-off-by: Masahiro Yamada --- Documentation/kbuild/makefiles.rst | 49 ++++-------- Kbuild | 8 +- arch/alpha/boot/Makefile | 2 +- arch/arm/vdso/Makefile | 2 +- arch/arm64/kernel/vdso32/Makefile | 4 +- arch/mips/boot/Makefile | 2 +- arch/mips/boot/compressed/Makefile | 4 +- arch/mips/boot/tools/Makefile | 2 +- arch/mips/tools/Makefile | 4 +- arch/mips/vdso/Makefile | 2 +- arch/powerpc/boot/Makefile | 4 +- arch/s390/tools/Makefile | 4 +- arch/sparc/boot/Makefile | 2 +- arch/sparc/vdso/Makefile | 2 +- arch/x86/boot/Makefile | 4 +- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/entry/vdso/Makefile | 2 +- arch/x86/realmode/rm/Makefile | 2 +- arch/x86/tools/Makefile | 4 +- drivers/gpu/drm/radeon/Makefile | 2 +- drivers/tty/vt/Makefile | 2 +- drivers/video/logo/Makefile | 2 +- drivers/zorro/Makefile | 2 +- fs/unicode/Makefile | 2 +- lib/Makefile | 4 +- lib/raid6/Makefile | 2 +- net/bpfilter/Makefile | 2 +- samples/bpf/Makefile | 118 ++++++++++++++-------------- samples/connector/Makefile | 8 +- samples/hidraw/Makefile | 6 +- samples/mei/Makefile | 4 +- samples/pidfd/Makefile | 4 +- samples/seccomp/Makefile | 4 +- samples/uhid/Makefile | 4 +- samples/vfs/Makefile | 5 +- scripts/Makefile | 22 +++--- scripts/Makefile.build | 8 +- scripts/Makefile.clean | 4 +- scripts/Makefile.host | 8 +- scripts/Makefile.lib | 6 +- scripts/basic/Makefile | 4 +- scripts/dtc/Makefile | 4 +- scripts/gcc-plugins/Makefile | 2 +- scripts/genksyms/Makefile | 4 +- scripts/kconfig/Makefile | 10 +-- scripts/mod/Makefile | 4 +- scripts/selinux/genheaders/Makefile | 4 +- scripts/selinux/mdp/Makefile | 4 +- usr/Makefile | 2 +- 49 files changed, 172 insertions(+), 190 deletions(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index b1733b877025..0e0eb2c8da7d 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -28,7 +28,6 @@ This document describes the Linux kernel Makefiles. --- 4.3 Using C++ for host programs --- 4.4 Controlling compiler options for host programs --- 4.5 When host programs are actually built - --- 4.6 Using hostprogs-$(CONFIG_FOO) === 5 Kbuild clean infrastructure @@ -595,11 +594,11 @@ compilation stage. Two steps are required in order to use a host executable. The first step is to tell kbuild that a host program exists. This is -done utilising the variable hostprogs-y. +done utilising the variable "hostprogs". The second step is to add an explicit dependency to the executable. This can be done in two ways. Either add the dependency in a rule, -or utilise the variable $(always). +or utilise the variable "always-y". Both possibilities are described in the following. 4.1 Simple Host Program @@ -612,7 +611,7 @@ Both possibilities are described in the following. Example:: - hostprogs-y := bin2hex + hostprogs := bin2hex Kbuild assumes in the above example that bin2hex is made from a single c-source file named bin2hex.c located in the same directory as @@ -630,7 +629,7 @@ Both possibilities are described in the following. Example:: #scripts/lxdialog/Makefile - hostprogs-y := lxdialog + hostprogs := lxdialog lxdialog-objs := checklist.o lxdialog.o Objects with extension .o are compiled from the corresponding .c @@ -650,7 +649,7 @@ Both possibilities are described in the following. Example:: #scripts/kconfig/Makefile - hostprogs-y := qconf + hostprogs := qconf qconf-cxxobjs := qconf.o In the example above the executable is composed of the C++ file @@ -662,7 +661,7 @@ Both possibilities are described in the following. Example:: #scripts/kconfig/Makefile - hostprogs-y := qconf + hostprogs := qconf qconf-cxxobjs := qconf.o qconf-objs := check.o @@ -710,7 +709,7 @@ Both possibilities are described in the following. Example:: #drivers/pci/Makefile - hostprogs-y := gen-devlist + hostprogs := gen-devlist $(obj)/devlist.h: $(src)/pci.ids $(obj)/gen-devlist ( cd $(obj); ./gen-devlist ) < $< @@ -718,47 +717,31 @@ Both possibilities are described in the following. $(obj)/gen-devlist is updated. Note that references to the host programs in special rules must be prefixed with $(obj). - (2) Use $(always) + (2) Use always-y When there is no suitable special rule, and the host program - shall be built when a makefile is entered, the $(always) + shall be built when a makefile is entered, the always-y variable shall be used. Example:: #scripts/lxdialog/Makefile - hostprogs-y := lxdialog - always := $(hostprogs-y) + hostprogs := lxdialog + always-y := $(hostprogs) This will tell kbuild to build lxdialog even if not referenced in any rule. -4.6 Using hostprogs-$(CONFIG_FOO) ---------------------------------- - - A typical pattern in a Kbuild file looks like this: - - Example:: - - #scripts/Makefile - hostprogs-$(CONFIG_KALLSYMS) += kallsyms - - Kbuild knows about both 'y' for built-in and 'm' for module. - So if a config symbol evaluates to 'm', kbuild will still build - the binary. In other words, Kbuild handles hostprogs-m exactly - like hostprogs-y. But only hostprogs-y is recommended to be used - when no CONFIG symbols are involved. - 5 Kbuild clean infrastructure ============================= "make clean" deletes most generated files in the obj tree where the kernel is compiled. This includes generated files such as host programs. -Kbuild knows targets listed in $(hostprogs-y), $(hostprogs-m), $(always), -$(extra-y) and $(targets). They are all deleted during "make clean". -Files matching the patterns "*.[oas]", "*.ko", plus some additional files -generated by kbuild are deleted all over the kernel src tree when -"make clean" is executed. +Kbuild knows targets listed in $(hostprogs), $(always-y), $(always-m), +$(always-), $(extra-y), $(extra-) and $(targets). They are all deleted +during "make clean". Files matching the patterns "*.[oas]", "*.ko", plus +some additional files generated by kbuild are deleted all over the kernel +source tree when "make clean" is executed. Additional files or directories can be specified in kbuild makefiles by use of $(clean-files). diff --git a/Kbuild b/Kbuild index 3109ac786e76..fa441b98c9f6 100644 --- a/Kbuild +++ b/Kbuild @@ -7,7 +7,7 @@ bounds-file := include/generated/bounds.h -always := $(bounds-file) +always-y := $(bounds-file) targets := kernel/bounds.s $(bounds-file): kernel/bounds.s FORCE @@ -28,7 +28,7 @@ $(timeconst-file): kernel/time/timeconst.bc FORCE offsets-file := include/generated/asm-offsets.h -always += $(offsets-file) +always-y += $(offsets-file) targets += arch/$(SRCARCH)/kernel/asm-offsets.s arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file) @@ -39,7 +39,7 @@ $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE ##### # Check for missing system calls -always += missing-syscalls +always-y += missing-syscalls quiet_cmd_syscalls = CALL $< cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags) @@ -50,7 +50,7 @@ missing-syscalls: scripts/checksyscalls.sh $(offsets-file) FORCE ##### # Check atomic headers are up-to-date -always += old-atomics +always-y += old-atomics quiet_cmd_atomics = CALL $< cmd_atomics = $(CONFIG_SHELL) $< diff --git a/arch/alpha/boot/Makefile b/arch/alpha/boot/Makefile index 991e023a6fc4..d8dba85e606c 100644 --- a/arch/alpha/boot/Makefile +++ b/arch/alpha/boot/Makefile @@ -8,7 +8,7 @@ # Copyright (C) 1994 by Linus Torvalds # -hostprogs-y := tools/mkbb tools/objstrip +hostprogs := tools/mkbb tools/objstrip targets := vmlinux.gz vmlinux \ vmlinux.nh tools/lxboot tools/bootlx tools/bootph \ tools/bootpzh bootloader bootpheader bootpzheader diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 1babb392e70a..d3c9f03e7e79 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -5,7 +5,7 @@ ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 include $(srctree)/lib/vdso/Makefile -hostprogs-y := vdsomunge +hostprogs := vdsomunge obj-vdso := vgettimeofday.o datapage.o note.o diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 76b327f88fbb..04df57b43cb1 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -115,9 +115,9 @@ VDSO_LDFLAGS += $(call cc32-ldoption,-fuse-ld=bfd) # Borrow vdsomunge.c from the arm vDSO # We have to use a relative path because scripts/Makefile.host prefixes -# $(hostprogs-y) with $(obj) +# $(hostprogs) with $(obj) munge := ../../../arm/vdso/vdsomunge -hostprogs-y := $(munge) +hostprogs := $(munge) c-obj-vdso := note.o c-obj-vdso-gettimeofday := vgettimeofday.o diff --git a/arch/mips/boot/Makefile b/arch/mips/boot/Makefile index 4ed45ade32a1..a3da2c5d63c2 100644 --- a/arch/mips/boot/Makefile +++ b/arch/mips/boot/Makefile @@ -21,7 +21,7 @@ endif drop-sections := .reginfo .mdebug .comment .note .pdr .options .MIPS.options strip-flags := $(addprefix --remove-section=,$(drop-sections)) -hostprogs-y := elf2ecoff +hostprogs := elf2ecoff suffix-y := bin suffix-$(CONFIG_KERNEL_BZIP2) := bz2 diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index d859f079b771..0df0ee8a298d 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -84,7 +84,7 @@ $(obj)/piggy.o: $(obj)/dummy.o $(obj)/vmlinux.bin.z FORCE HOSTCFLAGS_calc_vmlinuz_load_addr.o += $(LINUXINCLUDE) # Calculate the load address of the compressed kernel image -hostprogs-y := calc_vmlinuz_load_addr +hostprogs := calc_vmlinuz_load_addr ifneq ($(zload-y),) VMLINUZ_LOAD_ADDRESS := $(zload-y) @@ -112,7 +112,7 @@ ifdef CONFIG_MACH_DECSTATION endif # elf2ecoff can only handle 32bit image -hostprogs-y += ../elf2ecoff +hostprogs += ../elf2ecoff ifdef CONFIG_32BIT VMLINUZ = vmlinuz diff --git a/arch/mips/boot/tools/Makefile b/arch/mips/boot/tools/Makefile index 5f8e737348eb..592e05a51a4a 100644 --- a/arch/mips/boot/tools/Makefile +++ b/arch/mips/boot/tools/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -hostprogs-y += relocs +hostprogs += relocs relocs-objs += relocs_32.o relocs-objs += relocs_64.o relocs-objs += relocs_main.o diff --git a/arch/mips/tools/Makefile b/arch/mips/tools/Makefile index aaef688749f5..b851e5dcc65a 100644 --- a/arch/mips/tools/Makefile +++ b/arch/mips/tools/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -hostprogs-y := elf-entry +hostprogs := elf-entry PHONY += elf-entry elf-entry: $(obj)/elf-entry @: -hostprogs-$(CONFIG_CPU_LOONGSON3_WORKAROUNDS) += loongson3-llsc-check +hostprogs += loongson3-llsc-check PHONY += loongson3-llsc-check loongson3-llsc-check: $(obj)/loongson3-llsc-check @: diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index b2a2e032dc99..aa89a41dc5dd 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -100,7 +100,7 @@ $(obj)/%.so.raw: OBJCOPYFLAGS := -S $(obj)/%.so.raw: $(obj)/%.so.dbg.raw FORCE $(call if_changed,objcopy) -hostprogs-y := genvdso +hostprogs := genvdso quiet_cmd_genvdso = GENVDSO $@ define cmd_genvdso diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index dfbd7f22eef5..0556bf4fc9e9 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -224,7 +224,7 @@ $(patsubst %.S,%.o, $(filter %.S, $(src-boot))): %.o: %.S FORCE $(obj)/wrapper.a: $(obj-wlib) FORCE $(call if_changed,bootar) -hostprogs-y := addnote hack-coff mktree +hostprogs := addnote hack-coff mktree targets += $(patsubst $(obj)/%,%,$(obj-boot) wrapper.a) extra-y := $(obj)/wrapper.a $(obj-plat) $(obj)/empty.o \ @@ -464,7 +464,7 @@ WRAPPER_BINDIR := /usr/sbin INSTALL := install extra-installed := $(patsubst $(obj)/%, $(DESTDIR)$(WRAPPER_OBJDIR)/%, $(extra-y)) -hostprogs-installed := $(patsubst %, $(DESTDIR)$(WRAPPER_BINDIR)/%, $(hostprogs-y)) +hostprogs-installed := $(patsubst %, $(DESTDIR)$(WRAPPER_BINDIR)/%, $(hostprogs)) wrapper-installed := $(DESTDIR)$(WRAPPER_BINDIR)/wrapper dts-installed := $(patsubst $(dtstree)/%, $(DESTDIR)$(WRAPPER_DTSDIR)/%, $(wildcard $(dtstree)/*.dts)) diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index b5e35e8f999a..f9dd47ff9ac4 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -10,8 +10,8 @@ PHONY += kapi kapi: $(kapi-hdrs-y) -hostprogs-y += gen_facilities -hostprogs-y += gen_opcode_table +hostprogs += gen_facilities +hostprogs += gen_opcode_table HOSTCFLAGS_gen_facilities.o += $(LINUXINCLUDE) diff --git a/arch/sparc/boot/Makefile b/arch/sparc/boot/Makefile index ec8cd703b708..380e2b018992 100644 --- a/arch/sparc/boot/Makefile +++ b/arch/sparc/boot/Makefile @@ -7,7 +7,7 @@ ROOT_IMG := /usr/src/root.img ELFTOAOUT := elftoaout -hostprogs-y := piggyback +hostprogs := piggyback targets := tftpboot.img image zImage vmlinux.aout clean-files := System.map diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index 997ffe46e953..708cb6304c2d 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -41,7 +41,7 @@ $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) HOST_EXTRACFLAGS += -I$(srctree)/tools/include -hostprogs-y += vdso2c +hostprogs += vdso2c quiet_cmd_vdso2c = VDSO2C $@ cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 748b6d28a91d..012b82fc8617 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -45,8 +45,8 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs-y := tools/build -hostprogs-$(CONFIG_X86_FEATURE_NAMES) += mkcpustr +hostprogs := tools/build +hostprogs += mkcpustr HOST_EXTRACFLAGS += -I$(srctree)/tools/include \ -include include/generated/autoconf.h \ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 56aa5fa0a66b..26050ae0b27e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -58,7 +58,7 @@ KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ endif LDFLAGS_vmlinux := -T -hostprogs-y := mkpiggy +hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 2b75e80f6b41..433a1259f61d 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -59,7 +59,7 @@ $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso_and_check) HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi -hostprogs-y += vdso2c +hostprogs += vdso2c quiet_cmd_vdso2c = VDSO2C $@ cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@ diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index f60501a384f9..99b6332ba540 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -12,7 +12,7 @@ OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n -always := realmode.bin realmode.relocs +always-y := realmode.bin realmode.relocs wakeup-objs := wakeup_asm.o wakemain.o video-mode.o wakeup-objs += copy.o bioscall.o regs.o diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 09af7ff53044..55b1ab378974 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -26,7 +26,7 @@ posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity $(call cmd,posttest) $(call cmd,sanitytest) -hostprogs-y += insn_decoder_test insn_sanity +hostprogs += insn_decoder_test insn_sanity # -I needed for generated C source and C source which in the kernel tree. HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/ @@ -39,7 +39,7 @@ $(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/l $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c HOST_EXTRACFLAGS += -I$(srctree)/tools/include -hostprogs-y += relocs +hostprogs += relocs relocs-objs := relocs_32.o relocs_64.o relocs_common.o PHONY += relocs relocs: $(obj)/relocs diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 92ccd7aed0d4..c693b2ca0329 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -5,7 +5,7 @@ ccflags-y := -Idrivers/gpu/drm/amd/include -hostprogs-y := mkregtable +hostprogs := mkregtable clean-files := rn50_reg_safe.h r100_reg_safe.h r200_reg_safe.h rv515_reg_safe.h r300_reg_safe.h r420_reg_safe.h rs600_reg_safe.h r600_reg_safe.h evergreen_reg_safe.h cayman_reg_safe.h quiet_cmd_mkregtable = MKREGTABLE $@ diff --git a/drivers/tty/vt/Makefile b/drivers/tty/vt/Makefile index 329ca336b8ee..fe30ce512819 100644 --- a/drivers/tty/vt/Makefile +++ b/drivers/tty/vt/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_HW_CONSOLE) += vt.o defkeymap.o # Files generated that shall be removed upon make clean clean-files := consolemap_deftbl.c defkeymap.c -hostprogs-y += conmakehash +hostprogs += conmakehash quiet_cmd_conmk = CONMK $@ cmd_conmk = $(obj)/conmakehash $< > $@ diff --git a/drivers/video/logo/Makefile b/drivers/video/logo/Makefile index bcda657493a4..895c60b8402e 100644 --- a/drivers/video/logo/Makefile +++ b/drivers/video/logo/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_SPU_BASE) += logo_spe_clut224.o # How to generate logo's -hostprogs-y := pnmtologo +hostprogs := pnmtologo # Create commands like "pnmtologo -t mono -n logo_mac_mono -o ..." quiet_cmd_logo = LOGO $@ diff --git a/drivers/zorro/Makefile b/drivers/zorro/Makefile index b360ac4ea846..91ba82e633e7 100644 --- a/drivers/zorro/Makefile +++ b/drivers/zorro/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_ZORRO) += zorro.o zorro-driver.o zorro-sysfs.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_ZORRO_NAMES) += names.o -hostprogs-y := gen-devlist +hostprogs := gen-devlist # Files generated that shall be removed upon make clean clean-files := devlist.h diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index d46e9baee285..b88aecc86550 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -35,4 +35,4 @@ $(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE endif targets += utf8data.h -hostprogs-y += mkutf8data +hostprogs += mkutf8data diff --git a/lib/Makefile b/lib/Makefile index 23ca78d43d24..8508143d3b76 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -239,8 +239,8 @@ obj-$(CONFIG_ASN1) += asn1_decoder.o obj-$(CONFIG_FONT_SUPPORT) += fonts/ -hostprogs-y := gen_crc32table -hostprogs-y += gen_crc64table +hostprogs := gen_crc32table +hostprogs += gen_crc64table clean-files := crc32table.h clean-files += crc64table.h diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 0083b5cc646c..b4c0df6d706d 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -10,7 +10,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o -hostprogs-y += mktables +hostprogs += mktables ifeq ($(CONFIG_ALTIVEC),y) altivec_flags := -maltivec $(call cc-option,-mabi=altivec) diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index aa945ab5b655..36580301da70 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux BPFILTER layer. # -hostprogs-y := bpfilter_umh +hostprogs := bpfilter_umh bpfilter_umh-objs := main.o KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi HOSTCC := $(CC) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index b0e8adf7eb01..79b0fee6943b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -111,65 +111,65 @@ ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) # Tell kbuild to always build the programs -always := $(tprogs-y) -always += sockex1_kern.o -always += sockex2_kern.o -always += sockex3_kern.o -always += tracex1_kern.o -always += tracex2_kern.o -always += tracex3_kern.o -always += tracex4_kern.o -always += tracex5_kern.o -always += tracex6_kern.o -always += tracex7_kern.o -always += sock_flags_kern.o -always += test_probe_write_user_kern.o -always += trace_output_kern.o -always += tcbpf1_kern.o -always += tc_l2_redirect_kern.o -always += lathist_kern.o -always += offwaketime_kern.o -always += spintest_kern.o -always += map_perf_test_kern.o -always += test_overhead_tp_kern.o -always += test_overhead_raw_tp_kern.o -always += test_overhead_kprobe_kern.o -always += parse_varlen.o parse_simple.o parse_ldabs.o -always += test_cgrp2_tc_kern.o -always += xdp1_kern.o -always += xdp2_kern.o -always += xdp_router_ipv4_kern.o -always += test_current_task_under_cgroup_kern.o -always += trace_event_kern.o -always += sampleip_kern.o -always += lwt_len_hist_kern.o -always += xdp_tx_iptunnel_kern.o -always += test_map_in_map_kern.o -always += tcp_synrto_kern.o -always += tcp_rwnd_kern.o -always += tcp_bufs_kern.o -always += tcp_cong_kern.o -always += tcp_iw_kern.o -always += tcp_clamp_kern.o -always += tcp_basertt_kern.o -always += tcp_tos_reflect_kern.o -always += tcp_dumpstats_kern.o -always += xdp_redirect_kern.o -always += xdp_redirect_map_kern.o -always += xdp_redirect_cpu_kern.o -always += xdp_monitor_kern.o -always += xdp_rxq_info_kern.o -always += xdp2skb_meta_kern.o -always += syscall_tp_kern.o -always += cpustat_kern.o -always += xdp_adjust_tail_kern.o -always += xdp_fwd_kern.o -always += task_fd_query_kern.o -always += xdp_sample_pkts_kern.o -always += ibumad_kern.o -always += hbm_out_kern.o -always += hbm_edt_kern.o -always += xdpsock_kern.o +always-y := $(tprogs-y) +always-y += sockex1_kern.o +always-y += sockex2_kern.o +always-y += sockex3_kern.o +always-y += tracex1_kern.o +always-y += tracex2_kern.o +always-y += tracex3_kern.o +always-y += tracex4_kern.o +always-y += tracex5_kern.o +always-y += tracex6_kern.o +always-y += tracex7_kern.o +always-y += sock_flags_kern.o +always-y += test_probe_write_user_kern.o +always-y += trace_output_kern.o +always-y += tcbpf1_kern.o +always-y += tc_l2_redirect_kern.o +always-y += lathist_kern.o +always-y += offwaketime_kern.o +always-y += spintest_kern.o +always-y += map_perf_test_kern.o +always-y += test_overhead_tp_kern.o +always-y += test_overhead_raw_tp_kern.o +always-y += test_overhead_kprobe_kern.o +always-y += parse_varlen.o parse_simple.o parse_ldabs.o +always-y += test_cgrp2_tc_kern.o +always-y += xdp1_kern.o +always-y += xdp2_kern.o +always-y += xdp_router_ipv4_kern.o +always-y += test_current_task_under_cgroup_kern.o +always-y += trace_event_kern.o +always-y += sampleip_kern.o +always-y += lwt_len_hist_kern.o +always-y += xdp_tx_iptunnel_kern.o +always-y += test_map_in_map_kern.o +always-y += tcp_synrto_kern.o +always-y += tcp_rwnd_kern.o +always-y += tcp_bufs_kern.o +always-y += tcp_cong_kern.o +always-y += tcp_iw_kern.o +always-y += tcp_clamp_kern.o +always-y += tcp_basertt_kern.o +always-y += tcp_tos_reflect_kern.o +always-y += tcp_dumpstats_kern.o +always-y += xdp_redirect_kern.o +always-y += xdp_redirect_map_kern.o +always-y += xdp_redirect_cpu_kern.o +always-y += xdp_monitor_kern.o +always-y += xdp_rxq_info_kern.o +always-y += xdp2skb_meta_kern.o +always-y += syscall_tp_kern.o +always-y += cpustat_kern.o +always-y += xdp_adjust_tail_kern.o +always-y += xdp_fwd_kern.o +always-y += task_fd_query_kern.o +always-y += xdp_sample_pkts_kern.o +always-y += ibumad_kern.o +always-y += hbm_out_kern.o +always-y += hbm_edt_kern.o +always-y += xdpsock_kern.o ifeq ($(ARCH), arm) # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux diff --git a/samples/connector/Makefile b/samples/connector/Makefile index 6ad71620e503..b785cbde5ffa 100644 --- a/samples/connector/Makefile +++ b/samples/connector/Makefile @@ -2,12 +2,8 @@ obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o # List of programs to build -ifdef CONFIG_SAMPLE_CONNECTOR -hostprogs-y := ucon -endif - -# Tell kbuild to always build the programs -always := $(hostprogs-y) +hostprogs := ucon +always-y := $(hostprogs) HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile index dec1b22adf54..8bd25f77671f 100644 --- a/samples/hidraw/Makefile +++ b/samples/hidraw/Makefile @@ -1,9 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # List of programs to build -hostprogs-y := hid-example - -# Tell kbuild to always build the programs -always := $(hostprogs-y) +hostprogs := hid-example +always-y := $(hostprogs) HOSTCFLAGS_hid-example.o += -I$(objtree)/usr/include diff --git a/samples/mei/Makefile b/samples/mei/Makefile index 27f37efdadb4..f5b9d02be2cd 100644 --- a/samples/mei/Makefile +++ b/samples/mei/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (c) 2012-2019, Intel Corporation. All rights reserved. -hostprogs-y := mei-amt-version +hostprogs := mei-amt-version HOSTCFLAGS_mei-amt-version.o += -I$(objtree)/usr/include -always := $(hostprogs-y) +always-y := $(hostprogs) all: mei-amt-version diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile index 0ff97784177a..ee2979849d92 100644 --- a/samples/pidfd/Makefile +++ b/samples/pidfd/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -hostprogs-y := pidfd-metadata -always := $(hostprogs-y) +hostprogs := pidfd-metadata +always-y := $(hostprogs) HOSTCFLAGS_pidfd-metadata.o += -I$(objtree)/usr/include all: pidfd-metadata diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index 009775b52538..89279e8b87df 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 ifndef CROSS_COMPILE -hostprogs-y := bpf-fancy dropper bpf-direct user-trap +hostprogs := bpf-fancy dropper bpf-direct user-trap HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include @@ -40,5 +40,5 @@ HOSTLDLIBS_bpf-fancy += $(MFLAG) HOSTLDLIBS_dropper += $(MFLAG) HOSTLDLIBS_user-trap += $(MFLAG) endif -always := $(hostprogs-y) +always-y := $(hostprogs) endif diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile index 8c9bc9f98d37..5f44ea40d6d5 100644 --- a/samples/uhid/Makefile +++ b/samples/uhid/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only # List of programs to build -hostprogs-y := uhid-example +hostprogs := uhid-example # Tell kbuild to always build the programs -always := $(hostprogs-y) +always-y := $(hostprogs) HOSTCFLAGS_uhid-example.o += -I$(objtree)/usr/include diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile index e21c9f6fe9be..65acdde5c117 100644 --- a/samples/vfs/Makefile +++ b/samples/vfs/Makefile @@ -1,11 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only # List of programs to build -hostprogs-y := \ +hostprogs := \ test-fsmount \ test-statx -# Tell kbuild to always build the programs -always := $(hostprogs-y) +always-y := $(hostprogs) HOSTCFLAGS_test-fsmount.o += -I$(objtree)/usr/include HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include diff --git a/scripts/Makefile b/scripts/Makefile index 4d41f48e7376..5e75802b1a44 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -7,14 +7,14 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include -hostprogs-$(CONFIG_BUILD_BIN2C) += bin2c -hostprogs-$(CONFIG_KALLSYMS) += kallsyms -hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount -hostprogs-$(CONFIG_BUILDTIME_TABLE_SORT) += sorttable -hostprogs-$(CONFIG_ASN1) += asn1_compiler -hostprogs-$(CONFIG_MODULE_SIG_FORMAT) += sign-file -hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert -hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert +always-$(CONFIG_BUILD_BIN2C) += bin2c +always-$(CONFIG_KALLSYMS) += kallsyms +always-$(BUILD_C_RECORDMCOUNT) += recordmcount +always-$(CONFIG_BUILDTIME_TABLE_SORT) += sorttable +always-$(CONFIG_ASN1) += asn1_compiler +always-$(CONFIG_MODULE_SIG_FORMAT) += sign-file +always-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert +always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include @@ -30,10 +30,10 @@ HOSTCFLAGS_sorttable.o += -DUNWINDER_ORC_ENABLED HOSTLDLIBS_sorttable = -lpthread endif -always := $(hostprogs-y) $(hostprogs-m) +hostprogs := $(always-y) $(always-m) -# The following hostprogs-y programs are only build on demand -hostprogs-y += unifdef +# The following programs are only built on demand +hostprogs += unifdef subdir-$(CONFIG_GCC_PLUGINS) += gcc-plugins subdir-$(CONFIG_MODVERSIONS) += genksyms diff --git a/scripts/Makefile.build b/scripts/Makefile.build index a562d695f0fa..a1730d42e5f3 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -16,6 +16,8 @@ obj-m := lib-y := lib-m := always := +always-y := +always-m := targets := subdir-y := subdir-m := @@ -44,7 +46,7 @@ include $(kbuild-file) include scripts/Makefile.lib # Do not include host rules unless needed -ifneq ($(hostprogs-y)$(hostprogs-m)$(hostlibs-y)$(hostlibs-m)$(hostcxxlibs-y)$(hostcxxlibs-m),) +ifneq ($(hostprogs)$(hostlibs-y)$(hostlibs-m)$(hostcxxlibs-y)$(hostcxxlibs-m),) include scripts/Makefile.host endif @@ -348,7 +350,7 @@ $(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE $(call if_changed_rule,as_o_S) targets += $(filter-out $(subdir-obj-y), $(real-obj-y)) $(real-obj-m) $(lib-y) -targets += $(extra-y) $(MAKECMDGOALS) $(always) +targets += $(extra-y) $(always-y) $(MAKECMDGOALS) # Linker scripts preprocessor (.lds.S -> .lds) # --------------------------------------------------------------------------- @@ -490,7 +492,7 @@ else __build: $(if $(KBUILD_BUILTIN),$(builtin-target) $(lib-target) $(extra-y)) \ $(if $(KBUILD_MODULES),$(obj-m) $(mod-targets) $(modorder-target)) \ - $(subdir-ym) $(always) + $(subdir-ym) $(always-y) @: endif diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean index e367eb95c5c0..1e4206566a82 100644 --- a/scripts/Makefile.clean +++ b/scripts/Makefile.clean @@ -28,8 +28,8 @@ subdir-ymn := $(addprefix $(obj)/,$(subdir-ymn)) # directory __clean-files := $(extra-y) $(extra-m) $(extra-) \ - $(always) $(targets) $(clean-files) \ - $(hostprogs-y) $(hostprogs-m) $(hostprogs-) \ + $(always) $(always-y) $(always-m) $(always-) $(targets) $(clean-files) \ + $(hostprogs) $(hostprogs-y) $(hostprogs-m) $(hostprogs-) \ $(hostlibs-y) $(hostlibs-m) $(hostlibs-) \ $(hostcxxlibs-y) $(hostcxxlibs-m) diff --git a/scripts/Makefile.host b/scripts/Makefile.host index 4c51c95d40f4..3b7121d43324 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -24,21 +24,21 @@ $(obj)/%.tab.c $(obj)/%.tab.h: $(src)/%.y FORCE # Both C and C++ are supported, but preferred language is C for such utilities. # # Sample syntax (see Documentation/kbuild/makefiles.rst for reference) -# hostprogs-y := bin2hex +# hostprogs := bin2hex # Will compile bin2hex.c and create an executable named bin2hex # -# hostprogs-y := lxdialog +# hostprogs := lxdialog # lxdialog-objs := checklist.o lxdialog.o # Will compile lxdialog.c and checklist.c, and then link the executable # lxdialog, based on checklist.o and lxdialog.o # -# hostprogs-y := qconf +# hostprogs := qconf # qconf-cxxobjs := qconf.o # qconf-objs := menu.o # Will compile qconf as a C++ program, and menu as a C program. # They are linked as C++ code to the executable qconf -__hostprogs := $(sort $(hostprogs-y) $(hostprogs-m)) +__hostprogs := $(sort $(hostprogs)) host-cshlib := $(sort $(hostlibs-y) $(hostlibs-m)) host-cxxshlib := $(sort $(hostcxxlibs-y) $(hostcxxlibs-m)) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index d10f7a03e0ee..bae62549e3d2 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -4,6 +4,8 @@ asflags-y += $(EXTRA_AFLAGS) ccflags-y += $(EXTRA_CFLAGS) cppflags-y += $(EXTRA_CPPFLAGS) ldflags-y += $(EXTRA_LDFLAGS) +always-y += $(always) +hostprogs += $(hostprogs-y) $(hostprogs-m) # flags that take effect in current and sub directories KBUILD_AFLAGS += $(subdir-asflags-y) @@ -59,6 +61,8 @@ subdir-obj-y := $(filter %/built-in.a, $(obj-y)) real-obj-y := $(foreach m, $(obj-y), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-))),$($(m:.o=-objs)) $($(m:.o=-y)),$(m))) real-obj-m := $(foreach m, $(obj-m), $(if $(strip $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)) $($(m:.o=-))),$($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m)),$(m))) +always-y += $(always-m) + # DTB # If CONFIG_OF_ALL_DTBS is enabled, all DT blobs are built extra-y += $(dtb-y) @@ -72,7 +76,7 @@ endif # Add subdir path extra-y := $(addprefix $(obj)/,$(extra-y)) -always := $(addprefix $(obj)/,$(always)) +always-y := $(addprefix $(obj)/,$(always-y)) targets := $(addprefix $(obj)/,$(targets)) modorder := $(addprefix $(obj)/,$(modorder)) obj-m := $(addprefix $(obj)/,$(obj-m)) diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 7c9cb80d097b..290dd27d2809 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -2,5 +2,5 @@ # # fixdep: used to generate dependency information during build process -hostprogs-y := fixdep -always := $(hostprogs-y) +hostprogs := fixdep +always-y := $(hostprogs) diff --git a/scripts/dtc/Makefile b/scripts/dtc/Makefile index b5a5b1c548c9..3acbb410904c 100644 --- a/scripts/dtc/Makefile +++ b/scripts/dtc/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # scripts/dtc makefile -hostprogs-$(CONFIG_DTC) := dtc -always := $(hostprogs-y) +hostprogs := dtc +always-$(CONFIG_DTC) := $(hostprogs) dtc-objs := dtc.o flattree.o fstree.o data.o livetree.o treesource.o \ srcpos.o checks.o util.o diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile index aa0d0ec6936d..f2ee8bd7abc6 100644 --- a/scripts/gcc-plugins/Makefile +++ b/scripts/gcc-plugins/Makefile @@ -23,7 +23,7 @@ $(objtree)/$(obj)/randomize_layout_seed.h: FORCE targets = randomize_layout_seed.h randomize_layout_hash.h $(HOSTLIBS)-y := $(foreach p,$(GCC_PLUGIN),$(if $(findstring /,$(p)),,$(p))) -always := $($(HOSTLIBS)-y) +always-y := $($(HOSTLIBS)-y) $(foreach p,$($(HOSTLIBS)-y:%.so=%),$(eval $(p)-objs := $(p).o)) diff --git a/scripts/genksyms/Makefile b/scripts/genksyms/Makefile index 78629f515e78..d328de1e10ee 100644 --- a/scripts/genksyms/Makefile +++ b/scripts/genksyms/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -hostprogs-y := genksyms -always := $(hostprogs-y) +hostprogs := genksyms +always-y := $(hostprogs) genksyms-objs := genksyms.o parse.tab.o lex.lex.o diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index fbeb62ae3401..5887ceb6229e 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -157,11 +157,11 @@ HOSTCFLAGS_lexer.lex.o := -I $(srctree)/$(src) HOSTCFLAGS_parser.tab.o := -I $(srctree)/$(src) # conf: Used for defconfig, oldconfig and related targets -hostprogs-y += conf +hostprogs += conf conf-objs := conf.o $(common-objs) # nconf: Used for the nconfig target based on ncurses -hostprogs-y += nconf +hostprogs += nconf nconf-objs := nconf.o nconf.gui.o $(common-objs) HOSTLDLIBS_nconf = $(shell . $(obj)/nconf-cfg && echo $$libs) @@ -171,7 +171,7 @@ HOSTCFLAGS_nconf.gui.o = $(shell . $(obj)/nconf-cfg && echo $$cflags) $(obj)/nconf.o $(obj)/nconf.gui.o: $(obj)/nconf-cfg # mconf: Used for the menuconfig target based on lxdialog -hostprogs-y += mconf +hostprogs += mconf lxdialog := $(addprefix lxdialog/, \ checklist.o inputbox.o menubox.o textbox.o util.o yesno.o) mconf-objs := mconf.o $(lxdialog) $(common-objs) @@ -183,7 +183,7 @@ $(foreach f, mconf.o $(lxdialog), \ $(addprefix $(obj)/, mconf.o $(lxdialog)): $(obj)/mconf-cfg # qconf: Used for the xconfig target based on Qt -hostprogs-y += qconf +hostprogs += qconf qconf-cxxobjs := qconf.o qconf-objs := images.o $(common-objs) @@ -199,7 +199,7 @@ $(obj)/%.moc: $(src)/%.h $(obj)/qconf-cfg $(call cmd,moc) # gconf: Used for the gconfig target based on GTK+ -hostprogs-y += gconf +hostprogs += gconf gconf-objs := gconf.o images.o $(common-objs) HOSTLDLIBS_gconf = $(shell . $(obj)/gconf-cfg && echo $$libs) diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index 42c5d50f2bcc..296b6a3878b2 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 OBJECT_FILES_NON_STANDARD := y -hostprogs-y := modpost mk_elfconfig -always := $(hostprogs-y) empty.o +hostprogs := modpost mk_elfconfig +always-y := $(hostprogs) empty.o modpost-objs := modpost.o file2alias.o sumversion.o diff --git a/scripts/selinux/genheaders/Makefile b/scripts/selinux/genheaders/Makefile index e8c533140981..70cf8d95d07c 100644 --- a/scripts/selinux/genheaders/Makefile +++ b/scripts/selinux/genheaders/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -hostprogs-y := genheaders +hostprogs := genheaders HOST_EXTRACFLAGS += \ -I$(srctree)/include/uapi -I$(srctree)/include \ -I$(srctree)/security/selinux/include -always := $(hostprogs-y) +always-y := $(hostprogs) diff --git a/scripts/selinux/mdp/Makefile b/scripts/selinux/mdp/Makefile index 8a1269a9d0ba..3026f3c2aa2b 100644 --- a/scripts/selinux/mdp/Makefile +++ b/scripts/selinux/mdp/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -hostprogs-y := mdp +hostprogs := mdp HOST_EXTRACFLAGS += \ -I$(srctree)/include/uapi -I$(srctree)/include \ -I$(srctree)/security/selinux/include -I$(objtree)/include -always := $(hostprogs-y) +always-y := $(hostprogs) clean-files := policy.* file_contexts diff --git a/usr/Makefile b/usr/Makefile index 244862bfb765..18aed2ab98da 100644 --- a/usr/Makefile +++ b/usr/Makefile @@ -52,7 +52,7 @@ ifeq ($(cpio-data),) cpio-data := $(obj)/initramfs_data.cpio -hostprogs-y := gen_init_cpio +hostprogs := gen_init_cpio # .initramfs_data.cpio.d is used to identify all files included # in initramfs and to detect if any files are added/removed. From be9f6133f8770bb9c5f4a3cb3df6d30d7d3f7e5b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2020 14:09:20 +0900 Subject: [PATCH 036/147] scripts/kallsyms: rename local variables in read_symbol() I will use 'sym' for the point to struce sym_entry in the next commit. Rename 'sym', 'stype' to 'name', 'type', which are more intuitive. Signed-off-by: Masahiro Yamada --- scripts/kallsyms.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 94153732ec00..5c34edd98b3e 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -176,43 +176,43 @@ static void check_symbol_range(const char *sym, unsigned long long addr, static int read_symbol(FILE *in, struct sym_entry *s) { - char sym[500], stype; + char name[500], type; int rc; - rc = fscanf(in, "%llx %c %499s\n", &s->addr, &stype, sym); + rc = fscanf(in, "%llx %c %499s\n", &s->addr, &type, name); if (rc != 3) { - if (rc != EOF && fgets(sym, 500, in) == NULL) + if (rc != EOF && fgets(name, 500, in) == NULL) fprintf(stderr, "Read error or end of file.\n"); return -1; } - if (strlen(sym) >= KSYM_NAME_LEN) { + if (strlen(name) >= KSYM_NAME_LEN) { fprintf(stderr, "Symbol %s too long for kallsyms (%zu >= %d).\n" "Please increase KSYM_NAME_LEN both in kernel and kallsyms.c\n", - sym, strlen(sym), KSYM_NAME_LEN); + name, strlen(name), KSYM_NAME_LEN); return -1; } - if (is_ignored_symbol(sym, stype)) + if (is_ignored_symbol(name, type)) return -1; /* Ignore most absolute/undefined (?) symbols. */ - if (strcmp(sym, "_text") == 0) + if (strcmp(name, "_text") == 0) _text = s->addr; - check_symbol_range(sym, s->addr, text_ranges, ARRAY_SIZE(text_ranges)); - check_symbol_range(sym, s->addr, &percpu_range, 1); + check_symbol_range(name, s->addr, text_ranges, ARRAY_SIZE(text_ranges)); + check_symbol_range(name, s->addr, &percpu_range, 1); /* include the type field in the symbol name, so that it gets * compressed together */ - s->len = strlen(sym) + 1; + s->len = strlen(name) + 1; s->sym = malloc(s->len + 1); if (!s->sym) { fprintf(stderr, "kallsyms failure: " "unable to allocate required amount of memory\n"); exit(EXIT_FAILURE); } - strcpy(sym_name(s), sym); - s->sym[0] = stype; + strcpy(sym_name(s), name); + s->sym[0] = type; s->percpu_absolute = 0; From 8d60526999aace135de37220ec94ba40bc792234 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2020 14:09:21 +0900 Subject: [PATCH 037/147] scripts/kallsyms: change table to store (strcut sym_entry *) The symbol table is extended every 10000 addition by using realloc(), where data copy might occur to the new buffer. To decrease the amount of possible data copy, let's change the table to store the pointer. The symbol type + symbol name part is appended at the end of (struct sym_entry), and allocated together with the struct body. Signed-off-by: Masahiro Yamada --- scripts/kallsyms.c | 121 ++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 56 deletions(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 5c34edd98b3e..a566d8201b56 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -33,8 +33,8 @@ struct sym_entry { unsigned long long addr; unsigned int len; unsigned int start_pos; - unsigned char *sym; unsigned int percpu_absolute; + unsigned char sym[0]; }; struct addr_range { @@ -55,7 +55,7 @@ static struct addr_range percpu_range = { "__per_cpu_start", "__per_cpu_end", -1ULL, 0 }; -static struct sym_entry *table; +static struct sym_entry **table; static unsigned int table_size, table_cnt; static int all_symbols; static int absolute_percpu; @@ -174,49 +174,55 @@ static void check_symbol_range(const char *sym, unsigned long long addr, } } -static int read_symbol(FILE *in, struct sym_entry *s) +static struct sym_entry *read_symbol(FILE *in) { char name[500], type; + unsigned long long addr; + unsigned int len; + struct sym_entry *sym; int rc; - rc = fscanf(in, "%llx %c %499s\n", &s->addr, &type, name); + rc = fscanf(in, "%llx %c %499s\n", &addr, &type, name); if (rc != 3) { if (rc != EOF && fgets(name, 500, in) == NULL) fprintf(stderr, "Read error or end of file.\n"); - return -1; + return NULL; } if (strlen(name) >= KSYM_NAME_LEN) { fprintf(stderr, "Symbol %s too long for kallsyms (%zu >= %d).\n" "Please increase KSYM_NAME_LEN both in kernel and kallsyms.c\n", name, strlen(name), KSYM_NAME_LEN); - return -1; + return NULL; } if (is_ignored_symbol(name, type)) - return -1; + return NULL; /* Ignore most absolute/undefined (?) symbols. */ if (strcmp(name, "_text") == 0) - _text = s->addr; + _text = addr; - check_symbol_range(name, s->addr, text_ranges, ARRAY_SIZE(text_ranges)); - check_symbol_range(name, s->addr, &percpu_range, 1); + check_symbol_range(name, addr, text_ranges, ARRAY_SIZE(text_ranges)); + check_symbol_range(name, addr, &percpu_range, 1); /* include the type field in the symbol name, so that it gets * compressed together */ - s->len = strlen(name) + 1; - s->sym = malloc(s->len + 1); - if (!s->sym) { + + len = strlen(name) + 1; + + sym = malloc(sizeof(*sym) + len); + if (!sym) { fprintf(stderr, "kallsyms failure: " "unable to allocate required amount of memory\n"); exit(EXIT_FAILURE); } - strcpy(sym_name(s), name); - s->sym[0] = type; + sym->addr = addr; + sym->len = len; + sym->sym[0] = type; + memcpy(sym_name(sym), name, len); + sym->percpu_absolute = 0; - s->percpu_absolute = 0; - - return 0; + return sym; } static int symbol_in_range(const struct sym_entry *s, @@ -268,12 +274,12 @@ static void shrink_table(void) pos = 0; for (i = 0; i < table_cnt; i++) { - if (symbol_valid(&table[i])) { + if (symbol_valid(table[i])) { if (pos != i) table[pos] = table[i]; pos++; } else { - free(table[i].sym); + free(table[i]); } } table_cnt = pos; @@ -287,7 +293,15 @@ static void shrink_table(void) static void read_map(FILE *in) { + struct sym_entry *sym; + while (!feof(in)) { + sym = read_symbol(in); + if (!sym) + continue; + + sym->start_pos = table_cnt; + if (table_cnt >= table_size) { table_size += 10000; table = realloc(table, sizeof(*table) * table_size); @@ -296,10 +310,8 @@ static void read_map(FILE *in) exit (1); } } - if (read_symbol(in, &table[table_cnt]) == 0) { - table[table_cnt].start_pos = table_cnt; - table_cnt++; - } + + table[table_cnt++] = sym; } } @@ -387,27 +399,27 @@ static void write_src(void) int overflow; if (!absolute_percpu) { - offset = table[i].addr - relative_base; + offset = table[i]->addr - relative_base; overflow = (offset < 0 || offset > UINT_MAX); - } else if (symbol_absolute(&table[i])) { - offset = table[i].addr; + } else if (symbol_absolute(table[i])) { + offset = table[i]->addr; overflow = (offset < 0 || offset > INT_MAX); } else { - offset = relative_base - table[i].addr - 1; + offset = relative_base - table[i]->addr - 1; overflow = (offset < INT_MIN || offset >= 0); } if (overflow) { fprintf(stderr, "kallsyms failure: " "%s symbol value %#llx out of range in relative mode\n", - symbol_absolute(&table[i]) ? "absolute" : "relative", - table[i].addr); + symbol_absolute(table[i]) ? "absolute" : "relative", + table[i]->addr); exit(EXIT_FAILURE); } printf("\t.long\t%#x\n", (int)offset); - } else if (!symbol_absolute(&table[i])) { - output_address(table[i].addr); + } else if (!symbol_absolute(table[i])) { + output_address(table[i]->addr); } else { - printf("\tPTR\t%#llx\n", table[i].addr); + printf("\tPTR\t%#llx\n", table[i]->addr); } } printf("\n"); @@ -437,12 +449,12 @@ static void write_src(void) if ((i & 0xFF) == 0) markers[i >> 8] = off; - printf("\t.byte 0x%02x", table[i].len); - for (k = 0; k < table[i].len; k++) - printf(", 0x%02x", table[i].sym[k]); + printf("\t.byte 0x%02x", table[i]->len); + for (k = 0; k < table[i]->len; k++) + printf(", 0x%02x", table[i]->sym[k]); printf("\n"); - off += table[i].len + 1; + off += table[i]->len + 1; } printf("\n"); @@ -496,7 +508,7 @@ static void build_initial_tok_table(void) unsigned int i; for (i = 0; i < table_cnt; i++) - learn_symbol(table[i].sym, table[i].len); + learn_symbol(table[i]->sym, table[i]->len); } static unsigned char *find_token(unsigned char *str, int len, @@ -520,15 +532,15 @@ static void compress_symbols(const unsigned char *str, int idx) for (i = 0; i < table_cnt; i++) { - len = table[i].len; - p1 = table[i].sym; + len = table[i]->len; + p1 = table[i]->sym; /* find the token on the symbol */ p2 = find_token(p1, len, str); if (!p2) continue; /* decrease the counts for this symbol's tokens */ - forget_symbol(table[i].sym, len); + forget_symbol(table[i]->sym, len); size = len; @@ -547,10 +559,10 @@ static void compress_symbols(const unsigned char *str, int idx) } while (p2); - table[i].len = len; + table[i]->len = len; /* increase the counts for this symbol's new tokens */ - learn_symbol(table[i].sym, len); + learn_symbol(table[i]->sym, len); } } @@ -606,8 +618,8 @@ static void insert_real_symbols_in_table(void) unsigned int i, j, c; for (i = 0; i < table_cnt; i++) { - for (j = 0; j < table[i].len; j++) { - c = table[i].sym[j]; + for (j = 0; j < table[i]->len; j++) { + c = table[i]->sym[j]; best_table[c][0]=c; best_table_len[c]=1; } @@ -660,13 +672,10 @@ static int may_be_linker_script_provide_symbol(const struct sym_entry *se) static int compare_symbols(const void *a, const void *b) { - const struct sym_entry *sa; - const struct sym_entry *sb; + const struct sym_entry *sa = *(const struct sym_entry **)a; + const struct sym_entry *sb = *(const struct sym_entry **)b; int wa, wb; - sa = a; - sb = b; - /* sort by address first */ if (sa->addr > sb->addr) return 1; @@ -697,7 +706,7 @@ static int compare_symbols(const void *a, const void *b) static void sort_symbols(void) { - qsort(table, table_cnt, sizeof(struct sym_entry), compare_symbols); + qsort(table, table_cnt, sizeof(table[0]), compare_symbols); } static void make_percpus_absolute(void) @@ -705,14 +714,14 @@ static void make_percpus_absolute(void) unsigned int i; for (i = 0; i < table_cnt; i++) - if (symbol_in_range(&table[i], &percpu_range, 1)) { + if (symbol_in_range(table[i], &percpu_range, 1)) { /* * Keep the 'A' override for percpu symbols to * ensure consistent behavior compared to older * versions of this tool. */ - table[i].sym[0] = 'A'; - table[i].percpu_absolute = 1; + table[i]->sym[0] = 'A'; + table[i]->percpu_absolute = 1; } } @@ -722,12 +731,12 @@ static void record_relative_base(void) unsigned int i; for (i = 0; i < table_cnt; i++) - if (!symbol_absolute(&table[i])) { + if (!symbol_absolute(table[i])) { /* * The table is sorted by address. * Take the first non-absolute symbol value. */ - relative_base = table[i].addr; + relative_base = table[i]->addr; return; } } From baa6cf8450b72dcab11f37c47efce7c5b9b8ad0f Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Fri, 31 Jan 2020 15:45:24 +0200 Subject: [PATCH 038/147] iwlwifi: mvm: Fix thermal zone registration Use a unique name when registering a thermal zone. Otherwise, with multiple NICS, we hit the following warning during the unregistration. WARNING: CPU: 2 PID: 3525 at fs/sysfs/group.c:255 RIP: 0010:sysfs_remove_group+0x80/0x90 Call Trace: dpm_sysfs_remove+0x57/0x60 device_del+0x5a/0x350 ? sscanf+0x4e/0x70 device_unregister+0x1a/0x60 hwmon_device_unregister+0x4a/0xa0 thermal_remove_hwmon_sysfs+0x175/0x1d0 thermal_zone_device_unregister+0x188/0x1e0 iwl_mvm_thermal_exit+0xe7/0x100 [iwlmvm] iwl_op_mode_mvm_stop+0x27/0x180 [iwlmvm] _iwl_op_mode_stop.isra.3+0x2b/0x50 [iwlwifi] iwl_opmode_deregister+0x90/0xa0 [iwlwifi] __exit_compat+0x10/0x2c7 [iwlmvm] __x64_sys_delete_module+0x13f/0x270 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/tt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c index b5a16f00bada..fcad25ffd811 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c @@ -734,7 +734,8 @@ static struct thermal_zone_device_ops tzone_ops = { static void iwl_mvm_thermal_zone_register(struct iwl_mvm *mvm) { int i; - char name[] = "iwlwifi"; + char name[16]; + static atomic_t counter = ATOMIC_INIT(0); if (!iwl_mvm_is_tt_in_fw(mvm)) { mvm->tz_device.tzone = NULL; @@ -744,6 +745,7 @@ static void iwl_mvm_thermal_zone_register(struct iwl_mvm *mvm) BUILD_BUG_ON(ARRAY_SIZE(name) >= THERMAL_NAME_LENGTH); + sprintf(name, "iwlwifi_%u", atomic_inc_return(&counter) & 0xFF); mvm->tz_device.tzone = thermal_zone_device_register(name, IWL_MAX_DTS_TRIPS, IWL_WRITABLE_TRIPS_MSK, From 197288d5ba8a5289f22d3aeb4fca3824bfd9b4af Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 31 Jan 2020 15:45:25 +0200 Subject: [PATCH 039/147] iwlwifi: don't throw error when trying to remove IGTK The IGTK keys are only removed by mac80211 after it has already removed the AP station. This causes the driver to throw an error because mac80211 is trying to remove the IGTK when the station doesn't exist anymore. The firmware is aware that the station has been removed and can deal with it the next time we try to add an IGTK for a station, so we shouldn't try to remove the key if the station ID is IWL_MVM_INVALID_STA. Do this by removing the check for mvm_sta before calling iwl_mvm_send_sta_igtk() and check return from that function gracefully if the station ID is invalid. Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 7b35f416404c..64ef3f3ba23b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -3320,6 +3320,10 @@ static int iwl_mvm_send_sta_igtk(struct iwl_mvm *mvm, igtk_cmd.sta_id = cpu_to_le32(sta_id); if (remove_key) { + /* This is a valid situation for IGTK */ + if (sta_id == IWL_MVM_INVALID_STA) + return 0; + igtk_cmd.ctrl_flags |= cpu_to_le32(STA_KEY_NOT_VALID); } else { struct ieee80211_key_seq seq; @@ -3574,9 +3578,9 @@ int iwl_mvm_remove_sta_key(struct iwl_mvm *mvm, IWL_DEBUG_WEP(mvm, "mvm remove dynamic key: idx=%d sta=%d\n", keyconf->keyidx, sta_id); - if (mvm_sta && (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC || - keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 || - keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256)) + if (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC || + keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 || + keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256) return iwl_mvm_send_sta_igtk(mvm, keyconf, sta_id, true); if (!__test_and_clear_bit(keyconf->hw_key_idx, mvm->fw_key_table)) { From 6bd5fa332a8c24c8b9079a70c44240b61858fab8 Mon Sep 17 00:00:00 2001 From: Golan Ben Ami Date: Fri, 31 Jan 2020 15:45:26 +0200 Subject: [PATCH 040/147] iwlwifi: mvm: update the DTS measurement type Till now, the driver asked the fw for a DTS measurement in automatic mode. This triggered a flow in which the fw actively measured the temperature. This is not needed anymore, as the fw performs measurements by itself, without the driver triggering them, and the current cadence in which the fw performs such measurements is sufficient. In addition, in some time-sensitive scenarios, in which the driver asks the fw for an active measurement twice in a short time (<100ms), the fw asserts with code 0x20100801. Change the DTS measurement to _WITHOUT_MEASURE instead, so the fw will respond with the last measurement it has performed. Signed-off-by: Golan Ben Ami Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/tt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c index fcad25ffd811..418e59b7c671 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c @@ -8,7 +8,7 @@ * Copyright(c) 2013 - 2014, 2019 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright(c) 2019 Intel Corporation + * Copyright(c) 2019 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -31,7 +31,7 @@ * Copyright(c) 2012 - 2014, 2019 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright(c) 2019 Intel Corporation + * Copyright(c) 2019 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -234,7 +234,7 @@ static int iwl_mvm_get_temp_cmd(struct iwl_mvm *mvm) .flags = cpu_to_le32(DTS_TRIGGER_CMD_FLAGS_TEMP), }; struct iwl_ext_dts_measurement_cmd extcmd = { - .control_mode = cpu_to_le32(DTS_AUTOMATIC), + .control_mode = cpu_to_le32(DTS_DIRECT_WITHOUT_MEASURE), }; u32 cmdid; From cc4255eff523f25187bb95561642941de0e57497 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 31 Jan 2020 15:45:27 +0200 Subject: [PATCH 041/147] iwlwifi: mvm: avoid use after free for pmsr request When a FTM request is aborted, the driver sends the abort command to the fw and waits for a response. When the response arrives, the driver calls cfg80211_pmsr_complete() for that request. However, cfg80211 frees the requested data immediately after sending the abort command, so this may lead to use after free. Fix it by clearing the request data in the driver when the abort command arrives and ignoring the fw notification that will come afterwards. Signed-off-by: Avraham Stern Fixes: fc36ffda3267 ("iwlwifi: mvm: support FTM initiator") Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c index f783d6d53b6f..6e1ea921c299 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c @@ -8,6 +8,7 @@ * Copyright(c) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 Intel Corporation * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -30,6 +31,7 @@ * Copyright(c) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 Intel Corporation * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -528,6 +530,8 @@ void iwl_mvm_ftm_abort(struct iwl_mvm *mvm, struct cfg80211_pmsr_request *req) if (req != mvm->ftm_initiator.req) return; + iwl_mvm_ftm_reset(mvm); + if (iwl_mvm_send_cmd_pdu(mvm, iwl_cmd_id(TOF_RANGE_ABORT_CMD, LOCATION_GROUP, 0), 0, sizeof(cmd), &cmd)) @@ -641,7 +645,6 @@ void iwl_mvm_ftm_range_resp(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb) lockdep_assert_held(&mvm->mutex); if (!mvm->ftm_initiator.req) { - IWL_ERR(mvm, "Got FTM response but have no request?\n"); return; } From 12d47f0ea5e0aa63f19ba618da55a7c67850ca10 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Fri, 31 Jan 2020 15:45:28 +0200 Subject: [PATCH 042/147] iwlwifi: mvm: Check the sta is not NULL in iwl_mvm_cfg_he_sta() Fix a kernel panic by checking that the sta is not NULL. This could happen during a reconfig flow, as mac80211 moves the sta between all the states without really checking if the previous state was successfully set. So, if for some reason we failed to add back the station, subsequent calls to sta_state() callback will be done when the station is NULL. This would result in a following panic: BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 IP: iwl_mvm_cfg_he_sta+0xfc/0x690 [iwlmvm] [..] Call Trace: iwl_mvm_mac_sta_state+0x629/0x6f0 [iwlmvm] drv_sta_state+0xf4/0x950 [mac80211] ieee80211_reconfig+0xa12/0x2180 [mac80211] ieee80211_restart_work+0xbb/0xe0 [mac80211] process_one_work+0x1e2/0x610 worker_thread+0x4d/0x3e0 [..] Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 6717f25c46b1..8ecd1f6875de 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -5,10 +5,9 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2012 - 2014, 2018 - 2020 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -28,10 +27,9 @@ * * BSD LICENSE * - * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved. * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH * Copyright(c) 2016 - 2017 Intel Deutschland GmbH - * Copyright(c) 2018 - 2019 Intel Corporation + * Copyright(c) 2012 - 2014, 2018 - 2020 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -2037,7 +2035,7 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, rcu_read_lock(); sta = rcu_dereference(mvm->fw_id_to_mac_id[sta_ctxt_cmd.sta_id]); - if (IS_ERR(sta)) { + if (IS_ERR_OR_NULL(sta)) { rcu_read_unlock(); WARN(1, "Can't find STA to configure HE\n"); return; From b5b878e36c1836c0195575132cc7c199e5a34a7b Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 31 Jan 2020 15:45:29 +0200 Subject: [PATCH 043/147] iwlwifi: mvm: fix TDLS discovery with the new firmware API I changed the API for asking for a session protection but I omitted the TDLS flows. Fix that now. Note that for the TDLS flow, we need to block until the session protection actually starts, so add this option to iwl_mvm_schedule_session_protection. This patch fixes a firmware assert in the TDLS flow since the old TIME_EVENT_CMD is not supported anymore by newer firwmare versions. Signed-off-by: Emmanuel Grumbach Fixes: fe959c7b2049 ("iwlwifi: mvm: use the new session protection command") Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/tdls.c | 10 ++- .../wireless/intel/iwlwifi/mvm/time-event.c | 71 ++++++++++++++++--- .../wireless/intel/iwlwifi/mvm/time-event.h | 4 +- 4 files changed, 72 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 8ecd1f6875de..02df603b6400 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -3291,7 +3291,7 @@ static void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw, if (fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_SESSION_PROT_CMD)) iwl_mvm_schedule_session_protection(mvm, vif, 900, - min_duration); + min_duration, false); else iwl_mvm_protect_session(mvm, vif, duration, min_duration, 500, false); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c index 1851719e9f4b..d781777b6b96 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c @@ -205,9 +205,15 @@ void iwl_mvm_mac_mgd_protect_tdls_discover(struct ieee80211_hw *hw, struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); u32 duration = 2 * vif->bss_conf.dtim_period * vif->bss_conf.beacon_int; - mutex_lock(&mvm->mutex); /* Protect the session to hear the TDLS setup response on the channel */ - iwl_mvm_protect_session(mvm, vif, duration, duration, 100, true); + mutex_lock(&mvm->mutex); + if (fw_has_capa(&mvm->fw->ucode_capa, + IWL_UCODE_TLV_CAPA_SESSION_PROT_CMD)) + iwl_mvm_schedule_session_protection(mvm, vif, duration, + duration, true); + else + iwl_mvm_protect_session(mvm, vif, duration, + duration, 100, true); mutex_unlock(&mvm->mutex); } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 51b138673ddb..c0b420fe5e48 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -1056,13 +1056,42 @@ int iwl_mvm_schedule_csa_period(struct iwl_mvm *mvm, return iwl_mvm_time_event_send_add(mvm, vif, te_data, &time_cmd); } +static bool iwl_mvm_session_prot_notif(struct iwl_notif_wait_data *notif_wait, + struct iwl_rx_packet *pkt, void *data) +{ + struct iwl_mvm *mvm = + container_of(notif_wait, struct iwl_mvm, notif_wait); + struct iwl_mvm_session_prot_notif *resp; + int resp_len = iwl_rx_packet_payload_len(pkt); + + if (WARN_ON(pkt->hdr.cmd != SESSION_PROTECTION_NOTIF || + pkt->hdr.group_id != MAC_CONF_GROUP)) + return true; + + if (WARN_ON_ONCE(resp_len != sizeof(*resp))) { + IWL_ERR(mvm, "Invalid SESSION_PROTECTION_NOTIF response\n"); + return true; + } + + resp = (void *)pkt->data; + + if (!resp->status) + IWL_ERR(mvm, + "TIME_EVENT_NOTIFICATION received but not executed\n"); + + return true; +} + void iwl_mvm_schedule_session_protection(struct iwl_mvm *mvm, struct ieee80211_vif *vif, - u32 duration, u32 min_duration) + u32 duration, u32 min_duration, + bool wait_for_notif) { struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); struct iwl_mvm_time_event_data *te_data = &mvmvif->time_event_data; - + const u16 notif[] = { iwl_cmd_id(SESSION_PROTECTION_NOTIF, + MAC_CONF_GROUP, 0) }; + struct iwl_notification_wait wait_notif; struct iwl_mvm_session_prot_cmd cmd = { .id_and_color = cpu_to_le32(FW_CMD_ID_AND_COLOR(mvmvif->id, @@ -1071,7 +1100,6 @@ void iwl_mvm_schedule_session_protection(struct iwl_mvm *mvm, .conf_id = cpu_to_le32(SESSION_PROTECT_CONF_ASSOC), .duration_tu = cpu_to_le32(MSEC_TO_TU(duration)), }; - int ret; lockdep_assert_held(&mvm->mutex); @@ -1092,14 +1120,35 @@ void iwl_mvm_schedule_session_protection(struct iwl_mvm *mvm, IWL_DEBUG_TE(mvm, "Add new session protection, duration %d TU\n", le32_to_cpu(cmd.duration_tu)); - ret = iwl_mvm_send_cmd_pdu(mvm, iwl_cmd_id(SESSION_PROTECTION_CMD, - MAC_CONF_GROUP, 0), - 0, sizeof(cmd), &cmd); - if (ret) { + if (!wait_for_notif) { + if (iwl_mvm_send_cmd_pdu(mvm, + iwl_cmd_id(SESSION_PROTECTION_CMD, + MAC_CONF_GROUP, 0), + 0, sizeof(cmd), &cmd)) { + IWL_ERR(mvm, + "Couldn't send the SESSION_PROTECTION_CMD\n"); + spin_lock_bh(&mvm->time_event_lock); + iwl_mvm_te_clear_data(mvm, te_data); + spin_unlock_bh(&mvm->time_event_lock); + } + + return; + } + + iwl_init_notification_wait(&mvm->notif_wait, &wait_notif, + notif, ARRAY_SIZE(notif), + iwl_mvm_session_prot_notif, NULL); + + if (iwl_mvm_send_cmd_pdu(mvm, + iwl_cmd_id(SESSION_PROTECTION_CMD, + MAC_CONF_GROUP, 0), + 0, sizeof(cmd), &cmd)) { IWL_ERR(mvm, - "Couldn't send the SESSION_PROTECTION_CMD: %d\n", ret); - spin_lock_bh(&mvm->time_event_lock); - iwl_mvm_te_clear_data(mvm, te_data); - spin_unlock_bh(&mvm->time_event_lock); + "Couldn't send the SESSION_PROTECTION_CMD\n"); + iwl_remove_notification(&mvm->notif_wait, &wait_notif); + } else if (iwl_wait_notification(&mvm->notif_wait, &wait_notif, + TU_TO_JIFFIES(100))) { + IWL_ERR(mvm, + "Failed to protect session until session protection\n"); } } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.h b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.h index df6832b79666..3186d7e40567 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.h @@ -250,10 +250,12 @@ iwl_mvm_te_scheduled(struct iwl_mvm_time_event_data *te_data) * @mvm: the mvm component * @vif: the virtual interface for which the protection issued * @duration: the duration of the protection + * @wait_for_notif: if true, will block until the start of the protection */ void iwl_mvm_schedule_session_protection(struct iwl_mvm *mvm, struct ieee80211_vif *vif, - u32 duration, u32 min_duration); + u32 duration, u32 min_duration, + bool wait_for_notif); /** * iwl_mvm_rx_session_protect_notif - handles %SESSION_PROTECTION_NOTIF From 577ddbee1f43a355cc733829da96c8ffb3b99cc5 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Fri, 31 Jan 2020 15:45:30 +0200 Subject: [PATCH 044/147] iwlwifi: d3: read all FW CPUs error info Continue the wakeup flow only if no FW CPUs have an error If we don't check for error in all FW CPUs the driver can think based on one CPU that the FW is operational and try to access and send commands. Also, handle the error_id endianness correctly as le32 Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 54 ++++++++++++++++----- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 8878409d2f07..22a32eb10f01 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -1897,27 +1897,55 @@ static void iwl_mvm_d3_disconnect_iter(void *data, u8 *mac, ieee80211_resume_disconnect(vif); } -static int iwl_mvm_check_rt_status(struct iwl_mvm *mvm, - struct ieee80211_vif *vif) +static bool iwl_mvm_rt_status(struct iwl_trans *trans, u32 base, u32 *err_id) { - u32 base = mvm->trans->dbg.lmac_error_event_table[0]; struct error_table_start { /* cf. struct iwl_error_event_table */ u32 valid; - u32 error_id; + __le32 err_id; } err_info; - iwl_trans_read_mem_bytes(mvm->trans, base, - &err_info, sizeof(err_info)); + if (!base) + return false; - if (err_info.valid && - err_info.error_id == RF_KILL_INDICATOR_FOR_WOWLAN) { - struct cfg80211_wowlan_wakeup wakeup = { - .rfkill_release = true, - }; - ieee80211_report_wowlan_wakeup(vif, &wakeup, GFP_KERNEL); + iwl_trans_read_mem_bytes(trans, base, + &err_info, sizeof(err_info)); + if (err_info.valid && err_id) + *err_id = le32_to_cpu(err_info.err_id); + + return !!err_info.valid; +} + +static bool iwl_mvm_check_rt_status(struct iwl_mvm *mvm, + struct ieee80211_vif *vif) +{ + u32 err_id; + + /* check for lmac1 error */ + if (iwl_mvm_rt_status(mvm->trans, + mvm->trans->dbg.lmac_error_event_table[0], + &err_id)) { + if (err_id == RF_KILL_INDICATOR_FOR_WOWLAN) { + struct cfg80211_wowlan_wakeup wakeup = { + .rfkill_release = true, + }; + ieee80211_report_wowlan_wakeup(vif, &wakeup, + GFP_KERNEL); + } + return true; } - return err_info.valid; + + /* check if we have lmac2 set and check for error */ + if (iwl_mvm_rt_status(mvm->trans, + mvm->trans->dbg.lmac_error_event_table[1], NULL)) + return true; + + /* check for umac error */ + if (iwl_mvm_rt_status(mvm->trans, + mvm->trans->dbg.umac_error_event_table, NULL)) + return true; + + return false; } static int __iwl_mvm_resume(struct iwl_mvm *mvm, bool test) From ebe8e6116ac49d182b060f843904caf5eacf5b87 Mon Sep 17 00:00:00 2001 From: Chin-Yen Lee Date: Mon, 3 Feb 2020 14:01:57 +0800 Subject: [PATCH 045/147] rtw88: Fix return value of rtw_wow_check_fw_status Clang warns that ret is used uninitialzed. And we found that actually the return type should be "int" instead of "bool". Fixes: 44bc17f7f5b3 ("rtw88: support wowlan feature for 8822c") Link: https://github.com/ClangBuiltLinux/linux/issues/850 Reported-by: Nathan Chancellor Signed-off-by: Chin-Yen Lee Signed-off-by: Yan-Hsuan Chuang Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor # build Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtw88/wow.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/wow.c b/drivers/net/wireless/realtek/rtw88/wow.c index af5c27e1bb07..4820dca958dd 100644 --- a/drivers/net/wireless/realtek/rtw88/wow.c +++ b/drivers/net/wireless/realtek/rtw88/wow.c @@ -281,27 +281,26 @@ static void rtw_wow_rx_dma_start(struct rtw_dev *rtwdev) rtw_write32_clr(rtwdev, REG_RXPKT_NUM, BIT_RW_RELEASE); } -static bool rtw_wow_check_fw_status(struct rtw_dev *rtwdev, bool wow_enable) +static int rtw_wow_check_fw_status(struct rtw_dev *rtwdev, bool wow_enable) { - bool ret; - /* wait 100ms for wow firmware to finish work */ msleep(100); if (wow_enable) { - if (!rtw_read8(rtwdev, REG_WOWLAN_WAKE_REASON)) - ret = 0; + if (rtw_read8(rtwdev, REG_WOWLAN_WAKE_REASON)) + goto wow_fail; } else { - if (rtw_read32_mask(rtwdev, REG_FE1IMR, BIT_FS_RXDONE) == 0 && - rtw_read32_mask(rtwdev, REG_RXPKT_NUM, BIT_RW_RELEASE) == 0) - ret = 0; + if (rtw_read32_mask(rtwdev, REG_FE1IMR, BIT_FS_RXDONE) || + rtw_read32_mask(rtwdev, REG_RXPKT_NUM, BIT_RW_RELEASE)) + goto wow_fail; } - if (ret) - rtw_err(rtwdev, "failed to check wow status %s\n", - wow_enable ? "enabled" : "disabled"); + return 0; - return ret; +wow_fail: + rtw_err(rtwdev, "failed to check wow status %s\n", + wow_enable ? "enabled" : "disabled"); + return -EBUSY; } static void rtw_wow_fw_security_type_iter(struct ieee80211_hw *hw, From 0f060936e490c6279dfe773d75d526d3d3d77111 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Feb 2020 21:46:43 +0200 Subject: [PATCH 046/147] SMB3: Backup intent flag missing from some more ops When "backup intent" is requested on the mount (e.g. backupuid or backupgid mount options), the corresponding flag was missing from some of the operations. Change all operations to use the macro cifs_create_options() to set the backup intent flag if needed. Signed-off-by: Amir Goldstein Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 14 +++----- fs/cifs/cifsfs.c | 2 +- fs/cifs/cifsglob.h | 6 ++-- fs/cifs/cifsproto.h | 8 +++++ fs/cifs/connect.c | 2 +- fs/cifs/dir.c | 5 +-- fs/cifs/file.c | 10 ++---- fs/cifs/inode.c | 8 ++--- fs/cifs/ioctl.c | 2 +- fs/cifs/link.c | 18 +++------- fs/cifs/smb1ops.c | 19 +++++------ fs/cifs/smb2inode.c | 9 ++--- fs/cifs/smb2ops.c | 81 +++++++++++++++------------------------------ fs/cifs/smb2proto.h | 2 +- 14 files changed, 68 insertions(+), 118 deletions(-) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fb41e51dd574..440828afcdde 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1084,7 +1084,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, struct cifs_ntsd *pntsd = NULL; int oplock = 0; unsigned int xid; - int rc, create_options = 0; + int rc; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct cifs_fid fid; @@ -1096,13 +1096,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = get_xid(); - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = READ_CONTROL; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.disposition = FILE_OPEN; oparms.path = path; oparms.fid = &fid; @@ -1147,7 +1144,7 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, { int oplock = 0; unsigned int xid; - int rc, access_flags, create_options = 0; + int rc, access_flags; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); @@ -1160,9 +1157,6 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, tcon = tlink_tcon(tlink); xid = get_xid(); - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) access_flags = WRITE_OWNER; else @@ -1171,7 +1165,7 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = access_flags; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.disposition = FILE_OPEN; oparms.path = path; oparms.fid = &fid; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5492b9860baa..febab27cd838 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -275,7 +275,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = 0; /* unlimited */ if (server->ops->queryfs) - rc = server->ops->queryfs(xid, tcon, buf); + rc = server->ops->queryfs(xid, tcon, cifs_sb, buf); free_xid(xid); return 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 239338d57086..1205041fd966 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -298,7 +298,8 @@ struct smb_version_operations { const char *, struct dfs_info3_param **, unsigned int *, const struct nls_table *, int); /* informational QFS call */ - void (*qfs_tcon)(const unsigned int, struct cifs_tcon *); + void (*qfs_tcon)(const unsigned int, struct cifs_tcon *, + struct cifs_sb_info *); /* check if a path is accessible or not */ int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); @@ -409,7 +410,7 @@ struct smb_version_operations { struct cifsInodeInfo *); /* query remote filesystem */ int (*queryfs)(const unsigned int, struct cifs_tcon *, - struct kstatfs *); + struct cifs_sb_info *, struct kstatfs *); /* send mandatory brlock to the server */ int (*mand_lock)(const unsigned int, struct cifsFileInfo *, __u64, __u64, __u32, int, int, bool); @@ -490,6 +491,7 @@ struct smb_version_operations { /* ioctl passthrough for query_info */ int (*ioctl_query_info)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, __le16 *path, int is_dir, unsigned long p); /* make unix special files (block, char, fifo, socket) */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 948bf3474db1..748bd00cb5f1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -612,4 +612,12 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, } #endif +static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) +{ + if (backup_cred(cifs_sb)) + return options | CREATE_OPEN_BACKUP_INTENT; + else + return options; +} + #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0aa3623ae0e1..a941ac7a659d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4365,7 +4365,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, /* do not care if a following call succeed - informational */ if (!tcon->pipe && server->ops->qfs_tcon) { - server->ops->qfs_tcon(*xid, tcon); + server->ops->qfs_tcon(*xid, tcon, cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) { if (tcon->fsDevInfo.DeviceCharacteristics & cpu_to_le32(FILE_READ_ONLY_DEVICE)) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index f3b79012ff29..0ef099442f20 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -355,13 +355,10 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, create_options); oparms.disposition = disposition; oparms.path = full_path; oparms.fid = fid; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a4e8f7d445ac..79e6f4f55b9b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -222,9 +222,6 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (!buf) return -ENOMEM; - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (f_flags & O_SYNC) create_options |= CREATE_WRITE_THROUGH; @@ -235,7 +232,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, create_options); oparms.disposition = disposition; oparms.path = full_path; oparms.fid = fid; @@ -752,9 +749,6 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) desired_access = cifs_convert_flags(cfile->f_flags); - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - /* O_SYNC also has bit for O_DSYNC so following check picks up either */ if (cfile->f_flags & O_SYNC) create_options |= CREATE_WRITE_THROUGH; @@ -768,7 +762,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, create_options); oparms.disposition = disposition; oparms.path = full_path; oparms.fid = &cfile->fid; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9b547f7f5f5d..b1383c524b98 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -472,9 +472,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_READ; - oparms.create_options = CREATE_NOT_DIR; - if (backup_cred(cifs_sb)) - oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_OPEN; oparms.path = path; oparms.fid = &fid; @@ -1284,7 +1282,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = CREATE_NOT_DIR; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_OPEN; oparms.path = full_path; oparms.fid = &fid; @@ -1822,7 +1820,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, oparms.cifs_sb = cifs_sb; /* open the file to be renamed -- we need DELETE perms */ oparms.desired_access = DELETE; - oparms.create_options = CREATE_NOT_DIR; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_OPEN; oparms.path = from_path; oparms.fid = &fid; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 1a01e108d75e..e4c935026d5e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -65,7 +65,7 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, if (tcon->ses->server->ops->ioctl_query_info) rc = tcon->ses->server->ops->ioctl_query_info( - xid, tcon, utf16_path, + xid, tcon, cifs_sb, utf16_path, filep->private_data ? 0 : 1, p); else rc = -EOPNOTSUPP; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index b736acd3917b..852aa00ec729 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -315,7 +315,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_READ; - oparms.create_options = CREATE_NOT_DIR; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_OPEN; oparms.path = path; oparms.fid = &fid; @@ -353,15 +353,11 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct cifs_open_parms oparms; struct cifs_io_parms io_parms; - int create_options = CREATE_NOT_DIR; - - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_CREATE; oparms.path = path; oparms.fid = &fid; @@ -402,9 +398,7 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_READ; - oparms.create_options = CREATE_NOT_DIR; - if (backup_cred(cifs_sb)) - oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_OPEN; oparms.fid = &fid; oparms.reconnect = false; @@ -457,14 +451,10 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct cifs_open_parms oparms; struct cifs_io_parms io_parms; - int create_options = CREATE_NOT_DIR; __le16 *utf16_path; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct kvec iov[2]; - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - cifs_dbg(FYI, "%s: path: %s\n", __func__, path); utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); @@ -474,7 +464,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_CREATE; oparms.fid = &fid; oparms.reconnect = false; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d70a2bb062df..eb994e313c6a 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -504,7 +504,8 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) } static void -cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb) { CIFSSMBQFSDeviceInfo(xid, tcon); CIFSSMBQFSAttributeInfo(xid, tcon); @@ -565,7 +566,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.disposition = FILE_OPEN; oparms.path = full_path; oparms.fid = &fid; @@ -793,7 +794,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = CREATE_NOT_DIR; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); oparms.disposition = FILE_OPEN; oparms.path = full_path; oparms.fid = &fid; @@ -872,7 +873,7 @@ cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, static int cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon, - struct kstatfs *buf) + struct cifs_sb_info *cifs_sb, struct kstatfs *buf) { int rc = -EOPNOTSUPP; @@ -970,7 +971,8 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = OPEN_REPARSE_POINT; + oparms.create_options = cifs_create_options(cifs_sb, + OPEN_REPARSE_POINT); oparms.disposition = FILE_OPEN; oparms.path = full_path; oparms.fid = &fid; @@ -1029,7 +1031,6 @@ cifs_make_node(unsigned int xid, struct inode *inode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *newinode = NULL; int rc = -EPERM; - int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; FILE_ALL_INFO *buf = NULL; struct cifs_io_parms io_parms; __u32 oplock = 0; @@ -1090,13 +1091,11 @@ cifs_make_node(unsigned int xid, struct inode *inode, goto out; } - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL); oparms.disposition = FILE_CREATE; oparms.path = full_path; oparms.fid = &fid; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 5ef5e97a6d13..1cf207564ff9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -99,9 +99,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.desired_access = desired_access; oparms.disposition = create_disposition; - oparms.create_options = create_options; - if (backup_cred(cifs_sb)) - oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; + oparms.create_options = cifs_create_options(cifs_sb, create_options); oparms.fid = &fid; oparms.reconnect = false; oparms.mode = mode; @@ -457,7 +455,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!strlen(full_path) && !no_cached_open) { - rc = open_shroot(xid, tcon, &fid); + rc = open_shroot(xid, tcon, cifs_sb, &fid); if (rc) goto out; @@ -474,9 +472,6 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, goto out; } - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 6787fce26f20..33bb86cae369 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -655,7 +655,8 @@ smb2_cached_lease_break(struct work_struct *work) /* * Open the directory at the root of a share */ -int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) +int open_shroot(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; @@ -702,7 +703,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms.tcon = tcon; - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; oparms.fid = pfid; @@ -818,7 +819,8 @@ oshr_free: } static void -smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb) { int rc; __le16 srch_path = 0; /* Null - open root of share */ @@ -830,7 +832,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; oparms.reconnect = false; @@ -838,7 +840,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL); else - rc = open_shroot(xid, tcon, &fid); + rc = open_shroot(xid, tcon, cifs_sb, &fid); if (rc) return; @@ -860,7 +862,8 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) } static void -smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) +smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb) { int rc; __le16 srch_path = 0; /* Null - open root of share */ @@ -871,7 +874,7 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; oparms.reconnect = false; @@ -906,10 +909,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; - if (backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; oparms.reconnect = false; @@ -1151,10 +1151,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.desired_access = FILE_WRITE_EA; oparms.disposition = FILE_OPEN; - if (backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; oparms.reconnect = false; @@ -1422,6 +1419,7 @@ req_res_key_exit: static int smb2_ioctl_query_info(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, __le16 *path, int is_dir, unsigned long p) { @@ -1447,6 +1445,7 @@ smb2_ioctl_query_info(const unsigned int xid, struct kvec close_iov[1]; unsigned int size[2]; void *data[2]; + int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; @@ -1477,10 +1476,7 @@ smb2_ioctl_query_info(const unsigned int xid, memset(&oparms, 0, sizeof(oparms)); oparms.tcon = tcon; oparms.disposition = FILE_OPEN; - if (is_dir) - oparms.create_options = CREATE_NOT_FILE; - else - oparms.create_options = CREATE_NOT_DIR; + oparms.create_options = cifs_create_options(cifs_sb, create_options); oparms.fid = &fid; oparms.reconnect = false; @@ -2086,10 +2082,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; oparms.disposition = FILE_OPEN; - if (backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = fid; oparms.reconnect = false; @@ -2343,10 +2336,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.desired_access = desired_access; oparms.disposition = FILE_OPEN; - if (cifs_sb && backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; oparms.reconnect = false; @@ -2402,7 +2392,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, static int smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, - struct kstatfs *buf) + struct cifs_sb_info *cifs_sb, struct kstatfs *buf) { struct smb2_query_info_rsp *rsp; struct smb2_fs_full_size_info *info = NULL; @@ -2439,7 +2429,7 @@ qfs_exit: static int smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, - struct kstatfs *buf) + struct cifs_sb_info *cifs_sb, struct kstatfs *buf) { int rc; __le16 srch_path = 0; /* Null - open root of share */ @@ -2448,12 +2438,12 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; if (!tcon->posix_extensions) - return smb2_queryfs(xid, tcon, buf); + return smb2_queryfs(xid, tcon, cifs_sb, buf); oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; - oparms.create_options = 0; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; oparms.reconnect = false; @@ -2722,6 +2712,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_create_rsp *create_rsp; struct smb2_ioctl_rsp *ioctl_rsp; struct reparse_data_buffer *reparse_buf; + int create_options = is_reparse_point ? OPEN_REPARSE_POINT : 0; u32 plen; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); @@ -2748,14 +2739,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; oparms.disposition = FILE_OPEN; - - if (backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; - if (is_reparse_point) - oparms.create_options = OPEN_REPARSE_POINT; - + oparms.create_options = cifs_create_options(cifs_sb, create_options); oparms.fid = &fid; oparms.reconnect = false; @@ -2934,11 +2918,6 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = get_xid(); - if (backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; - utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) { rc = -ENOMEM; @@ -2949,6 +2928,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.tcon = tcon; oparms.desired_access = READ_CONTROL; oparms.disposition = FILE_OPEN; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; oparms.reconnect = false; @@ -2990,11 +2970,6 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, tcon = tlink_tcon(tlink); xid = get_xid(); - if (backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; - if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) access_flags = WRITE_OWNER; else @@ -3009,6 +2984,7 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, oparms.tcon = tcon; oparms.desired_access = access_flags; + oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.disposition = FILE_OPEN; oparms.path = path; oparms.fid = &fid; @@ -4491,7 +4467,6 @@ smb2_make_node(unsigned int xid, struct inode *inode, { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; - int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; FILE_ALL_INFO *buf = NULL; struct cifs_io_parms io_parms; __u32 oplock = 0; @@ -4527,13 +4502,11 @@ smb2_make_node(unsigned int xid, struct inode *inode, goto out; } - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; + oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL); oparms.disposition = FILE_CREATE; oparms.path = full_path; oparms.fid = &fid; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 6c678e00046f..de6388ef344f 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -68,7 +68,7 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_fid *pfid); + struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid); extern void close_shroot(struct cached_fid *cfid); extern void close_shroot_lease(struct cached_fid *cfid); extern void close_shroot_lease_locked(struct cached_fid *cfid); From a525b0881de7742617343f02df4073ddc1571237 Mon Sep 17 00:00:00 2001 From: Michal Rostecki Date: Sun, 2 Feb 2020 12:02:00 +0100 Subject: [PATCH 047/147] bpftool: Remove redundant "HAVE" prefix from the large INSN limit check "HAVE" prefix is already applied by default to feature macros and before this change, the large INSN limit macro had the incorrect name with double "HAVE". Fixes: 2faef64aa6b3 ("bpftool: Add misc section and probe for large INSN limit") Signed-off-by: Michal Rostecki Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200202110200.31024-1-mrostecki@opensuse.org --- tools/bpf/bpftool/feature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 446ba891f1e2..941873d778d8 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -580,7 +580,7 @@ probe_large_insn_limit(const char *define_prefix, __u32 ifindex) res = bpf_probe_large_insn_limit(ifindex); print_bool_feature("have_large_insn_limit", "Large program size limit", - "HAVE_LARGE_INSN_LIMIT", + "LARGE_INSN_LIMIT", res, define_prefix); } From 257af63d7f84f0672aa6a24b5511871f00741f19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 31 Jan 2020 16:03:14 -0800 Subject: [PATCH 048/147] bpf: Fix modifier skipping logic Fix the way modifiers are skipped while walking pointers. Otherwise second level dereferences of 'const struct foo *' will be rejected by the verifier. Fixes: 9e15db66136a ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200201000314.261392-1-ast@kernel.org --- kernel/bpf/btf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8c9d8f266bef..805c43b083e9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3931,6 +3931,7 @@ again: if (btf_type_is_ptr(mtype)) { const struct btf_type *stype; + u32 id; if (msize != size || off != moff) { bpf_log(log, @@ -3939,12 +3940,9 @@ again: return -EACCES; } - stype = btf_type_by_id(btf_vmlinux, mtype->type); - /* skip modifiers */ - while (btf_type_is_modifier(stype)) - stype = btf_type_by_id(btf_vmlinux, stype->type); + stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); if (btf_type_is_struct(stype)) { - *next_btf_id = mtype->type; + *next_btf_id = id; return PTR_TO_BTF_ID; } } From 8fc91b972b734a62a3f3ef45db06d06617f739ea Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 1 Feb 2020 22:51:52 -0800 Subject: [PATCH 049/147] selftests/bpf: Fix trampoline_count.c selftest compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix missing braces compilation warning in trampoline_count test: .../prog_tests/trampoline_count.c: In function ‘test_trampoline_count’: .../prog_tests/trampoline_count.c:49:9: warning: missing braces around initializer [-Wmissing-braces] struct inst inst[MAX_TRAMP_PROGS] = { 0 }; ^ .../prog_tests/trampoline_count.c:49:9: warning: (near initialization for ‘inst[0]’) [-Wmissing-braces] Fixes: d633d57902a5 ("selftest/bpf: Add test for allowed trampolines count") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200202065152.2718142-1-andriin@fb.com --- tools/testing/selftests/bpf/prog_tests/trampoline_count.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index 1235f3d1cc50..1f6ccdaed1ac 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -46,7 +46,7 @@ void test_trampoline_count(void) const char *fentry_name = "fentry/__set_task_comm"; const char *fexit_name = "fexit/__set_task_comm"; const char *object = "test_trampoline_count.o"; - struct inst inst[MAX_TRAMP_PROGS] = { 0 }; + struct inst inst[MAX_TRAMP_PROGS] = {}; int err, i = 0, duration = 0; struct bpf_object *obj; struct bpf_link *link; From cde26a6e17ec36f6f20102a7e5767c2a0096c95f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2020 14:09:22 +0900 Subject: [PATCH 050/147] kallsyms: fix type of kallsyms_token_table[] kallsyms_token_table[] only contains ASCII characters. It should be char instead of u8. Signed-off-by: Masahiro Yamada Reviewed-by: Masami Hiramatsu --- kernel/kallsyms.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 136ce049c4ad..53f84f685841 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -44,7 +44,7 @@ __attribute__((weak, section(".rodata"))); extern const unsigned long kallsyms_relative_base __attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[] __weak; +extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; extern const unsigned int kallsyms_markers[] __weak; @@ -58,7 +58,8 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result, size_t maxlen) { int len, skipped_first = 0; - const u8 *tptr, *data; + const char *tptr; + const u8 *data; /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; From 089b7d890f972f6b649fedc9259f6b93a18fb970 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 4 Feb 2020 13:08:44 +0900 Subject: [PATCH 051/147] kconfig: Invalidate all symbols after changing to y or m. Since commit 89b9060987d9 ("kconfig: Add yes2modconfig and mod2yesconfig targets.") forgot to clear SYMBOL_VALID bit after changing to y or m, these targets did not save the changes. Call sym_clear_all_valid() so that all symbols are revalidated. Fixes: 89b9060987d9 ("kconfig: Add yes2modconfig and mod2yesconfig targets.") Signed-off-by: Tetsuo Handa Signed-off-by: Masahiro Yamada --- scripts/kconfig/confdata.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index 63d307b0d1ac..a39d93e3c6ae 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -1331,9 +1331,8 @@ void conf_rewrite_mod_or_yes(enum conf_def_mode mode) for_all_symbols(i, sym) { if (sym_get_type(sym) == S_TRISTATE && - sym->def[S_DEF_USER].tri == old_val) { + sym->def[S_DEF_USER].tri == old_val) sym->def[S_DEF_USER].tri = new_val; - sym_add_change_count(1); - } } + sym_clear_all_valid(); } From 87f93d82e0952da18af4d978e7d887b4c5326c0b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 4 Feb 2020 13:02:59 -0600 Subject: [PATCH 052/147] smb3: fix problem with null cifs super block with previous patch Add check for null cifs_sb to create_options helper Signed-off-by: Steve French Reviewed-by: Amir Goldstein Reviewed-by: Aurelien Aptel --- fs/cifs/cifsproto.h | 2 +- fs/cifs/smb2ops.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 748bd00cb5f1..89eaaf46d1ca 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -614,7 +614,7 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) { - if (backup_cred(cifs_sb)) + if (cifs_sb && (backup_cred(cifs_sb))) return options | CREATE_OPEN_BACKUP_INTENT; else return options; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 33bb86cae369..ac6628e28b12 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2407,7 +2407,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, FS_FULL_SIZE_INFORMATION, SMB2_O_INFO_FILESYSTEM, sizeof(struct smb2_fs_full_size_info), - &rsp_iov, &buftype, NULL); + &rsp_iov, &buftype, cifs_sb); if (rc) goto qfs_exit; From b0dd940e582b6a60296b9847a54012a4b080dc72 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 5 Feb 2020 11:08:01 +1000 Subject: [PATCH 053/147] cifs: fail i/o on soft mounts if sessionsetup errors out RHBZ: 1579050 If we have a soft mount we should fail commands for session-setup failures (such as the password having changed/ account being deleted/ ...) and return an error back to the application. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2pdu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 14f209f7376f..7996d81230aa 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -350,9 +350,14 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } rc = cifs_negotiate_protocol(0, tcon->ses); - if (!rc && tcon->ses->need_reconnect) + if (!rc && tcon->ses->need_reconnect) { rc = cifs_setup_session(0, tcon->ses, nls_codepage); - + if ((rc == -EACCES) && !tcon->retry) { + rc = -EHOSTDOWN; + mutex_unlock(&tcon->ses->session_mutex); + goto failed; + } + } if (rc || !tcon->need_reconnect) { mutex_unlock(&tcon->ses->session_mutex); goto out; @@ -397,6 +402,7 @@ out: case SMB2_SET_INFO: rc = -EAGAIN; } +failed: unload_nls(nls_codepage); return rc; } From 9784e619f0a0ada7421ffd777872a199cdcb73cd Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Tue, 4 Feb 2020 14:58:20 +0100 Subject: [PATCH 054/147] net: sgi: ioc3-eth: Remove leftover free_irq() Commit 0ce5ebd24d25 ("mfd: ioc3: Add driver for SGI IOC3 chip") moved request_irq() from ioc3_open into probe function, but forgot to remove free_irq() from ioc3_close. Fixes: 0ce5ebd24d25 ("mfd: ioc3: Add driver for SGI IOC3 chip") Signed-off-by: Thomas Bogendoerfer Signed-off-by: David S. Miller --- drivers/net/ethernet/sgi/ioc3-eth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index e61eb891c0f7..db6b2988e632 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -823,7 +823,6 @@ static int ioc3_close(struct net_device *dev) netif_stop_queue(dev); ioc3_stop(ip); - free_irq(dev->irq, dev); ioc3_free_rx_bufs(ip); ioc3_clean_tx_ring(ip); From 34611e6996f331399ee26382fdbc2bd009f9cdd6 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Tue, 4 Feb 2020 22:22:02 +0800 Subject: [PATCH 055/147] netdevsim: fix ptr_ret.cocci warnings drivers/net/netdevsim/dev.c:937:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Fixes: 6556ff32f12d ("netdevsim: use IS_ERR instead of IS_ERR_OR_NULL for debugfs") CC: Taehee Yoo Signed-off-by: kbuild test robot Signed-off-by: David S. Miller --- drivers/net/netdevsim/dev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 5c5427c840b6..d7706a0346f2 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -934,9 +934,7 @@ int nsim_dev_port_del(struct nsim_bus_dev *nsim_bus_dev, int nsim_dev_init(void) { nsim_dev_ddir = debugfs_create_dir(DRV_NAME, NULL); - if (IS_ERR(nsim_dev_ddir)) - return PTR_ERR(nsim_dev_ddir); - return 0; + return PTR_ERR_OR_ZERO(nsim_dev_ddir); } void nsim_dev_exit(void) From e22e0790595dad409d610651f4eb17742607e35d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Feb 2020 11:29:02 +0200 Subject: [PATCH 056/147] net: dsa: b53: Platform data shan't include kernel.h Replace with appropriate types.h. Signed-off-by: Andy Shevchenko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/platform_data/b53.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/platform_data/b53.h b/include/linux/platform_data/b53.h index c3b61ead41f2..6f6fed2b171d 100644 --- a/include/linux/platform_data/b53.h +++ b/include/linux/platform_data/b53.h @@ -19,7 +19,7 @@ #ifndef __B53_H #define __B53_H -#include +#include #include struct b53_platform_data { From 8b7a07c7d4420c7d684f661697c6a878040c8b65 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Feb 2020 11:29:03 +0200 Subject: [PATCH 057/147] net: dsa: microchip: Platform data shan't include kernel.h Replace with appropriate types.h. Signed-off-by: Andy Shevchenko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/platform_data/microchip-ksz.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index 84789ca634aa..ea1cc6d829e9 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -19,7 +19,7 @@ #ifndef __MICROCHIP_KSZ_H #define __MICROCHIP_KSZ_H -#include +#include struct ksz_platform_data { u32 chip_id; From 2c22c06ce426a0d025a3dacd17cd8868f3dbe96b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 4 Feb 2020 18:12:30 +0100 Subject: [PATCH 058/147] mptcp: fix use-after-free on tcp fallback When an mptcp socket connects to a tcp peer or when a middlebox interferes with tcp options, mptcp needs to fall back to plain tcp. Problem is that mptcp is trying to be too clever in this case: It attempts to close the mptcp meta sk and transparently replace it with the (only) subflow tcp sk. Unfortunately, this is racy -- the socket is already exposed to userspace. Any parallel calls to send/recv/setsockopt etc. can cause use-after-free: BUG: KASAN: use-after-free in atomic_try_cmpxchg include/asm-generic/atomic-instrumented.h:693 [inline] CPU: 1 PID: 2083 Comm: syz-executor.1 Not tainted 5.5.0 #2 atomic_try_cmpxchg include/asm-generic/atomic-instrumented.h:693 [inline] queued_spin_lock include/asm-generic/qspinlock.h:78 [inline] do_raw_spin_lock include/linux/spinlock.h:181 [inline] __raw_spin_lock_bh include/linux/spinlock_api_smp.h:136 [inline] _raw_spin_lock_bh+0x71/0xd0 kernel/locking/spinlock.c:175 spin_lock_bh include/linux/spinlock.h:343 [inline] __lock_sock+0x105/0x190 net/core/sock.c:2414 lock_sock_nested+0x10f/0x140 net/core/sock.c:2938 lock_sock include/net/sock.h:1516 [inline] mptcp_setsockopt+0x2f/0x1f0 net/mptcp/protocol.c:800 __sys_setsockopt+0x152/0x240 net/socket.c:2130 __do_sys_setsockopt net/socket.c:2146 [inline] __se_sys_setsockopt net/socket.c:2143 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2143 do_syscall_64+0xb7/0x3d0 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x44/0xa9 While the use-after-free can be resolved, there is another problem: sock->ops and sock->sk assignments are not atomic, i.e. we may get calls into mptcp functions with sock->sk already pointing at the subflow socket, or calls into tcp functions with a mptcp meta sk. Remove the fallback code and call the relevant functions for the (only) subflow in case the mptcp socket is connected to tcp peer. Reported-by: Christoph Paasch Diagnosed-by: Paolo Abeni Signed-off-by: Florian Westphal Reviewed-by: Mat Martineau Tested-by: Christoph Paasch Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 76 ++++---------------------------------------- 1 file changed, 6 insertions(+), 70 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3bccee455688..353f2d16b986 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -24,58 +24,6 @@ #define MPTCP_SAME_STATE TCP_MAX_STATES -static void __mptcp_close(struct sock *sk, long timeout); - -static const struct proto_ops *tcp_proto_ops(struct sock *sk) -{ -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (sk->sk_family == AF_INET6) - return &inet6_stream_ops; -#endif - return &inet_stream_ops; -} - -/* MP_CAPABLE handshake failed, convert msk to plain tcp, replacing - * socket->sk and stream ops and destroying msk - * return the msk socket, as we can't access msk anymore after this function - * completes - * Called with msk lock held, releases such lock before returning - */ -static struct socket *__mptcp_fallback_to_tcp(struct mptcp_sock *msk, - struct sock *ssk) -{ - struct mptcp_subflow_context *subflow; - struct socket *sock; - struct sock *sk; - - sk = (struct sock *)msk; - sock = sk->sk_socket; - subflow = mptcp_subflow_ctx(ssk); - - /* detach the msk socket */ - list_del_init(&subflow->node); - sock_orphan(sk); - sock->sk = NULL; - - /* socket is now TCP */ - lock_sock(ssk); - sock_graft(ssk, sock); - if (subflow->conn) { - /* We can't release the ULP data on a live socket, - * restore the tcp callback - */ - mptcp_subflow_tcp_fallback(ssk, subflow); - sock_put(subflow->conn); - subflow->conn = NULL; - } - release_sock(ssk); - sock->ops = tcp_proto_ops(ssk); - - /* destroy the left-over msk sock */ - __mptcp_close(sk, 0); - return sock; -} - /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -93,10 +41,6 @@ static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) return msk->first && !sk_is_mptcp(msk->first); } -/* if the mp_capable handshake has failed, it fallbacks msk to plain TCP, - * releases the socket lock and returns a reference to the now TCP socket. - * Otherwise returns NULL - */ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); @@ -105,15 +49,11 @@ static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) return NULL; if (msk->subflow) { - /* the first subflow is an active connection, discart the - * paired socket - */ - msk->subflow->sk = NULL; - sock_release(msk->subflow); - msk->subflow = NULL; + release_sock((struct sock *)msk); + return msk->subflow; } - return __mptcp_fallback_to_tcp(msk, msk->first); + return NULL; } static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk) @@ -640,12 +580,14 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how) } /* Called with msk lock held, releases such lock before returning */ -static void __mptcp_close(struct sock *sk, long timeout) +static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); LIST_HEAD(conn_list); + lock_sock(sk); + mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); @@ -662,12 +604,6 @@ static void __mptcp_close(struct sock *sk, long timeout) sk_common_release(sk); } -static void mptcp_close(struct sock *sk, long timeout) -{ - lock_sock(sk); - __mptcp_close(sk, timeout); -} - static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) From 52b5ae501c045010aeeb1d5ac0373ff161a88291 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 4 Feb 2020 11:10:12 -0800 Subject: [PATCH 059/147] net_sched: fix a resource leak in tcindex_set_parms() Jakub noticed there is a potential resource leak in tcindex_set_parms(): when tcindex_filter_result_init() fails and it jumps to 'errout1' which doesn't release the memory and resources allocated by tcindex_alloc_perfect_hash(). We should just jump to 'errout_alloc' which calls tcindex_free_perfect_hash(). Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Reported-by: Jakub Kicinski Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 0323aee03de7..09b7dc5fe7e0 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -365,7 +365,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcindex_filter_result_init(&new_filter_result, net); if (err < 0) - goto errout1; + goto errout_alloc; if (old_r) cr = r->res; @@ -484,7 +484,6 @@ errout_alloc: tcindex_free_perfect_hash(cp); else if (balloc == 2) kfree(cp->h); -errout1: tcf_exts_destroy(&new_filter_result.exts); errout: kfree(cp); From 9981159fc3b677b357f84e069a11de5a5ec8a2a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Feb 2020 22:17:25 +0100 Subject: [PATCH 060/147] wireguard: allowedips: fix use-after-free in root_remove_peer_lists In the unlikely case a new node could not be allocated, we need to remove @newnode from @peer->allowedips_list before freeing it. syzbot reported: BUG: KASAN: use-after-free in __list_del_entry_valid+0xdc/0xf5 lib/list_debug.c:54 Read of size 8 at addr ffff88809881a538 by task syz-executor.4/30133 CPU: 0 PID: 30133 Comm: syz-executor.4 Not tainted 5.5.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x32 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:639 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135 __list_del_entry_valid+0xdc/0xf5 lib/list_debug.c:54 __list_del_entry include/linux/list.h:132 [inline] list_del include/linux/list.h:146 [inline] root_remove_peer_lists+0x24f/0x4b0 drivers/net/wireguard/allowedips.c:65 wg_allowedips_free+0x232/0x390 drivers/net/wireguard/allowedips.c:300 wg_peer_remove_all+0xd5/0x620 drivers/net/wireguard/peer.c:187 wg_set_device+0xd01/0x1350 drivers/net/wireguard/netlink.c:542 genl_family_rcv_msg_doit net/netlink/genetlink.c:672 [inline] genl_family_rcv_msg net/netlink/genetlink.c:717 [inline] genl_rcv_msg+0x67d/0xea0 net/netlink/genetlink.c:734 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 genl_rcv+0x29/0x40 net/netlink/genetlink.c:745 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 ____sys_sendmsg+0x753/0x880 net/socket.c:2343 ___sys_sendmsg+0x100/0x170 net/socket.c:2397 __sys_sendmsg+0x105/0x1d0 net/socket.c:2430 __do_sys_sendmsg net/socket.c:2439 [inline] __se_sys_sendmsg net/socket.c:2437 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x45b399 Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f99a9bcdc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f99a9bce6d4 RCX: 000000000045b399 RDX: 0000000000000000 RSI: 0000000020001340 RDI: 0000000000000003 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004 R13: 00000000000009ba R14: 00000000004cb2b8 R15: 0000000000000009 Allocated by task 30103: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] __kasan_kmalloc mm/kasan/common.c:513 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:486 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:527 kmem_cache_alloc_trace+0x158/0x790 mm/slab.c:3551 kmalloc include/linux/slab.h:556 [inline] kzalloc include/linux/slab.h:670 [inline] add+0x70a/0x1970 drivers/net/wireguard/allowedips.c:236 wg_allowedips_insert_v4+0xf6/0x160 drivers/net/wireguard/allowedips.c:320 set_allowedip drivers/net/wireguard/netlink.c:343 [inline] set_peer+0xfb9/0x1150 drivers/net/wireguard/netlink.c:468 wg_set_device+0xbd4/0x1350 drivers/net/wireguard/netlink.c:591 genl_family_rcv_msg_doit net/netlink/genetlink.c:672 [inline] genl_family_rcv_msg net/netlink/genetlink.c:717 [inline] genl_rcv_msg+0x67d/0xea0 net/netlink/genetlink.c:734 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 genl_rcv+0x29/0x40 net/netlink/genetlink.c:745 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 ____sys_sendmsg+0x753/0x880 net/socket.c:2343 ___sys_sendmsg+0x100/0x170 net/socket.c:2397 __sys_sendmsg+0x105/0x1d0 net/socket.c:2430 __do_sys_sendmsg net/socket.c:2439 [inline] __se_sys_sendmsg net/socket.c:2437 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 30103: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] kasan_set_free_info mm/kasan/common.c:335 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:474 kasan_slab_free+0xe/0x10 mm/kasan/common.c:483 __cache_free mm/slab.c:3426 [inline] kfree+0x10a/0x2c0 mm/slab.c:3757 add+0x12d2/0x1970 drivers/net/wireguard/allowedips.c:266 wg_allowedips_insert_v4+0xf6/0x160 drivers/net/wireguard/allowedips.c:320 set_allowedip drivers/net/wireguard/netlink.c:343 [inline] set_peer+0xfb9/0x1150 drivers/net/wireguard/netlink.c:468 wg_set_device+0xbd4/0x1350 drivers/net/wireguard/netlink.c:591 genl_family_rcv_msg_doit net/netlink/genetlink.c:672 [inline] genl_family_rcv_msg net/netlink/genetlink.c:717 [inline] genl_rcv_msg+0x67d/0xea0 net/netlink/genetlink.c:734 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 genl_rcv+0x29/0x40 net/netlink/genetlink.c:745 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 ____sys_sendmsg+0x753/0x880 net/socket.c:2343 ___sys_sendmsg+0x100/0x170 net/socket.c:2397 __sys_sendmsg+0x105/0x1d0 net/socket.c:2430 __do_sys_sendmsg net/socket.c:2439 [inline] __se_sys_sendmsg net/socket.c:2437 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff88809881a500 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 56 bytes inside of 64-byte region [ffff88809881a500, ffff88809881a540) The buggy address belongs to the page: page:ffffea0002620680 refcount:1 mapcount:0 mapping:ffff8880aa400380 index:0x0 raw: 00fffe0000000200 ffffea000250b748 ffffea000254bac8 ffff8880aa400380 raw: 0000000000000000 ffff88809881a000 0000000100000020 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88809881a400: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88809881a480: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc >ffff88809881a500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff88809881a580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88809881a600: 00 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jason A. Donenfeld Cc: wireguard@lists.zx2c4.com Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/allowedips.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c index 121d9ea0f135..3725e9cd85f4 100644 --- a/drivers/net/wireguard/allowedips.c +++ b/drivers/net/wireguard/allowedips.c @@ -263,6 +263,7 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key, } else { node = kzalloc(sizeof(*node), GFP_KERNEL); if (unlikely(!node)) { + list_del(&newnode->peer_list); kfree(newnode); return -ENOMEM; } From ec31c2676a10e064878927b243fada8c2fb0c03c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 4 Feb 2020 22:17:26 +0100 Subject: [PATCH 061/147] wireguard: noise: reject peers with low order public keys Our static-static calculation returns a failure if the public key is of low order. We check for this when peers are added, and don't allow them to be added if they're low order, except in the case where we haven't yet been given a private key. In that case, we would defer the removal of the peer until we're given a private key, since at that point we're doing new static-static calculations which incur failures we can act on. This meant, however, that we wound up removing peers rather late in the configuration flow. Syzkaller points out that peer_remove calls flush_workqueue, which in turn might then wait for sending a handshake initiation to complete. Since handshake initiation needs the static identity lock, holding the static identity lock while calling peer_remove can result in a rare deadlock. We have precisely this case in this situation of late-stage peer removal based on an invalid public key. We can't drop the lock when removing, because then incoming handshakes might interact with a bogus static-static calculation. While the band-aid patch for this would involve breaking up the peer removal into two steps like wg_peer_remove_all does, in order to solve the locking issue, there's actually a much more elegant way of fixing this: If the static-static calculation succeeds with one private key, it *must* succeed with all others, because all 32-byte strings map to valid private keys, thanks to clamping. That means we can get rid of this silly dance and locking headaches of removing peers late in the configuration flow, and instead just reject them early on, regardless of whether the device has yet been assigned a private key. For the case where the device doesn't yet have a private key, we safely use zeros just for the purposes of checking for low order points by way of checking the output of the calculation. The following PoC will trigger the deadlock: ip link add wg0 type wireguard ip addr add 10.0.0.1/24 dev wg0 ip link set wg0 up ping -f 10.0.0.2 & while true; do wg set wg0 private-key /dev/null peer AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= allowed-ips 10.0.0.0/24 endpoint 10.0.0.3:1234 wg set wg0 private-key <(echo AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=) done [ 0.949105] ====================================================== [ 0.949550] WARNING: possible circular locking dependency detected [ 0.950143] 5.5.0-debug+ #18 Not tainted [ 0.950431] ------------------------------------------------------ [ 0.950959] wg/89 is trying to acquire lock: [ 0.951252] ffff8880333e2128 ((wq_completion)wg-kex-wg0){+.+.}, at: flush_workqueue+0xe3/0x12f0 [ 0.951865] [ 0.951865] but task is already holding lock: [ 0.952280] ffff888032819bc0 (&wg->static_identity.lock){++++}, at: wg_set_device+0x95d/0xcc0 [ 0.953011] [ 0.953011] which lock already depends on the new lock. [ 0.953011] [ 0.953651] [ 0.953651] the existing dependency chain (in reverse order) is: [ 0.954292] [ 0.954292] -> #2 (&wg->static_identity.lock){++++}: [ 0.954804] lock_acquire+0x127/0x350 [ 0.955133] down_read+0x83/0x410 [ 0.955428] wg_noise_handshake_create_initiation+0x97/0x700 [ 0.955885] wg_packet_send_handshake_initiation+0x13a/0x280 [ 0.956401] wg_packet_handshake_send_worker+0x10/0x20 [ 0.956841] process_one_work+0x806/0x1500 [ 0.957167] worker_thread+0x8c/0xcb0 [ 0.957549] kthread+0x2ee/0x3b0 [ 0.957792] ret_from_fork+0x24/0x30 [ 0.958234] [ 0.958234] -> #1 ((work_completion)(&peer->transmit_handshake_work)){+.+.}: [ 0.958808] lock_acquire+0x127/0x350 [ 0.959075] process_one_work+0x7ab/0x1500 [ 0.959369] worker_thread+0x8c/0xcb0 [ 0.959639] kthread+0x2ee/0x3b0 [ 0.959896] ret_from_fork+0x24/0x30 [ 0.960346] [ 0.960346] -> #0 ((wq_completion)wg-kex-wg0){+.+.}: [ 0.960945] check_prev_add+0x167/0x1e20 [ 0.961351] __lock_acquire+0x2012/0x3170 [ 0.961725] lock_acquire+0x127/0x350 [ 0.961990] flush_workqueue+0x106/0x12f0 [ 0.962280] peer_remove_after_dead+0x160/0x220 [ 0.962600] wg_set_device+0xa24/0xcc0 [ 0.962994] genl_rcv_msg+0x52f/0xe90 [ 0.963298] netlink_rcv_skb+0x111/0x320 [ 0.963618] genl_rcv+0x1f/0x30 [ 0.963853] netlink_unicast+0x3f6/0x610 [ 0.964245] netlink_sendmsg+0x700/0xb80 [ 0.964586] __sys_sendto+0x1dd/0x2c0 [ 0.964854] __x64_sys_sendto+0xd8/0x1b0 [ 0.965141] do_syscall_64+0x90/0xd9a [ 0.965408] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 0.965769] [ 0.965769] other info that might help us debug this: [ 0.965769] [ 0.966337] Chain exists of: [ 0.966337] (wq_completion)wg-kex-wg0 --> (work_completion)(&peer->transmit_handshake_work) --> &wg->static_identity.lock [ 0.966337] [ 0.967417] Possible unsafe locking scenario: [ 0.967417] [ 0.967836] CPU0 CPU1 [ 0.968155] ---- ---- [ 0.968497] lock(&wg->static_identity.lock); [ 0.968779] lock((work_completion)(&peer->transmit_handshake_work)); [ 0.969345] lock(&wg->static_identity.lock); [ 0.969809] lock((wq_completion)wg-kex-wg0); [ 0.970146] [ 0.970146] *** DEADLOCK *** [ 0.970146] [ 0.970531] 5 locks held by wg/89: [ 0.970908] #0: ffffffff827433c8 (cb_lock){++++}, at: genl_rcv+0x10/0x30 [ 0.971400] #1: ffffffff82743480 (genl_mutex){+.+.}, at: genl_rcv_msg+0x642/0xe90 [ 0.971924] #2: ffffffff827160c0 (rtnl_mutex){+.+.}, at: wg_set_device+0x9f/0xcc0 [ 0.972488] #3: ffff888032819de0 (&wg->device_update_lock){+.+.}, at: wg_set_device+0xb0/0xcc0 [ 0.973095] #4: ffff888032819bc0 (&wg->static_identity.lock){++++}, at: wg_set_device+0x95d/0xcc0 [ 0.973653] [ 0.973653] stack backtrace: [ 0.973932] CPU: 1 PID: 89 Comm: wg Not tainted 5.5.0-debug+ #18 [ 0.974476] Call Trace: [ 0.974638] dump_stack+0x97/0xe0 [ 0.974869] check_noncircular+0x312/0x3e0 [ 0.975132] ? print_circular_bug+0x1f0/0x1f0 [ 0.975410] ? __kernel_text_address+0x9/0x30 [ 0.975727] ? unwind_get_return_address+0x51/0x90 [ 0.976024] check_prev_add+0x167/0x1e20 [ 0.976367] ? graph_lock+0x70/0x160 [ 0.976682] __lock_acquire+0x2012/0x3170 [ 0.976998] ? register_lock_class+0x1140/0x1140 [ 0.977323] lock_acquire+0x127/0x350 [ 0.977627] ? flush_workqueue+0xe3/0x12f0 [ 0.977890] flush_workqueue+0x106/0x12f0 [ 0.978147] ? flush_workqueue+0xe3/0x12f0 [ 0.978410] ? find_held_lock+0x2c/0x110 [ 0.978662] ? lock_downgrade+0x6e0/0x6e0 [ 0.978919] ? queue_rcu_work+0x60/0x60 [ 0.979166] ? netif_napi_del+0x151/0x3b0 [ 0.979501] ? peer_remove_after_dead+0x160/0x220 [ 0.979871] peer_remove_after_dead+0x160/0x220 [ 0.980232] wg_set_device+0xa24/0xcc0 [ 0.980516] ? deref_stack_reg+0x8e/0xc0 [ 0.980801] ? set_peer+0xe10/0xe10 [ 0.981040] ? __ww_mutex_check_waiters+0x150/0x150 [ 0.981430] ? __nla_validate_parse+0x163/0x270 [ 0.981719] ? genl_family_rcv_msg_attrs_parse+0x13f/0x310 [ 0.982078] genl_rcv_msg+0x52f/0xe90 [ 0.982348] ? genl_family_rcv_msg_attrs_parse+0x310/0x310 [ 0.982690] ? register_lock_class+0x1140/0x1140 [ 0.983049] netlink_rcv_skb+0x111/0x320 [ 0.983298] ? genl_family_rcv_msg_attrs_parse+0x310/0x310 [ 0.983645] ? netlink_ack+0x880/0x880 [ 0.983888] genl_rcv+0x1f/0x30 [ 0.984168] netlink_unicast+0x3f6/0x610 [ 0.984443] ? netlink_detachskb+0x60/0x60 [ 0.984729] ? find_held_lock+0x2c/0x110 [ 0.984976] netlink_sendmsg+0x700/0xb80 [ 0.985220] ? netlink_broadcast_filtered+0xa60/0xa60 [ 0.985533] __sys_sendto+0x1dd/0x2c0 [ 0.985763] ? __x64_sys_getpeername+0xb0/0xb0 [ 0.986039] ? sockfd_lookup_light+0x17/0x160 [ 0.986397] ? __sys_recvmsg+0x8c/0xf0 [ 0.986711] ? __sys_recvmsg_sock+0xd0/0xd0 [ 0.987018] __x64_sys_sendto+0xd8/0x1b0 [ 0.987283] ? lockdep_hardirqs_on+0x39b/0x5a0 [ 0.987666] do_syscall_64+0x90/0xd9a [ 0.987903] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 0.988223] RIP: 0033:0x7fe77c12003e [ 0.988508] Code: c3 8b 07 85 c0 75 24 49 89 fb 48 89 f0 48 89 d7 48 89 ce 4c 89 c2 4d 89 ca 4c 8b 44 24 08 4c 8b 4c 24 10 4c 4 [ 0.989666] RSP: 002b:00007fffada2ed58 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 0.990137] RAX: ffffffffffffffda RBX: 00007fe77c159d48 RCX: 00007fe77c12003e [ 0.990583] RDX: 0000000000000040 RSI: 000055fd1d38e020 RDI: 0000000000000004 [ 0.991091] RBP: 000055fd1d38e020 R08: 000055fd1cb63358 R09: 000000000000000c [ 0.991568] R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000002c [ 0.992014] R13: 0000000000000004 R14: 000055fd1d38e020 R15: 0000000000000001 Signed-off-by: Jason A. Donenfeld Reported-by: syzbot Signed-off-by: David S. Miller --- drivers/net/wireguard/netlink.c | 6 ++---- drivers/net/wireguard/noise.c | 10 +++++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireguard/netlink.c b/drivers/net/wireguard/netlink.c index 0fdbd1c45977..bda26405497c 100644 --- a/drivers/net/wireguard/netlink.c +++ b/drivers/net/wireguard/netlink.c @@ -569,10 +569,8 @@ static int wg_set_device(struct sk_buff *skb, struct genl_info *info) private_key); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { - if (wg_noise_precompute_static_static(peer)) - wg_noise_expire_current_peer_keypairs(peer); - else - wg_peer_remove(peer); + BUG_ON(!wg_noise_precompute_static_static(peer)); + wg_noise_expire_current_peer_keypairs(peer); } wg_cookie_checker_precompute_device_keys(&wg->cookie_checker); up_write(&wg->static_identity.lock); diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c index d71c8db68a8c..919d9d866446 100644 --- a/drivers/net/wireguard/noise.c +++ b/drivers/net/wireguard/noise.c @@ -46,17 +46,21 @@ void __init wg_noise_init(void) /* Must hold peer->handshake.static_identity->lock */ bool wg_noise_precompute_static_static(struct wg_peer *peer) { - bool ret = true; + bool ret; down_write(&peer->handshake.lock); - if (peer->handshake.static_identity->has_identity) + if (peer->handshake.static_identity->has_identity) { ret = curve25519( peer->handshake.precomputed_static_static, peer->handshake.static_identity->static_private, peer->handshake.remote_static); - else + } else { + u8 empty[NOISE_PUBLIC_KEY_LEN] = { 0 }; + + ret = curve25519(empty, empty, peer->handshake.remote_static); memset(peer->handshake.precomputed_static_static, 0, NOISE_PUBLIC_KEY_LEN); + } up_write(&peer->handshake.lock); return ret; } From f9398acba6a4ae9cb98bfe4d56414d376eff8d57 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 4 Feb 2020 22:17:27 +0100 Subject: [PATCH 062/147] wireguard: selftests: ensure non-addition of peers with failed precomputation Ensure that peers with low order points are ignored, both in the case where we already have a device private key and in the case where we do not. This adds points that naturally give a zero output. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/netns.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index d5c85c7494f2..b03647d1bbf6 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -516,6 +516,12 @@ n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0,10.0.0.0/8,100.0.0.0/10,172.16. n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0 n0 wg set wg0 peer "$pub2" allowed-ips ::/0,1700::/111,5000::/4,e000::/37,9000::/75 n0 wg set wg0 peer "$pub2" allowed-ips ::/0 +n0 wg set wg0 peer "$pub2" remove +low_order_points=( AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= 4Ot6fDtBuK4WVuP68Z/EatoJjeucMrH9hmIFFl9JuAA= X5yVvKNQjCSx0LFVnIPvWwREXMRYHI6G2CJO3dCfEVc= 7P///////////////////////////////////////38= 7f///////////////////////////////////////38= 7v///////////////////////////////////////38= ) +n0 wg set wg0 private-key /dev/null ${low_order_points[@]/#/peer } +[[ -z $(n0 wg show wg0 peers) ]] +n0 wg set wg0 private-key <(echo "$key1") ${low_order_points[@]/#/peer } +[[ -z $(n0 wg show wg0 peers) ]] ip0 link del wg0 declare -A objects From 4a2ef721e60f0d2babeb29dbcfe05904b53ea19e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 4 Feb 2020 22:17:28 +0100 Subject: [PATCH 063/147] wireguard: selftests: cleanup CONFIG_ENABLE_WARN_DEPRECATED CONFIG_ENABLE_WARN_DEPRECATED is gone since commit 771c035372a0 ("deprecate the '__deprecated' attribute warnings entirely and for good"). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/qemu/debug.config | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index b9c72706fe4d..5909e7ef2a5c 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -1,5 +1,4 @@ CONFIG_LOCALVERSION="-debug" -CONFIG_ENABLE_WARN_DEPRECATED=y CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_POINTER=y CONFIG_STACK_VALIDATION=y From 88f404a9b1d75388225b1c67b6dd327cb2182777 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 4 Feb 2020 22:17:29 +0100 Subject: [PATCH 064/147] wireguard: selftests: tie socket waiting to target pid Without this, we wind up proceeding too early sometimes when the previous process has just used the same listening port. So, we tie the listening socket query to the specific pid we're interested in. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/netns.sh | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index b03647d1bbf6..f5ab1cda8bb5 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -38,9 +38,8 @@ ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; } ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; } ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; } sleep() { read -t "$1" -N 1 || true; } -waitiperf() { pretty "${1//*-}" "wait for iperf:5201"; while [[ $(ss -N "$1" -tlp 'sport = 5201') != *iperf3* ]]; do sleep 0.1; done; } -waitncatudp() { pretty "${1//*-}" "wait for udp:1111"; while [[ $(ss -N "$1" -ulp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; } -waitncattcp() { pretty "${1//*-}" "wait for tcp:1111"; while [[ $(ss -N "$1" -tlp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; } +waitiperf() { pretty "${1//*-}" "wait for iperf:5201 pid $2"; while [[ $(ss -N "$1" -tlpH 'sport = 5201') != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; } +waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; } waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; } cleanup() { @@ -119,22 +118,22 @@ tests() { # TCP over IPv4 n2 iperf3 -s -1 -B 192.168.241.2 & - waitiperf $netns2 + waitiperf $netns2 $! n1 iperf3 -Z -t 3 -c 192.168.241.2 # TCP over IPv6 n1 iperf3 -s -1 -B fd00::1 & - waitiperf $netns1 + waitiperf $netns1 $! n2 iperf3 -Z -t 3 -c fd00::1 # UDP over IPv4 n1 iperf3 -s -1 -B 192.168.241.1 & - waitiperf $netns1 + waitiperf $netns1 $! n2 iperf3 -Z -t 3 -b 0 -u -c 192.168.241.1 # UDP over IPv6 n2 iperf3 -s -1 -B fd00::2 & - waitiperf $netns2 + waitiperf $netns2 $! n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2 } @@ -207,7 +206,7 @@ n1 ping -W 1 -c 1 192.168.241.2 n1 wg set wg0 peer "$pub2" allowed-ips 192.168.241.0/24 exec 4< <(n1 ncat -l -u -p 1111) ncat_pid=$! -waitncatudp $netns1 +waitncatudp $netns1 $ncat_pid n2 ncat -u 192.168.241.1 1111 <<<"X" read -r -N 1 -t 1 out <&4 && [[ $out == "X" ]] kill $ncat_pid @@ -216,7 +215,7 @@ n1 wg set wg0 peer "$more_specific_key" allowed-ips 192.168.241.2/32 n2 wg set wg0 listen-port 9997 exec 4< <(n1 ncat -l -u -p 1111) ncat_pid=$! -waitncatudp $netns1 +waitncatudp $netns1 $ncat_pid n2 ncat -u 192.168.241.1 1111 <<<"X" ! read -r -N 1 -t 1 out <&4 || false kill $ncat_pid From 33e2b32b5df2b544ac5d43c4de2194bcc822b1b5 Mon Sep 17 00:00:00 2001 From: Moritz Fischer Date: Tue, 4 Feb 2020 15:01:18 -0800 Subject: [PATCH 065/147] net: ethernet: dec: tulip: Fix length mask in receive length calculation The receive frame length calculation uses a wrong mask to calculate the length of the received frames. Per spec table 4-1 the length is contained in the FL (Frame Length) field in bits 30:16. This didn't show up as an issue so far since frames were limited to 1500 bytes which falls within the 11 bit window. Signed-off-by: Moritz Fischer Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/de2104x.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/dec/tulip/de2104x.c b/drivers/net/ethernet/dec/tulip/de2104x.c index d305d1b24b0a..42b798a3fad4 100644 --- a/drivers/net/ethernet/dec/tulip/de2104x.c +++ b/drivers/net/ethernet/dec/tulip/de2104x.c @@ -417,7 +417,10 @@ static void de_rx (struct de_private *de) if (status & DescOwn) break; - len = ((status >> 16) & 0x7ff) - 4; + /* the length is actually a 15 bit value here according + * to Table 4-1 in the DE2104x spec so mask is 0x7fff + */ + len = ((status >> 16) & 0x7fff) - 4; mapping = de->rx_skb[rx_tail].mapping; if (unlikely(drop)) { From d5b90e99e1d51b7b5d2b74fbc4c2db236a510913 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 4 Feb 2020 15:59:50 -0800 Subject: [PATCH 066/147] devlink: report 0 after hitting end in region read commit fdd41ec21e15 ("devlink: Return right error code in case of errors for region read") modified the region read code to report errors properly in unexpected cases. In the case where the start_offset and ret_offset match, it unilaterally converted this into an error. This causes an issue for the "dump" version of the command. In this case, the devlink region dump will always report an invalid argument: 000000000000ffd0 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 000000000000ffe0 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff devlink answers: Invalid argument 000000000000fff0 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff This occurs because the expected flow for the dump is to return 0 after there is no further data. The simplest fix would be to stop converting the error code to -EINVAL if start_offset == ret_offset. However, avoid unnecessary work by checking for when start_offset is larger than the region size and returning 0 upfront. Fixes: fdd41ec21e15 ("devlink: Return right error code in case of errors for region read") Signed-off-by: Jacob Keller Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/devlink.c b/net/core/devlink.c index ca1df0ec3c97..549ee56b7a21 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3986,6 +3986,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + /* return 0 if there is no further data to read */ + if (start_offset >= region->size) { + err = 0; + goto out_unlock; + } + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, DEVLINK_CMD_REGION_READ); From 38f88c45404293bbc027b956def6c10cbd45c616 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Feb 2020 19:26:05 -0800 Subject: [PATCH 067/147] bonding/alb: properly access headers in bond_alb_xmit() syzbot managed to send an IPX packet through bond_alb_xmit() and af_packet and triggered a use-after-free. First, bond_alb_xmit() was using ipx_hdr() helper to reach the IPX header, but ipx_hdr() was using the transport offset instead of the network offset. In the particular syzbot report transport offset was 0xFFFF This patch removes ipx_hdr() since it was only (mis)used from bonding. Then we need to make sure IPv4/IPv6/IPX headers are pulled in skb->head before dereferencing anything. BUG: KASAN: use-after-free in bond_alb_xmit+0x153a/0x1590 drivers/net/bonding/bond_alb.c:1452 Read of size 2 at addr ffff8801ce56dfff by task syz-executor.2/18108 (if (ipx_hdr(skb)->ipx_checksum != IPX_NO_CHECKSUM) ...) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: [] __dump_stack lib/dump_stack.c:17 [inline] [] dump_stack+0x14d/0x20b lib/dump_stack.c:53 [] print_address_description+0x6f/0x20b mm/kasan/report.c:282 [] kasan_report_error mm/kasan/report.c:380 [inline] [] kasan_report mm/kasan/report.c:438 [inline] [] kasan_report.cold+0x8c/0x2a0 mm/kasan/report.c:422 [] __asan_report_load_n_noabort+0xf/0x20 mm/kasan/report.c:469 [] bond_alb_xmit+0x153a/0x1590 drivers/net/bonding/bond_alb.c:1452 [] __bond_start_xmit drivers/net/bonding/bond_main.c:4199 [inline] [] bond_start_xmit+0x4f4/0x1570 drivers/net/bonding/bond_main.c:4224 [] __netdev_start_xmit include/linux/netdevice.h:4525 [inline] [] netdev_start_xmit include/linux/netdevice.h:4539 [inline] [] xmit_one net/core/dev.c:3611 [inline] [] dev_hard_start_xmit+0x168/0x910 net/core/dev.c:3627 [] __dev_queue_xmit+0x1f55/0x33b0 net/core/dev.c:4238 [] dev_queue_xmit+0x18/0x20 net/core/dev.c:4278 [] packet_snd net/packet/af_packet.c:3226 [inline] [] packet_sendmsg+0x4919/0x70b0 net/packet/af_packet.c:3252 [] sock_sendmsg_nosec net/socket.c:673 [inline] [] sock_sendmsg+0x12c/0x160 net/socket.c:684 [] __sys_sendto+0x262/0x380 net/socket.c:1996 [] SYSC_sendto net/socket.c:2008 [inline] [] SyS_sendto+0x40/0x60 net/socket.c:2004 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Signed-off-by: David S. Miller --- drivers/net/bonding/bond_alb.c | 44 ++++++++++++++++++++++++---------- include/net/ipx.h | 5 ---- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 4f2e6910c623..1cc2cd894f87 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1383,26 +1383,31 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) bool do_tx_balance = true; u32 hash_index = 0; const u8 *hash_start = NULL; - struct ipv6hdr *ip6hdr; skb_reset_mac_header(skb); eth_data = eth_hdr(skb); switch (ntohs(skb->protocol)) { case ETH_P_IP: { - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; if (is_broadcast_ether_addr(eth_data->h_dest) || - iph->daddr == ip_bcast || - iph->protocol == IPPROTO_IGMP) { + !pskb_network_may_pull(skb, sizeof(*iph))) { + do_tx_balance = false; + break; + } + iph = ip_hdr(skb); + if (iph->daddr == ip_bcast || iph->protocol == IPPROTO_IGMP) { do_tx_balance = false; break; } hash_start = (char *)&(iph->daddr); hash_size = sizeof(iph->daddr); - } break; - case ETH_P_IPV6: + } + case ETH_P_IPV6: { + const struct ipv6hdr *ip6hdr; + /* IPv6 doesn't really use broadcast mac address, but leave * that here just in case. */ @@ -1419,7 +1424,11 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) break; } - /* Additianally, DAD probes should not be tx-balanced as that + if (!pskb_network_may_pull(skb, sizeof(*ip6hdr))) { + do_tx_balance = false; + break; + } + /* Additionally, DAD probes should not be tx-balanced as that * will lead to false positives for duplicate addresses and * prevent address configuration from working. */ @@ -1429,17 +1438,26 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) break; } - hash_start = (char *)&(ipv6_hdr(skb)->daddr); - hash_size = sizeof(ipv6_hdr(skb)->daddr); + hash_start = (char *)&ip6hdr->daddr; + hash_size = sizeof(ip6hdr->daddr); break; - case ETH_P_IPX: - if (ipx_hdr(skb)->ipx_checksum != IPX_NO_CHECKSUM) { + } + case ETH_P_IPX: { + const struct ipxhdr *ipxhdr; + + if (pskb_network_may_pull(skb, sizeof(*ipxhdr))) { + do_tx_balance = false; + break; + } + ipxhdr = (struct ipxhdr *)skb_network_header(skb); + + if (ipxhdr->ipx_checksum != IPX_NO_CHECKSUM) { /* something is wrong with this packet */ do_tx_balance = false; break; } - if (ipx_hdr(skb)->ipx_type != IPX_TYPE_NCP) { + if (ipxhdr->ipx_type != IPX_TYPE_NCP) { /* The only protocol worth balancing in * this family since it has an "ARP" like * mechanism @@ -1448,9 +1466,11 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev) break; } + eth_data = eth_hdr(skb); hash_start = (char *)eth_data->h_dest; hash_size = ETH_ALEN; break; + } case ETH_P_ARP: do_tx_balance = false; if (bond_info->rlb_enabled) diff --git a/include/net/ipx.h b/include/net/ipx.h index baf090390998..9d1342807b59 100644 --- a/include/net/ipx.h +++ b/include/net/ipx.h @@ -47,11 +47,6 @@ struct ipxhdr { /* From af_ipx.c */ extern int sysctl_ipx_pprop_broadcasting; -static __inline__ struct ipxhdr *ipx_hdr(struct sk_buff *skb) -{ - return (struct ipxhdr *)skb_transport_header(skb); -} - struct ipx_interface { /* IPX address */ __be32 if_netnum; From 41c1ef978c8d0259c6636e6d2d854777e92650eb Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Wed, 5 Feb 2020 18:08:11 +0530 Subject: [PATCH 068/147] net: macb: Remove unnecessary alignment check for TSO The IP TSO implementation does NOT require the length to be a multiple of 8. That is only a requirement for UFO as per IP documentation. Hence, exit macb_features_check function in the beginning if the protocol is not UDP. Only when it is UDP, proceed further to the alignment checks. Update comments to reflect the same. Also remove dead code checking for protocol TCP when calculating header length. Fixes: 1629dd4f763c ("cadence: Add LSO support.") Signed-off-by: Harini Katakam Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 7a2fe63d1136..11311923e166 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1791,16 +1791,14 @@ static netdev_features_t macb_features_check(struct sk_buff *skb, /* Validate LSO compatibility */ - /* there is only one buffer */ - if (!skb_is_nonlinear(skb)) + /* there is only one buffer or protocol is not UDP */ + if (!skb_is_nonlinear(skb) || (ip_hdr(skb)->protocol != IPPROTO_UDP)) return features; /* length of header */ hdrlen = skb_transport_offset(skb); - if (ip_hdr(skb)->protocol == IPPROTO_TCP) - hdrlen += tcp_hdrlen(skb); - /* For LSO: + /* For UFO only: * When software supplies two or more payload buffers all payload buffers * apart from the last must be a multiple of 8 bytes in size. */ From f822e9c4ffa511a5c681cf866287d9383a3b6f1b Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Wed, 5 Feb 2020 18:08:12 +0530 Subject: [PATCH 069/147] net: macb: Limit maximum GEM TX length in TSO GEM_MAX_TX_LEN currently resolves to 0x3FF8 for any IP version supporting TSO with full 14bits of length field in payload descriptor. But an IP errata causes false amba_error (bit 6 of ISR) when length in payload descriptors is specified above 16387. The error occurs because the DMA falsely concludes that there is not enough space in SRAM for incoming payload. These errors were observed continuously under stress of large packets using iperf on a version where SRAM was 16K for each queue. This errata will be documented shortly and affects all versions since TSO functionality was added. Hence limit the max length to 0x3FC0 (rounded). Signed-off-by: Harini Katakam Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 11311923e166..4508f0d150da 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -73,7 +73,11 @@ struct sifive_fu540_macb_mgmt { /* Max length of transmit frame must be a multiple of 8 bytes */ #define MACB_TX_LEN_ALIGN 8 #define MACB_MAX_TX_LEN ((unsigned int)((1 << MACB_TX_FRMLEN_SIZE) - 1) & ~((unsigned int)(MACB_TX_LEN_ALIGN - 1))) -#define GEM_MAX_TX_LEN ((unsigned int)((1 << GEM_TX_FRMLEN_SIZE) - 1) & ~((unsigned int)(MACB_TX_LEN_ALIGN - 1))) +/* Limit maximum TX length as per Cadence TSO errata. This is to avoid a + * false amba_error in TX path from the DMA assuming there is not enough + * space in the SRAM (16KB) even when there is. + */ +#define GEM_MAX_TX_LEN (unsigned int)(0x3FC0) #define GEM_MTU_MIN_SIZE ETH_MIN_MTU #define MACB_NETIF_LSO NETIF_F_TSO From 0202d293c2faecba791ba4afc5aec086249c393d Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 5 Feb 2020 05:10:55 -0800 Subject: [PATCH 070/147] qed: Fix timestamping issue for L2 unicast ptp packets. commit cedeac9df4b8 ("qed: Add support for Timestamping the unicast PTP packets.") handles the timestamping of L4 ptp packets only. This patch adds driver changes to detect/timestamp both L2/L4 unicast PTP packets. Fixes: cedeac9df4b8 ("qed: Add support for Timestamping the unicast PTP packets.") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_ptp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ptp.c b/drivers/net/ethernet/qlogic/qed/qed_ptp.c index 0dacf2c18c09..3e613058e225 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ptp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ptp.c @@ -44,8 +44,8 @@ /* Add/subtract the Adjustment_Value when making a Drift adjustment */ #define QED_DRIFT_CNTR_DIRECTION_SHIFT 31 #define QED_TIMESTAMP_MASK BIT(16) -/* Param mask for Hardware to detect/timestamp the unicast PTP packets */ -#define QED_PTP_UCAST_PARAM_MASK 0xF +/* Param mask for Hardware to detect/timestamp the L2/L4 unicast PTP packets */ +#define QED_PTP_UCAST_PARAM_MASK 0x70F static enum qed_resc_lock qed_ptcdev_to_resc(struct qed_hwfn *p_hwfn) { From fc9e34f8de001c92e6cfc6481bb30a34dbda4f5c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 4 Feb 2020 13:50:37 -0800 Subject: [PATCH 071/147] tools/bpf/runqslower: Rebuild libbpf.a on libbpf source change Add missing dependency of $(BPFOBJ) to $(LIBBPF_SRC), so that running make in runqslower/ will rebuild libbpf.a when there is change in libbpf/. Fixes: 9c01546d26d2 ("tools/bpf: Add runqslower tool to tools/bpf") Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200204215037.2258698-1-songliubraving@fb.com --- tools/bpf/runqslower/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 87eae5be9bcd..39edd68afa8e 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -75,7 +75,7 @@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) fi $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ -$(BPFOBJ): | $(OUTPUT) +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ OUTPUT=$(abspath $(dir $@))/ $(abspath $@) From c77e9f09143822623dd71a0fdc84331129e97c3a Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 5 Feb 2020 05:58:32 +0100 Subject: [PATCH 072/147] i40e: Relax i40e_xsk_wakeup's return value when PF is busy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return -EAGAIN instead of -ENETDOWN to provide a slightly milder information to user space so that an application will know to retry the syscall when __I40E_CONFIG_BUSY bit is set on pf->state. Fixes: b3873a5be757 ("net/i40e: Fix concurrency issues between config flow and XSK") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200205045834.56795-2-maciej.fijalkowski@intel.com --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 42058fad6a3c..0b7d29192b2c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -791,7 +791,7 @@ int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) struct i40e_ring *ring; if (test_bit(__I40E_CONFIG_BUSY, pf->state)) - return -ENETDOWN; + return -EAGAIN; if (test_bit(__I40E_VSI_DOWN, vsi->state)) return -ENETDOWN; From 32c92c15ad3dca6f7cca8643766ad79fa3ab3d17 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 5 Feb 2020 05:58:33 +0100 Subject: [PATCH 073/147] samples: bpf: Drop doubled variable declaration in xdpsock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seems that by accident there is a doubled declaration of global variable opt_xdp_bind_flags in xdpsock_user.c. The second one is uninitialized so compiler was simply ignoring it. To keep things clean, drop the doubled variable. Fixes: c543f5469822 ("samples/bpf: add unaligned chunks mode support to xdpsock") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200205045834.56795-3-maciej.fijalkowski@intel.com --- samples/bpf/xdpsock_user.c | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 0b5acd722306..bab7a850e864 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -83,7 +83,6 @@ static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; static u32 opt_umem_flags; static int opt_unaligned_chunks; static int opt_mmap_flags; -static u32 opt_xdp_bind_flags; static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; static int opt_timeout = 1000; static bool opt_need_wakeup = true; From 8ed47e140867a6e7d56170f325c8d4fdee6d6b66 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 5 Feb 2020 05:58:34 +0100 Subject: [PATCH 074/147] samples: bpf: Allow for -ENETDOWN in xdpsock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ndo_xsk_wakeup() can return -ENETDOWN and there's no particular reason to bail the whole application out on that case. Let's check in kick_tx() whether errno was set to mentioned value and basically allow application to further process frames. Fixes: 248c7f9c0e21 ("samples/bpf: convert xdpsock to use libbpf for AF_XDP access") Reported-by: Cameron Elliott Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20200205045834.56795-4-maciej.fijalkowski@intel.com --- samples/bpf/xdpsock_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index bab7a850e864..c91e91362a0c 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -788,7 +788,8 @@ static void kick_tx(struct xsk_socket_info *xsk) int ret; ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); - if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY) + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || + errno == EBUSY || errno == ENETDOWN) return; exit_with_error(errno); } From f566e1fbadb686e28f1c307e356114b2865ef588 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 5 Feb 2020 15:51:52 +0900 Subject: [PATCH 075/147] kbuild: make multiple directory targets work Currently, the single-target build does not work when two or more sub-directories are given: $ make fs/ kernel/ lib/ CALL scripts/checksyscalls.sh CALL scripts/atomic/check-atomics.sh DESCEND objtool make[2]: Nothing to be done for 'kernel/'. make[2]: Nothing to be done for 'fs/'. make[2]: Nothing to be done for 'lib/'. Make it work properly. Reported-by: Linus Torvalds Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 65a5dc653deb..ef8913a8eb2a 100644 --- a/Makefile +++ b/Makefile @@ -1679,7 +1679,7 @@ PHONY += descend $(build-dirs) descend: $(build-dirs) $(build-dirs): prepare $(Q)$(MAKE) $(build)=$@ \ - single-build=$(if $(filter-out $@/, $(single-no-ko)),1) \ + single-build=$(if $(filter-out $@/, $(filter $@/%, $(single-no-ko))),1) \ need-builtin=1 need-modorder=1 clean-dirs := $(addprefix _clean_, $(clean-dirs)) From d6fd41905ec577851734623fb905b1763801f5ef Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 5 Feb 2020 16:52:11 -0600 Subject: [PATCH 076/147] cifs: log warning message (once) if out of disk space We ran into a confusing problem where an application wasn't checking return code on close and so user didn't realize that the application ran out of disk space. log a warning message (once) in these cases. For example: [ 8407.391909] Out of space writing to \\oleg-server\small-share Signed-off-by: Steve French Reported-by: Oleg Kravtsov Reviewed-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2pdu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7996d81230aa..1a732ff71de4 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -4029,6 +4029,9 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->cfile->fid.persistent_fid, tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes, wdata->result); + if (wdata->result == -ENOSPC) + printk_once(KERN_WARNING "Out of space writing to %s\n", + tcon->treeName); } else trace_smb3_write_done(0 /* no xid */, wdata->cfile->fid.persistent_fid, From f2bf09e97b47c7b13e8a918f560f6082e9bc8f8a Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 5 Feb 2020 18:22:37 -0600 Subject: [PATCH 077/147] cifs: Add tracepoints for errors on flush or fsync Makes it easier to debug errors on writeback that happen later, and are being returned on flush or fsync For example: writetest-17829 [002] .... 13583.407859: cifs_flush_err: ino=90 rc=-28 Signed-off-by: Steve French --- fs/cifs/file.c | 7 +++++-- fs/cifs/trace.h | 27 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 79e6f4f55b9b..99ea7b2a06a5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2632,8 +2632,10 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); rc = file_write_and_wait_range(file, start, end); - if (rc) + if (rc) { + trace_cifs_fsync_err(file_inode(file)->i_ino, rc); return rc; + } xid = get_xid(); @@ -2666,7 +2668,8 @@ int cifs_flush(struct file *file, fl_owner_t id) rc = filemap_write_and_wait(inode->i_mapping); cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc); - + if (rc) + trace_cifs_flush_err(inode->i_ino, rc); return rc; } diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index e7e350b13d6a..4cb0d5f7ce45 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -547,6 +547,33 @@ DEFINE_EVENT(smb3_exit_err_class, smb3_##name, \ DEFINE_SMB3_EXIT_ERR_EVENT(exit_err); + +DECLARE_EVENT_CLASS(smb3_sync_err_class, + TP_PROTO(unsigned long ino, + int rc), + TP_ARGS(ino, rc), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(int, rc) + ), + TP_fast_assign( + __entry->ino = ino; + __entry->rc = rc; + ), + TP_printk("\tino=%lu rc=%d", + __entry->ino, __entry->rc) +) + +#define DEFINE_SMB3_SYNC_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_sync_err_class, cifs_##name, \ + TP_PROTO(unsigned long ino, \ + int rc), \ + TP_ARGS(ino, rc)) + +DEFINE_SMB3_SYNC_ERR_EVENT(fsync_err); +DEFINE_SMB3_SYNC_ERR_EVENT(flush_err); + + DECLARE_EVENT_CLASS(smb3_enter_exit_class, TP_PROTO(unsigned int xid, const char *func_name), From b0519de8b3f1caf10632aca55def999ec2d2f1bc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 6 Feb 2020 00:39:37 +0100 Subject: [PATCH 078/147] mptcp: fix use-after-free for ipv6 Turns out that when we accept a new subflow, the newly created inet_sk(tcp_sk)->pinet6 points at the ipv6_pinfo structure of the listener socket. This wasn't caught by the selftest because it closes the accepted fd before the listening one. adding a close(listenfd) after accept returns is enough: BUG: KASAN: use-after-free in inet6_getname+0x6ba/0x790 Read of size 1 at addr ffff88810e310866 by task mptcp_connect/2518 Call Trace: inet6_getname+0x6ba/0x790 __sys_getpeername+0x10b/0x250 __x64_sys_getpeername+0x6f/0xb0 also alter test program to exercise this. Reported-by: Christoph Paasch Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 36 +++++++++++++++++-- .../selftests/net/mptcp/mptcp_connect.c | 9 +++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 353f2d16b986..73780b4cb108 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -24,6 +24,13 @@ #define MPTCP_SAME_STATE TCP_MAX_STATES +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +struct mptcp6_sock { + struct mptcp_sock msk; + struct ipv6_pinfo np; +}; +#endif + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -627,6 +634,30 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) +{ + unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} +#endif + +struct sock *mptcp_sk_clone_lock(const struct sock *sk) +{ + struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); + + if (!nsk) + return NULL; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (nsk->sk_family == AF_INET6) + inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); +#endif + + return nsk; +} + static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, bool kern) { @@ -657,7 +688,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, lock_sock(sk); local_bh_disable(); - new_mptcp_sock = sk_clone_lock(sk, GFP_ATOMIC); + new_mptcp_sock = mptcp_sk_clone_lock(sk); if (!new_mptcp_sock) { *err = -ENOBUFS; local_bh_enable(); @@ -1206,8 +1237,7 @@ int mptcp_proto_v6_init(void) strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.destroy = mptcp_v6_destroy; - mptcp_v6_prot.obj_size = sizeof(struct mptcp_sock) + - sizeof(struct ipv6_pinfo); + mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); err = proto_register(&mptcp_v6_prot, 1); if (err) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index a3dccd816ae4..99579c0223c1 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -634,6 +634,14 @@ static void check_getpeername_connect(int fd) cfg_host, a, cfg_port, b); } +static void maybe_close(int fd) +{ + unsigned int r = rand(); + + if (r & 1) + close(fd); +} + int main_loop_s(int listensock) { struct sockaddr_storage ss; @@ -657,6 +665,7 @@ int main_loop_s(int listensock) salen = sizeof(ss); remotesock = accept(listensock, (struct sockaddr *)&ss, &salen); if (remotesock >= 0) { + maybe_close(listensock); check_sockaddr(pf, &ss, salen); check_getpeername(remotesock, &ss, salen); From 45a8317ed2c6f0d33a89367ec5d36e4e3aa46576 Mon Sep 17 00:00:00 2001 From: Devulapally Shiva Krishna Date: Thu, 6 Feb 2020 14:04:43 +0530 Subject: [PATCH 079/147] cxgb4: Added tls stats prints. Added debugfs entry to show the tls stats. Signed-off-by: Devulapally Shiva Krishna Signed-off-by: Vinay Kumar Yadav Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 9d1f2f88b945..de30d61af065 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -3403,6 +3403,13 @@ static int chcr_stats_show(struct seq_file *seq, void *v) atomic_read(&adap->chcr_stats.fallback)); seq_printf(seq, "IPSec PDU: %10u\n", atomic_read(&adap->chcr_stats.ipsec_cnt)); + seq_printf(seq, "TLS PDU Tx: %10u\n", + atomic_read(&adap->chcr_stats.tls_pdu_tx)); + seq_printf(seq, "TLS PDU Rx: %10u\n", + atomic_read(&adap->chcr_stats.tls_pdu_rx)); + seq_printf(seq, "TLS Keys (DDR) Count: %10u\n", + atomic_read(&adap->chcr_stats.tls_key)); + return 0; } DEFINE_SHOW_ATTRIBUTE(chcr_stats); From c35947b8ff8acca33134ee39c31708233765c31a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 6 Feb 2020 10:14:39 +0100 Subject: [PATCH 080/147] net: mvneta: move rx_dropped and rx_errors in per-cpu stats Move rx_dropped and rx_errors counters in mvneta_pcpu_stats in order to avoid possible races updating statistics Fixes: 562e2f467e71 ("net: mvneta: Improve the buffer allocation method for SWBM") Fixes: dc35a10f68d3 ("net: mvneta: bm: add support for hardware buffer management") Fixes: c5aff18204da ("net: mvneta: driver for Marvell Armada 370/XP network unit") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 31 +++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 037e054b01a2..98017e7d5dd0 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -401,6 +401,8 @@ struct mvneta_pcpu_stats { struct u64_stats_sync syncp; u64 rx_packets; u64 rx_bytes; + u64 rx_dropped; + u64 rx_errors; u64 tx_packets; u64 tx_bytes; }; @@ -738,6 +740,8 @@ mvneta_get_stats64(struct net_device *dev, struct mvneta_pcpu_stats *cpu_stats; u64 rx_packets; u64 rx_bytes; + u64 rx_dropped; + u64 rx_errors; u64 tx_packets; u64 tx_bytes; @@ -746,19 +750,20 @@ mvneta_get_stats64(struct net_device *dev, start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); rx_packets = cpu_stats->rx_packets; rx_bytes = cpu_stats->rx_bytes; + rx_dropped = cpu_stats->rx_dropped; + rx_errors = cpu_stats->rx_errors; tx_packets = cpu_stats->tx_packets; tx_bytes = cpu_stats->tx_bytes; } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; + stats->rx_dropped += rx_dropped; + stats->rx_errors += rx_errors; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; } - stats->rx_errors = dev->stats.rx_errors; - stats->rx_dropped = dev->stats.rx_dropped; - stats->tx_dropped = dev->stats.tx_dropped; } @@ -1736,8 +1741,14 @@ static u32 mvneta_txq_desc_csum(int l3_offs, int l3_proto, static void mvneta_rx_error(struct mvneta_port *pp, struct mvneta_rx_desc *rx_desc) { + struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats); u32 status = rx_desc->status; + /* update per-cpu counter */ + u64_stats_update_begin(&stats->syncp); + stats->rx_errors++; + u64_stats_update_end(&stats->syncp); + switch (status & MVNETA_RXD_ERR_CODE_MASK) { case MVNETA_RXD_ERR_CRC: netdev_err(pp->dev, "bad rx status %08x (crc error), size=%d\n", @@ -2179,11 +2190,15 @@ mvneta_swbm_rx_frame(struct mvneta_port *pp, rxq->skb = build_skb(xdp->data_hard_start, PAGE_SIZE); if (unlikely(!rxq->skb)) { - netdev_err(dev, - "Can't allocate skb on queue %d\n", - rxq->id); - dev->stats.rx_dropped++; + struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats); + + netdev_err(dev, "Can't allocate skb on queue %d\n", rxq->id); rxq->skb_alloc_err++; + + u64_stats_update_begin(&stats->syncp); + stats->rx_dropped++; + u64_stats_update_end(&stats->syncp); + return -ENOMEM; } page_pool_release_page(rxq->page_pool, page); @@ -2270,7 +2285,6 @@ static int mvneta_rx_swbm(struct napi_struct *napi, /* Check errors only for FIRST descriptor */ if (rx_status & MVNETA_RXD_ERR_SUMMARY) { mvneta_rx_error(pp, rx_desc); - dev->stats.rx_errors++; /* leave the descriptor untouched */ continue; } @@ -2372,7 +2386,6 @@ err_drop_frame_ret_pool: mvneta_bm_pool_put_bp(pp->bm_priv, bm_pool, rx_desc->buf_phys_addr); err_drop_frame: - dev->stats.rx_errors++; mvneta_rx_error(pp, rx_desc); /* leave the descriptor untouched */ continue; From 86b18aaa2b5b5bb48e609cd591b3d2d0fdbe0442 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 4 Feb 2020 13:40:29 -0500 Subject: [PATCH 081/147] skbuff: fix a data race in skb_queue_len() sk_buff.qlen can be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in __skb_try_recv_from_queue / unix_dgram_sendmsg read to 0xffff8a1b1d8a81c0 of 4 bytes by task 5371 on cpu 96: unix_dgram_sendmsg+0x9a9/0xb70 include/linux/skbuff.h:1821 net/unix/af_unix.c:1761 ____sys_sendmsg+0x33e/0x370 ___sys_sendmsg+0xa6/0xf0 __sys_sendmsg+0x69/0xf0 __x64_sys_sendmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe write to 0xffff8a1b1d8a81c0 of 4 bytes by task 1 on cpu 99: __skb_try_recv_from_queue+0x327/0x410 include/linux/skbuff.h:2029 __skb_try_recv_datagram+0xbe/0x220 unix_dgram_recvmsg+0xee/0x850 ____sys_recvmsg+0x1fb/0x210 ___sys_recvmsg+0xa2/0xf0 __sys_recvmsg+0x66/0xf0 __x64_sys_recvmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe Since only the read is operating as lockless, it could introduce a logic bug in unix_recvq_full() due to the load tearing. Fix it by adding a lockless variant of skb_queue_len() and unix_recvq_full() where READ_ONCE() is on the read while WRITE_ONCE() is on the write similar to the commit d7d16a89350a ("net: add skb_queue_empty_lockless()"). Signed-off-by: Qian Cai Signed-off-by: David S. Miller --- include/linux/skbuff.h | 14 +++++++++++++- net/unix/af_unix.c | 11 +++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3d13a4b717e9..ca8806b69388 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1821,6 +1821,18 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) return list_->qlen; } +/** + * skb_queue_len_lockless - get queue length + * @list_: list to measure + * + * Return the length of an &sk_buff queue. + * This variant can be used in lockless contexts. + */ +static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) +{ + return READ_ONCE(list_->qlen); +} + /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize @@ -2026,7 +2038,7 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; - list->qlen--; + WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 321af97c7bbe..62c12cb5763e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -189,11 +189,17 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk) return unix_peer(osk) == NULL || unix_our_peer(sk, osk); } -static inline int unix_recvq_full(struct sock const *sk) +static inline int unix_recvq_full(const struct sock *sk) { return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; } +static inline int unix_recvq_full_lockless(const struct sock *sk) +{ + return skb_queue_len_lockless(&sk->sk_receive_queue) > + READ_ONCE(sk->sk_max_ack_backlog); +} + struct sock *unix_peer_get(struct sock *s) { struct sock *peer; @@ -1758,7 +1764,8 @@ restart_locked: * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && - unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + unlikely(unix_peer(other) != sk && + unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); From 7a02ea65027523386ab4ba4af0ab93497b3073df Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Feb 2020 14:53:30 +0300 Subject: [PATCH 082/147] net: sched: prevent a use after free The bug is that we call kfree_skb(skb) and then pass "skb" to qdisc_pkt_len(skb) on the next line, which is a use after free. Also Cong Wang points out that it's better to delay the actual frees until we drop the rtnl lock so we should use rtnl_kfree_skbs() instead of kfree_skb(). Cc: Cong Wang Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler") Signed-off-by: Dan Carpenter Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_fq_pie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index bbd0dea6b6b9..214657eb3dfd 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -349,9 +349,9 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit) { struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); - kfree_skb(skb); len_dropped += qdisc_pkt_len(skb); num_dropped += 1; + rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); From 21b5f672fb2eb1366dedc4ac9d32431146b378d3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 5 Feb 2020 21:22:46 +0100 Subject: [PATCH 083/147] r8169: fix performance regression related to PCIe max read request size It turned out that on low performance systems the original change can cause lower tx performance. On a N3450-based mini-PC tx performance in iperf3 was reduced from 950Mbps to ~900Mbps. Therefore effectively revert the original change, just use pcie_set_readrq() now instead of changing the PCIe capability register directly. Fixes: 2df49d365498 ("r8169: remove fiddling with the PCIe max read request size") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index aaa316be6183..a2168a14794c 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2477,15 +2477,18 @@ static void rtl_hw_jumbo_enable(struct rtl8169_private *tp) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_12: case RTL_GIGA_MAC_VER_17: + pcie_set_readrq(tp->pci_dev, 512); r8168b_1_hw_jumbo_enable(tp); break; case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_26: + pcie_set_readrq(tp->pci_dev, 512); r8168c_hw_jumbo_enable(tp); break; case RTL_GIGA_MAC_VER_27 ... RTL_GIGA_MAC_VER_28: r8168dp_hw_jumbo_enable(tp); break; case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_33: + pcie_set_readrq(tp->pci_dev, 512); r8168e_hw_jumbo_enable(tp); break; default: @@ -2515,6 +2518,9 @@ static void rtl_hw_jumbo_disable(struct rtl8169_private *tp) break; } rtl_lock_config_regs(tp); + + if (pci_is_pcie(tp->pci_dev) && tp->supports_gmii) + pcie_set_readrq(tp->pci_dev, 4096); } static void rtl_jumbo_config(struct rtl8169_private *tp, int mtu) From 263a425a482fc495d6d3f9a29b9103a664c38b69 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 5 Feb 2020 12:32:04 -0800 Subject: [PATCH 084/147] net: systemport: Avoid RBUF stuck in Wake-on-LAN mode After a number of suspend and resume cycles, it is possible for the RBUF to be stuck in Wake-on-LAN mode, despite the MPD enable bit being cleared which instructed the RBUF to exit that mode. Avoid creating that problematic condition by clearing the RX_EN and TX_EN bits in the UniMAC prior to disable the Magic Packet Detector logic which is guaranteed to make the RBUF exit Wake-on-LAN mode. Fixes: 83e82f4c706b ("net: systemport: add Wake-on-LAN support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index f07ac0e0af59..e0611cba87f9 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2736,6 +2736,9 @@ static int __maybe_unused bcm_sysport_resume(struct device *d) umac_reset(priv); + /* Disable the UniMAC RX/TX */ + umac_enable_set(priv, CMD_RX_EN | CMD_TX_EN, 0); + /* We may have been suspended and never received a WOL event that * would turn off MPD detection, take care of that now */ From 09c40b15351c1bb1af035f7f6df167366d552790 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 6 Feb 2020 13:55:19 +1000 Subject: [PATCH 085/147] cifs: fix soft mounts hanging in the reconnect code RHBZ: 1795423 This is the SMB1 version of a patch we already have for SMB2 In recent DFS updates we have a new variable controlling how many times we will retry to reconnect the share. If DFS is not used, then this variable is initialized to 0 in: static inline int dfs_cache_get_nr_tgts(const struct dfs_cache_tgt_list *tl) { return tl ? tl->tl_numtgts : 0; } This means that in the reconnect loop in smb2_reconnect() we will immediately wrap retries to -1 and never actually get to pass this conditional: if (--retries) continue; The effect is that we no longer reach the point where we fail the commands with -EHOSTDOWN and basically the kernel threads are virtually hung and unkillable. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Reviewed-by: Paulo Alcantara (SUSE) --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a481296f417f..3c89569e7210 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -260,7 +260,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) if (server->tcpStatus != CifsNeedReconnect) break; - if (--retries) + if (retries && --retries) continue; /* From 343a1b777a92af3022c95ab6c188bef4a6e9ff97 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 6 Feb 2020 10:19:11 +0100 Subject: [PATCH 086/147] cifs: make multichannel warning more visible When no interfaces are returned by the server we cannot open multiple channels. Make it more obvious by reporting that to the user at the VFS log level. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index f0795c856d8f..43a88e26d26b 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -101,7 +101,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) iface_count = ses->iface_count; if (iface_count <= 0) { spin_unlock(&ses->iface_lock); - cifs_dbg(FYI, "no iface list available to open channels\n"); + cifs_dbg(VFS, "no iface list available to open channels\n"); return 0; } ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces), From d26c2ddd33569667e3eeb577c4c1d966ca9192e2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 6 Feb 2020 06:00:14 -0600 Subject: [PATCH 087/147] cifs: add SMB3 change notification support A commonly used SMB3 feature is change notification, allowing an app to be notified about changes to a directory. The SMB3 Notify request blocks until the server detects a change to that directory or its contents that matches the completion flags that were passed in and the "watch_tree" flag (which indicates whether subdirectories under this directory should be also included). See MS-SMB2 2.2.35 for additional detail. To use this simply pass in the following structure to ioctl: struct __attribute__((__packed__)) smb3_notify { uint32_t completion_filter; bool watch_tree; } __packed; using CIFS_IOC_NOTIFY 0x4005cf09 or equivalently _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) SMB3 change notification is supported by all major servers. The ioctl will block until the server detects a change to that directory or its subdirectories (if watch_tree is set). Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Acked-by: Paulo Alcantara (SUSE) --- fs/cifs/cifs_ioctl.h | 6 +++++ fs/cifs/cifsglob.h | 2 ++ fs/cifs/ioctl.c | 16 ++++++++++++ fs/cifs/smb2ops.c | 62 ++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.c | 1 + 5 files changed, 87 insertions(+) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 0f0dc1c1fe41..153d5c842a9b 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -65,6 +65,11 @@ struct smb3_key_debug_info { __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; } __packed; +struct smb3_notify { + __u32 completion_filter; + bool watch_tree; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) @@ -72,3 +77,4 @@ struct smb3_key_debug_info { #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) +#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1205041fd966..de82cfa44b1a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -431,6 +431,8 @@ struct smb_version_operations { struct cifsFileInfo *src_file); int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *src_file, void __user *); + int (*notify)(const unsigned int xid, struct file *pfile, + void __user *pbuf); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index e4c935026d5e..4a73e63c4d43 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -169,6 +169,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; __u64 ExtAttrBits = 0; __u64 caps; @@ -299,6 +300,21 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = 0; break; + case CIFS_IOC_NOTIFY: + if (!S_ISDIR(inode->i_mode)) { + /* Notify can only be done on directories */ + rc = -EOPNOTSUPP; + break; + } + cifs_sb = CIFS_SB(inode->i_sb); + tcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); + if (tcon && tcon->ses->server->ops->notify) { + rc = tcon->ses->server->ops->notify(xid, + filep, (void __user *)arg); + cifs_dbg(FYI, "ioctl notify rc %d\n", rc); + } else + rc = -EOPNOTSUPP; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index ac6628e28b12..baa825f4cec0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2045,6 +2045,66 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, return rc; } + + +static int +smb3_notify(const unsigned int xid, struct file *pfile, + void __user *ioc_buf) +{ + struct smb3_notify notify; + struct dentry *dentry = pfile->f_path.dentry; + struct inode *inode = file_inode(pfile); + struct cifs_sb_info *cifs_sb; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct cifs_tcon *tcon; + unsigned char *path = NULL; + __le16 *utf16_path = NULL; + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + int rc = 0; + + path = build_path_from_dentry(dentry); + if (path == NULL) + return -ENOMEM; + + cifs_sb = CIFS_SB(inode->i_sb); + + utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb); + if (utf16_path == NULL) { + rc = -ENOMEM; + goto notify_exit; + } + + if (copy_from_user(¬ify, ioc_buf, sizeof(struct smb3_notify))) { + rc = -EFAULT; + goto notify_exit; + } + + tcon = cifs_sb_master_tcon(cifs_sb); + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = cifs_create_options(cifs_sb, 0); + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); + if (rc) + goto notify_exit; + + rc = SMB2_change_notify(xid, tcon, fid.persistent_fid, fid.volatile_fid, + notify.watch_tree, notify.completion_filter); + + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + + cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); + +notify_exit: + kfree(path); + kfree(utf16_path); + return rc; +} + static int smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, @@ -4841,6 +4901,7 @@ struct smb_version_operations smb30_operations = { .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, .enum_snapshots = smb3_enum_snapshots, + .notify = smb3_notify, .init_transform_rq = smb3_init_transform_rq, .is_transform_hdr = smb3_is_transform_hdr, .receive_transform = smb3_receive_transform, @@ -4951,6 +5012,7 @@ struct smb_version_operations smb311_operations = { .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, .enum_snapshots = smb3_enum_snapshots, + .notify = smb3_notify, .init_transform_rq = smb3_init_transform_rq, .is_transform_hdr = smb3_is_transform_hdr, .receive_transform = smb3_receive_transform, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1a732ff71de4..47cce0bd1afe 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3363,6 +3363,7 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + /* See note 354 of MS-SMB2, 64K max */ req->OutputBufferLength = cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); req->CompletionFilter = cpu_to_le32(completion_filter); From cc95b6772790c476b2c2de09abec739120593d98 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 6 Feb 2020 13:49:26 +0100 Subject: [PATCH 088/147] cifs: fix channel signing The server var was accidentally used as an iterator over the global list of connections, thus overwritten the passed argument. This resulted in the wrong signing key being returned for extra channels. Fix this by using a separate var to iterate. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2transport.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index fe6acfce3390..08b703b7a15e 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -104,13 +104,14 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { struct cifs_chan *chan; struct cifs_ses *ses = NULL; + struct TCP_Server_Info *it = NULL; int i; int rc = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(it, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &it->smb_ses_list, smb_ses_list) { if (ses->Suid == ses_id) goto found; } From c1948390d78b5183ee9b7dd831efd7f6ac496ab0 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 27 Jan 2020 09:27:51 +0200 Subject: [PATCH 089/147] net/mlx5: Fix deadlock in fs_core free_match_list could be called when the flow table is already locked. We need to pass this notation to tree_put_node. It fixes the following lockdep warnning: [ 1797.268537] ============================================ [ 1797.276837] WARNING: possible recursive locking detected [ 1797.285101] 5.5.0-rc5+ #10 Not tainted [ 1797.291641] -------------------------------------------- [ 1797.299917] handler10/9296 is trying to acquire lock: [ 1797.307885] ffff889ad399a0a0 (&node->lock){++++}, at: tree_put_node+0x1d5/0x210 [mlx5_core] [ 1797.319694] [ 1797.319694] but task is already holding lock: [ 1797.330904] ffff889ad399a0a0 (&node->lock){++++}, at: nested_down_write_ref_node.part.33+0x1a/0x60 [mlx5_core] [ 1797.344707] [ 1797.344707] other info that might help us debug this: [ 1797.356952] Possible unsafe locking scenario: [ 1797.356952] [ 1797.368333] CPU0 [ 1797.373357] ---- [ 1797.378364] lock(&node->lock); [ 1797.384222] lock(&node->lock); [ 1797.390031] [ 1797.390031] *** DEADLOCK *** [ 1797.390031] [ 1797.403003] May be due to missing lock nesting notation [ 1797.403003] [ 1797.414691] 3 locks held by handler10/9296: [ 1797.421465] #0: ffff889cf2c5a110 (&block->cb_lock){++++}, at: tc_setup_cb_add+0x70/0x250 [ 1797.432810] #1: ffff88a030081490 (&comp->sem){++++}, at: mlx5_devcom_get_peer_data+0x4c/0xb0 [mlx5_core] [ 1797.445829] #2: ffff889ad399a0a0 (&node->lock){++++}, at: nested_down_write_ref_node.part.33+0x1a/0x60 [mlx5_core] [ 1797.459913] [ 1797.459913] stack backtrace: [ 1797.469436] CPU: 1 PID: 9296 Comm: handler10 Kdump: loaded Not tainted 5.5.0-rc5+ #10 [ 1797.480643] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/17/2017 [ 1797.491480] Call Trace: [ 1797.496701] dump_stack+0x96/0xe0 [ 1797.502864] __lock_acquire.cold.63+0xf8/0x212 [ 1797.510301] ? lockdep_hardirqs_on+0x250/0x250 [ 1797.517701] ? mark_held_locks+0x55/0xa0 [ 1797.524547] ? quarantine_put+0xb7/0x160 [ 1797.531422] ? lockdep_hardirqs_on+0x17d/0x250 [ 1797.538913] lock_acquire+0xd6/0x1f0 [ 1797.545529] ? tree_put_node+0x1d5/0x210 [mlx5_core] [ 1797.553701] down_write+0x94/0x140 [ 1797.560206] ? tree_put_node+0x1d5/0x210 [mlx5_core] [ 1797.568464] ? down_write_killable_nested+0x170/0x170 [ 1797.576925] ? del_hw_flow_group+0xde/0x1f0 [mlx5_core] [ 1797.585629] tree_put_node+0x1d5/0x210 [mlx5_core] [ 1797.593891] ? free_match_list.part.25+0x147/0x170 [mlx5_core] [ 1797.603389] free_match_list.part.25+0xe0/0x170 [mlx5_core] [ 1797.612654] _mlx5_add_flow_rules+0x17e2/0x20b0 [mlx5_core] [ 1797.621838] ? lock_acquire+0xd6/0x1f0 [ 1797.629028] ? esw_get_prio_table+0xb0/0x3e0 [mlx5_core] [ 1797.637981] ? alloc_insert_flow_group+0x420/0x420 [mlx5_core] [ 1797.647459] ? try_to_wake_up+0x4c7/0xc70 [ 1797.654881] ? lock_downgrade+0x350/0x350 [ 1797.662271] ? __mutex_unlock_slowpath+0xb1/0x3f0 [ 1797.670396] ? find_held_lock+0xac/0xd0 [ 1797.677540] ? mlx5_add_flow_rules+0xdc/0x360 [mlx5_core] [ 1797.686467] mlx5_add_flow_rules+0xdc/0x360 [mlx5_core] [ 1797.695134] ? _mlx5_add_flow_rules+0x20b0/0x20b0 [mlx5_core] [ 1797.704270] ? irq_exit+0xa5/0x170 [ 1797.710764] ? retint_kernel+0x10/0x10 [ 1797.717698] ? mlx5_eswitch_set_rule_source_port.isra.9+0x122/0x230 [mlx5_core] [ 1797.728708] mlx5_eswitch_add_offloaded_rule+0x465/0x6d0 [mlx5_core] [ 1797.738713] ? mlx5_eswitch_get_prio_range+0x30/0x30 [mlx5_core] [ 1797.748384] ? mlx5_fc_stats_work+0x670/0x670 [mlx5_core] [ 1797.757400] mlx5e_tc_offload_fdb_rules.isra.27+0x24/0x90 [mlx5_core] [ 1797.767665] mlx5e_tc_add_fdb_flow+0xaf8/0xd40 [mlx5_core] [ 1797.776886] ? mlx5e_encap_put+0xd0/0xd0 [mlx5_core] [ 1797.785562] ? mlx5e_alloc_flow.isra.43+0x18c/0x1c0 [mlx5_core] [ 1797.795353] __mlx5e_add_fdb_flow+0x2e2/0x440 [mlx5_core] [ 1797.804558] ? mlx5e_tc_update_neigh_used_value+0x8c0/0x8c0 [mlx5_core] [ 1797.815093] ? wait_for_completion+0x260/0x260 [ 1797.823272] mlx5e_configure_flower+0xe94/0x1620 [mlx5_core] [ 1797.832792] ? __mlx5e_add_fdb_flow+0x440/0x440 [mlx5_core] [ 1797.842096] ? down_read+0x11a/0x2e0 [ 1797.849090] ? down_write+0x140/0x140 [ 1797.856142] ? mlx5e_rep_indr_setup_block_cb+0xc0/0xc0 [mlx5_core] [ 1797.866027] tc_setup_cb_add+0x11a/0x250 [ 1797.873339] fl_hw_replace_filter+0x25e/0x320 [cls_flower] [ 1797.882385] ? fl_hw_destroy_filter+0x1c0/0x1c0 [cls_flower] [ 1797.891607] fl_change+0x1d54/0x1fb6 [cls_flower] [ 1797.899772] ? __rhashtable_insert_fast.constprop.50+0x9f0/0x9f0 [cls_flower] [ 1797.910728] ? lock_downgrade+0x350/0x350 [ 1797.918187] ? __radix_tree_lookup+0xa5/0x130 [ 1797.926046] ? fl_set_key+0x1590/0x1590 [cls_flower] [ 1797.934611] ? __rhashtable_insert_fast.constprop.50+0x9f0/0x9f0 [cls_flower] [ 1797.945673] tc_new_tfilter+0xcd1/0x1240 [ 1797.953138] ? tc_del_tfilter+0xb10/0xb10 [ 1797.960688] ? avc_has_perm_noaudit+0x92/0x320 [ 1797.968721] ? avc_has_perm_noaudit+0x1df/0x320 [ 1797.976816] ? avc_has_extended_perms+0x990/0x990 [ 1797.985090] ? mark_lock+0xaa/0x9e0 [ 1797.991988] ? match_held_lock+0x1b/0x240 [ 1797.999457] ? match_held_lock+0x1b/0x240 [ 1798.006859] ? find_held_lock+0xac/0xd0 [ 1798.014045] ? symbol_put_addr+0x40/0x40 [ 1798.021317] ? rcu_read_lock_sched_held+0xd0/0xd0 [ 1798.029460] ? tc_del_tfilter+0xb10/0xb10 [ 1798.036810] rtnetlink_rcv_msg+0x4d5/0x620 [ 1798.044236] ? rtnl_bridge_getlink+0x460/0x460 [ 1798.052034] ? lockdep_hardirqs_on+0x250/0x250 [ 1798.059837] ? match_held_lock+0x1b/0x240 [ 1798.067146] ? find_held_lock+0xac/0xd0 [ 1798.074246] netlink_rcv_skb+0xc6/0x1f0 [ 1798.081339] ? rtnl_bridge_getlink+0x460/0x460 [ 1798.089104] ? netlink_ack+0x440/0x440 [ 1798.096061] netlink_unicast+0x2d4/0x3b0 [ 1798.103189] ? netlink_attachskb+0x3f0/0x3f0 [ 1798.110724] ? _copy_from_iter_full+0xda/0x370 [ 1798.118415] netlink_sendmsg+0x3ba/0x6a0 [ 1798.125478] ? netlink_unicast+0x3b0/0x3b0 [ 1798.132705] ? netlink_unicast+0x3b0/0x3b0 [ 1798.139880] sock_sendmsg+0x94/0xa0 [ 1798.146332] ____sys_sendmsg+0x36c/0x3f0 [ 1798.153251] ? copy_msghdr_from_user+0x165/0x230 [ 1798.160941] ? kernel_sendmsg+0x30/0x30 [ 1798.167738] ___sys_sendmsg+0xeb/0x150 [ 1798.174411] ? sendmsg_copy_msghdr+0x30/0x30 [ 1798.181649] ? lock_downgrade+0x350/0x350 [ 1798.188559] ? rcu_read_lock_sched_held+0xd0/0xd0 [ 1798.196239] ? __fget+0x21d/0x320 [ 1798.202335] ? do_dup2+0x2a0/0x2a0 [ 1798.208499] ? lock_downgrade+0x350/0x350 [ 1798.215366] ? __fget_light+0xd6/0xf0 [ 1798.221808] ? syscall_trace_enter+0x369/0x5d0 [ 1798.229112] __sys_sendmsg+0xd3/0x160 [ 1798.235511] ? __sys_sendmsg_sock+0x60/0x60 [ 1798.242478] ? syscall_trace_enter+0x233/0x5d0 [ 1798.249721] ? syscall_slow_exit_work+0x280/0x280 [ 1798.257211] ? do_syscall_64+0x1e/0x2e0 [ 1798.263680] do_syscall_64+0x72/0x2e0 [ 1798.269950] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: bd71b08ec2ee ("net/mlx5: Support multiple updates of steering rules in parallel") Signed-off-by: Maor Gottlieb Signed-off-by: Alaa Hleihel Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index c7a16ae05fa8..9dc24241dc91 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1582,16 +1582,16 @@ struct match_list_head { struct match_list first; }; -static void free_match_list(struct match_list_head *head) +static void free_match_list(struct match_list_head *head, bool ft_locked) { if (!list_empty(&head->list)) { struct match_list *iter, *match_tmp; list_del(&head->first.list); - tree_put_node(&head->first.g->node, false); + tree_put_node(&head->first.g->node, ft_locked); list_for_each_entry_safe(iter, match_tmp, &head->list, list) { - tree_put_node(&iter->g->node, false); + tree_put_node(&iter->g->node, ft_locked); list_del(&iter->list); kfree(iter); } @@ -1600,7 +1600,8 @@ static void free_match_list(struct match_list_head *head) static int build_match_list(struct match_list_head *match_head, struct mlx5_flow_table *ft, - const struct mlx5_flow_spec *spec) + const struct mlx5_flow_spec *spec, + bool ft_locked) { struct rhlist_head *tmp, *list; struct mlx5_flow_group *g; @@ -1625,7 +1626,7 @@ static int build_match_list(struct match_list_head *match_head, curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC); if (!curr_match) { - free_match_list(match_head); + free_match_list(match_head, ft_locked); err = -ENOMEM; goto out; } @@ -1805,7 +1806,7 @@ search_again_locked: version = atomic_read(&ft->node.version); /* Collect all fgs which has a matching match_criteria */ - err = build_match_list(&match_head, ft, spec); + err = build_match_list(&match_head, ft, spec, take_write); if (err) { if (take_write) up_write_ref_node(&ft->node, false); @@ -1819,7 +1820,7 @@ search_again_locked: rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest, dest_num, version); - free_match_list(&match_head); + free_match_list(&match_head, take_write); if (!IS_ERR(rule) || (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) { if (take_write) From 0dc2c534f17c05bed0622b37a744bc38b48ca88a Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 24 Dec 2019 09:54:45 +0200 Subject: [PATCH 090/147] net/mlx5: IPsec, Fix esp modify function attribute The function mlx5_fpga_esp_validate_xfrm_attrs is wrongly used with negative negation as zero value indicates success but it used as failure return value instead. Fix by remove the unary not negation operator. Fixes: 05564d0ae075 ("net/mlx5: Add flow-steering commands for FPGA IPSec implementation") Signed-off-by: Raed Salem Reviewed-by: Boris Pismenny Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index e4ec0e03c289..4ed4d4d8e073 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -1478,7 +1478,7 @@ int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, if (!memcmp(&xfrm->attrs, attrs, sizeof(xfrm->attrs))) return 0; - if (!mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) { + if (mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) { mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n"); return -EOPNOTSUPP; } From 08db2cf577487f5123aebcc2f913e0b8a2c14b43 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Wed, 23 Oct 2019 16:41:21 +0300 Subject: [PATCH 091/147] net/mlx5: IPsec, fix memory leak at mlx5_fpga_ipsec_delete_sa_ctx SA context is allocated at mlx5_fpga_ipsec_create_sa_ctx, however the counterpart mlx5_fpga_ipsec_delete_sa_ctx function nullifies sa_ctx pointer without freeing the memory allocated, hence the memory leak. Fix by free SA context when the SA is released. Fixes: d6c4f0298cec ("net/mlx5: Refactor accel IPSec code") Signed-off-by: Raed Salem Reviewed-by: Boris Pismenny Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index 4ed4d4d8e073..4c61d25d2e88 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -850,6 +850,7 @@ void mlx5_fpga_ipsec_delete_sa_ctx(void *context) mutex_lock(&fpga_xfrm->lock); if (!--fpga_xfrm->num_rules) { mlx5_fpga_ipsec_release_sa_ctx(fpga_xfrm->sa_ctx); + kfree(fpga_xfrm->sa_ctx); fpga_xfrm->sa_ctx = NULL; } mutex_unlock(&fpga_xfrm->lock); From b57e66ad42d051ed31319c28ed1b62b191299a29 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 9 Jan 2020 15:53:37 +0200 Subject: [PATCH 092/147] net/mlx5e: TX, Error completion is for last WQE in batch For a cyclic work queue, when not requesting a completion per WQE, a single CQE might indicate the completion of several WQEs. However, in case some WQE in the batch causes an error, then an error completion is issued, breaking the batch, and pointing to the offending WQE in the wqe_counter field. Hence, WQE-specific error CQE handling (like printing, breaking, etc...) should be performed only for the last WQE in batch. Fixes: 130c7b46c93d ("net/mlx5e: TX, Dump WQs wqe descriptors on CQE with error events") Fixes: fd9b4be8002c ("net/mlx5e: RX, Support multiple outstanding UMR posts") Signed-off-by: Tariq Toukan Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 16 +++++---- .../net/ethernet/mellanox/mlx5/core/en_tx.c | 33 ++++++++----------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 9e9960146e5b..1c3ab69cbd96 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -613,13 +613,6 @@ void mlx5e_poll_ico_cq(struct mlx5e_cq *cq) wqe_counter = be16_to_cpu(cqe->wqe_counter); - if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { - netdev_WARN_ONCE(cq->channel->netdev, - "Bad OP in ICOSQ CQE: 0x%x\n", get_cqe_opcode(cqe)); - if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) - queue_work(cq->channel->priv->wq, &sq->recover_work); - break; - } do { struct mlx5e_sq_wqe_info *wi; u16 ci; @@ -629,6 +622,15 @@ void mlx5e_poll_ico_cq(struct mlx5e_cq *cq) ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); wi = &sq->db.ico_wqe[ci]; + if (last_wqe && unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { + netdev_WARN_ONCE(cq->channel->netdev, + "Bad OP in ICOSQ CQE: 0x%x\n", + get_cqe_opcode(cqe)); + if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) + queue_work(cq->channel->priv->wq, &sq->recover_work); + break; + } + if (likely(wi->opcode == MLX5_OPCODE_UMR)) { sqcc += MLX5E_UMR_WQEBBS; wi->umr.rq->mpwqe.umr_completed++; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 2565ba8692d9..ee60383adc5b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -451,34 +451,17 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) i = 0; do { + struct mlx5e_tx_wqe_info *wi; u16 wqe_counter; bool last_wqe; + u16 ci; mlx5_cqwq_pop(&cq->wq); wqe_counter = be16_to_cpu(cqe->wqe_counter); - if (unlikely(get_cqe_opcode(cqe) == MLX5_CQE_REQ_ERR)) { - if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, - &sq->state)) { - struct mlx5e_tx_wqe_info *wi; - u16 ci; - - ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); - wi = &sq->db.wqe_info[ci]; - mlx5e_dump_error_cqe(sq, - (struct mlx5_err_cqe *)cqe); - mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs); - queue_work(cq->channel->priv->wq, - &sq->recover_work); - } - stats->cqe_err++; - } - do { - struct mlx5e_tx_wqe_info *wi; struct sk_buff *skb; - u16 ci; int j; last_wqe = (sqcc == wqe_counter); @@ -516,6 +499,18 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) napi_consume_skb(skb, napi_budget); } while (!last_wqe); + if (unlikely(get_cqe_opcode(cqe) == MLX5_CQE_REQ_ERR)) { + if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, + &sq->state)) { + mlx5e_dump_error_cqe(sq, + (struct mlx5_err_cqe *)cqe); + mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs); + queue_work(cq->channel->priv->wq, + &sq->recover_work); + } + stats->cqe_err++; + } + } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); stats->cqes += i; From 61c00cca41aeeaa8e5263c2f81f28534bc1efafb Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 27 Jan 2020 14:18:14 +0200 Subject: [PATCH 093/147] net/mlx5: Deprecate usage of generic TLS HW capability bit Deprecate the generic TLS cap bit, use the new TX-specific TLS cap bit instead. Fixes: a12ff35e0fb7 ("net/mlx5: Introduce TLS TX offload hardware bits and structures") Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h index d787bc0a4155..e09bc3858d57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h @@ -45,7 +45,7 @@ void mlx5_ktls_destroy_key(struct mlx5_core_dev *mdev, u32 key_id); static inline bool mlx5_accel_is_ktls_device(struct mlx5_core_dev *mdev) { - if (!MLX5_CAP_GEN(mdev, tls)) + if (!MLX5_CAP_GEN(mdev, tls_tx)) return false; if (!MLX5_CAP_GEN(mdev, log_max_dek)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c index 71384ad1a443..ef1ed15a53b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c @@ -269,7 +269,7 @@ struct sk_buff *mlx5e_tls_handle_tx_skb(struct net_device *netdev, int datalen; u32 skb_seq; - if (MLX5_CAP_GEN(sq->channel->mdev, tls)) { + if (MLX5_CAP_GEN(sq->channel->mdev, tls_tx)) { skb = mlx5e_ktls_handle_tx_skb(netdev, sq, skb, wqe, pi); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index d89ff1d09119..909a7f284614 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -242,7 +242,7 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } - if (MLX5_CAP_GEN(dev, tls)) { + if (MLX5_CAP_GEN(dev, tls_tx)) { err = mlx5_core_get_caps(dev, MLX5_CAP_TLS); if (err) return err; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 032cd6630720..ff8c9d527bb4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1448,14 +1448,15 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_440[0x20]; - u8 tls[0x1]; - u8 reserved_at_461[0x2]; + u8 reserved_at_460[0x3]; u8 log_max_uctx[0x5]; u8 reserved_at_468[0x3]; u8 log_max_umem[0x5]; u8 max_num_eqs[0x10]; - u8 reserved_at_480[0x3]; + u8 reserved_at_480[0x1]; + u8 tls_tx[0x1]; + u8 reserved_at_482[0x1]; u8 log_max_l2_table[0x5]; u8 reserved_at_488[0x8]; u8 log_uar_page_sz[0x10]; From e3e056c35108661e418c803adfc054bf683426e7 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 6 Feb 2020 18:16:55 +0100 Subject: [PATCH 094/147] cifs: fix mode bits from dir listing when mounted with modefromsid When mounting with -o modefromsid, the mode bits are stored in an ACE. Directory enumeration (e.g. ls -l /mnt) triggers an SMB Query Dir which does not include ACEs in its response. The mode bits in this case are silently set to a default value of 755 instead. This patch marks the dentry created during the directory enumeration as needing re-evaluation (i.e. additional Query Info with ACEs) so that the mode bits can be properly extracted. Quick repro: $ mount.cifs //win19.test/data /mnt -o ...,modefromsid $ touch /mnt/foo && chmod 751 /mnt/foo $ stat /mnt/foo # reports 751 (OK) $ sleep 2 # dentry older than 1s by default get invalidated $ ls -l /mnt # since dentry invalid, ls does a Query Dir # and reports foo as 755 (WRONG) Signed-off-by: Aurelien Aptel Signed-off-by: Steve French CC: Stable Reviewed-by: Pavel Shilovsky --- fs/cifs/readdir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index d17587c2c4ab..ba9dadf3be24 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -196,7 +196,8 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) * may look wrong since the inodes may not have timed out by the time * "ls" does a stat() call on them. */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && From 2391ca41b476078da5cf72c2fe82ae9f03da8b38 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 6 Feb 2020 16:04:59 -0600 Subject: [PATCH 095/147] smb3: add one more dynamic tracepoint missing from strict fsync path We didn't have a dynamic trace point for catching errors in file_write_and_wait_range error cases in cifs_strict_fsync. Since not all apps check for write behind errors, it can be important for debugging to be able to trace these error paths. Suggested-and-reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 99ea7b2a06a5..bc9516ab4b34 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2593,8 +2593,10 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); rc = file_write_and_wait_range(file, start, end); - if (rc) + if (rc) { + trace_cifs_fsync_err(inode->i_ino, rc); return rc; + } xid = get_xid(); From ab3459d8f0ef52c38119ed58c4c29139efc7022c Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 6 Feb 2020 17:31:56 -0600 Subject: [PATCH 096/147] smb3: print warning once if posix context returned on open SMB3.1.1 POSIX Context processing is not complete yet - so print warning (once) if server returns it on open. Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/smb2pdu.c | 22 ++++++++++++++++++++++ fs/cifs/smb2pdu.h | 8 ++++++++ 2 files changed, 30 insertions(+) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 47cce0bd1afe..1234f9ccab03 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1939,6 +1939,16 @@ parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf) buf->IndexNumber = pdisk_id->DiskFileId; } +static void +parse_posix_ctxt(struct create_context *cc, struct smb_posix_info *pposix_inf) +{ + /* struct smb_posix_info *ppinf = (struct smb_posix_info *)cc; */ + + /* TODO: Need to add parsing for the context and return */ + printk_once(KERN_WARNING + "SMB3 3.11 POSIX response context not completed yet\n"); +} + void smb2_parse_contexts(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, @@ -1950,6 +1960,9 @@ smb2_parse_contexts(struct TCP_Server_Info *server, unsigned int next; unsigned int remaining; char *name; + const char smb3_create_tag_posix[] = {0x93, 0xAD, 0x25, 0x50, 0x9C, + 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83, + 0xDE, 0x96, 0x8B, 0xCD, 0x7C}; *oplock = 0; data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); @@ -1969,6 +1982,15 @@ smb2_parse_contexts(struct TCP_Server_Info *server, else if (buf && (le16_to_cpu(cc->NameLength) == 4) && strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0) parse_query_id_ctxt(cc, buf); + else if ((le16_to_cpu(cc->NameLength) == 16)) { + if (memcmp(name, smb3_create_tag_posix, 16) == 0) + parse_posix_ctxt(cc, NULL); + } + /* else { + cifs_dbg(FYI, "Context not matched with len %d\n", + le16_to_cpu(cc->NameLength)); + cifs_dump_mem("Cctxt name: ", name, 4); + } */ next = le32_to_cpu(cc->Next); if (!next) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4c43dbd1e089..ca2123d7f199 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1595,4 +1595,12 @@ struct smb2_file_network_open_info { extern char smb2_padding[7]; +/* equivalent of the contents of SMB3.1.1 POSIX open context response */ +struct smb_posix_info { + __le32 nlink; + __le32 reparse_tag; + __le32 mode; + kuid_t uid; + kuid_t gid; +}; #endif /* _SMB2PDU_H */ From 51d92d69f77b121d8093af82eb3bd2e8e631be55 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 6 Feb 2020 12:32:03 -0600 Subject: [PATCH 097/147] smb3: Add defines for new information level, FileIdInformation See MS-FSCC 2.4.43. Valid to be quried from most Windows servers (among others). Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/smb2pdu.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ca2123d7f199..fa03df130f1a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1519,6 +1519,7 @@ struct smb3_fs_vol_info { #define FILE_NORMALIZED_NAME_INFORMATION 48 #define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 #define FILE_STANDARD_LINK_INFORMATION 54 +#define FILE_ID_INFORMATION 59 struct smb2_file_internal_info { __le64 IndexNumber; @@ -1593,6 +1594,13 @@ struct smb2_file_network_open_info { __le32 Reserved; } __packed; /* level 34 Query also similar returned in close rsp and open rsp */ +/* See MS-FSCC 2.4.43 */ +struct smb2_file_id_information { + __le64 VolumeSerialNumber; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ +} __packed; /* level 59 */ + extern char smb2_padding[7]; /* equivalent of the contents of SMB3.1.1 POSIX open context response */ From 8dcc1a9d90c10fa4143e5c17821082e5e60e46a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 25 Dec 2019 16:07:44 +0900 Subject: [PATCH 098/147] fs: New zonefs file system zonefs is a very simple file system exposing each zone of a zoned block device as a file. Unlike a regular file system with zoned block device support (e.g. f2fs), zonefs does not hide the sequential write constraint of zoned block devices to the user. Files representing sequential write zones of the device must be written sequentially starting from the end of the file (append only writes). As such, zonefs is in essence closer to a raw block device access interface than to a full featured POSIX file system. The goal of zonefs is to simplify the implementation of zoned block device support in applications by replacing raw block device file accesses with a richer file API, avoiding relying on direct block device file ioctls which may be more obscure to developers. One example of this approach is the implementation of LSM (log-structured merge) tree structures (such as used in RocksDB and LevelDB) on zoned block devices by allowing SSTables to be stored in a zone file similarly to a regular file system rather than as a range of sectors of a zoned device. The introduction of the higher level construct "one file is one zone" can help reducing the amount of changes needed in the application as well as introducing support for different application programming languages. Zonefs on-disk metadata is reduced to an immutable super block to persistently store a magic number and optional feature flags and values. On mount, zonefs uses blkdev_report_zones() to obtain the device zone configuration and populates the mount point with a static file tree solely based on this information. E.g. file sizes come from the device zone type and write pointer offset managed by the device itself. The zone files created on mount have the following characteristics. 1) Files representing zones of the same type are grouped together under a common sub-directory: * For conventional zones, the sub-directory "cnv" is used. * For sequential write zones, the sub-directory "seq" is used. These two directories are the only directories that exist in zonefs. Users cannot create other directories and cannot rename nor delete the "cnv" and "seq" sub-directories. 2) The name of zone files is the number of the file within the zone type sub-directory, in order of increasing zone start sector. 3) The size of conventional zone files is fixed to the device zone size. Conventional zone files cannot be truncated. 4) The size of sequential zone files represent the file's zone write pointer position relative to the zone start sector. Truncating these files is allowed only down to 0, in which case, the zone is reset to rewind the zone write pointer position to the start of the zone, or up to the zone size, in which case the file's zone is transitioned to the FULL state (finish zone operation). 5) All read and write operations to files are not allowed beyond the file zone size. Any access exceeding the zone size is failed with the -EFBIG error. 6) Creating, deleting, renaming or modifying any attribute of files and sub-directories is not allowed. 7) There are no restrictions on the type of read and write operations that can be issued to conventional zone files. Buffered, direct and mmap read & write operations are accepted. For sequential zone files, there are no restrictions on read operations, but all write operations must be direct IO append writes. mmap write of sequential files is not allowed. Several optional features of zonefs can be enabled at format time. * Conventional zone aggregation: ranges of contiguous conventional zones can be aggregated into a single larger file instead of the default one file per zone. * File ownership: The owner UID and GID of zone files is by default 0 (root) but can be changed to any valid UID/GID. * File access permissions: the default 640 access permissions can be changed. The mkzonefs tool is used to format zoned block devices for use with zonefs. This tool is available on Github at: git@github.com:damien-lemoal/zonefs-tools.git. zonefs-tools also includes a test suite which can be run against any zoned block device, including null_blk block device created with zoned mode. Example: the following formats a 15TB host-managed SMR HDD with 256 MB zones with the conventional zones aggregation feature enabled. $ sudo mkzonefs -o aggr_cnv /dev/sdX $ sudo mount -t zonefs /dev/sdX /mnt $ ls -l /mnt/ total 0 dr-xr-xr-x 2 root root 1 Nov 25 13:23 cnv dr-xr-xr-x 2 root root 55356 Nov 25 13:23 seq The size of the zone files sub-directories indicate the number of files existing for each type of zones. In this example, there is only one conventional zone file (all conventional zones are aggregated under a single file). $ ls -l /mnt/cnv total 137101312 -rw-r----- 1 root root 140391743488 Nov 25 13:23 0 This aggregated conventional zone file can be used as a regular file. $ sudo mkfs.ext4 /mnt/cnv/0 $ sudo mount -o loop /mnt/cnv/0 /data The "seq" sub-directory grouping files for sequential write zones has in this example 55356 zones. $ ls -lv /mnt/seq total 14511243264 -rw-r----- 1 root root 0 Nov 25 13:23 0 -rw-r----- 1 root root 0 Nov 25 13:23 1 -rw-r----- 1 root root 0 Nov 25 13:23 2 ... -rw-r----- 1 root root 0 Nov 25 13:23 55354 -rw-r----- 1 root root 0 Nov 25 13:23 55355 For sequential write zone files, the file size changes as data is appended at the end of the file, similarly to any regular file system. $ dd if=/dev/zero of=/mnt/seq/0 bs=4K count=1 conv=notrunc oflag=direct 1+0 records in 1+0 records out 4096 bytes (4.1 kB, 4.0 KiB) copied, 0.000452219 s, 9.1 MB/s $ ls -l /mnt/seq/0 -rw-r----- 1 root root 4096 Nov 25 13:23 /mnt/seq/0 The written file can be truncated to the zone size, preventing any further write operation. $ truncate -s 268435456 /mnt/seq/0 $ ls -l /mnt/seq/0 -rw-r----- 1 root root 268435456 Nov 25 13:49 /mnt/seq/0 Truncation to 0 size allows freeing the file zone storage space and restart append-writes to the file. $ truncate -s 0 /mnt/seq/0 $ ls -l /mnt/seq/0 -rw-r----- 1 root root 0 Nov 25 13:49 /mnt/seq/0 Since files are statically mapped to zones on the disk, the number of blocks of a file as reported by stat() and fstat() indicates the size of the file zone. $ stat /mnt/seq/0 File: /mnt/seq/0 Size: 0 Blocks: 524288 IO Block: 4096 regular empty file Device: 870h/2160d Inode: 50431 Links: 1 Access: (0640/-rw-r-----) Uid: ( 0/ root) Gid: ( 0/ root) Access: 2019-11-25 13:23:57.048971997 +0900 Modify: 2019-11-25 13:52:25.553805765 +0900 Change: 2019-11-25 13:52:25.553805765 +0900 Birth: - The number of blocks of the file ("Blocks") in units of 512B blocks gives the maximum file size of 524288 * 512 B = 256 MB, corresponding to the device zone size in this example. Of note is that the "IO block" field always indicates the minimum IO size for writes and corresponds to the device physical sector size. This code contains contributions from: * Johannes Thumshirn , * Darrick J. Wong , * Christoph Hellwig , * Chaitanya Kulkarni and * Ting Yao . Signed-off-by: Damien Le Moal Reviewed-by: Dave Chinner --- MAINTAINERS | 9 + fs/Kconfig | 1 + fs/Makefile | 1 + fs/zonefs/Kconfig | 9 + fs/zonefs/Makefile | 4 + fs/zonefs/super.c | 1439 ++++++++++++++++++++++++++++++++++++ fs/zonefs/zonefs.h | 189 +++++ include/uapi/linux/magic.h | 1 + 8 files changed, 1653 insertions(+) create mode 100644 fs/zonefs/Kconfig create mode 100644 fs/zonefs/Makefile create mode 100644 fs/zonefs/super.c create mode 100644 fs/zonefs/zonefs.h diff --git a/MAINTAINERS b/MAINTAINERS index 56765f542244..089fd879632a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18303,6 +18303,15 @@ L: linux-kernel@vger.kernel.org S: Maintained F: arch/x86/kernel/cpu/zhaoxin.c +ZONEFS FILESYSTEM +M: Damien Le Moal +M: Naohiro Aota +R: Johannes Thumshirn +L: linux-fsdevel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git +S: Maintained +F: fs/zonefs/ + ZPOOL COMPRESSED PAGE STORAGE API M: Dan Streetman L: linux-mm@kvack.org diff --git a/fs/Kconfig b/fs/Kconfig index 7b623e9fc1b0..a3f97ca2bd46 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" +source "fs/zonefs/Kconfig" config FS_DAX bool "Direct Access (DAX) support" diff --git a/fs/Makefile b/fs/Makefile index 1148c555c4d3..527f228a5e8a 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -133,3 +133,4 @@ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ obj-$(CONFIG_EROFS_FS) += erofs/ +obj-$(CONFIG_ZONEFS_FS) += zonefs/ diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig new file mode 100644 index 000000000000..fb87ad372e29 --- /dev/null +++ b/fs/zonefs/Kconfig @@ -0,0 +1,9 @@ +config ZONEFS_FS + tristate "zonefs filesystem support" + depends on BLOCK + depends on BLK_DEV_ZONED + help + zonefs is a simple file system which exposes zones of a zoned block + device (e.g. host-managed or host-aware SMR disk drives) as files. + + If unsure, say N. diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile new file mode 100644 index 000000000000..75a380aa1ae1 --- /dev/null +++ b/fs/zonefs/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_ZONEFS_FS) += zonefs.o + +zonefs-y := super.o diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c new file mode 100644 index 000000000000..8bc6ef82d693 --- /dev/null +++ b/fs/zonefs/super.c @@ -0,0 +1,1439 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple file system for zoned block devices exposing zones as files. + * + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zonefs.h" + +static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All I/Os should always be within the file maximum size */ + if (WARN_ON_ONCE(offset + length > zi->i_max_size)) + return -EIO; + + /* + * Sequential zones can only accept direct writes. This is already + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ + if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && + (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT))) + return -EIO; + + /* + * For conventional zones, all blocks are always mapped. For sequential + * zones, all blocks after always mapped below the inode size (zone + * write pointer) and unwriten beyond. + */ + mutex_lock(&zi->i_truncate_mutex); + isize = i_size_read(inode); + if (offset >= isize) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + if (flags & IOMAP_WRITE) + length = zi->i_max_size - offset; + else + length = min(length, isize - offset); + mutex_unlock(&zi->i_truncate_mutex); + + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset; + iomap->bdev = inode->i_sb->s_bdev; + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + + return 0; +} + +static const struct iomap_ops zonefs_iomap_ops = { + .iomap_begin = zonefs_iomap_begin, +}; + +static int zonefs_readpage(struct file *unused, struct page *page) +{ + return iomap_readpage(page, &zonefs_iomap_ops); +} + +static int zonefs_readpages(struct file *unused, struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) +{ + return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops); +} + +/* + * Map blocks for page writeback. This is used only on conventional zone files, + * which implies that the page range can only be within the fixed inode size. + */ +static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; + + /* If the mapping is already OK, nothing needs to be done */ + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops zonefs_writeback_ops = { + .map_blocks = zonefs_map_blocks, +}; + +static int zonefs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepage(page, wbc, &wpc, &zonefs_writeback_ops); +} + +static int zonefs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); +} + +static const struct address_space_operations zonefs_file_aops = { + .readpage = zonefs_readpage, + .readpages = zonefs_readpages, + .writepage = zonefs_writepage, + .writepages = zonefs_writepages, + .set_page_dirty = iomap_set_page_dirty, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, + .migratepage = iomap_migrate_page, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .direct_IO = noop_direct_IO, +}; + +static void zonefs_update_stats(struct inode *inode, loff_t new_isize) +{ + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + loff_t old_isize = i_size_read(inode); + loff_t nr_blocks; + + if (new_isize == old_isize) + return; + + spin_lock(&sbi->s_lock); + + /* + * This may be called for an update after an IO error. + * So beware of the values seen. + */ + if (new_isize < old_isize) { + nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits; + if (sbi->s_used_blocks > nr_blocks) + sbi->s_used_blocks -= nr_blocks; + else + sbi->s_used_blocks = 0; + } else { + sbi->s_used_blocks += + (new_isize - old_isize) >> sb->s_blocksize_bits; + if (sbi->s_used_blocks > sbi->s_blocks) + sbi->s_used_blocks = sbi->s_blocks; + } + + spin_unlock(&sbi->s_lock); +} + +/* + * Check a zone condition and adjust its file inode access permissions for + * offline and readonly zones. Return the inode size corresponding to the + * amount of readable data in the zone. + */ +static loff_t zonefs_check_zone_condition(struct inode *inode, + struct blk_zone *zone, bool warn) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + switch (zone->cond) { + case BLK_ZONE_COND_OFFLINE: + /* + * Dead zone: make the inode immutable, disable all accesses + * and set the file size to 0 (zone wp set to zone start). + */ + if (warn) + zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", + inode->i_ino); + inode->i_flags |= S_IMMUTABLE; + inode->i_mode &= ~0777; + zone->wp = zone->start; + return 0; + case BLK_ZONE_COND_READONLY: + /* Do not allow writes in read-only zones */ + if (warn) + zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", + inode->i_ino); + inode->i_flags |= S_IMMUTABLE; + inode->i_mode &= ~0222; + /* fallthrough */ + default: + if (zi->i_ztype == ZONEFS_ZTYPE_CNV) + return zi->i_max_size; + return (zone->wp - zone->start) << SECTOR_SHIFT; + } +} + +struct zonefs_ioerr_data { + struct inode *inode; + bool write; +}; + +static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct zonefs_ioerr_data *err = data; + struct inode *inode = err->inode; + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + loff_t isize, data_size; + + /* + * Check the zone condition: if the zone is not "bad" (offline or + * read-only), read errors are simply signaled to the IO issuer as long + * as there is no inconsistency between the inode size and the amount of + * data writen in the zone (data_size). + */ + data_size = zonefs_check_zone_condition(inode, zone, true); + isize = i_size_read(inode); + if (zone->cond != BLK_ZONE_COND_OFFLINE && + zone->cond != BLK_ZONE_COND_READONLY && + !err->write && isize == data_size) + return 0; + + /* + * At this point, we detected either a bad zone or an inconsistency + * between the inode size and the amount of data written in the zone. + * For the latter case, the cause may be a write IO error or an external + * action on the device. Two error patterns exist: + * 1) The inode size is lower than the amount of data in the zone: + * a write operation partially failed and data was writen at the end + * of the file. This can happen in the case of a large direct IO + * needing several BIOs and/or write requests to be processed. + * 2) The inode size is larger than the amount of data in the zone: + * this can happen with a deferred write error with the use of the + * device side write cache after getting successful write IO + * completions. Other possibilities are (a) an external corruption, + * e.g. an application reset the zone directly, or (b) the device + * has a serious problem (e.g. firmware bug). + * + * In all cases, warn about inode size inconsistency and handle the + * IO error according to the zone condition and to the mount options. + */ + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) + zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", + inode->i_ino, isize, data_size); + + /* + * First handle bad zones signaled by hardware. The mount options + * errors=zone-ro and errors=zone-offline result in changing the + * zone condition to read-only and offline respectively, as if the + * condition was signaled by the hardware. + */ + if (zone->cond == BLK_ZONE_COND_OFFLINE || + sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) { + zonefs_warn(sb, "inode %lu: read/write access disabled\n", + inode->i_ino); + if (zone->cond != BLK_ZONE_COND_OFFLINE) { + zone->cond = BLK_ZONE_COND_OFFLINE; + data_size = zonefs_check_zone_condition(inode, zone, + false); + } + } else if (zone->cond == BLK_ZONE_COND_READONLY || + sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) { + zonefs_warn(sb, "inode %lu: write access disabled\n", + inode->i_ino); + if (zone->cond != BLK_ZONE_COND_READONLY) { + zone->cond = BLK_ZONE_COND_READONLY; + data_size = zonefs_check_zone_condition(inode, zone, + false); + } + } + + /* + * If error=remount-ro was specified, any error result in remounting + * the volume as read-only. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) { + zonefs_warn(sb, "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + + /* + * Update block usage stats and the inode size to prevent access to + * invalid data. + */ + zonefs_update_stats(inode, data_size); + i_size_write(inode, data_size); + zi->i_wpoffset = data_size; + + return 0; +} + +/* + * When an file IO error occurs, check the file zone to see if there is a change + * in the zone condition (e.g. offline or read-only). For a failed write to a + * sequential zone, the zone write pointer position must also be checked to + * eventually correct the file size and zonefs inode write pointer offset + * (which can be out of sync with the drive due to partial write failures). + */ +static void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + unsigned int noio_flag; + unsigned int nr_zones = + zi->i_max_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + struct zonefs_ioerr_data err = { + .inode = inode, + .write = write, + }; + int ret; + + mutex_lock(&zi->i_truncate_mutex); + + /* + * Memory allocations in blkdev_report_zones() can trigger a memory + * reclaim which may in turn cause a recursion into zonefs as well as + * struct request allocations for the same device. The former case may + * end up in a deadlock on the inode truncate mutex, while the latter + * may prevent IO forward progress. Executing the report zones under + * the GFP_NOIO context avoids both problems. + */ + noio_flag = memalloc_noio_save(); + ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, + zonefs_io_error_cb, &err); + if (ret != nr_zones) + zonefs_err(sb, "Get inode %lu zone information failed %d\n", + inode->i_ino, ret); + memalloc_noio_restore(noio_flag); + + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_truncate(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t old_isize; + enum req_opf op; + int ret = 0; + + /* + * Only sequential zone files can be truncated and truncation is allowed + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return -EPERM; + + if (!isize) + op = REQ_OP_ZONE_RESET; + else if (isize == zi->i_max_size) + op = REQ_OP_ZONE_FINISH; + else + return -EPERM; + + inode_dio_wait(inode); + + /* Serialize against page faults */ + down_write(&zi->i_mmap_sem); + + /* Serialize against zonefs_iomap_begin() */ + mutex_lock(&zi->i_truncate_mutex); + + old_isize = i_size_read(inode); + if (isize == old_isize) + goto unlock; + + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, + zi->i_max_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { + zonefs_err(inode->i_sb, + "Zone management operation at %llu failed %d", + zi->i_zsector, ret); + goto unlock; + } + + zonefs_update_stats(inode, isize); + truncate_setsize(inode, isize); + zi->i_wpoffset = isize; + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + up_write(&zi->i_mmap_sem); + + return ret; +} + +static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + ret = setattr_prepare(dentry, iattr); + if (ret) + return ret; + + /* + * Since files and directories cannot be created nor deleted, do not + * allow setting any write attributes on the sub-directories grouping + * files by zone type. + */ + if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && + (iattr->ia_mode & 0222)) + return -EPERM; + + if (((iattr->ia_valid & ATTR_UID) && + !uid_eq(iattr->ia_uid, inode->i_uid)) || + ((iattr->ia_valid & ATTR_GID) && + !gid_eq(iattr->ia_gid, inode->i_gid))) { + ret = dquot_transfer(inode, iattr); + if (ret) + return ret; + } + + if (iattr->ia_valid & ATTR_SIZE) { + ret = zonefs_file_truncate(inode, iattr->ia_size); + if (ret) + return ret; + } + + setattr_copy(inode, iattr); + + return 0; +} + +static const struct inode_operations zonefs_file_inode_operations = { + .setattr = zonefs_inode_setattr, +}; + +static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file_inode(file); + int ret = 0; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + /* + * Since only direct writes are allowed in sequential files, page cache + * flush is needed only for conventional zone files. + */ + if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) + ret = file_write_and_wait_range(file, start, end); + if (!ret) + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + + if (ret) + zonefs_io_error(inode, true); + + return ret; +} + +static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf) +{ + struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file)); + vm_fault_t ret; + + down_read(&zi->i_mmap_sem); + ret = filemap_fault(vmf); + up_read(&zi->i_mmap_sem); + + return ret; +} + +static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + vm_fault_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + /* + * Sanity check: only conventional zone files can have shared + * writeable mappings. + */ + if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) + return VM_FAULT_NOPAGE; + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + + /* Serialize against truncates */ + down_read(&zi->i_mmap_sem); + ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); + up_read(&zi->i_mmap_sem); + + sb_end_pagefault(inode->i_sb); + return ret; +} + +static const struct vm_operations_struct zonefs_file_vm_ops = { + .fault = zonefs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = zonefs_filemap_page_mkwrite, +}; + +static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* + * Conventional zones accept random writes, so their files can support + * shared writable mappings. For sequential zone files, only read + * mappings are possible since there are no guarantees for write + * ordering between msync() and page cache writeback. + */ + if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + + file_accessed(file); + vma->vm_ops = &zonefs_file_vm_ops; + + return 0; +} + +static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t isize = i_size_read(file_inode(file)); + + /* + * Seeks are limited to below the zone size for conventional zones + * and below the zone write pointer for sequential zones. In both + * cases, this limit is the inode size. + */ + return generic_file_llseek_size(file, offset, whence, isize, isize); +} + +static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (error) { + zonefs_io_error(inode, true); + return error; + } + + if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed + * successfully necessarily means that all preceding writes + * were also successful. So we can safely increase the inode + * size to the write end location. + */ + mutex_lock(&zi->i_truncate_mutex); + if (i_size_read(inode) < iocb->ki_pos + size) { + zonefs_update_stats(inode, iocb->ki_pos + size); + i_size_write(inode, iocb->ki_pos + size); + } + mutex_unlock(&zi->i_truncate_mutex); + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_write_dio_ops = { + .end_io = zonefs_file_write_dio_end_io, +}; + +/* + * Handle direct writes. For sequential zone files, this is the only possible + * write path. For these files, check that the user is issuing writes + * sequentially from the end of the file. This code assumes that the block layer + * delivers write requests to the device in sequential order. This is always the + * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE + * elevator feature is being used (e.g. mq-deadline). The block layer always + * automatically select such an elevator for zoned block devices during the + * device initialization. + */ +static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + size_t count; + ssize_t ret; + + /* + * For async direct IOs to sequential zone files, ignore IOCB_NOWAIT + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) + && (iocb->ki_flags & IOCB_NOWAIT)) + iocb->ki_flags &= ~IOCB_NOWAIT; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto inode_unlock; + + iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos); + count = iov_iter_count(from); + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + + /* Enforce sequential writes (append only) in sequential zones */ + mutex_lock(&zi->i_truncate_mutex); + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && iocb->ki_pos != zi->i_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); + ret = -EINVAL; + goto inode_unlock; + } + mutex_unlock(&zi->i_truncate_mutex); + + ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, + &zonefs_write_dio_ops, is_sync_kiocb(iocb)); + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; + mutex_lock(&zi->i_truncate_mutex); + zi->i_wpoffset += count; + mutex_unlock(&zi->i_truncate_mutex); + } + +inode_unlock: + inode_unlock(inode); + + return ret; +} + +static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + ssize_t ret; + + /* + * Direct IO writes are mandatory for sequential zone files so that the + * write IO issuing order is preserved. + */ + if (zi->i_ztype != ZONEFS_ZTYPE_CNV) + return -EIO; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto inode_unlock; + + iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos); + + ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); + if (ret > 0) + iocb->ki_pos += ret; + else if (ret == -EIO) + zonefs_io_error(inode, true); + +inode_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (sb_rdonly(inode->i_sb)) + return -EROFS; + + /* Write operations beyond the zone size are not allowed */ + if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) + return -EFBIG; + + if (iocb->ki_flags & IOCB_DIRECT) + return zonefs_file_dio_write(iocb, from); + + return zonefs_file_buffered_write(iocb, from); +} + +static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + if (error) { + zonefs_io_error(file_inode(iocb->ki_filp), false); + return error; + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_read_dio_ops = { + .end_io = zonefs_file_read_dio_end_io, +}; + +static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + ssize_t ret; + + /* Offline zones cannot be read */ + if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) + return -EPERM; + + if (iocb->ki_pos >= zi->i_max_size) + return 0; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + /* Limit read operations to written data */ + mutex_lock(&zi->i_truncate_mutex); + isize = i_size_read(inode); + if (iocb->ki_pos >= isize) { + mutex_unlock(&zi->i_truncate_mutex); + ret = 0; + goto inode_unlock; + } + iov_iter_truncate(to, isize - iocb->ki_pos); + mutex_unlock(&zi->i_truncate_mutex); + + if (iocb->ki_flags & IOCB_DIRECT) { + size_t count = iov_iter_count(to); + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + file_accessed(iocb->ki_filp); + ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, + &zonefs_read_dio_ops, is_sync_kiocb(iocb)); + } else { + ret = generic_file_read_iter(iocb, to); + if (ret == -EIO) + zonefs_io_error(inode, false); + } + +inode_unlock: + inode_unlock_shared(inode); + + return ret; +} + +static const struct file_operations zonefs_file_operations = { + .open = generic_file_open, + .fsync = zonefs_file_fsync, + .mmap = zonefs_file_mmap, + .llseek = zonefs_file_llseek, + .read_iter = zonefs_file_read_iter, + .write_iter = zonefs_file_write_iter, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .iopoll = iomap_dio_iopoll, +}; + +static struct kmem_cache *zonefs_inode_cachep; + +static struct inode *zonefs_alloc_inode(struct super_block *sb) +{ + struct zonefs_inode_info *zi; + + zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL); + if (!zi) + return NULL; + + inode_init_once(&zi->i_vnode); + mutex_init(&zi->i_truncate_mutex); + init_rwsem(&zi->i_mmap_sem); + + return &zi->i_vnode; +} + +static void zonefs_free_inode(struct inode *inode) +{ + kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode)); +} + +/* + * File system stat. + */ +static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype t; + u64 fsid; + + buf->f_type = ZONEFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_namelen = ZONEFS_NAME_MAX; + + spin_lock(&sbi->s_lock); + + buf->f_blocks = sbi->s_blocks; + if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks)) + buf->f_bfree = 0; + else + buf->f_bfree = buf->f_blocks - sbi->s_used_blocks; + buf->f_bavail = buf->f_bfree; + + for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { + if (sbi->s_nr_files[t]) + buf->f_files += sbi->s_nr_files[t] + 1; + } + buf->f_ffree = 0; + + spin_unlock(&sbi->s_lock); + + fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ + le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); + buf->f_fsid.val[0] = (u32)fsid; + buf->f_fsid.val[1] = (u32)(fsid >> 32); + + return 0; +} + +enum { + Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, + Opt_err, +}; + +static const match_table_t tokens = { + { Opt_errors_ro, "errors=remount-ro"}, + { Opt_errors_zro, "errors=zone-ro"}, + { Opt_errors_zol, "errors=zone-offline"}, + { Opt_errors_repair, "errors=repair"}, + { Opt_err, NULL} +}; + +static int zonefs_parse_options(struct super_block *sb, char *options) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_errors_ro: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO; + break; + case Opt_errors_zro: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO; + break; + case Opt_errors_zol: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL; + break; + case Opt_errors_repair: + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; + sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; + break; + default: + return -EINVAL; + } + } + + return 0; +} + +static int zonefs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb); + + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) + seq_puts(seq, ",errors=remount-ro"); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) + seq_puts(seq, ",errors=zone-ro"); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) + seq_puts(seq, ",errors=zone-offline"); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR) + seq_puts(seq, ",errors=repair"); + + return 0; +} + +static int zonefs_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + + return zonefs_parse_options(sb, data); +} + +static const struct super_operations zonefs_sops = { + .alloc_inode = zonefs_alloc_inode, + .free_inode = zonefs_free_inode, + .statfs = zonefs_statfs, + .remount_fs = zonefs_remount, + .show_options = zonefs_show_options, +}; + +static const struct inode_operations zonefs_dir_inode_operations = { + .lookup = simple_lookup, + .setattr = zonefs_inode_setattr, +}; + +static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, + enum zonefs_ztype type) +{ + struct super_block *sb = parent->i_sb; + + inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1; + inode_init_owner(inode, parent, S_IFDIR | 0555); + inode->i_op = &zonefs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + inc_nlink(parent); +} + +static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + enum zonefs_ztype type) +{ + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; + inode->i_mode = S_IFREG | sbi->s_perm; + + zi->i_ztype = type; + zi->i_zsector = zone->start; + zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, + zone->len << SECTOR_SHIFT); + zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true); + + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; + inode->i_size = zi->i_wpoffset; + inode->i_blocks = zone->len; + + inode->i_op = &zonefs_file_inode_operations; + inode->i_fop = &zonefs_file_operations; + inode->i_mapping->a_ops = &zonefs_file_aops; + + sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); + sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; + sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; +} + +static struct dentry *zonefs_create_inode(struct dentry *parent, + const char *name, struct blk_zone *zone, + enum zonefs_ztype type) +{ + struct inode *dir = d_inode(parent); + struct dentry *dentry; + struct inode *inode; + + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + inode = new_inode(parent->d_sb); + if (!inode) + goto dput; + + inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; + if (zone) + zonefs_init_file_inode(inode, zone, type); + else + zonefs_init_dir_inode(dir, inode, type); + d_add(dentry, inode); + dir->i_size++; + + return dentry; + +dput: + dput(dentry); + + return NULL; +} + +struct zonefs_zone_data { + struct super_block *sb; + unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; + struct blk_zone *zones; +}; + +/* + * Create a zone group and populate it with zone files. + */ +static int zonefs_create_zgroup(struct zonefs_zone_data *zd, + enum zonefs_ztype type) +{ + struct super_block *sb = zd->sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct blk_zone *zone, *next, *end; + const char *zgroup_name; + char *file_name; + struct dentry *dir; + unsigned int n = 0; + int ret = -ENOMEM; + + /* If the group is empty, there is nothing to do */ + if (!zd->nr_zones[type]) + return 0; + + file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); + if (!file_name) + return -ENOMEM; + + if (type == ZONEFS_ZTYPE_CNV) + zgroup_name = "cnv"; + else + zgroup_name = "seq"; + + dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); + if (!dir) + goto free; + + /* + * The first zone contains the super block: skip it. + */ + end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk); + for (zone = &zd->zones[1]; zone < end; zone = next) { + + next = zone + 1; + if (zonefs_zone_type(zone) != type) + continue; + + /* + * For conventional zones, contiguous zones can be aggregated + * together to form larger files. Note that this overwrites the + * length of the first zone of the set of contiguous zones + * aggregated together. If one offline or read-only zone is + * found, assume that all zones aggregated have the same + * condition. + */ + if (type == ZONEFS_ZTYPE_CNV && + (sbi->s_features & ZONEFS_F_AGGRCNV)) { + for (; next < end; next++) { + if (zonefs_zone_type(next) != type) + break; + zone->len += next->len; + if (next->cond == BLK_ZONE_COND_READONLY && + zone->cond != BLK_ZONE_COND_OFFLINE) + zone->cond = BLK_ZONE_COND_READONLY; + else if (next->cond == BLK_ZONE_COND_OFFLINE) + zone->cond = BLK_ZONE_COND_OFFLINE; + } + } + + /* + * Use the file number within its group as file name. + */ + snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); + if (!zonefs_create_inode(dir, file_name, zone, type)) + goto free; + + n++; + } + + zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", + zgroup_name, n, n > 1 ? "s" : ""); + + sbi->s_nr_files[type] = n; + ret = 0; + +free: + kfree(file_name); + + return ret; +} + +static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct zonefs_zone_data *zd = data; + + /* + * Count the number of usable zones: the first zone at index 0 contains + * the super block and is ignored. + */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + zone->wp = zone->start + zone->len; + if (idx) + zd->nr_zones[ZONEFS_ZTYPE_CNV]++; + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + if (idx) + zd->nr_zones[ZONEFS_ZTYPE_SEQ]++; + break; + default: + zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", + zone->type); + return -EIO; + } + + memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); + + return 0; +} + +static int zonefs_get_zone_info(struct zonefs_zone_data *zd) +{ + struct block_device *bdev = zd->sb->s_bdev; + int ret; + + zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk), + sizeof(struct blk_zone), GFP_KERNEL); + if (!zd->zones) + return -ENOMEM; + + /* Get zones information from the device */ + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, + zonefs_get_zone_info_cb, zd); + if (ret < 0) { + zonefs_err(zd->sb, "Zone report failed %d\n", ret); + return ret; + } + + if (ret != blkdev_nr_zones(bdev->bd_disk)) { + zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", + ret, blkdev_nr_zones(bdev->bd_disk)); + return -EIO; + } + + return 0; +} + +static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd) +{ + kvfree(zd->zones); +} + +/* + * Read super block information from the device. + */ +static int zonefs_read_super(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_super *super; + u32 crc, stored_crc; + struct page *page; + struct bio_vec bio_vec; + struct bio bio; + int ret; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + bio_init(&bio, &bio_vec, 1); + bio.bi_iter.bi_sector = 0; + bio.bi_opf = REQ_OP_READ; + bio_set_dev(&bio, sb->s_bdev); + bio_add_page(&bio, page, PAGE_SIZE, 0); + + ret = submit_bio_wait(&bio); + if (ret) + goto free_page; + + super = kmap(page); + + ret = -EINVAL; + if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) + goto unmap; + + stored_crc = le32_to_cpu(super->s_crc); + super->s_crc = 0; + crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super)); + if (crc != stored_crc) { + zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", + crc, stored_crc); + goto unmap; + } + + sbi->s_features = le64_to_cpu(super->s_features); + if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { + zonefs_err(sb, "Unknown features set 0x%llx\n", + sbi->s_features); + goto unmap; + } + + if (sbi->s_features & ZONEFS_F_UID) { + sbi->s_uid = make_kuid(current_user_ns(), + le32_to_cpu(super->s_uid)); + if (!uid_valid(sbi->s_uid)) { + zonefs_err(sb, "Invalid UID feature\n"); + goto unmap; + } + } + + if (sbi->s_features & ZONEFS_F_GID) { + sbi->s_gid = make_kgid(current_user_ns(), + le32_to_cpu(super->s_gid)); + if (!gid_valid(sbi->s_gid)) { + zonefs_err(sb, "Invalid GID feature\n"); + goto unmap; + } + } + + if (sbi->s_features & ZONEFS_F_PERM) + sbi->s_perm = le32_to_cpu(super->s_perm); + + if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { + zonefs_err(sb, "Reserved area is being used\n"); + goto unmap; + } + + uuid_copy(&sbi->s_uuid, (uuid_t *)super->s_uuid); + ret = 0; + +unmap: + kunmap(page); +free_page: + __free_page(page); + + return ret; +} + +/* + * Check that the device is zoned. If it is, get the list of zones and create + * sub-directories and files according to the device zone configuration and + * format options. + */ +static int zonefs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct zonefs_zone_data zd; + struct zonefs_sb_info *sbi; + struct inode *inode; + enum zonefs_ztype t; + int ret; + + if (!bdev_is_zoned(sb->s_bdev)) { + zonefs_err(sb, "Not a zoned block device\n"); + return -EINVAL; + } + + /* + * Initialize super block information: the maximum file size is updated + * when the zone files are created so that the format option + * ZONEFS_F_AGGRCNV which increases the maximum file size of a file + * beyond the zone size is taken into account. + */ + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + spin_lock_init(&sbi->s_lock); + sb->s_fs_info = sbi; + sb->s_magic = ZONEFS_MAGIC; + sb->s_maxbytes = 0; + sb->s_op = &zonefs_sops; + sb->s_time_gran = 1; + + /* + * The block size is set to the device physical sector size to ensure + * that write operations on 512e devices (512B logical block and 4KB + * physical block) are always aligned to the device physical blocks, + * as mandated by the ZBC/ZAC specifications. + */ + sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev)); + sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); + sbi->s_uid = GLOBAL_ROOT_UID; + sbi->s_gid = GLOBAL_ROOT_GID; + sbi->s_perm = 0640; + sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + + ret = zonefs_read_super(sb); + if (ret) + return ret; + + ret = zonefs_parse_options(sb, data); + if (ret) + return ret; + + memset(&zd, 0, sizeof(struct zonefs_zone_data)); + zd.sb = sb; + ret = zonefs_get_zone_info(&zd); + if (ret) + goto cleanup; + + zonefs_info(sb, "Mounting %u zones", + blkdev_nr_zones(sb->s_bdev->bd_disk)); + + /* Create root directory inode */ + ret = -ENOMEM; + inode = new_inode(sb); + if (!inode) + goto cleanup; + + inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk); + inode->i_mode = S_IFDIR | 0555; + inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_op = &zonefs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + goto cleanup; + + /* Create and populate files in zone groups directories */ + for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { + ret = zonefs_create_zgroup(&zd, t); + if (ret) + break; + } + +cleanup: + zonefs_cleanup_zone_info(&zd); + + return ret; +} + +static struct dentry *zonefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super); +} + +static void zonefs_kill_super(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + + if (sb->s_root) + d_genocide(sb->s_root); + kill_block_super(sb); + kfree(sbi); +} + +/* + * File system definition and registration. + */ +static struct file_system_type zonefs_type = { + .owner = THIS_MODULE, + .name = "zonefs", + .mount = zonefs_mount, + .kill_sb = zonefs_kill_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init zonefs_init_inodecache(void) +{ + zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache", + sizeof(struct zonefs_inode_info), 0, + (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), + NULL); + if (zonefs_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void zonefs_destroy_inodecache(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy the inode cache. + */ + rcu_barrier(); + kmem_cache_destroy(zonefs_inode_cachep); +} + +static int __init zonefs_init(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); + + ret = zonefs_init_inodecache(); + if (ret) + return ret; + + ret = register_filesystem(&zonefs_type); + if (ret) { + zonefs_destroy_inodecache(); + return ret; + } + + return 0; +} + +static void __exit zonefs_exit(void) +{ + zonefs_destroy_inodecache(); + unregister_filesystem(&zonefs_type); +} + +MODULE_AUTHOR("Damien Le Moal"); +MODULE_DESCRIPTION("Zone file system for zoned block devices"); +MODULE_LICENSE("GPL"); +module_init(zonefs_init); +module_exit(zonefs_exit); diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h new file mode 100644 index 000000000000..ad17fef7ce91 --- /dev/null +++ b/fs/zonefs/zonefs.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Simple zone file system for zoned block devices. + * + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + */ +#ifndef __ZONEFS_H__ +#define __ZONEFS_H__ + +#include +#include +#include +#include +#include + +/* + * Maximum length of file names: this only needs to be large enough to fit + * the zone group directory names and a decimal zone number for file names. + * 16 characters is plenty. + */ +#define ZONEFS_NAME_MAX 16 + +/* + * Zone types: ZONEFS_ZTYPE_SEQ is used for all sequential zone types + * defined in linux/blkzoned.h, that is, BLK_ZONE_TYPE_SEQWRITE_REQ and + * BLK_ZONE_TYPE_SEQWRITE_PREF. + */ +enum zonefs_ztype { + ZONEFS_ZTYPE_CNV, + ZONEFS_ZTYPE_SEQ, + ZONEFS_ZTYPE_MAX, +}; + +static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) +{ + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return ZONEFS_ZTYPE_CNV; + return ZONEFS_ZTYPE_SEQ; +} + +/* + * In-memory inode data. + */ +struct zonefs_inode_info { + struct inode i_vnode; + + /* File zone type */ + enum zonefs_ztype i_ztype; + + /* File zone start sector (512B unit) */ + sector_t i_zsector; + + /* File zone write pointer position (sequential zones only) */ + loff_t i_wpoffset; + + /* File maximum size */ + loff_t i_max_size; + + /* + * To serialise fully against both syscall and mmap based IO and + * sequential file truncation, two locks are used. For serializing + * zonefs_seq_file_truncate() against zonefs_iomap_begin(), that is, + * file truncate operations against block mapping, i_truncate_mutex is + * used. i_truncate_mutex also protects against concurrent accesses + * and changes to the inode private data, and in particular changes to + * a sequential file size on completion of direct IO writes. + * Serialization of mmap read IOs with truncate and syscall IO + * operations is done with i_mmap_sem in addition to i_truncate_mutex. + * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first, + * i_truncate_mutex second). + */ + struct mutex i_truncate_mutex; + struct rw_semaphore i_mmap_sem; +}; + +static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) +{ + return container_of(inode, struct zonefs_inode_info, i_vnode); +} + +/* + * On-disk super block (block 0). + */ +#define ZONEFS_LABEL_LEN 64 +#define ZONEFS_UUID_SIZE 16 +#define ZONEFS_SUPER_SIZE 4096 + +struct zonefs_super { + + /* Magic number */ + __le32 s_magic; + + /* Checksum */ + __le32 s_crc; + + /* Volume label */ + char s_label[ZONEFS_LABEL_LEN]; + + /* 128-bit uuid */ + __u8 s_uuid[ZONEFS_UUID_SIZE]; + + /* Features */ + __le64 s_features; + + /* UID/GID to use for files */ + __le32 s_uid; + __le32 s_gid; + + /* File permissions */ + __le32 s_perm; + + /* Padding to ZONEFS_SUPER_SIZE bytes */ + __u8 s_reserved[3988]; + +} __packed; + +/* + * Feature flags: specified in the s_features field of the on-disk super + * block struct zonefs_super and in-memory in the s_feartures field of + * struct zonefs_sb_info. + */ +enum zonefs_features { + /* + * Aggregate contiguous conventional zones into a single file. + */ + ZONEFS_F_AGGRCNV = 1ULL << 0, + /* + * Use super block specified UID for files instead of default 0. + */ + ZONEFS_F_UID = 1ULL << 1, + /* + * Use super block specified GID for files instead of default 0. + */ + ZONEFS_F_GID = 1ULL << 2, + /* + * Use super block specified file permissions instead of default 640. + */ + ZONEFS_F_PERM = 1ULL << 3, +}; + +#define ZONEFS_F_DEFINED_FEATURES \ + (ZONEFS_F_AGGRCNV | ZONEFS_F_UID | ZONEFS_F_GID | ZONEFS_F_PERM) + +/* + * Mount options for zone write pointer error handling. + */ +#define ZONEFS_MNTOPT_ERRORS_RO (1 << 0) /* Make zone file readonly */ +#define ZONEFS_MNTOPT_ERRORS_ZRO (1 << 1) /* Make zone file offline */ +#define ZONEFS_MNTOPT_ERRORS_ZOL (1 << 2) /* Make zone file offline */ +#define ZONEFS_MNTOPT_ERRORS_REPAIR (1 << 3) /* Remount read-only */ +#define ZONEFS_MNTOPT_ERRORS_MASK \ + (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ + ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) + +/* + * In-memory Super block information. + */ +struct zonefs_sb_info { + + unsigned long s_mount_opts; + + spinlock_t s_lock; + + unsigned long long s_features; + kuid_t s_uid; + kgid_t s_gid; + umode_t s_perm; + uuid_t s_uuid; + unsigned int s_zone_sectors_shift; + + unsigned int s_nr_files[ZONEFS_ZTYPE_MAX]; + + loff_t s_blocks; + loff_t s_used_blocks; +}; + +static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +#define zonefs_info(sb, format, args...) \ + pr_info("zonefs (%s): " format, sb->s_id, ## args) +#define zonefs_err(sb, format, args...) \ + pr_err("zonefs (%s) ERROR: " format, sb->s_id, ## args) +#define zonefs_warn(sb, format, args...) \ + pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) + +#endif diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 3ac436376d79..d78064007b17 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -87,6 +87,7 @@ #define NSFS_MAGIC 0x6e736673 #define BPF_FS_MAGIC 0xcafe4a11 #define AAFS_MAGIC 0x5a3c69f0 +#define ZONEFS_MAGIC 0x5a4f4653 /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 From fcb9c24bef3d1d0942c50fb25fbb8ab45c7c3753 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 25 Dec 2019 16:11:09 +0900 Subject: [PATCH 099/147] zonefs: Add documentation Add the new file Documentation/filesystems/zonefs.txt to document zonefs principles and user-space tool usage. Signed-off-by: Damien Le Moal Reviewed-by: Dave Chinner --- Documentation/filesystems/zonefs.txt | 404 +++++++++++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 405 insertions(+) create mode 100644 Documentation/filesystems/zonefs.txt diff --git a/Documentation/filesystems/zonefs.txt b/Documentation/filesystems/zonefs.txt new file mode 100644 index 000000000000..935bf22031ca --- /dev/null +++ b/Documentation/filesystems/zonefs.txt @@ -0,0 +1,404 @@ +ZoneFS - Zone filesystem for Zoned block devices + +Introduction +============ + +zonefs is a very simple file system exposing each zone of a zoned block device +as a file. Unlike a regular POSIX-compliant file system with native zoned block +device support (e.g. f2fs), zonefs does not hide the sequential write +constraint of zoned block devices to the user. Files representing sequential +write zones of the device must be written sequentially starting from the end +of the file (append only writes). + +As such, zonefs is in essence closer to a raw block device access interface +than to a full-featured POSIX file system. The goal of zonefs is to simplify +the implementation of zoned block device support in applications by replacing +raw block device file accesses with a richer file API, avoiding relying on +direct block device file ioctls which may be more obscure to developers. One +example of this approach is the implementation of LSM (log-structured merge) +tree structures (such as used in RocksDB and LevelDB) on zoned block devices +by allowing SSTables to be stored in a zone file similarly to a regular file +system rather than as a range of sectors of the entire disk. The introduction +of the higher level construct "one file is one zone" can help reducing the +amount of changes needed in the application as well as introducing support for +different application programming languages. + +Zoned block devices +------------------- + +Zoned storage devices belong to a class of storage devices with an address +space that is divided into zones. A zone is a group of consecutive LBAs and all +zones are contiguous (there are no LBA gaps). Zones may have different types. +* Conventional zones: there are no access constraints to LBAs belonging to + conventional zones. Any read or write access can be executed, similarly to a + regular block device. +* Sequential zones: these zones accept random reads but must be written + sequentially. Each sequential zone has a write pointer maintained by the + device that keeps track of the mandatory start LBA position of the next write + to the device. As a result of this write constraint, LBAs in a sequential zone + cannot be overwritten. Sequential zones must first be erased using a special + command (zone reset) before rewriting. + +Zoned storage devices can be implemented using various recording and media +technologies. The most common form of zoned storage today uses the SCSI Zoned +Block Commands (ZBC) and Zoned ATA Commands (ZAC) interfaces on Shingled +Magnetic Recording (SMR) HDDs. + +Solid State Disks (SSD) storage devices can also implement a zoned interface +to, for instance, reduce internal write amplification due to garbage collection. +The NVMe Zoned NameSpace (ZNS) is a technical proposal of the NVMe standard +committee aiming at adding a zoned storage interface to the NVMe protocol. + +Zonefs Overview +=============== + +Zonefs exposes the zones of a zoned block device as files. The files +representing zones are grouped by zone type, which are themselves represented +by sub-directories. This file structure is built entirely using zone information +provided by the device and so does not require any complex on-disk metadata +structure. + +On-disk metadata +---------------- + +zonefs on-disk metadata is reduced to an immutable super block which +persistently stores a magic number and optional feature flags and values. On +mount, zonefs uses blkdev_report_zones() to obtain the device zone configuration +and populates the mount point with a static file tree solely based on this +information. File sizes come from the device zone type and write pointer +position managed by the device itself. + +The super block is always written on disk at sector 0. The first zone of the +device storing the super block is never exposed as a zone file by zonefs. If +the zone containing the super block is a sequential zone, the mkzonefs format +tool always "finishes" the zone, that is, it transitions the zone to a full +state to make it read-only, preventing any data write. + +Zone type sub-directories +------------------------- + +Files representing zones of the same type are grouped together under the same +sub-directory automatically created on mount. + +For conventional zones, the sub-directory "cnv" is used. This directory is +however created if and only if the device has usable conventional zones. If +the device only has a single conventional zone at sector 0, the zone will not +be exposed as a file as it will be used to store the zonefs super block. For +such devices, the "cnv" sub-directory will not be created. + +For sequential write zones, the sub-directory "seq" is used. + +These two directories are the only directories that exist in zonefs. Users +cannot create other directories and cannot rename nor delete the "cnv" and +"seq" sub-directories. + +The size of the directories indicated by the st_size field of struct stat, +obtained with the stat() or fstat() system calls, indicates the number of files +existing under the directory. + +Zone files +---------- + +Zone files are named using the number of the zone they represent within the set +of zones of a particular type. That is, both the "cnv" and "seq" directories +contain files named "0", "1", "2", ... The file numbers also represent +increasing zone start sector on the device. + +All read and write operations to zone files are not allowed beyond the file +maximum size, that is, beyond the zone size. Any access exceeding the zone +size is failed with the -EFBIG error. + +Creating, deleting, renaming or modifying any attribute of files and +sub-directories is not allowed. + +The number of blocks of a file as reported by stat() and fstat() indicates the +size of the file zone, or in other words, the maximum file size. + +Conventional zone files +----------------------- + +The size of conventional zone files is fixed to the size of the zone they +represent. Conventional zone files cannot be truncated. + +These files can be randomly read and written using any type of I/O operation: +buffered I/Os, direct I/Os, memory mapped I/Os (mmap), etc. There are no I/O +constraint for these files beyond the file size limit mentioned above. + +Sequential zone files +--------------------- + +The size of sequential zone files grouped in the "seq" sub-directory represents +the file's zone write pointer position relative to the zone start sector. + +Sequential zone files can only be written sequentially, starting from the file +end, that is, write operations can only be append writes. Zonefs makes no +attempt at accepting random writes and will fail any write request that has a +start offset not corresponding to the end of the file, or to the end of the last +write issued and still in-flight (for asynchrnous I/O operations). + +Since dirty page writeback by the page cache does not guarantee a sequential +write pattern, zonefs prevents buffered writes and writeable shared mappings +on sequential files. Only direct I/O writes are accepted for these files. +zonefs relies on the sequential delivery of write I/O requests to the device +implemented by the block layer elevator. An elevator implementing the sequential +write feature for zoned block device (ELEVATOR_F_ZBD_SEQ_WRITE elevator feature) +must be used. This type of elevator (e.g. mq-deadline) is the set by default +for zoned block devices on device initialization. + +There are no restrictions on the type of I/O used for read operations in +sequential zone files. Buffered I/Os, direct I/Os and shared read mappings are +all accepted. + +Truncating sequential zone files is allowed only down to 0, in which case, the +zone is reset to rewind the file zone write pointer position to the start of +the zone, or up to the zone size, in which case the file's zone is transitioned +to the FULL state (finish zone operation). + +Format options +-------------- + +Several optional features of zonefs can be enabled at format time. +* Conventional zone aggregation: ranges of contiguous conventional zones can be + aggregated into a single larger file instead of the default one file per zone. +* File ownership: The owner UID and GID of zone files is by default 0 (root) + but can be changed to any valid UID/GID. +* File access permissions: the default 640 access permissions can be changed. + +IO error handling +----------------- + +Zoned block devices may fail I/O requests for reasons similar to regular block +devices, e.g. due to bad sectors. However, in addition to such known I/O +failure pattern, the standards governing zoned block devices behavior define +additional conditions that result in I/O errors. + +* A zone may transition to the read-only condition (BLK_ZONE_COND_READONLY): + While the data already written in the zone is still readable, the zone can + no longer be written. No user action on the zone (zone management command or + read/write access) can change the zone condition back to a normal read/write + state. While the reasons for the device to transition a zone to read-only + state are not defined by the standards, a typical cause for such transition + would be a defective write head on an HDD (all zones under this head are + changed to read-only). + +* A zone may transition to the offline condition (BLK_ZONE_COND_OFFLINE): + An offline zone cannot be read nor written. No user action can transition an + offline zone back to an operational good state. Similarly to zone read-only + transitions, the reasons for a drive to transition a zone to the offline + condition are undefined. A typical cause would be a defective read-write head + on an HDD causing all zones on the platter under the broken head to be + inaccessible. + +* Unaligned write errors: These errors result from the host issuing write + requests with a start sector that does not correspond to a zone write pointer + position when the write request is executed by the device. Even though zonefs + enforces sequential file write for sequential zones, unaligned write errors + may still happen in the case of a partial failure of a very large direct I/O + operation split into multiple BIOs/requests or asynchronous I/O operations. + If one of the write request within the set of sequential write requests + issued to the device fails, all write requests after queued after it will + become unaligned and fail. + +* Delayed write errors: similarly to regular block devices, if the device side + write cache is enabled, write errors may occur in ranges of previously + completed writes when the device write cache is flushed, e.g. on fsync(). + Similarly to the previous immediate unaligned write error case, delayed write + errors can propagate through a stream of cached sequential data for a zone + causing all data to be dropped after the sector that caused the error. + +All I/O errors detected by zonefs are notified to the user with an error code +return for the system call that trigered or detected the error. The recovery +actions taken by zonefs in response to I/O errors depend on the I/O type (read +vs write) and on the reason for the error (bad sector, unaligned writes or zone +condition change). + +* For read I/O errors, zonefs does not execute any particular recovery action, + but only if the file zone is still in a good condition and there is no + inconsistency between the file inode size and its zone write pointer position. + If a problem is detected, I/O error recovery is executed (see below table). + +* For write I/O errors, zonefs I/O error recovery is always executed. + +* A zone condition change to read-only or offline also always triggers zonefs + I/O error recovery. + +Zonefs minimal I/O error recovery may change a file size and a file access +permissions. + +* File size changes: + Immediate or delayed write errors in a sequential zone file may cause the file + inode size to be inconsistent with the amount of data successfully written in + the file zone. For instance, the partial failure of a multi-BIO large write + operation will cause the zone write pointer to advance partially, even though + the entire write operation will be reported as failed to the user. In such + case, the file inode size must be advanced to reflect the zone write pointer + change and eventually allow the user to restart writing at the end of the + file. + A file size may also be reduced to reflect a delayed write error detected on + fsync(): in this case, the amount of data effectively written in the zone may + be less than originally indicated by the file inode size. After such I/O + error, zonefs always fixes a file inode size to reflect the amount of data + persistently stored in the file zone. + +* Access permission changes: + A zone condition change to read-only is indicated with a change in the file + access permissions to render the file read-only. This disables changes to the + file attributes and data modification. For offline zones, all permissions + (read and write) to the file are disabled. + +Further action taken by zonefs I/O error recovery can be controlled by the user +with the "errors=xxx" mount option. The table below summarizes the result of +zonefs I/O error processing depending on the mount option and on the zone +conditions. + + +--------------+-----------+-----------------------------------------+ + | | | Post error state | + | "errors=xxx" | device | access permissions | + | mount | zone | file file device zone | + | option | condition | size read write read write | + +--------------+-----------+-----------------------------------------+ + | | good | fixed yes no yes yes | + | remount-ro | read-only | fixed yes no yes no | + | (default) | offline | 0 no no no no | + +--------------+-----------+-----------------------------------------+ + | | good | fixed yes no yes yes | + | zone-ro | read-only | fixed yes no yes no | + | | offline | 0 no no no no | + +--------------+-----------+-----------------------------------------+ + | | good | 0 no no yes yes | + | zone-offline | read-only | 0 no no yes no | + | | offline | 0 no no no no | + +--------------+-----------+-----------------------------------------+ + | | good | fixed yes yes yes yes | + | repair | read-only | fixed yes no yes no | + | | offline | 0 no no no no | + +--------------+-----------+-----------------------------------------+ + +Further notes: +* The "errors=remount-ro" mount option is the default behavior of zonefs I/O + error processing if no errors mount option is specified. +* With the "errors=remount-ro" mount option, the change of the file access + permissions to read-only applies to all files. The file system is remounted + read-only. +* Access permission and file size changes due to the device transitioning zones + to the offline condition are permanent. Remounting or reformating the device + with mkfs.zonefs (mkzonefs) will not change back offline zone files to a good + state. +* File access permission changes to read-only due to the device transitioning + zones to the read-only condition are permanent. Remounting or reformating + the device will not re-enable file write access. +* File access permission changes implied by the remount-ro, zone-ro and + zone-offline mount options are temporary for zones in a good condition. + Unmounting and remounting the file system will restore the previous default + (format time values) access rights to the files affected. +* The repair mount option triggers only the minimal set of I/O error recovery + actions, that is, file size fixes for zones in a good condition. Zones + indicated as being read-only or offline by the device still imply changes to + the zone file access permissions as noted in the table above. + +Mount options +------------- + +zonefs define the "errors=" mount option to allow the user to specify +zonefs behavior in response to I/O errors, inode size inconsistencies or zone +condition chages. The defined behaviors are as follow: +* remount-ro (default) +* zone-ro +* zone-offline +* repair + +The I/O error actions defined for each behavior is detailed in the previous +section. + +Zonefs User Space Tools +======================= + +The mkzonefs tool is used to format zoned block devices for use with zonefs. +This tool is available on Github at: + +https://github.com/damien-lemoal/zonefs-tools + +zonefs-tools also includes a test suite which can be run against any zoned +block device, including null_blk block device created with zoned mode. + +Examples +-------- + +The following formats a 15TB host-managed SMR HDD with 256 MB zones +with the conventional zones aggregation feature enabled. + +# mkzonefs -o aggr_cnv /dev/sdX +# mount -t zonefs /dev/sdX /mnt +# ls -l /mnt/ +total 0 +dr-xr-xr-x 2 root root 1 Nov 25 13:23 cnv +dr-xr-xr-x 2 root root 55356 Nov 25 13:23 seq + +The size of the zone files sub-directories indicate the number of files +existing for each type of zones. In this example, there is only one +conventional zone file (all conventional zones are aggregated under a single +file). + +# ls -l /mnt/cnv +total 137101312 +-rw-r----- 1 root root 140391743488 Nov 25 13:23 0 + +This aggregated conventional zone file can be used as a regular file. + +# mkfs.ext4 /mnt/cnv/0 +# mount -o loop /mnt/cnv/0 /data + +The "seq" sub-directory grouping files for sequential write zones has in this +example 55356 zones. + +# ls -lv /mnt/seq +total 14511243264 +-rw-r----- 1 root root 0 Nov 25 13:23 0 +-rw-r----- 1 root root 0 Nov 25 13:23 1 +-rw-r----- 1 root root 0 Nov 25 13:23 2 +... +-rw-r----- 1 root root 0 Nov 25 13:23 55354 +-rw-r----- 1 root root 0 Nov 25 13:23 55355 + +For sequential write zone files, the file size changes as data is appended at +the end of the file, similarly to any regular file system. + +# dd if=/dev/zero of=/mnt/seq/0 bs=4096 count=1 conv=notrunc oflag=direct +1+0 records in +1+0 records out +4096 bytes (4.1 kB, 4.0 KiB) copied, 0.00044121 s, 9.3 MB/s + +# ls -l /mnt/seq/0 +-rw-r----- 1 root root 4096 Nov 25 13:23 /mnt/seq/0 + +The written file can be truncated to the zone size, preventing any further +write operation. + +# truncate -s 268435456 /mnt/seq/0 +# ls -l /mnt/seq/0 +-rw-r----- 1 root root 268435456 Nov 25 13:49 /mnt/seq/0 + +Truncation to 0 size allows freeing the file zone storage space and restart +append-writes to the file. + +# truncate -s 0 /mnt/seq/0 +# ls -l /mnt/seq/0 +-rw-r----- 1 root root 0 Nov 25 13:49 /mnt/seq/0 + +Since files are statically mapped to zones on the disk, the number of blocks of +a file as reported by stat() and fstat() indicates the size of the file zone. + +# stat /mnt/seq/0 + File: /mnt/seq/0 + Size: 0 Blocks: 524288 IO Block: 4096 regular empty file +Device: 870h/2160d Inode: 50431 Links: 1 +Access: (0640/-rw-r-----) Uid: ( 0/ root) Gid: ( 0/ root) +Access: 2019-11-25 13:23:57.048971997 +0900 +Modify: 2019-11-25 13:52:25.553805765 +0900 +Change: 2019-11-25 13:52:25.553805765 +0900 + Birth: - + +The number of blocks of the file ("Blocks") in units of 512B blocks gives the +maximum file size of 524288 * 512 B = 256 MB, corresponding to the device zone +size in this example. Of note is that the "IO block" field always indicates the +minimum I/O size for writes and corresponds to the device physical sector size. diff --git a/MAINTAINERS b/MAINTAINERS index 089fd879632a..e9dcf8952573 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18311,6 +18311,7 @@ L: linux-fsdevel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs.git S: Maintained F: fs/zonefs/ +F: Documentation/filesystems/zonefs.txt ZPOOL COMPRESSED PAGE STORAGE API M: Dan Streetman From b39a934ec72fa2b5a74123891f25273a38378b90 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Feb 2020 13:55:01 +0000 Subject: [PATCH 100/147] rxrpc: Fix service call disconnection The recent patch that substituted a flag on an rxrpc_call for the connection pointer being NULL as an indication that a call was disconnected puts the set_bit in the wrong place for service calls. This is only a problem if a call is implicitly terminated by a new call coming in on the same connection channel instead of a terminating ACK packet. In such a case, rxrpc_input_implicit_end_call() calls __rxrpc_disconnect_call(), which is now (incorrectly) setting the disconnection bit, meaning that when rxrpc_release_call() is later called, it doesn't call rxrpc_disconnect_call() and so the call isn't removed from the peer's error distribution list and the list gets corrupted. KASAN finds the issue as an access after release on a call, but the position at which it occurs is confusing as it appears to be related to a different call (the call site is where the latter call is being removed from the error distribution list and either the next or pprev pointer points to a previously released call). Fix this by moving the setting of the flag from __rxrpc_disconnect_call() to rxrpc_disconnect_call() in the same place that the connection pointer was being cleared. Fixes: 5273a191dca6 ("rxrpc: Fix NULL pointer deref due to call->conn being cleared on disconnect") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/conn_object.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index c0b3154f7a7e..19e141eeed17 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -171,8 +171,6 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, _enter("%d,%x", conn->debug_id, call->cid); - set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); - if (rcu_access_pointer(chan->call) == call) { /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. @@ -225,6 +223,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) __rxrpc_disconnect_call(conn, call); spin_unlock(&conn->channel_lock); + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); conn->idle_timestamp = jiffies; } From 963485d436ccc2810177a7b08af22336ec2af67b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Feb 2020 13:57:40 +0000 Subject: [PATCH 101/147] rxrpc: Fix call RCU cleanup using non-bh-safe locks rxrpc_rcu_destroy_call(), which is called as an RCU callback to clean up a put call, calls rxrpc_put_connection() which, deep in its bowels, takes a number of spinlocks in a non-BH-safe way, including rxrpc_conn_id_lock and local->client_conns_lock. RCU callbacks, however, are normally called from softirq context, which can cause lockdep to notice the locking inconsistency. To get lockdep to detect this, it's necessary to have the connection cleaned up on the put at the end of the last of its calls, though normally the clean up is deferred. This can be induced, however, by starting a call on an AF_RXRPC socket and then closing the socket without reading the reply. Fix this by having rxrpc_rcu_destroy_call() punt the destruction to a workqueue if in softirq-mode and defer the destruction to process context. Note that another way to fix this could be to add a bunch of bh-disable annotations to the spinlocks concerned - and there might be more than just those two - but that means spending more time with BHs disabled. Note also that some of these places were covered by bh-disable spinlocks belonging to the rxrpc_transport object, but these got removed without the _bh annotation being retained on the next lock in. Fixes: 999b69f89241 ("rxrpc: Kill the client connection bundle concept") Reported-by: syzbot+d82f3ac8d87e7ccbb2c9@syzkaller.appspotmail.com Reported-by: syzbot+3f1fd6b8cbf8702d134e@syzkaller.appspotmail.com Signed-off-by: David Howells cc: Hillf Danton Signed-off-by: David S. Miller --- net/rxrpc/call_object.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index dbdbc4f18b5e..c9f34b0a11df 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -562,11 +562,11 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) } /* - * Final call destruction under RCU. + * Final call destruction - but must be done in process context. */ -static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) +static void rxrpc_destroy_call(struct work_struct *work) { - struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); struct rxrpc_net *rxnet = call->rxnet; rxrpc_put_connection(call->conn); @@ -578,6 +578,22 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) wake_up_var(&rxnet->nr_calls); } +/* + * Final call destruction under RCU. + */ +static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) +{ + struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + + if (in_softirq()) { + INIT_WORK(&call->processor, rxrpc_destroy_call); + if (!rxrpc_queue_work(&call->processor)) + BUG(); + } else { + rxrpc_destroy_call(&call->processor); + } +} + /* * clean up a call */ From 7d10f0774f9e32aa2f2e012f7fcb312a2ce422b9 Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Thu, 6 Feb 2020 23:29:17 +0800 Subject: [PATCH 102/147] net: stmmac: fix a possible endless loop It forgot to reduce the value of the variable retry in a while loop in the ethqos_configure() function. It may cause an endless loop and without timeout. Fixes: a7c30e62d4b8 ("net: stmmac: Add driver for Qualcomm ethqos") Signed-off-by: Dejin Zheng Acked-by: Vinod Koul Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 7ec895407d23..e0a5fe83d8e0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -413,6 +413,7 @@ static int ethqos_configure(struct qcom_ethqos *ethqos) dll_lock = rgmii_readl(ethqos, SDC4_STATUS); if (dll_lock & SDC4_STATUS_DLL_LOCK) break; + retry--; } while (retry > 0); if (!retry) dev_err(ðqos->pdev->dev, From df373702bc0f8f2d83980ea441e71639fc1efcf8 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 6 Feb 2020 11:07:45 -0800 Subject: [PATCH 103/147] net: dsa: b53: Always use dev->vlan_enabled in b53_configure_vlan() b53_configure_vlan() is called by the bcm_sf2 driver upon setup and indirectly through resume as well. During the initial setup, we are guaranteed that dev->vlan_enabled is false, so there is no change in behavior, however during suspend, we may have enabled VLANs before, so we do want to restore that setting. Fixes: dad8d7c6452b ("net: dsa: b53: Properly account for VLAN filtering") Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 060497512159..449a22172e07 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -693,7 +693,7 @@ int b53_configure_vlan(struct dsa_switch *ds) b53_do_vlan_op(dev, VTA_CMD_CLEAR); } - b53_enable_vlan(dev, false, ds->vlan_filtering); + b53_enable_vlan(dev, dev->vlan_enabled, ds->vlan_filtering); b53_for_each_port(dev, i) b53_write16(dev, B53_VLAN_PAGE, From de34d7084edd069dac5aa010cfe32bd8c4619fa6 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 6 Feb 2020 11:23:52 -0800 Subject: [PATCH 104/147] net: dsa: bcm_sf2: Only 7278 supports 2Gb/sec IMP port The 7445 switch clocking profiles do not allow us to run the IMP port at 2Gb/sec in a way that it is reliable and consistent. Make sure that the setting is only applied to the 7278 family. Fixes: 8f1880cbe8d0 ("net: dsa: bcm_sf2: Configure IMP port for 2Gb/sec") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 3e8635311d0d..d1955543acd1 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -68,7 +68,9 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) /* Force link status for IMP port */ reg = core_readl(priv, offset); - reg |= (MII_SW_OR | LINK_STS | GMII_SPEED_UP_2G); + reg |= (MII_SW_OR | LINK_STS); + if (priv->type == BCM7278_DEVICE_ID) + reg |= GMII_SPEED_UP_2G; core_writel(priv, reg, offset); /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */ From 5652e63df3303c2a702bac25fbf710b9cb64dfba Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Thu, 6 Feb 2020 13:46:06 -0800 Subject: [PATCH 105/147] taprio: Fix enabling offload with wrong number of traffic classes If the driver implementing taprio offloading depends on the value of the network device number of traffic classes (dev->num_tc) for whatever reason, it was going to receive the value zero. The value was only set after the offloading function is called. So, moving setting the number of traffic classes to before the offloading function is called fixes this issue. This is safe because this only happens when taprio is instantiated (we don't allow this configuration to be changed without first removing taprio). Fixes: 9c66d1564676 ("taprio: Add support for hardware offloading") Reported-by: Po Liu Signed-off-by: Vinicius Costa Gomes Acked-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c609373c8661..ad0dadcfcdba 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1444,6 +1444,19 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, taprio_set_picos_per_byte(dev, q); + if (mqprio) { + netdev_set_num_tc(dev, mqprio->num_tc); + for (i = 0; i < mqprio->num_tc; i++) + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + + /* Always use supplied priority mappings */ + for (i = 0; i <= TC_BITMASK; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } + if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) err = taprio_enable_offload(dev, mqprio, q, new_admin, extack); else @@ -1471,19 +1484,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->advance_timer.function = advance_sched; } - if (mqprio) { - netdev_set_num_tc(dev, mqprio->num_tc); - for (i = 0; i < mqprio->num_tc; i++) - netdev_set_tc_queue(dev, i, - mqprio->count[i], - mqprio->offset[i]); - - /* Always use supplied priority mappings */ - for (i = 0; i <= TC_BITMASK; i++) - netdev_set_prio_tc_map(dev, i, - mqprio->prio_tc_map[i]); - } - if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) { q->dequeue = taprio_dequeue_offload; q->peek = taprio_peek_offload; From a9d6227436f32142209f4428f2dc616761485112 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Thu, 6 Feb 2020 13:46:07 -0800 Subject: [PATCH 106/147] taprio: Fix still allowing changing the flags during runtime Because 'q->flags' starts as zero, and zero is a valid value, we aren't able to detect the transition from zero to something else during "runtime". The solution is to initialize 'q->flags' with an invalid value, so we can detect if 'q->flags' was set by the user or not. To better solidify the behavior, 'flags' handling is moved to a separate function. The behavior is: - 'flags' if unspecified by the user, is assumed to be zero; - 'flags' cannot change during "runtime" (i.e. a change() request cannot modify it); With this new function we can remove taprio_flags, which should reduce the risk of future accidents. Allowing flags to be changed was causing the following RCU stall: [ 1730.558249] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: [ 1730.558258] rcu: 6-...0: (190 ticks this GP) idle=922/0/0x1 softirq=25580/25582 fqs=16250 [ 1730.558264] (detected by 2, t=65002 jiffies, g=33017, q=81) [ 1730.558269] Sending NMI from CPU 2 to CPUs 6: [ 1730.559277] NMI backtrace for cpu 6 [ 1730.559277] CPU: 6 PID: 0 Comm: swapper/6 Tainted: G E 5.5.0-rc6+ #35 [ 1730.559278] Hardware name: Gigabyte Technology Co., Ltd. Z390 AORUS ULTRA/Z390 AORUS ULTRA-CF, BIOS F7 03/14/2019 [ 1730.559278] RIP: 0010:__hrtimer_run_queues+0xe2/0x440 [ 1730.559278] Code: 48 8b 43 28 4c 89 ff 48 8b 75 c0 48 89 45 c8 e8 f4 bb 7c 00 0f 1f 44 00 00 65 8b 05 40 31 f0 68 89 c0 48 0f a3 05 3e 5c 25 01 <0f> 82 fc 01 00 00 48 8b 45 c8 48 89 df ff d0 89 45 c8 0f 1f 44 00 [ 1730.559279] RSP: 0018:ffff9970802d8f10 EFLAGS: 00000083 [ 1730.559279] RAX: 0000000000000006 RBX: ffff8b31645bff38 RCX: 0000000000000000 [ 1730.559280] RDX: 0000000000000000 RSI: ffffffff9710f2ec RDI: ffffffff978daf0e [ 1730.559280] RBP: ffff9970802d8f68 R08: 0000000000000000 R09: 0000000000000000 [ 1730.559280] R10: 0000018336d7944e R11: 0000000000000001 R12: ffff8b316e39f9c0 [ 1730.559281] R13: ffff8b316e39f940 R14: ffff8b316e39f998 R15: ffff8b316e39f7c0 [ 1730.559281] FS: 0000000000000000(0000) GS:ffff8b316e380000(0000) knlGS:0000000000000000 [ 1730.559281] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1730.559281] CR2: 00007f1105303760 CR3: 0000000227210005 CR4: 00000000003606e0 [ 1730.559282] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1730.559282] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1730.559282] Call Trace: [ 1730.559282] [ 1730.559283] ? taprio_dequeue_soft+0x2d0/0x2d0 [sch_taprio] [ 1730.559283] hrtimer_interrupt+0x104/0x220 [ 1730.559283] ? irqtime_account_irq+0x34/0xa0 [ 1730.559283] smp_apic_timer_interrupt+0x6d/0x230 [ 1730.559284] apic_timer_interrupt+0xf/0x20 [ 1730.559284] [ 1730.559284] RIP: 0010:cpu_idle_poll+0x35/0x1a0 [ 1730.559285] Code: 88 82 ff 65 44 8b 25 12 7d 73 68 0f 1f 44 00 00 e8 90 c3 89 ff fb 65 48 8b 1c 25 c0 7e 01 00 48 8b 03 a8 08 74 0b eb 1c f3 90 <48> 8b 03 a8 08 75 13 8b 05 be a8 a8 00 85 c0 75 ed e8 75 48 84 ff [ 1730.559285] RSP: 0018:ffff997080137ea8 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13 [ 1730.559285] RAX: 0000000000000001 RBX: ffff8b316bc3c580 RCX: 0000000000000000 [ 1730.559286] RDX: 0000000000000001 RSI: 000000002819aad9 RDI: ffffffff978da730 [ 1730.559286] RBP: ffff997080137ec0 R08: 0000018324a6d387 R09: 0000000000000000 [ 1730.559286] R10: 0000000000000400 R11: 0000000000000001 R12: 0000000000000006 [ 1730.559286] R13: ffff8b316bc3c580 R14: 0000000000000000 R15: 0000000000000000 [ 1730.559287] ? cpu_idle_poll+0x20/0x1a0 [ 1730.559287] ? cpu_idle_poll+0x20/0x1a0 [ 1730.559287] do_idle+0x4d/0x1f0 [ 1730.559287] ? complete+0x44/0x50 [ 1730.559288] cpu_startup_entry+0x1b/0x20 [ 1730.559288] start_secondary+0x142/0x180 [ 1730.559288] secondary_startup_64+0xb6/0xc0 [ 1776.686313] nvme nvme0: I/O 96 QID 1 timeout, completion polled Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 61 ++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index ad0dadcfcdba..e2d4283bea6d 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -31,6 +31,7 @@ static DEFINE_SPINLOCK(taprio_list_lock); #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) +#define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { struct list_head list; @@ -1367,6 +1368,33 @@ static int taprio_mqprio_cmp(const struct net_device *dev, return 0; } +/* The semantics of the 'flags' argument in relation to 'change()' + * requests, are interpreted following two rules (which are applied in + * this order): (1) an omitted 'flags' argument is interpreted as + * zero; (2) the 'flags' of a "running" taprio instance cannot be + * changed. + */ +static int taprio_new_flags(const struct nlattr *attr, u32 old, + struct netlink_ext_ack *extack) +{ + u32 new = 0; + + if (attr) + new = nla_get_u32(attr); + + if (old != TAPRIO_FLAGS_INVALID && old != new) { + NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } + + if (!taprio_flags_valid(new)) { + NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); + return -EINVAL; + } + + return new; +} + static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1375,7 +1403,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; - u32 taprio_flags = 0; unsigned long flags; ktime_t start; int i, err; @@ -1388,21 +1415,14 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - if (tb[TCA_TAPRIO_ATTR_FLAGS]) { - taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]); + err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], + q->flags, extack); + if (err < 0) + return err; - if (q->flags != 0 && q->flags != taprio_flags) { - NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); - return -EOPNOTSUPP; - } else if (!taprio_flags_valid(taprio_flags)) { - NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); - return -EINVAL; - } + q->flags = err; - q->flags = taprio_flags; - } - - err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags); + err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; @@ -1457,7 +1477,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, mqprio->prio_tc_map[i]); } - if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, mqprio, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); @@ -1477,14 +1497,14 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } - if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) && - !FULL_OFFLOAD_IS_ENABLED(taprio_flags) && + if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && + !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } - if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) { + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { q->dequeue = taprio_dequeue_offload; q->peek = taprio_peek_offload; } else { @@ -1501,7 +1521,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) { + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { setup_txtime(q, new_admin, start); if (!oper) { @@ -1528,7 +1548,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, spin_unlock_irqrestore(&q->current_entry_lock, flags); - if (FULL_OFFLOAD_IS_ENABLED(taprio_flags)) + if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } @@ -1597,6 +1617,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, * and get the valid one on taprio_change(). */ q->clockid = -1; + q->flags = TAPRIO_FLAGS_INVALID; spin_lock(&taprio_list_lock); list_add(&q->taprio_list, &taprio_list); From 49c684d79cfdc3032344bf6f3deeea81c4efedbf Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Thu, 6 Feb 2020 13:46:08 -0800 Subject: [PATCH 107/147] taprio: Add missing policy validation for flags netlink policy validation for the 'flags' argument was missing. Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e2d4283bea6d..b82a9769ab40 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -767,6 +767,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, }; static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, From 7c16680a08ee1e444a67d232c679ccf5b30fad16 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Thu, 6 Feb 2020 13:46:09 -0800 Subject: [PATCH 108/147] taprio: Use taprio_reset_tc() to reset Traffic Classes configuration When destroying the current taprio instance, which can happen when the creation of one fails, we should reset the traffic class configuration back to the default state. netdev_reset_tc() is a better way because in addition to setting the number of traffic classes to zero, it also resets the priority to traffic classes mapping to the default value. Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler") Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index b82a9769ab40..21df69071df2 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1588,7 +1588,7 @@ static void taprio_destroy(struct Qdisc *sch) } q->qdiscs = NULL; - netdev_set_num_tc(dev, 0); + netdev_reset_tc(dev); if (q->oper_sched) call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); From bfabd41da34180d05382312533a3adc2e012dee0 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Thu, 6 Feb 2020 13:46:10 -0800 Subject: [PATCH 109/147] taprio: Fix dropping packets when using taprio + ETF offloading When using taprio offloading together with ETF offloading, configured like this, for example: $ tc qdisc replace dev $IFACE parent root handle 100 taprio \ num_tc 4 \ map 2 2 1 0 3 2 2 2 2 2 2 2 2 2 2 2 \ queues 1@0 1@1 1@2 1@3 \ base-time $BASE_TIME \ sched-entry S 01 1000000 \ sched-entry S 0e 1000000 \ flags 0x2 $ tc qdisc replace dev $IFACE parent 100:1 etf \ offload delta 300000 clockid CLOCK_TAI During enqueue, it works out that the verification added for the "txtime" assisted mode is run when using taprio + ETF offloading, the only thing missing is initializing the 'next_txtime' of all the cycle entries. (if we don't set 'next_txtime' all packets from SO_TXTIME sockets are dropped) Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode") Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 21df69071df2..660fc45ee40f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1522,9 +1522,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { - setup_txtime(q, new_admin, start); + setup_txtime(q, new_admin, start); + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; From 184367dce4f744bde54377203305ccc8889aa79f Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Thu, 6 Feb 2020 14:01:05 -0800 Subject: [PATCH 110/147] hv_netvsc: Fix XDP refcnt for synthetic and VF NICs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The caller of XDP_SETUP_PROG has already incremented refcnt in __bpf_prog_get(), so drivers should only increment refcnt by num_queues - 1. To fix the issue, update netvsc_xdp_set() to add the correct number to refcnt. Hold a refcnt in netvsc_xdp_set()’s other caller, netvsc_attach(). And, do the same in netvsc_vf_setxdp(). Otherwise, every time when VF is removed and added from the host side, the refcnt will be decreased by one, which may cause page fault when unloading xdp program. Fixes: 351e1581395f ("hv_netvsc: Add XDP support") Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_bpf.c | 13 +++++++++++-- drivers/net/hyperv/netvsc_drv.c | 5 ++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 20adfe544294..b86611041db6 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -120,7 +120,7 @@ int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog, } if (prog) - bpf_prog_add(prog, nvdev->num_chn); + bpf_prog_add(prog, nvdev->num_chn - 1); for (i = 0; i < nvdev->num_chn; i++) rcu_assign_pointer(nvdev->chan_table[i].bpf_prog, prog); @@ -136,6 +136,7 @@ int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog) { struct netdev_bpf xdp; bpf_op_t ndo_bpf; + int ret; ASSERT_RTNL(); @@ -148,10 +149,18 @@ int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog) memset(&xdp, 0, sizeof(xdp)); + if (prog) + bpf_prog_inc(prog); + xdp.command = XDP_SETUP_PROG; xdp.prog = prog; - return ndo_bpf(vf_netdev, &xdp); + ret = ndo_bpf(vf_netdev, &xdp); + + if (ret && prog) + bpf_prog_put(prog); + + return ret; } static u32 netvsc_xdp_query(struct netvsc_device *nvdev) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 8fc71bd49894..65e12cb07f45 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1059,9 +1059,12 @@ static int netvsc_attach(struct net_device *ndev, prog = dev_info->bprog; if (prog) { + bpf_prog_inc(prog); ret = netvsc_xdp_set(ndev, prog, NULL, nvdev); - if (ret) + if (ret) { + bpf_prog_put(prog); goto err1; + } } /* In any case device is now ready */ From 9eeeb3c9de4e3aeaa2bec097162f09305dd9f4c3 Mon Sep 17 00:00:00 2001 From: "Tan, Tee Min" Date: Fri, 7 Feb 2020 15:33:20 +0800 Subject: [PATCH 111/147] net: stmmac: fix incorrect GMAC_VLAN_TAG register writting in GMAC4+ It should always do a read of current value of GMAC_VLAN_TAG instead of directly overwriting the register value. Fixes: c1be0022df0d ("net: stmmac: Add VLAN HASH filtering support in GMAC4+") Signed-off-by: Tan, Tee Min Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index f0c0ea616032..4d8eef9ff137 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -736,11 +736,14 @@ static void dwmac4_update_vlan_hash(struct mac_device_info *hw, u32 hash, __le16 perfect_match, bool is_double) { void __iomem *ioaddr = hw->pcsr; + u32 value; writel(hash, ioaddr + GMAC_VLAN_HASH_TABLE); + value = readl(ioaddr + GMAC_VLAN_TAG); + if (hash) { - u32 value = GMAC_VLAN_VTHM | GMAC_VLAN_ETV; + value |= GMAC_VLAN_VTHM | GMAC_VLAN_ETV; if (is_double) { value |= GMAC_VLAN_EDVLP; value |= GMAC_VLAN_ESVL; @@ -759,8 +762,6 @@ static void dwmac4_update_vlan_hash(struct mac_device_info *hw, u32 hash, writel(value | perfect_match, ioaddr + GMAC_VLAN_TAG); } else { - u32 value = readl(ioaddr + GMAC_VLAN_TAG); - value &= ~(GMAC_VLAN_VTHM | GMAC_VLAN_ETV); value &= ~(GMAC_VLAN_EDVLP | GMAC_VLAN_ESVL); value &= ~GMAC_VLAN_DOVLTC; From 907a076881f171254219faad05f46ac5baabedfb Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Fri, 7 Feb 2020 15:33:40 +0800 Subject: [PATCH 112/147] net: stmmac: xgmac: fix incorrect XGMAC_VLAN_TAG register writting We should always do a read of current value of XGMAC_VLAN_TAG instead of directly overwriting the register value. Fixes: 3cd1cfcba26e2 ("net: stmmac: Implement VLAN Hash Filtering in XGMAC") Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 2af3ac5409b7..a0e67d178280 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -569,7 +569,9 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash, writel(value, ioaddr + XGMAC_PACKET_FILTER); - value = XGMAC_VLAN_VTHM | XGMAC_VLAN_ETV; + value = readl(ioaddr + XGMAC_VLAN_TAG); + + value |= XGMAC_VLAN_VTHM | XGMAC_VLAN_ETV; if (is_double) { value |= XGMAC_VLAN_EDVLP; value |= XGMAC_VLAN_ESVL; @@ -584,7 +586,9 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash, writel(value, ioaddr + XGMAC_PACKET_FILTER); - value = XGMAC_VLAN_ETV; + value = readl(ioaddr + XGMAC_VLAN_TAG); + + value |= XGMAC_VLAN_ETV; if (is_double) { value |= XGMAC_VLAN_EDVLP; value |= XGMAC_VLAN_ESVL; From 2ba31cd93784b61813226d259fd94a221ecd9d61 Mon Sep 17 00:00:00 2001 From: "Verma, Aashish" Date: Fri, 7 Feb 2020 15:33:54 +0800 Subject: [PATCH 113/147] net: stmmac: fix missing IFF_MULTICAST check in dwmac4_set_filter Without checking for IFF_MULTICAST flag, it is wrong to assume multicast filtering is always enabled. By checking against IFF_MULTICAST, now the driver behaves correctly when the multicast support is toggled by below command:- ip link set multicast off|on Fixes: 477286b53f55 ("stmmac: add GMAC4 core support") Signed-off-by: Verma, Aashish Tested-by: Tan, Tee Min Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 4d8eef9ff137..dc09d2131e40 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -420,7 +420,7 @@ static void dwmac4_set_filter(struct mac_device_info *hw, value |= GMAC_PACKET_FILTER_PM; /* Set all the bits of the HASH tab */ memset(mc_filter, 0xff, sizeof(mc_filter)); - } else if (!netdev_mc_empty(dev)) { + } else if (!netdev_mc_empty(dev) && (dev->flags & IFF_MULTICAST)) { struct netdev_hw_addr *ha; /* Hash filter for multicast */ From 2f633d5820e4ed870f408957322acb9263bce2f4 Mon Sep 17 00:00:00 2001 From: "Tan, Tee Min" Date: Fri, 7 Feb 2020 15:34:15 +0800 Subject: [PATCH 114/147] net: stmmac: xgmac: fix missing IFF_MULTICAST checki in dwxgmac2_set_filter Without checking for IFF_MULTICAST flag, it is wrong to assume multicast filtering is always enabled. By checking against IFF_MULTICAST, now the driver behaves correctly when the multicast support is toggled by below command:- ip link set multicast off|on Fixes: 0efedbf11f07a ("net: stmmac: xgmac: Fix XGMAC selftests") Signed-off-by: Tan, Tee Min Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index a0e67d178280..67b754a56288 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -458,7 +458,7 @@ static void dwxgmac2_set_filter(struct mac_device_info *hw, for (i = 0; i < XGMAC_MAX_HASH_TABLE; i++) writel(~0x0, ioaddr + XGMAC_HASH_TABLE(i)); - } else if (!netdev_mc_empty(dev)) { + } else if (!netdev_mc_empty(dev) && (dev->flags & IFF_MULTICAST)) { struct netdev_hw_addr *ha; value |= XGMAC_FILTER_HMC; From 909c1dde67c433f1e4122f2619cbd8ac370fcf0a Mon Sep 17 00:00:00 2001 From: Voon Weifeng Date: Fri, 7 Feb 2020 15:34:28 +0800 Subject: [PATCH 115/147] net: stmmac: update pci platform data to use phy_interface The recent patch to support passive mode converter did not take care the phy interface configuration in PCI platform data. Hence, converting all the PCI platform data from plat->interface to plat->phy_interface as the default mode is meant for PHY. Fixes: 0060c8783330 ("net: stmmac: implement support for passive mode converters via dt") Signed-off-by: Voon Weifeng Tested-by: Tan, Tee Min Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 623521052152..fe2c9fa6a71c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -95,7 +95,7 @@ static int stmmac_default_data(struct pci_dev *pdev, plat->bus_id = 1; plat->phy_addr = 0; - plat->interface = PHY_INTERFACE_MODE_GMII; + plat->phy_interface = PHY_INTERFACE_MODE_GMII; plat->dma_cfg->pbl = 32; plat->dma_cfg->pblx8 = true; @@ -217,7 +217,8 @@ static int ehl_sgmii_data(struct pci_dev *pdev, { plat->bus_id = 1; plat->phy_addr = 0; - plat->interface = PHY_INTERFACE_MODE_SGMII; + plat->phy_interface = PHY_INTERFACE_MODE_SGMII; + return ehl_common_data(pdev, plat); } @@ -230,7 +231,8 @@ static int ehl_rgmii_data(struct pci_dev *pdev, { plat->bus_id = 1; plat->phy_addr = 0; - plat->interface = PHY_INTERFACE_MODE_RGMII; + plat->phy_interface = PHY_INTERFACE_MODE_RGMII; + return ehl_common_data(pdev, plat); } @@ -258,7 +260,7 @@ static int tgl_sgmii_data(struct pci_dev *pdev, { plat->bus_id = 1; plat->phy_addr = 0; - plat->interface = PHY_INTERFACE_MODE_SGMII; + plat->phy_interface = PHY_INTERFACE_MODE_SGMII; return tgl_common_data(pdev, plat); } @@ -358,7 +360,7 @@ static int quark_default_data(struct pci_dev *pdev, plat->bus_id = pci_dev_id(pdev); plat->phy_addr = ret; - plat->interface = PHY_INTERFACE_MODE_RMII; + plat->phy_interface = PHY_INTERFACE_MODE_RMII; plat->dma_cfg->pbl = 16; plat->dma_cfg->pblx8 = true; @@ -415,7 +417,7 @@ static int snps_gmac5_default_data(struct pci_dev *pdev, plat->bus_id = 1; plat->phy_addr = -1; - plat->interface = PHY_INTERFACE_MODE_GMII; + plat->phy_interface = PHY_INTERFACE_MODE_GMII; plat->dma_cfg->pbl = 32; plat->dma_cfg->pblx8 = true; From 73a21fa817f0cc8022dc6226250a86bca727a56d Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Tue, 4 Feb 2020 12:08:58 +0200 Subject: [PATCH 116/147] dpaa_eth: support all modes with rate adapting PHYs Stop removing modes that are not supported on the system interface when the connected PHY is capable of rate adaptation. This addresses an issue with the LS1046ARDB board 10G interface no longer working with an 1G link partner after autonegotiation support was added for the Aquantia PHY on board in commit 09c4c57f7bc4 ("net: phy: aquantia: add support for auto-negotiation configuration") Before this commit the values advertised by the PHY were not influenced by the dpaa_eth driver removal of system-side unsupported modes as the aqr_config_aneg() was basically a no-op. After this commit, the modes removed by the dpaa_eth driver were no longer advertised thus autonegotiation with 1G link partners failed. Reported-by: Mian Yousaf Kaukab Signed-off-by: Madalin Bucur Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 09dbcd819d84..fd93d542f497 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2453,6 +2453,9 @@ static void dpaa_adjust_link(struct net_device *net_dev) mac_dev->adjust_link(mac_dev); } +/* The Aquantia PHYs are capable of performing rate adaptation */ +#define PHY_VEND_AQUANTIA 0x03a1b400 + static int dpaa_phy_init(struct net_device *net_dev) { __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; @@ -2471,9 +2474,14 @@ static int dpaa_phy_init(struct net_device *net_dev) return -ENODEV; } - /* Remove any features not supported by the controller */ - ethtool_convert_legacy_u32_to_link_mode(mask, mac_dev->if_support); - linkmode_and(phy_dev->supported, phy_dev->supported, mask); + /* Unless the PHY is capable of rate adaptation */ + if (mac_dev->phy_if != PHY_INTERFACE_MODE_XGMII || + ((phy_dev->drv->phy_id & GENMASK(31, 10)) != PHY_VEND_AQUANTIA)) { + /* remove any features not supported by the controller */ + ethtool_convert_legacy_u32_to_link_mode(mask, + mac_dev->if_support); + linkmode_and(phy_dev->supported, phy_dev->supported, mask); + } phy_support_asym_pause(phy_dev); From 0f378d73d429d5f73fe2f00be4c9a15dbe9779ee Mon Sep 17 00:00:00 2001 From: Tony W Wang-oc Date: Wed, 15 Jan 2020 16:22:19 +0800 Subject: [PATCH 117/147] x86/apic: Mask IOAPIC entries when disabling the local APIC When a system suspends, the local APIC is disabled in the suspend sequence, but the IOAPIC is left in the current state. This means unmasked interrupt lines stay unmasked. This is usually the case for IOAPIC pin 9 to which the ACPI interrupt is connected. That means that in suspended state the IOAPIC can respond to an external interrupt, e.g. the wakeup via keyboard/RTC/ACPI, but the interrupt message cannot be handled by the disabled local APIC. As a consequence the Remote IRR bit is set, but the local APIC does not send an EOI to acknowledge it. This causes the affected interrupt line to become stale and the stale Remote IRR bit will cause a hang when __synchronize_hardirq() is invoked for that interrupt line. To prevent this, mask all IOAPIC entries before disabling the local APIC. The resume code already has the unmask operation inside. [ tglx: Massaged changelog ] Signed-off-by: Tony W Wang-oc Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1579076539-7267-1-git-send-email-TonyWWang-oc@zhaoxin.com --- arch/x86/kernel/apic/apic.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4b0f9117e1cd..5f973fed3c9f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2639,6 +2639,13 @@ static int lapic_suspend(void) #endif local_irq_save(flags); + + /* + * Mask IOAPIC before disabling the local APIC to prevent stale IRR + * entries on some implementations. + */ + mask_ioapic_entries(); + disable_local_APIC(); irq_remapping_disable(); From 1e474b28e78897d0d170fab3b28ba683149cb9ea Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Feb 2020 06:34:09 -0800 Subject: [PATCH 118/147] smp/up: Make smp_call_function_single() match SMP semantics In CONFIG_SMP=y kernels, smp_call_function_single() returns -ENXIO when invoked for a non-existent CPU. In contrast, in CONFIG_SMP=n kernels, a splat is emitted and smp_call_function_single() otherwise silently ignores its "cpu" argument, instead pretending that the caller intended to have something happen on CPU 0. Given that there is now code that expects smp_call_function_single() to return an error if a bad CPU was specified, this difference in semantics needs to be addressed. Bring the semantics of the CONFIG_SMP=n version of smp_call_function_single() into alignment with its CONFIG_SMP=y counterpart. Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200205143409.GA7021@paulmck-ThinkPad-P72 --- kernel/up.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/up.c b/kernel/up.c index 53144d056252..c6f323dcd45b 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -14,7 +14,8 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, { unsigned long flags; - WARN_ON(cpu != 0); + if (cpu != 0) + return -ENXIO; local_irq_save(flags); func(info); From db3fa271022dacb9f741b96ea4714461a8911bb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2020 07:16:37 -0800 Subject: [PATCH 119/147] ipv6/addrconf: fix potential NULL deref in inet6_set_link_af() __in6_dev_get(dev) called from inet6_set_link_af() can return NULL. The needed check has been recently removed, let's add it back. While do_setlink() does call validate_linkmsg() : ... err = validate_linkmsg(dev, tb); /* OK at this point */ ... It is possible that the following call happening before the ->set_link_af() removes IPv6 if MTU is less than 1280 : if (tb[IFLA_MTU]) { err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } ... if (tb[IFLA_AF_SPEC]) { ... err = af_ops->set_link_af(dev, af); ->inet6_set_link_af() // CRASH because idev is NULL Please note that IPv4 is immune to the bug since inet_set_link_af() does : struct in_device *in_dev = __in_dev_get_rcu(dev); if (!in_dev) return -EAFNOSUPPORT; This problem has been mentioned in commit cf7afbfeb8ce ("rtnl: make link af-specific updates atomic") changelog : This method is not fail proof, while it is currently sufficient to make set_link_af() inerrable and thus 100% atomic, the validation function method will not be able to detect all error scenarios in the future, there will likely always be errors depending on states which are f.e. not protected by rtnl_mutex and thus may change between validation and setting. IPv6: ADDRCONF(NETDEV_CHANGE): lo: link becomes ready general protection fault, probably for non-canonical address 0xdffffc0000000056: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x00000000000002b0-0x00000000000002b7] CPU: 0 PID: 9698 Comm: syz-executor712 Not tainted 5.5.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:inet6_set_link_af+0x66e/0xae0 net/ipv6/addrconf.c:5733 Code: 38 d0 7f 08 84 c0 0f 85 20 03 00 00 48 8d bb b0 02 00 00 45 0f b6 64 24 04 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 1a 03 00 00 44 89 a3 b0 02 00 RSP: 0018:ffffc90005b06d40 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff86df39a6 RDX: 0000000000000056 RSI: ffffffff86df3e74 RDI: 00000000000002b0 RBP: ffffc90005b06e70 R08: ffff8880a2ac0380 R09: ffffc90005b06db0 R10: fffff52000b60dbe R11: ffffc90005b06df7 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8880a1fcc424 R15: dffffc0000000000 FS: 0000000000c46880(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f0494ca0d0 CR3: 000000009e4ac000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: do_setlink+0x2a9f/0x3720 net/core/rtnetlink.c:2754 rtnl_group_changelink net/core/rtnetlink.c:3103 [inline] __rtnl_newlink+0xdd1/0x1790 net/core/rtnetlink.c:3257 rtnl_newlink+0x69/0xa0 net/core/rtnetlink.c:3377 rtnetlink_rcv_msg+0x45e/0xaf0 net/core/rtnetlink.c:5438 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5456 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 ____sys_sendmsg+0x753/0x880 net/socket.c:2343 ___sys_sendmsg+0x100/0x170 net/socket.c:2397 __sys_sendmsg+0x105/0x1d0 net/socket.c:2430 __do_sys_sendmsg net/socket.c:2439 [inline] __se_sys_sendmsg net/socket.c:2437 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4402e9 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fffd62fbcf8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 00000000004402e9 RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000008 R09: 00000000004002c8 R10: 0000000000000005 R11: 0000000000000246 R12: 0000000000401b70 R13: 0000000000401c00 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: ---[ end trace cfa7664b8fdcdff3 ]--- RIP: 0010:inet6_set_link_af+0x66e/0xae0 net/ipv6/addrconf.c:5733 Code: 38 d0 7f 08 84 c0 0f 85 20 03 00 00 48 8d bb b0 02 00 00 45 0f b6 64 24 04 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 1a 03 00 00 44 89 a3 b0 02 00 RSP: 0018:ffffc90005b06d40 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff86df39a6 RDX: 0000000000000056 RSI: ffffffff86df3e74 RDI: 00000000000002b0 RBP: ffffc90005b06e70 R08: ffff8880a2ac0380 R09: ffffc90005b06db0 R10: fffff52000b60dbe R11: ffffc90005b06df7 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8880a1fcc424 R15: dffffc0000000000 FS: 0000000000c46880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000004 CR3: 000000009e4ac000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 7dc2bccab0ee ("Validate required parameters in inet6_validate_link_af") Signed-off-by: Eric Dumazet Bisected-and-reported-by: syzbot Cc: Maxim Mikityanskiy Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 39d861d00377..cb493e15959c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5718,6 +5718,9 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) struct nlattr *tb[IFLA_INET6_MAX + 1]; int err; + if (!idev) + return -EAFNOSUPPORT; + if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0) BUG(); From f8c2afa66d5397b0b9293c4347dac6dabb327685 Mon Sep 17 00:00:00 2001 From: Razvan Stefanescu Date: Fri, 7 Feb 2020 17:44:04 +0200 Subject: [PATCH 120/147] net: dsa: microchip: enable module autoprobe This matches /sys/devices/.../spi1.0/modalias content. Fixes: 9b2d9f05cddf ("net: dsa: microchip: add ksz9567 to ksz9477 driver") Fixes: d9033ae95cf4 ("net: dsa: microchip: add KSZ8563 compatibility string") Fixes: 8c29bebb1f8a ("net: dsa: microchip: add KSZ9893 switch support") Fixes: 45316818371d ("net: dsa: add support for ksz9897 ethernet switch") Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477") Signed-off-by: Razvan Stefanescu Signed-off-by: Codrin Ciubotariu Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz9477_spi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz9477_spi.c b/drivers/net/dsa/microchip/ksz9477_spi.c index c5f64959a184..1142768969c2 100644 --- a/drivers/net/dsa/microchip/ksz9477_spi.c +++ b/drivers/net/dsa/microchip/ksz9477_spi.c @@ -101,6 +101,12 @@ static struct spi_driver ksz9477_spi_driver = { module_spi_driver(ksz9477_spi_driver); +MODULE_ALIAS("spi:ksz9477"); +MODULE_ALIAS("spi:ksz9897"); +MODULE_ALIAS("spi:ksz9893"); +MODULE_ALIAS("spi:ksz9563"); +MODULE_ALIAS("spi:ksz8563"); +MODULE_ALIAS("spi:ksz9567"); MODULE_AUTHOR("Woojung Huh "); MODULE_DESCRIPTION("Microchip KSZ9477 Series Switch SPI access Driver"); MODULE_LICENSE("GPL"); From 0508ff8934f40b52a78313049b96eec29a46ba49 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 7 Feb 2020 19:26:24 +0200 Subject: [PATCH 121/147] mlxsw: spectrum_router: Prevent incorrect replacement of local table routes The driver uses the same table to represent both the main and local routing tables. Prevent routes in the main table from replacing routes in the local table to reflect the fact that the local table is consulted first during lookup. Fixes: b6a1d871d37a ("mlxsw: spectrum_router: Start using new IPv4 route notifications") Fixes: dacad7b34b59 ("mlxsw: spectrum_router: Start using new IPv6 route notifications") Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index ce707723f8cf..f8b3869672c3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4844,6 +4844,23 @@ mlxsw_sp_fib_node_entry_unlink(struct mlxsw_sp *mlxsw_sp, fib_node->fib_entry = NULL; } +static bool mlxsw_sp_fib4_allow_replace(struct mlxsw_sp_fib4_entry *fib4_entry) +{ + struct mlxsw_sp_fib_node *fib_node = fib4_entry->common.fib_node; + struct mlxsw_sp_fib4_entry *fib4_replaced; + + if (!fib_node->fib_entry) + return true; + + fib4_replaced = container_of(fib_node->fib_entry, + struct mlxsw_sp_fib4_entry, common); + if (fib4_entry->tb_id == RT_TABLE_MAIN && + fib4_replaced->tb_id == RT_TABLE_LOCAL) + return false; + + return true; +} + static int mlxsw_sp_router_fib4_replace(struct mlxsw_sp *mlxsw_sp, const struct fib_entry_notifier_info *fen_info) @@ -4872,6 +4889,12 @@ mlxsw_sp_router_fib4_replace(struct mlxsw_sp *mlxsw_sp, goto err_fib4_entry_create; } + if (!mlxsw_sp_fib4_allow_replace(fib4_entry)) { + mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_entry); + mlxsw_sp_fib_node_put(mlxsw_sp, fib_node); + return 0; + } + replaced = fib_node->fib_entry; err = mlxsw_sp_fib_node_entry_link(mlxsw_sp, &fib4_entry->common); if (err) { @@ -4908,7 +4931,7 @@ static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp, return; fib4_entry = mlxsw_sp_fib4_entry_lookup(mlxsw_sp, fen_info); - if (WARN_ON(!fib4_entry)) + if (!fib4_entry) return; fib_node = fib4_entry->common.fib_node; @@ -5408,6 +5431,27 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp, return NULL; } +static bool mlxsw_sp_fib6_allow_replace(struct mlxsw_sp_fib6_entry *fib6_entry) +{ + struct mlxsw_sp_fib_node *fib_node = fib6_entry->common.fib_node; + struct mlxsw_sp_fib6_entry *fib6_replaced; + struct fib6_info *rt, *rt_replaced; + + if (!fib_node->fib_entry) + return true; + + fib6_replaced = container_of(fib_node->fib_entry, + struct mlxsw_sp_fib6_entry, + common); + rt = mlxsw_sp_fib6_entry_rt(fib6_entry); + rt_replaced = mlxsw_sp_fib6_entry_rt(fib6_replaced); + if (rt->fib6_table->tb6_id == RT_TABLE_MAIN && + rt_replaced->fib6_table->tb6_id == RT_TABLE_LOCAL) + return false; + + return true; +} + static int mlxsw_sp_router_fib6_replace(struct mlxsw_sp *mlxsw_sp, struct fib6_info **rt_arr, unsigned int nrt6) @@ -5442,6 +5486,12 @@ static int mlxsw_sp_router_fib6_replace(struct mlxsw_sp *mlxsw_sp, goto err_fib6_entry_create; } + if (!mlxsw_sp_fib6_allow_replace(fib6_entry)) { + mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry); + mlxsw_sp_fib_node_put(mlxsw_sp, fib_node); + return 0; + } + replaced = fib_node->fib_entry; err = mlxsw_sp_fib_node_entry_link(mlxsw_sp, &fib6_entry->common); if (err) From 6c05ca26f1153b4346ffd03cf65910ecc9cabbbf Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 7 Feb 2020 19:26:25 +0200 Subject: [PATCH 122/147] selftests: mlxsw: Add test cases for local table route replacement Test that routes in the main table do not replace identical routes in the local table and that routes in the local table do replace identical routes in the main table. Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/fib.sh | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tools/testing/selftests/drivers/net/mlxsw/fib.sh b/tools/testing/selftests/drivers/net/mlxsw/fib.sh index 45115f81c2b1..eab79b9e58cd 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/fib.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/fib.sh @@ -14,6 +14,7 @@ ALL_TESTS=" ipv4_plen ipv4_replay ipv4_flush + ipv4_local_replace ipv6_add ipv6_metric ipv6_append_single @@ -26,6 +27,7 @@ ALL_TESTS=" ipv6_delete_multipath ipv6_replay_single ipv6_replay_multipath + ipv6_local_replace " NUM_NETIFS=0 source $lib_dir/lib.sh @@ -89,6 +91,43 @@ ipv4_flush() fib_ipv4_flush_test "testns1" } +ipv4_local_replace() +{ + local ns="testns1" + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add table local 192.0.2.1/32 dev dummy1 + fib4_trap_check $ns "table local 192.0.2.1/32 dev dummy1" false + check_err $? "Local table route not in hardware when should" + + ip -n $ns route add table main 192.0.2.1/32 dev dummy1 + fib4_trap_check $ns "table main 192.0.2.1/32 dev dummy1" true + check_err $? "Main table route in hardware when should not" + + fib4_trap_check $ns "table local 192.0.2.1/32 dev dummy1" false + check_err $? "Local table route was replaced when should not" + + # Test that local routes can replace routes in main table. + ip -n $ns route add table main 192.0.2.2/32 dev dummy1 + fib4_trap_check $ns "table main 192.0.2.2/32 dev dummy1" false + check_err $? "Main table route not in hardware when should" + + ip -n $ns route add table local 192.0.2.2/32 dev dummy1 + fib4_trap_check $ns "table local 192.0.2.2/32 dev dummy1" false + check_err $? "Local table route did not replace route in main table when should" + + fib4_trap_check $ns "table main 192.0.2.2/32 dev dummy1" true + check_err $? "Main table route was not replaced when should" + + log_test "IPv4 local table route replacement" + + ip -n $ns link del dev dummy1 +} + ipv6_add() { fib_ipv6_add_test "testns1" @@ -149,6 +188,43 @@ ipv6_replay_multipath() fib_ipv6_replay_multipath_test "testns1" "$DEVLINK_DEV" } +ipv6_local_replace() +{ + local ns="testns1" + + RET=0 + + ip -n $ns link add name dummy1 type dummy + ip -n $ns link set dev dummy1 up + + ip -n $ns route add table local 2001:db8:1::1/128 dev dummy1 + fib6_trap_check $ns "table local 2001:db8:1::1/128 dev dummy1" false + check_err $? "Local table route not in hardware when should" + + ip -n $ns route add table main 2001:db8:1::1/128 dev dummy1 + fib6_trap_check $ns "table main 2001:db8:1::1/128 dev dummy1" true + check_err $? "Main table route in hardware when should not" + + fib6_trap_check $ns "table local 2001:db8:1::1/128 dev dummy1" false + check_err $? "Local table route was replaced when should not" + + # Test that local routes can replace routes in main table. + ip -n $ns route add table main 2001:db8:1::2/128 dev dummy1 + fib6_trap_check $ns "table main 2001:db8:1::2/128 dev dummy1" false + check_err $? "Main table route not in hardware when should" + + ip -n $ns route add table local 2001:db8:1::2/128 dev dummy1 + fib6_trap_check $ns "table local 2001:db8:1::2/128 dev dummy1" false + check_err $? "Local route route did not replace route in main table when should" + + fib6_trap_check $ns "table main 2001:db8:1::2/128 dev dummy1" true + check_err $? "Main table route was not replaced when should" + + log_test "IPv6 local table route replacement" + + ip -n $ns link del dev dummy1 +} + setup_prepare() { ip netns add testns1 From 490f0542a7f75572c841afa92da83b549e7d47f9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 7 Feb 2020 19:26:26 +0200 Subject: [PATCH 123/147] mlxsw: spectrum_router: Clear offload indication from IPv6 nexthops on abort Unlike IPv4, in IPv6 there is no unique structure to represent the nexthop and both the route and nexthop information are squashed to the same structure ('struct fib6_info'). In order to improve resource utilization the driver consolidates identical nexthop groups to the same internal representation of a nexthop group. Therefore, when the offload indication of a nexthop changes, the driver needs to iterate over all the linked fib6_info and toggle their offload flag accordingly. During abort, all the routes are removed from the device and unlinked from their nexthop group. The offload indication is cleared just before the group is destroyed, but by that time no fib6_info is linked to the group and the offload indication remains set. Fix this by clearing the offload indication just before dropping the reference from the nexthop. Fixes: ee5a0448e72b ("mlxsw: spectrum_router: Set hardware flags for routes") Reported-by: Alex Kushnarov Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Tested-by: Alex Kushnarov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index f8b3869672c3..4a77b511ead2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4993,6 +4993,9 @@ static void mlxsw_sp_rt6_release(struct fib6_info *rt) static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) { + struct fib6_nh *fib6_nh = mlxsw_sp_rt6->rt->fib6_nh; + + fib6_nh->fib_nh_flags &= ~RTNH_F_OFFLOAD; mlxsw_sp_rt6_release(mlxsw_sp_rt6->rt); kfree(mlxsw_sp_rt6); } From 36844c855b896f90bab51ccecf72940eb7e3cfe1 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Fri, 7 Feb 2020 19:26:27 +0200 Subject: [PATCH 124/147] mlxsw: core: Add validation of hardware device types for MGPIR register When reading the number of gearboxes from the hardware, the driver does not validate the returned 'device type' field. The driver can therefore wrongly assume that the queried devices are gearboxes. On Spectrum-3 systems that support different types of devices, this can prevent the driver from loading, as it will try to query the temperature sensors from devices which it assumes are gearboxes and in fact are not. For example: [ 218.129230] mlxsw_minimal 2-0048: Reg cmd access status failed (status=7(bad parameter)) [ 218.138282] mlxsw_minimal 2-0048: Reg cmd access failed (reg_id=900a(mtmp),type=write) [ 218.147131] mlxsw_minimal 2-0048: Failed to setup temp sensor number 256 [ 218.534480] mlxsw_minimal 2-0048: Fail to register core bus [ 218.540714] mlxsw_minimal: probe of 2-0048 failed with error -5 Fix this by validating the 'device type' field. Fixes: 2e265a8b6c094 ("mlxsw: core: Extend hwmon interface with inter-connect temperature attributes") Fixes: f14f4e621b1b4 ("mlxsw: core: Extend thermal core with per inter-connect device thermal zones") Signed-off-by: Vadim Pasternak Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c | 6 ++++-- drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c index 9bf8da5f6daf..3fe878d7c94c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c @@ -573,6 +573,7 @@ static int mlxsw_hwmon_module_init(struct mlxsw_hwmon *mlxsw_hwmon) static int mlxsw_hwmon_gearbox_init(struct mlxsw_hwmon *mlxsw_hwmon) { + enum mlxsw_reg_mgpir_device_type device_type; int index, max_index, sensor_index; char mgpir_pl[MLXSW_REG_MGPIR_LEN]; char mtmp_pl[MLXSW_REG_MTMP_LEN]; @@ -584,8 +585,9 @@ static int mlxsw_hwmon_gearbox_init(struct mlxsw_hwmon *mlxsw_hwmon) if (err) return err; - mlxsw_reg_mgpir_unpack(mgpir_pl, &gbox_num, NULL, NULL, NULL); - if (!gbox_num) + mlxsw_reg_mgpir_unpack(mgpir_pl, &gbox_num, &device_type, NULL, NULL); + if (device_type != MLXSW_REG_MGPIR_DEVICE_TYPE_GEARBOX_DIE || + !gbox_num) return 0; index = mlxsw_hwmon->module_sensor_max; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c index c721b171bd8d..ce0a6837daa3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c @@ -895,8 +895,10 @@ static int mlxsw_thermal_gearboxes_init(struct device *dev, struct mlxsw_core *core, struct mlxsw_thermal *thermal) { + enum mlxsw_reg_mgpir_device_type device_type; struct mlxsw_thermal_module *gearbox_tz; char mgpir_pl[MLXSW_REG_MGPIR_LEN]; + u8 gbox_num; int i; int err; @@ -908,11 +910,13 @@ mlxsw_thermal_gearboxes_init(struct device *dev, struct mlxsw_core *core, if (err) return err; - mlxsw_reg_mgpir_unpack(mgpir_pl, &thermal->tz_gearbox_num, NULL, NULL, + mlxsw_reg_mgpir_unpack(mgpir_pl, &gbox_num, &device_type, NULL, NULL); - if (!thermal->tz_gearbox_num) + if (device_type != MLXSW_REG_MGPIR_DEVICE_TYPE_GEARBOX_DIE || + !gbox_num) return 0; + thermal->tz_gearbox_num = gbox_num; thermal->tz_gearbox_arr = kcalloc(thermal->tz_gearbox_num, sizeof(*thermal->tz_gearbox_arr), GFP_KERNEL); From 3a99cbb6fa7bca1995586ec2dc21b0368aad4937 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 7 Feb 2020 19:26:28 +0200 Subject: [PATCH 125/147] mlxsw: spectrum_dpipe: Add missing error path In case devlink_dpipe_entry_ctx_prepare() failed, release RTNL that was previously taken and free the memory allocated by mlxsw_sp_erif_entry_prepare(). Fixes: 2ba5999f009d ("mlxsw: spectrum: Add Support for erif table entries access") Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c index 49933818c6f5..2dc0978428e6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c @@ -215,7 +215,7 @@ mlxsw_sp_dpipe_table_erif_entries_dump(void *priv, bool counters_enabled, start_again: err = devlink_dpipe_entry_ctx_prepare(dump_ctx); if (err) - return err; + goto err_ctx_prepare; j = 0; for (; i < rif_count; i++) { struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i); @@ -247,6 +247,7 @@ start_again: return 0; err_entry_append: err_entry_get: +err_ctx_prepare: rtnl_unlock(); devlink_dpipe_entry_clear(&entry); return err; From dfa7f709596be5ca46c070d4f8acbb344322056a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 7 Feb 2020 19:29:28 +0200 Subject: [PATCH 126/147] drop_monitor: Do not cancel uninitialized work item Drop monitor uses a work item that takes care of constructing and sending netlink notifications to user space. In case drop monitor never started to monitor, then the work item is uninitialized and not associated with a function. Therefore, a stop command from user space results in canceling an uninitialized work item which leads to the following warning [1]. Fix this by not processing a stop command if drop monitor is not currently monitoring. [1] [ 31.735402] ------------[ cut here ]------------ [ 31.736470] WARNING: CPU: 0 PID: 143 at kernel/workqueue.c:3032 __flush_work+0x89f/0x9f0 ... [ 31.738120] CPU: 0 PID: 143 Comm: dwdump Not tainted 5.5.0-custom-09491-g16d4077796b8 #727 [ 31.741968] RIP: 0010:__flush_work+0x89f/0x9f0 ... [ 31.760526] Call Trace: [ 31.771689] __cancel_work_timer+0x2a6/0x3b0 [ 31.776809] net_dm_cmd_trace+0x300/0xef0 [ 31.777549] genl_rcv_msg+0x5c6/0xd50 [ 31.781005] netlink_rcv_skb+0x13b/0x3a0 [ 31.784114] genl_rcv+0x29/0x40 [ 31.784720] netlink_unicast+0x49f/0x6a0 [ 31.787148] netlink_sendmsg+0x7cf/0xc80 [ 31.790426] ____sys_sendmsg+0x620/0x770 [ 31.793458] ___sys_sendmsg+0xfd/0x170 [ 31.802216] __sys_sendmsg+0xdf/0x1a0 [ 31.806195] do_syscall_64+0xa0/0x540 [ 31.806885] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 8e94c3bc922e ("drop_monitor: Allow user to start monitoring hardware drops") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index ea46fc6aa883..31700e0c3928 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1000,8 +1000,10 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) { int cpu; - if (!monitor_hw) + if (!monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); + return; + } monitor_hw = false; From f9f21cea311340f38074ff93a8d89b4a9cae6bcc Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 6 Feb 2020 11:15:21 -0800 Subject: [PATCH 127/147] genirq: Clarify that irq wake state is orthogonal to enable/disable There's some confusion around if an irq that's disabled with disable_irq() can still wake the system from sleep states such as "suspend to RAM". Clarify this in the kernel documentation for irq_set_irq_wake() so that it's clear that an irq can be disabled and still wake the system if it has been marked for wakeup. Signed-off-by: Stephen Boyd Signed-off-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lkml.kernel.org/r/20200206191521.94559-1-swboyd@chromium.org --- kernel/irq/manage.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 818b2802d3e7..3089a60ea8f9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -731,6 +731,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) * * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". + * + * Note: irq enable/disable state is completely orthogonal + * to the enable/disable state of irq wake. An irq can be + * disabled with disable_irq() and still wake the system as + * long as the irq has wake enabled. If this does not hold, + * then the underlying irq chip and the related driver need + * to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { From 85b8ac01a421791d66c3a458a7f83cfd173fe3fa Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 7 Feb 2020 10:37:12 +0000 Subject: [PATCH 128/147] bpf, sockmap: Check update requirements after locking It's currently possible to insert sockets in unexpected states into a sockmap, due to a TOCTTOU when updating the map from a syscall. sock_map_update_elem checks that sk->sk_state == TCP_ESTABLISHED, locks the socket and then calls sock_map_update_common. At this point, the socket may have transitioned into another state, and the earlier assumptions don't hold anymore. Crucially, it's conceivable (though very unlikely) that a socket has become unhashed. This breaks the sockmap's assumption that it will get a callback via sk->sk_prot->unhash. Fix this by checking the (fixed) sk_type and sk_protocol without the lock, followed by a locked check of sk_state. Unfortunately it's not possible to push the check down into sock_(map|hash)_update_common, since BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB run before the socket has transitioned from TCP_SYN_RECV into TCP_ESTABLISHED. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200207103713.28175-1-lmb@cloudflare.com --- net/core/sock_map.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8998e356f423..36a2433e183f 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -416,14 +416,16 @@ static int sock_map_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_map_update_common(map, idx, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_map_update_common(map, idx, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); @@ -739,14 +741,16 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key, ret = -EINVAL; goto out; } - if (!sock_map_sk_is_suitable(sk) || - sk->sk_state != TCP_ESTABLISHED) { + if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); - ret = sock_hash_update_common(map, key, sk, flags); + if (sk->sk_state != TCP_ESTABLISHED) + ret = -EOPNOTSUPP; + else + ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: fput(sock->file); From d95f1e8b462c4372ac409886070bb8719d8a4d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 6 Feb 2020 11:29:06 +0100 Subject: [PATCH 129/147] bpftool: Don't crash on missing xlated program instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out the xlated program instructions can also be missing if kptr_restrict sysctl is set. This means that the previous fix to check the jited_prog_insns pointer was insufficient; add another check of the xlated_prog_insns pointer as well. Fixes: 5b79bcdf0362 ("bpftool: Don't crash on missing jited insns or ksyms") Fixes: cae73f233923 ("bpftool: use bpf_program__get_prog_info_linear() in prog.c:do_dump()") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200206102906.112551-1-toke@redhat.com --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index a3521deca869..b352ab041160 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -536,7 +536,7 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode, buf = (unsigned char *)(info->jited_prog_insns); member_len = info->jited_prog_len; } else { /* DUMP_XLATED */ - if (info->xlated_prog_len == 0) { + if (info->xlated_prog_len == 0 || !info->xlated_prog_insns) { p_err("error retrieving insn dump: kernel.kptr_restrict set?"); return -1; } From db6a5018b6e008c1d69c6628cdaa9541b8e70940 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:50 +0100 Subject: [PATCH 130/147] bpf, sockmap: Don't sleep while holding RCU lock on tear-down rcu_read_lock is needed to protect access to psock inside sock_map_unref when tearing down the map. However, we can't afford to sleep in lock_sock while in RCU read-side critical section. Grab the RCU lock only after we have locked the socket. This fixes RCU warnings triggerable on a VM with 1 vCPU when free'ing a sockmap/sockhash that contains at least one socket: | ============================= | WARNING: suspicious RCU usage | 5.5.0-04005-g8fc91b972b73 #450 Not tainted | ----------------------------- | include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section! | | other info that might help us debug this: | | | rcu_scheduler_active = 2, debug_locks = 1 | 4 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_map_free+0x5/0x170 | #3: ffff8881368c5df8 (&stab->lock){+...}, at: sock_map_free+0x64/0x170 | | stack backtrace: | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73 #450 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep+0x105/0x190 | lock_sock_nested+0x28/0x90 | sock_map_free+0x95/0x170 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 | ============================= | WARNING: suspicious RCU usage | 5.5.0-04005-g8fc91b972b73-dirty #452 Not tainted | ----------------------------- | include/linux/rcupdate.h:272 Illegal context switch in RCU read-side critical section! | | other info that might help us debug this: | | | rcu_scheduler_active = 2, debug_locks = 1 | 4 locks held by kworker/0:1/62: | #0: ffff88813b019748 ((wq_completion)events){+.+.}, at: process_one_work+0x1d7/0x5e0 | #1: ffffc900000abe50 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x1d7/0x5e0 | #2: ffffffff82065d20 (rcu_read_lock){....}, at: sock_hash_free+0x5/0x1d0 | #3: ffff888139966e00 (&htab->buckets[i].lock){+...}, at: sock_hash_free+0x92/0x1d0 | | stack backtrace: | CPU: 0 PID: 62 Comm: kworker/0:1 Not tainted 5.5.0-04005-g8fc91b972b73-dirty #452 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x71/0xa0 | ___might_sleep+0x105/0x190 | lock_sock_nested+0x28/0x90 | sock_hash_free+0xec/0x1d0 | bpf_map_free_deferred+0x58/0x80 | process_one_work+0x260/0x5e0 | worker_thread+0x4d/0x3e0 | kthread+0x108/0x140 | ? process_one_work+0x5e0/0x5e0 | ? kthread_park+0x90/0x90 | ret_from_fork+0x3a/0x50 Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-2-jakub@cloudflare.com --- net/core/sock_map.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 36a2433e183f..992504478e68 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -234,7 +234,6 @@ static void sock_map_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; @@ -243,12 +242,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { lock_sock(sk); + rcu_read_lock(); sock_map_unref(sk, psk); + rcu_read_unlock(); release_sock(sk); } } raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); synchronize_rcu(); @@ -863,19 +863,19 @@ static void sock_hash_free(struct bpf_map *map) int i; synchronize_rcu(); - rcu_read_lock(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); + rcu_read_lock(); sock_map_unref(elem->sk, elem); + rcu_read_unlock(); release_sock(elem->sk); } raw_spin_unlock_bh(&bucket->lock); } - rcu_read_unlock(); bpf_map_area_free(htab->buckets); kfree(htab); From 0b2dc83906cf1e694e48003eae5df8fa63f76fd9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:51 +0100 Subject: [PATCH 131/147] bpf, sockhash: Synchronize_rcu before free'ing map We need to have a synchronize_rcu before free'ing the sockhash because any outstanding psock references will have a pointer to the map and when they use it, this could trigger a use after free. This is a sister fix for sockhash, following commit 2bb90e5cc90e ("bpf: sockmap, synchronize_rcu before free'ing map") which addressed sockmap, which comes from a manual audit. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-3-jakub@cloudflare.com --- net/core/sock_map.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 992504478e68..085cef5857bb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -250,6 +250,7 @@ static void sock_map_free(struct bpf_map *map) } raw_spin_unlock_bh(&stab->lock); + /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); @@ -877,6 +878,9 @@ static void sock_hash_free(struct bpf_map *map) raw_spin_unlock_bh(&bucket->lock); } + /* wait for psock readers accessing its map link */ + synchronize_rcu(); + bpf_map_area_free(htab->buckets); kfree(htab); } From 5d3919a953c3c96c02fc7a337f8376cde43ae31f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 6 Feb 2020 12:16:52 +0100 Subject: [PATCH 132/147] selftests/bpf: Test freeing sockmap/sockhash with a socket in it Commit 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") introduced sleeping issues inside RCU critical sections and while holding a spinlock on sockmap/sockhash tear-down. There has to be at least one socket in the map for the problem to surface. This adds a test that triggers the warnings for broken locking rules. Not a fix per se, but rather tooling to verify the accompanying fixes. Run on a VM with 1 vCPU to reproduce the warnings. Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200206111652.694507-4-jakub@cloudflare.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c new file mode 100644 index 000000000000..07f5b462c2ef --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Cloudflare + +#include "test_progs.h" + +static int connected_socket_v4(void) +{ + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = htons(80), + .sin_addr = { inet_addr("127.0.0.1") }, + }; + socklen_t len = sizeof(addr); + int s, repair, err; + + s = socket(AF_INET, SOCK_STREAM, 0); + if (CHECK_FAIL(s == -1)) + goto error; + + repair = TCP_REPAIR_ON; + err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair)); + if (CHECK_FAIL(err)) + goto error; + + err = connect(s, (struct sockaddr *)&addr, len); + if (CHECK_FAIL(err)) + goto error; + + repair = TCP_REPAIR_OFF_NO_WP; + err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair)); + if (CHECK_FAIL(err)) + goto error; + + return s; +error: + perror(__func__); + close(s); + return -1; +} + +/* Create a map, populate it with one socket, and free the map. */ +static void test_sockmap_create_update_free(enum bpf_map_type map_type) +{ + const int zero = 0; + int s, map, err; + + s = connected_socket_v4(); + if (CHECK_FAIL(s == -1)) + return; + + map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); + if (CHECK_FAIL(map == -1)) { + perror("bpf_create_map"); + goto out; + } + + err = bpf_map_update_elem(map, &zero, &s, BPF_NOEXIST); + if (CHECK_FAIL(err)) { + perror("bpf_map_update"); + goto out; + } + +out: + close(map); + close(s); +} + +void test_sockmap_basic(void) +{ + if (test__start_subtest("sockmap create_update_free")) + test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash create_update_free")) + test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH); +} From 88d6f130e5632bbf419a2e184ec7adcbe241260b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 7 Feb 2020 00:18:10 -0800 Subject: [PATCH 133/147] bpf: Improve bucket_log calculation logic It was reported that the max_t, ilog2, and roundup_pow_of_two macros have exponential effects on the number of states in the sparse checker. This patch breaks them up by calculating the "nbuckets" first so that the "bucket_log" only needs to take ilog2(). In addition, Linus mentioned: Patch looks good, but I'd like to point out that it's not just sparse. You can see it with a simple make net/core/bpf_sk_storage.i grep 'smap->bucket_log = ' net/core/bpf_sk_storage.i | wc and see the end result: 1 365071 2686974 That's one line (the assignment line) that is 2,686,974 characters in length. Now, sparse does happen to react particularly badly to that (I didn't look to why, but I suspect it's just that evaluating all the types that don't actually ever end up getting used ends up being much more expensive than it should be), but I bet it's not good for gcc either. Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") Reported-by: Randy Dunlap Reported-by: Luc Van Oostenryck Suggested-by: Linus Torvalds Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Luc Van Oostenryck Link: https://lore.kernel.org/bpf/20200207081810.3918919-1-kafai@fb.com --- net/core/bpf_sk_storage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 458be6b3eda9..3ab23f698221 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -643,9 +643,10 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); + nbuckets = roundup_pow_of_two(num_possible_cpus()); /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); - nbuckets = 1U << smap->bucket_log; + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); ret = bpf_map_charge_init(&smap->map.memory, cost); From d08f3010f4a32eec3c8aa771f03a1b342a1472fa Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 7 Feb 2020 11:29:51 +0100 Subject: [PATCH 134/147] mt76: mt7615: fix max_nss in mt7615_eeprom_parse_hw_cap Fix u8 cast reading max_nss from MT_TOP_STRAP_STA register in mt7615_eeprom_parse_hw_cap routine Fixes: acf5457fd99db ("mt76: mt7615: read {tx,rx} mask from eeprom") Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c b/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c index eccad4987ac8..17e277bf39e0 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/eeprom.c @@ -92,8 +92,9 @@ static int mt7615_check_eeprom(struct mt76_dev *dev) static void mt7615_eeprom_parse_hw_cap(struct mt7615_dev *dev) { - u8 val, *eeprom = dev->mt76.eeprom.data; + u8 *eeprom = dev->mt76.eeprom.data; u8 tx_mask, rx_mask, max_nss; + u32 val; val = FIELD_GET(MT_EE_NIC_WIFI_CONF_BAND_SEL, eeprom[MT_EE_WIFI_CONF]); From e88bd316e5971fe78884ad1f466b9fc576575e5f Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 6 Feb 2020 15:57:06 +0800 Subject: [PATCH 135/147] irqchip/gic-v4.1: Fix programming of GICR_VPROPBASER_4_1_SIZE The Size field of GICv4.1 VPROPBASER register indicates number of pages minus one and together Page_Size and Size control the vPEID width. Let's respect this requirement of the architecture. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200206075711.1275-2-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index e5a25d97f8db..992bc72cab6f 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2531,7 +2531,7 @@ static int allocate_vpe_l1_table(void) npg = 1; } - val |= FIELD_PREP(GICR_VPROPBASER_4_1_SIZE, npg); + val |= FIELD_PREP(GICR_VPROPBASER_4_1_SIZE, npg - 1); /* Right, that's the number of CPU pages we need for L1 */ np = DIV_ROUND_UP(npg * psz, PAGE_SIZE); From 8b718d403c5cdc7f0ea492c33ec88169f3e76462 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 6 Feb 2020 15:57:07 +0800 Subject: [PATCH 136/147] irqchip/gic-v4.1: Set vpe_l1_base for all redistributors Currently, we will not set vpe_l1_page for the current RD if we can inherit the vPE configuration table from another RD (or ITS), which results in an inconsistency between RDs within the same CommonLPIAff group. Let's rename it to vpe_l1_base to indicate the base address of the vPE configuration table of this RD, and set it properly for *all* v4.1 redistributors. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200206075711.1275-3-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 5 ++++- include/linux/irqchip/arm-gic-v3.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 992bc72cab6f..0f1fe56ce0af 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2376,6 +2376,8 @@ static u64 inherit_vpe_l1_table_from_its(void) continue; /* We have a winner! */ + gic_data_rdist()->vpe_l1_base = its->tables[2].base; + val = GICR_VPROPBASER_4_1_VALID; if (baser & GITS_BASER_INDIRECT) val |= GICR_VPROPBASER_4_1_INDIRECT; @@ -2432,6 +2434,7 @@ static u64 inherit_vpe_l1_table_from_rd(cpumask_t **mask) val = gits_read_vpropbaser(base + SZ_128K + GICR_VPROPBASER); val &= ~GICR_VPROPBASER_4_1_Z; + gic_data_rdist()->vpe_l1_base = gic_data_rdist_cpu(cpu)->vpe_l1_base; *mask = gic_data_rdist_cpu(cpu)->vpe_table_mask; return val; @@ -2542,7 +2545,7 @@ static int allocate_vpe_l1_table(void) if (!page) return -ENOMEM; - gic_data_rdist()->vpe_l1_page = page; + gic_data_rdist()->vpe_l1_base = page_address(page); pa = virt_to_phys(page_address(page)); WARN_ON(!IS_ALIGNED(pa, psz)); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index f0b8ca766e7d..83439bfb6c5b 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -652,10 +652,10 @@ struct rdists { struct { void __iomem *rd_base; struct page *pend_page; - struct page *vpe_l1_page; phys_addr_t phys_base; bool lpi_enabled; cpumask_t *vpe_table_mask; + void *vpe_l1_base; } __percpu *rdist; phys_addr_t prop_table_pa; void *prop_table_va; From 4e6437f12d6e929e802f5599a2d50dfcf92d0f50 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 6 Feb 2020 15:57:08 +0800 Subject: [PATCH 137/147] irqchip/gic-v4.1: Ensure L2 vPE table is allocated at RD level In GICv4, we will ensure that level2 vPE table memory is allocated for the specified vpe_id on all v4 ITS, in its_alloc_vpe_table(). This still works well for the typical GICv4.1 implementation, where the new vPE table is shared between the ITSs and the RDs. To make it explicit, let us introduce allocate_vpe_l2_table() to make sure that the L2 tables are allocated on all v4.1 RDs. We're likely not need to allocate memory in it because the vPE table is shared and (L2 table is) already allocated at ITS level, except for the case where the ITS doesn't share anything (say SVPET == 0, practically unlikely but architecturally allowed). The implementation of allocate_vpe_l2_table() is mostly copied from its_alloc_table_entry(). Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200206075711.1275-4-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 80 ++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0f1fe56ce0af..ae4e7b355b46 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2443,6 +2443,72 @@ static u64 inherit_vpe_l1_table_from_rd(cpumask_t **mask) return 0; } +static bool allocate_vpe_l2_table(int cpu, u32 id) +{ + void __iomem *base = gic_data_rdist_cpu(cpu)->rd_base; + u64 val, gpsz, npg; + unsigned int psz, esz, idx; + struct page *page; + __le64 *table; + + if (!gic_rdists->has_rvpeid) + return true; + + val = gits_read_vpropbaser(base + SZ_128K + GICR_VPROPBASER); + + esz = FIELD_GET(GICR_VPROPBASER_4_1_ENTRY_SIZE, val) + 1; + gpsz = FIELD_GET(GICR_VPROPBASER_4_1_PAGE_SIZE, val); + npg = FIELD_GET(GICR_VPROPBASER_4_1_SIZE, val) + 1; + + switch (gpsz) { + default: + WARN_ON(1); + /* fall through */ + case GIC_PAGE_SIZE_4K: + psz = SZ_4K; + break; + case GIC_PAGE_SIZE_16K: + psz = SZ_16K; + break; + case GIC_PAGE_SIZE_64K: + psz = SZ_64K; + break; + } + + /* Don't allow vpe_id that exceeds single, flat table limit */ + if (!(val & GICR_VPROPBASER_4_1_INDIRECT)) + return (id < (npg * psz / (esz * SZ_8))); + + /* Compute 1st level table index & check if that exceeds table limit */ + idx = id >> ilog2(psz / (esz * SZ_8)); + if (idx >= (npg * psz / GITS_LVL1_ENTRY_SIZE)) + return false; + + table = gic_data_rdist_cpu(cpu)->vpe_l1_base; + + /* Allocate memory for 2nd level table */ + if (!table[idx]) { + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); + if (!page) + return false; + + /* Flush Lvl2 table to PoC if hw doesn't support coherency */ + if (!(val & GICR_VPROPBASER_SHAREABILITY_MASK)) + gic_flush_dcache_to_poc(page_address(page), psz); + + table[idx] = cpu_to_le64(page_to_phys(page) | GITS_BASER_VALID); + + /* Flush Lvl1 entry to PoC if hw doesn't support coherency */ + if (!(val & GICR_VPROPBASER_SHAREABILITY_MASK)) + gic_flush_dcache_to_poc(table + idx, GITS_LVL1_ENTRY_SIZE); + + /* Ensure updated table contents are visible to RD hardware */ + dsb(sy); + } + + return true; +} + static int allocate_vpe_l1_table(void) { void __iomem *vlpi_base = gic_data_rdist_vlpi_base(); @@ -2957,6 +3023,7 @@ static bool its_alloc_device_table(struct its_node *its, u32 dev_id) static bool its_alloc_vpe_table(u32 vpe_id) { struct its_node *its; + int cpu; /* * Make sure the L2 tables are allocated on *all* v4 ITSs. We @@ -2979,6 +3046,19 @@ static bool its_alloc_vpe_table(u32 vpe_id) return false; } + /* Non v4.1? No need to iterate RDs and go back early. */ + if (!gic_rdists->has_rvpeid) + return true; + + /* + * Make sure the L2 tables are allocated for all copies of + * the L1 table on *all* v4.1 RDs. + */ + for_each_possible_cpu(cpu) { + if (!allocate_vpe_l2_table(cpu, vpe_id)) + return false; + } + return true; } From 4bccf1d715fe8f5fe10bd6202c8caa0ae6104cd2 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 6 Feb 2020 15:57:09 +0800 Subject: [PATCH 138/147] irqchip/gic-v4.1: Drop 'tmp' in inherit_vpe_l1_table_from_rd() The variable 'tmp' in inherit_vpe_l1_table_from_rd() is actually not needed, drop it. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200206075711.1275-5-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index ae4e7b355b46..8405ebdd9ffb 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2415,14 +2415,12 @@ static u64 inherit_vpe_l1_table_from_rd(cpumask_t **mask) for_each_possible_cpu(cpu) { void __iomem *base = gic_data_rdist_cpu(cpu)->rd_base; - u32 tmp; if (!base || cpu == smp_processor_id()) continue; val = gic_read_typer(base + GICR_TYPER); - tmp = compute_common_aff(val); - if (tmp != aff) + if (aff != compute_common_aff(val)) continue; /* From b46353250ba3b4946adcbbabead23546fcb758b0 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 6 Feb 2020 15:57:10 +0800 Subject: [PATCH 139/147] irqchip/gic-v3-its: Remove superfluous WARN_ON "ITS virtual pending table not cleaning" is already complained inside its_clear_vpend_valid(), there's no need to trigger a WARN_ON again. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200206075711.1275-6-yuzenghui@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 8405ebdd9ffb..811875bf3abb 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2857,7 +2857,6 @@ static void its_cpu_init_lpis(void) * corrupting memory. */ val = its_clear_vpend_valid(vlpi_base, 0, 0); - WARN_ON(val & GICR_VPENDBASER_Dirty); } if (allocate_vpe_l1_table()) { From 5186a6cc3ef5a3fa327c258924ef098b0de77006 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 6 Feb 2020 15:57:11 +0800 Subject: [PATCH 140/147] irqchip/gic-v3-its: Rename VPENDBASER/VPROPBASER accessors V{PEND,PROP}BASER registers are actually located in VLPI_base frame of the *redistributor*. Rename their accessors to reflect this fact. No functional changes. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200206075711.1275-7-yuzenghui@huawei.com --- arch/arm/include/asm/arch_gicv3.h | 12 ++++++------ arch/arm64/include/asm/arch_gicv3.h | 8 ++++---- drivers/irqchip/irq-gic-v3-its.c | 28 ++++++++++++++-------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index b5752f0e8936..c815477b4303 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -326,16 +326,16 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) #define gits_write_cwriter(v, c) __gic_writeq_nonatomic(v, c) /* - * GITS_VPROPBASER - hi and lo bits may be accessed independently. + * GICR_VPROPBASER - hi and lo bits may be accessed independently. */ -#define gits_read_vpropbaser(c) __gic_readq_nonatomic(c) -#define gits_write_vpropbaser(v, c) __gic_writeq_nonatomic(v, c) +#define gicr_read_vpropbaser(c) __gic_readq_nonatomic(c) +#define gicr_write_vpropbaser(v, c) __gic_writeq_nonatomic(v, c) /* - * GITS_VPENDBASER - the Valid bit must be cleared before changing + * GICR_VPENDBASER - the Valid bit must be cleared before changing * anything else. */ -static inline void gits_write_vpendbaser(u64 val, void __iomem *addr) +static inline void gicr_write_vpendbaser(u64 val, void __iomem *addr) { u32 tmp; @@ -352,7 +352,7 @@ static inline void gits_write_vpendbaser(u64 val, void __iomem *addr) __gic_writeq_nonatomic(val, addr); } -#define gits_read_vpendbaser(c) __gic_readq_nonatomic(c) +#define gicr_read_vpendbaser(c) __gic_readq_nonatomic(c) static inline bool gic_prio_masking_enabled(void) { diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index 4750fc8030c3..25fec4bde43a 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -140,11 +140,11 @@ static inline u32 gic_read_rpr(void) #define gicr_write_pendbaser(v, c) writeq_relaxed(v, c) #define gicr_read_pendbaser(c) readq_relaxed(c) -#define gits_write_vpropbaser(v, c) writeq_relaxed(v, c) -#define gits_read_vpropbaser(c) readq_relaxed(c) +#define gicr_write_vpropbaser(v, c) writeq_relaxed(v, c) +#define gicr_read_vpropbaser(c) readq_relaxed(c) -#define gits_write_vpendbaser(v, c) writeq_relaxed(v, c) -#define gits_read_vpendbaser(c) readq_relaxed(c) +#define gicr_write_vpendbaser(v, c) writeq_relaxed(v, c) +#define gicr_read_vpendbaser(c) readq_relaxed(c) static inline bool gic_prio_masking_enabled(void) { diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 811875bf3abb..1ee95f546cb0 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2429,7 +2429,7 @@ static u64 inherit_vpe_l1_table_from_rd(cpumask_t **mask) * ours wrt CommonLPIAff. Let's use its own VPROPBASER. * Make sure we don't write the Z bit in that case. */ - val = gits_read_vpropbaser(base + SZ_128K + GICR_VPROPBASER); + val = gicr_read_vpropbaser(base + SZ_128K + GICR_VPROPBASER); val &= ~GICR_VPROPBASER_4_1_Z; gic_data_rdist()->vpe_l1_base = gic_data_rdist_cpu(cpu)->vpe_l1_base; @@ -2452,7 +2452,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id) if (!gic_rdists->has_rvpeid) return true; - val = gits_read_vpropbaser(base + SZ_128K + GICR_VPROPBASER); + val = gicr_read_vpropbaser(base + SZ_128K + GICR_VPROPBASER); esz = FIELD_GET(GICR_VPROPBASER_4_1_ENTRY_SIZE, val) + 1; gpsz = FIELD_GET(GICR_VPROPBASER_4_1_PAGE_SIZE, val); @@ -2524,8 +2524,8 @@ static int allocate_vpe_l1_table(void) * effect of making sure no doorbell will be generated and we can * then safely clear VPROPBASER.Valid. */ - if (gits_read_vpendbaser(vlpi_base + GICR_VPENDBASER) & GICR_VPENDBASER_Valid) - gits_write_vpendbaser(GICR_VPENDBASER_PendingLast, + if (gicr_read_vpendbaser(vlpi_base + GICR_VPENDBASER) & GICR_VPENDBASER_Valid) + gicr_write_vpendbaser(GICR_VPENDBASER_PendingLast, vlpi_base + GICR_VPENDBASER); /* @@ -2548,8 +2548,8 @@ static int allocate_vpe_l1_table(void) /* First probe the page size */ val = FIELD_PREP(GICR_VPROPBASER_4_1_PAGE_SIZE, GIC_PAGE_SIZE_64K); - gits_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); - val = gits_read_vpropbaser(vlpi_base + GICR_VPROPBASER); + gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); + val = gicr_read_vpropbaser(vlpi_base + GICR_VPROPBASER); gpsz = FIELD_GET(GICR_VPROPBASER_4_1_PAGE_SIZE, val); esz = FIELD_GET(GICR_VPROPBASER_4_1_ENTRY_SIZE, val); @@ -2620,7 +2620,7 @@ static int allocate_vpe_l1_table(void) val |= GICR_VPROPBASER_4_1_VALID; out: - gits_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); + gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); cpumask_set_cpu(smp_processor_id(), gic_data_rdist()->vpe_table_mask); pr_debug("CPU%d: VPROPBASER = %llx %*pbl\n", @@ -2727,14 +2727,14 @@ static u64 its_clear_vpend_valid(void __iomem *vlpi_base, u64 clr, u64 set) bool clean; u64 val; - val = gits_read_vpendbaser(vlpi_base + GICR_VPENDBASER); + val = gicr_read_vpendbaser(vlpi_base + GICR_VPENDBASER); val &= ~GICR_VPENDBASER_Valid; val &= ~clr; val |= set; - gits_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); + gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); do { - val = gits_read_vpendbaser(vlpi_base + GICR_VPENDBASER); + val = gicr_read_vpendbaser(vlpi_base + GICR_VPENDBASER); clean = !(val & GICR_VPENDBASER_Dirty); if (!clean) { count--; @@ -2849,7 +2849,7 @@ static void its_cpu_init_lpis(void) val = (LPI_NRBITS - 1) & GICR_VPROPBASER_IDBITS_MASK; pr_debug("GICv4: CPU%d: Init IDbits to 0x%llx for GICR_VPROPBASER\n", smp_processor_id(), val); - gits_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); + gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); /* * Also clear Valid bit of GICR_VPENDBASER, in case some @@ -3523,7 +3523,7 @@ static void its_vpe_schedule(struct its_vpe *vpe) val |= (LPI_NRBITS - 1) & GICR_VPROPBASER_IDBITS_MASK; val |= GICR_VPROPBASER_RaWb; val |= GICR_VPROPBASER_InnerShareable; - gits_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); + gicr_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER); val = virt_to_phys(page_address(vpe->vpt_page)) & GENMASK_ULL(51, 16); @@ -3541,7 +3541,7 @@ static void its_vpe_schedule(struct its_vpe *vpe) val |= GICR_VPENDBASER_PendingLast; val |= vpe->idai ? GICR_VPENDBASER_IDAI : 0; val |= GICR_VPENDBASER_Valid; - gits_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); + gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); } static void its_vpe_deschedule(struct its_vpe *vpe) @@ -3741,7 +3741,7 @@ static void its_vpe_4_1_schedule(struct its_vpe *vpe, val |= info->g1en ? GICR_VPENDBASER_4_1_VGRP1EN : 0; val |= FIELD_PREP(GICR_VPENDBASER_4_1_VPEID, vpe->vpe_id); - gits_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); + gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER); } static void its_vpe_4_1_deschedule(struct its_vpe *vpe, From 9dc086f1e9ef39dd823bd27954b884b2062f9e70 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 7 Feb 2020 22:15:46 +1100 Subject: [PATCH 141/147] powerpc/futex: Fix incorrect user access blocking The early versions of our kernel user access prevention (KUAP) were written by Russell and Christophe, and didn't have separate read/write access. At some point I picked up the series and added the read/write access, but I failed to update the usages in futex.h to correctly allow read and write. However we didn't notice because of another bug which was causing the low-level code to always enable read and write. That bug was fixed recently in commit 1d8f739b07bd ("powerpc/kuap: Fix set direction in allow/prevent_user_access()"). futex_atomic_cmpxchg_inatomic() is passed the user address as %3 and does: 1: lwarx %1, 0, %3 cmpw 0, %1, %4 bne- 3f 2: stwcx. %5, 0, %3 Which clearly loads and stores from/to %3. The logic in arch_futex_atomic_op_inuser() is similar, so fix both of them to use allow_read_write_user(). Without this fix, and with PPC_KUAP_DEBUG=y, we see eg: Bug: Read fault blocked by AMR! WARNING: CPU: 94 PID: 149215 at arch/powerpc/include/asm/book3s/64/kup-radix.h:126 __do_page_fault+0x600/0xf30 CPU: 94 PID: 149215 Comm: futex_requeue_p Tainted: G W 5.5.0-rc7-gcc9x-g4c25df5640ae #1 ... NIP [c000000000070680] __do_page_fault+0x600/0xf30 LR [c00000000007067c] __do_page_fault+0x5fc/0xf30 Call Trace: [c00020138e5637e0] [c00000000007067c] __do_page_fault+0x5fc/0xf30 (unreliable) [c00020138e5638c0] [c00000000000ada8] handle_page_fault+0x10/0x30 --- interrupt: 301 at cmpxchg_futex_value_locked+0x68/0xd0 LR = futex_lock_pi_atomic+0xe0/0x1f0 [c00020138e563bc0] [c000000000217b50] futex_lock_pi_atomic+0x80/0x1f0 (unreliable) [c00020138e563c30] [c00000000021b668] futex_requeue+0x438/0xb60 [c00020138e563d60] [c00000000021c6cc] do_futex+0x1ec/0x2b0 [c00020138e563d90] [c00000000021c8b8] sys_futex+0x128/0x200 [c00020138e563e20] [c00000000000b7ac] system_call+0x5c/0x68 Fixes: de78a9c42a79 ("powerpc: Add a framework for Kernel Userspace Access Protection") Cc: stable@vger.kernel.org # v5.2+ Reported-by: syzbot+e808452bad7c375cbee6@syzkaller-ppc64.appspotmail.com Signed-off-by: Michael Ellerman Reviewed-by: Christophe Leroy Link: https://lore.kernel.org/r/20200207122145.11928-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/futex.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index eea28ca679db..bc7d9d06a6d9 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -35,7 +35,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, { int oldval = 0, ret; - allow_write_to_user(uaddr, sizeof(*uaddr)); + allow_read_write_user(uaddr, uaddr, sizeof(*uaddr)); pagefault_disable(); switch (op) { @@ -62,7 +62,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, *oval = oldval; - prevent_write_to_user(uaddr, sizeof(*uaddr)); + prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr)); return ret; } @@ -76,7 +76,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; - allow_write_to_user(uaddr, sizeof(*uaddr)); + allow_read_write_user(uaddr, uaddr, sizeof(*uaddr)); + __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER "1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ @@ -97,7 +98,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "cc", "memory"); *uval = prev; - prevent_write_to_user(uaddr, sizeof(*uaddr)); + prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr)); + return ret; } From d4bf905307a1c90a27714ff7a9fd29b0a2ceed98 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 7 Feb 2020 17:20:57 +0000 Subject: [PATCH 142/147] powerpc: Fix CONFIG_TRACE_IRQFLAGS with CONFIG_VMAP_STACK When CONFIG_PROVE_LOCKING is selected together with (now default) CONFIG_VMAP_STACK, kernel enter deadlock during boot. At the point of checking whether interrupts are enabled or not, the value of MSR saved on stack is read using the physical address of the stack. But at this point, when using VMAP stack the DATA MMU translation has already been re-enabled, leading to deadlock. Don't use the physical address of the stack when CONFIG_VMAP_STACK is set. Signed-off-by: Christophe Leroy Reported-by: Guenter Roeck Fixes: 028474876f47 ("powerpc/32: prepare for CONFIG_VMAP_STACK") Tested-by: Guenter Roeck Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/daeacdc0dec0416d1c587cc9f9e7191ad3068dc0.1581095957.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 77abbc34bbe0..0713daa651d9 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -214,7 +214,7 @@ transfer_to_handler_cont: * To speed up the syscall path where interrupts stay on, let's check * first if we are changing the MSR value at all. */ - tophys(r12, r1) + tophys_novmstack r12, r1 lwz r12,_MSR(r12) andi. r12,r12,MSR_EE bne 1f From 29ca3b31756fb7cfbfc85f2d82a0475bf38cc1ed Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Fri, 7 Feb 2020 12:40:26 -0800 Subject: [PATCH 143/147] net: thunderx: use proper interface type for RGMII The configuration of the OCTEONTX XCV_DLL_CTL register via xcv_init_hw() is such that the RGMII RX delay is bypassed leaving the RGMII TX delay enabled in the MAC: /* Configure DLL - enable or bypass * TX no bypass, RX bypass */ cfg = readq_relaxed(xcv->reg_base + XCV_DLL_CTL); cfg &= ~0xFF03; cfg |= CLKRX_BYP; writeq_relaxed(cfg, xcv->reg_base + XCV_DLL_CTL); This would coorespond to a interface type of PHY_INTERFACE_MODE_RGMII_RXID and not PHY_INTERFACE_MODE_RGMII. Fixing this allows RGMII PHY drivers to do the right thing (enable RX delay in the PHY) instead of erroneously enabling both delays in the PHY. Signed-off-by: Tim Harvey Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index c4f6ec0cd183..17a4110c2e49 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1039,7 +1039,7 @@ static int phy_interface_mode(u8 lmac_type) if (lmac_type == BGX_MODE_QSGMII) return PHY_INTERFACE_MODE_QSGMII; if (lmac_type == BGX_MODE_RGMII) - return PHY_INTERFACE_MODE_RGMII; + return PHY_INTERFACE_MODE_RGMII_RXID; return PHY_INTERFACE_MODE_SGMII; } From b0ef7cda8d9be41ba198207c30a037bc470d9aef Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 8 Feb 2020 14:19:39 -0800 Subject: [PATCH 144/147] Fix up remaining devm_ioremap_nocache() in SGI IOC3 8250 UART driver This is a merge error on my part - the driver was merged into mainline by commit c5951e7c8ee5 ("Merge tag 'mips_5.6' of git://../mips/linux") over a week ago, but nobody apparently noticed that it didn't actually build due to still having a reference to the devm_ioremap_nocache() function, removed a few days earlier through commit 6a1000bd2703 ("Merge tag 'ioremap-5.6' of git://../ioremap"). Apparently this didn't get any build testing anywhere. Not perhaps all that surprising: it's restricted to 64-bit MIPS only, and only with the new SGI_MFD_IOC3 support enabled. I only noticed because the ioremap conflicts in the ARM SoC driver update made me check there weren't any others hiding, and I found this one. Signed-off-by: Linus Torvalds --- drivers/tty/serial/8250/8250_ioc3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_ioc3.c b/drivers/tty/serial/8250/8250_ioc3.c index 4c405f1b9c67..d5a39e105a76 100644 --- a/drivers/tty/serial/8250/8250_ioc3.c +++ b/drivers/tty/serial/8250/8250_ioc3.c @@ -47,7 +47,7 @@ static int serial8250_ioc3_probe(struct platform_device *pdev) if (!data) return -ENOMEM; - membase = devm_ioremap_nocache(&pdev->dev, r->start, resource_size(r)); + membase = devm_ioremap(&pdev->dev, r->start, resource_size(r)); if (!membase) return -ENOMEM; From 0fd169576648452725fa2949bf391d10883d3991 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 12 Dec 2019 15:09:14 +0100 Subject: [PATCH 145/147] fs: Add VirtualBox guest shared folder (vboxsf) support VirtualBox hosts can share folders with guests, this commit adds a VFS driver implementing the Linux-guest side of this, allowing folders exported by the host to be mounted under Linux. This driver depends on the guest <-> host IPC functions exported by the vboxguest driver. Acked-by: Christoph Hellwig Signed-off-by: Hans de Goede Signed-off-by: Al Viro --- MAINTAINERS | 6 + fs/Kconfig | 1 + fs/Makefile | 1 + fs/vboxsf/Kconfig | 10 + fs/vboxsf/Makefile | 5 + fs/vboxsf/dir.c | 427 +++++++++++++++++ fs/vboxsf/file.c | 379 +++++++++++++++ fs/vboxsf/shfl_hostintf.h | 901 ++++++++++++++++++++++++++++++++++++ fs/vboxsf/super.c | 491 ++++++++++++++++++++ fs/vboxsf/utils.c | 551 ++++++++++++++++++++++ fs/vboxsf/vboxsf_wrappers.c | 371 +++++++++++++++ fs/vboxsf/vfsmod.h | 137 ++++++ 12 files changed, 3280 insertions(+) create mode 100644 fs/vboxsf/Kconfig create mode 100644 fs/vboxsf/Makefile create mode 100644 fs/vboxsf/dir.c create mode 100644 fs/vboxsf/file.c create mode 100644 fs/vboxsf/shfl_hostintf.h create mode 100644 fs/vboxsf/super.c create mode 100644 fs/vboxsf/utils.c create mode 100644 fs/vboxsf/vboxsf_wrappers.c create mode 100644 fs/vboxsf/vfsmod.h diff --git a/MAINTAINERS b/MAINTAINERS index 4017e6b760be..8c87c7eba3f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17610,6 +17610,12 @@ F: include/linux/vbox_utils.h F: include/uapi/linux/vbox*.h F: drivers/virt/vboxguest/ +VIRTUAL BOX SHARED FOLDER VFS DRIVER: +M: Hans de Goede +L: linux-fsdevel@vger.kernel.org +S: Maintained +F: fs/vboxsf/* + VIRTUAL SERIO DEVICE DRIVER M: Stephen Chandler Paul S: Maintained diff --git a/fs/Kconfig b/fs/Kconfig index 7b623e9fc1b0..8493a3f0c4b1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -264,6 +264,7 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/erofs/Kconfig" +source "fs/vboxsf/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 1148c555c4d3..b807cbff3790 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -133,3 +133,4 @@ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ obj-$(CONFIG_EROFS_FS) += erofs/ +obj-$(CONFIG_VBOXSF_FS) += vboxsf/ diff --git a/fs/vboxsf/Kconfig b/fs/vboxsf/Kconfig new file mode 100644 index 000000000000..b84586ae08b3 --- /dev/null +++ b/fs/vboxsf/Kconfig @@ -0,0 +1,10 @@ +config VBOXSF_FS + tristate "VirtualBox guest shared folder (vboxsf) support" + depends on X86 && VBOXGUEST + select NLS + help + VirtualBox hosts can share folders with guests, this driver + implements the Linux-guest side of this allowing folders exported + by the host to be mounted under Linux. + + If you want to use shared folders in VirtualBox guests, answer Y or M. diff --git a/fs/vboxsf/Makefile b/fs/vboxsf/Makefile new file mode 100644 index 000000000000..9e4328e79623 --- /dev/null +++ b/fs/vboxsf/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT + +obj-$(CONFIG_VBOXSF_FS) += vboxsf.o + +vboxsf-y := dir.o file.o utils.o vboxsf_wrappers.o super.o diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c new file mode 100644 index 000000000000..dd147b490982 --- /dev/null +++ b/fs/vboxsf/dir.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: MIT +/* + * VirtualBox Guest Shared Folders support: Directory inode and file operations + * + * Copyright (C) 2006-2018 Oracle Corporation + */ + +#include +#include +#include "vfsmod.h" + +static int vboxsf_dir_open(struct inode *inode, struct file *file) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb); + struct shfl_createparms params = {}; + struct vboxsf_dir_info *sf_d; + int err; + + sf_d = vboxsf_dir_info_alloc(); + if (!sf_d) + return -ENOMEM; + + params.handle = SHFL_HANDLE_NIL; + params.create_flags = SHFL_CF_DIRECTORY | SHFL_CF_ACT_OPEN_IF_EXISTS | + SHFL_CF_ACT_FAIL_IF_NEW | SHFL_CF_ACCESS_READ; + + err = vboxsf_create_at_dentry(file_dentry(file), ¶ms); + if (err) + goto err_free_dir_info; + + if (params.result != SHFL_FILE_EXISTS) { + err = -ENOENT; + goto err_close; + } + + err = vboxsf_dir_read_all(sbi, sf_d, params.handle); + if (err) + goto err_close; + + vboxsf_close(sbi->root, params.handle); + file->private_data = sf_d; + return 0; + +err_close: + vboxsf_close(sbi->root, params.handle); +err_free_dir_info: + vboxsf_dir_info_free(sf_d); + return err; +} + +static int vboxsf_dir_release(struct inode *inode, struct file *file) +{ + if (file->private_data) + vboxsf_dir_info_free(file->private_data); + + return 0; +} + +static unsigned int vboxsf_get_d_type(u32 mode) +{ + unsigned int d_type; + + switch (mode & SHFL_TYPE_MASK) { + case SHFL_TYPE_FIFO: + d_type = DT_FIFO; + break; + case SHFL_TYPE_DEV_CHAR: + d_type = DT_CHR; + break; + case SHFL_TYPE_DIRECTORY: + d_type = DT_DIR; + break; + case SHFL_TYPE_DEV_BLOCK: + d_type = DT_BLK; + break; + case SHFL_TYPE_FILE: + d_type = DT_REG; + break; + case SHFL_TYPE_SYMLINK: + d_type = DT_LNK; + break; + case SHFL_TYPE_SOCKET: + d_type = DT_SOCK; + break; + case SHFL_TYPE_WHITEOUT: + d_type = DT_WHT; + break; + default: + d_type = DT_UNKNOWN; + break; + } + return d_type; +} + +static bool vboxsf_dir_emit(struct file *dir, struct dir_context *ctx) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(file_inode(dir)->i_sb); + struct vboxsf_dir_info *sf_d = dir->private_data; + struct shfl_dirinfo *info; + struct vboxsf_dir_buf *b; + unsigned int d_type; + loff_t i, cur = 0; + ino_t fake_ino; + void *end; + int err; + + list_for_each_entry(b, &sf_d->info_list, head) { +try_next_entry: + if (ctx->pos >= cur + b->entries) { + cur += b->entries; + continue; + } + + /* + * Note the vboxsf_dir_info objects we are iterating over here + * are variable sized, so the info pointer may end up being + * unaligned. This is how we get the data from the host. + * Since vboxsf is only supported on x86 machines this is not + * a problem. + */ + for (i = 0, info = b->buf; i < ctx->pos - cur; i++) { + end = &info->name.string.utf8[info->name.size]; + /* Only happens if the host gives us corrupt data */ + if (WARN_ON(end > (b->buf + b->used))) + return false; + info = end; + } + + end = &info->name.string.utf8[info->name.size]; + if (WARN_ON(end > (b->buf + b->used))) + return false; + + /* Info now points to the right entry, emit it. */ + d_type = vboxsf_get_d_type(info->info.attr.mode); + + /* + * On 32 bit systems pos is 64 signed, while ino is 32 bit + * unsigned so fake_ino may overflow, check for this. + */ + if ((ino_t)(ctx->pos + 1) != (u64)(ctx->pos + 1)) { + vbg_err("vboxsf: fake ino overflow, truncating dir\n"); + return false; + } + fake_ino = ctx->pos + 1; + + if (sbi->nls) { + char d_name[NAME_MAX]; + + err = vboxsf_nlscpy(sbi, d_name, NAME_MAX, + info->name.string.utf8, + info->name.length); + if (err) { + /* skip erroneous entry and proceed */ + ctx->pos += 1; + goto try_next_entry; + } + + return dir_emit(ctx, d_name, strlen(d_name), + fake_ino, d_type); + } + + return dir_emit(ctx, info->name.string.utf8, info->name.length, + fake_ino, d_type); + } + + return false; +} + +static int vboxsf_dir_iterate(struct file *dir, struct dir_context *ctx) +{ + bool emitted; + + do { + emitted = vboxsf_dir_emit(dir, ctx); + if (emitted) + ctx->pos += 1; + } while (emitted); + + return 0; +} + +const struct file_operations vboxsf_dir_fops = { + .open = vboxsf_dir_open, + .iterate = vboxsf_dir_iterate, + .release = vboxsf_dir_release, + .read = generic_read_dir, + .llseek = generic_file_llseek, +}; + +/* + * This is called during name resolution/lookup to check if the @dentry in + * the cache is still valid. the job is handled by vboxsf_inode_revalidate. + */ +static int vboxsf_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (d_really_is_positive(dentry)) + return vboxsf_inode_revalidate(dentry) == 0; + else + return vboxsf_stat_dentry(dentry, NULL) == -ENOENT; +} + +const struct dentry_operations vboxsf_dentry_ops = { + .d_revalidate = vboxsf_dentry_revalidate +}; + +/* iops */ + +static struct dentry *vboxsf_dir_lookup(struct inode *parent, + struct dentry *dentry, + unsigned int flags) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb); + struct shfl_fsobjinfo fsinfo; + struct inode *inode; + int err; + + dentry->d_time = jiffies; + + err = vboxsf_stat_dentry(dentry, &fsinfo); + if (err) { + inode = (err == -ENOENT) ? NULL : ERR_PTR(err); + } else { + inode = vboxsf_new_inode(parent->i_sb); + if (!IS_ERR(inode)) + vboxsf_init_inode(sbi, inode, &fsinfo); + } + + return d_splice_alias(inode, dentry); +} + +static int vboxsf_dir_instantiate(struct inode *parent, struct dentry *dentry, + struct shfl_fsobjinfo *info) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb); + struct vboxsf_inode *sf_i; + struct inode *inode; + + inode = vboxsf_new_inode(parent->i_sb); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + sf_i = VBOXSF_I(inode); + /* The host may have given us different attr then requested */ + sf_i->force_restat = 1; + vboxsf_init_inode(sbi, inode, info); + + d_instantiate(dentry, inode); + + return 0; +} + +static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry, + umode_t mode, int is_dir) +{ + struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent); + struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb); + struct shfl_createparms params = {}; + int err; + + params.handle = SHFL_HANDLE_NIL; + params.create_flags = SHFL_CF_ACT_CREATE_IF_NEW | + SHFL_CF_ACT_FAIL_IF_EXISTS | + SHFL_CF_ACCESS_READWRITE | + (is_dir ? SHFL_CF_DIRECTORY : 0); + params.info.attr.mode = (mode & 0777) | + (is_dir ? SHFL_TYPE_DIRECTORY : SHFL_TYPE_FILE); + params.info.attr.additional = SHFLFSOBJATTRADD_NOTHING; + + err = vboxsf_create_at_dentry(dentry, ¶ms); + if (err) + return err; + + if (params.result != SHFL_FILE_CREATED) + return -EPERM; + + vboxsf_close(sbi->root, params.handle); + + err = vboxsf_dir_instantiate(parent, dentry, ¶ms.info); + if (err) + return err; + + /* parent directory access/change time changed */ + sf_parent_i->force_restat = 1; + + return 0; +} + +static int vboxsf_dir_mkfile(struct inode *parent, struct dentry *dentry, + umode_t mode, bool excl) +{ + return vboxsf_dir_create(parent, dentry, mode, 0); +} + +static int vboxsf_dir_mkdir(struct inode *parent, struct dentry *dentry, + umode_t mode) +{ + return vboxsf_dir_create(parent, dentry, mode, 1); +} + +static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb); + struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent); + struct inode *inode = d_inode(dentry); + struct shfl_string *path; + u32 flags; + int err; + + if (S_ISDIR(inode->i_mode)) + flags = SHFL_REMOVE_DIR; + else + flags = SHFL_REMOVE_FILE; + + if (S_ISLNK(inode->i_mode)) + flags |= SHFL_REMOVE_SYMLINK; + + path = vboxsf_path_from_dentry(sbi, dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + + err = vboxsf_remove(sbi->root, path, flags); + __putname(path); + if (err) + return err; + + /* parent directory access/change time changed */ + sf_parent_i->force_restat = 1; + + return 0; +} + +static int vboxsf_dir_rename(struct inode *old_parent, + struct dentry *old_dentry, + struct inode *new_parent, + struct dentry *new_dentry, + unsigned int flags) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(old_parent->i_sb); + struct vboxsf_inode *sf_old_parent_i = VBOXSF_I(old_parent); + struct vboxsf_inode *sf_new_parent_i = VBOXSF_I(new_parent); + u32 shfl_flags = SHFL_RENAME_FILE | SHFL_RENAME_REPLACE_IF_EXISTS; + struct shfl_string *old_path, *new_path; + int err; + + if (flags) + return -EINVAL; + + old_path = vboxsf_path_from_dentry(sbi, old_dentry); + if (IS_ERR(old_path)) + return PTR_ERR(old_path); + + new_path = vboxsf_path_from_dentry(sbi, new_dentry); + if (IS_ERR(new_path)) { + err = PTR_ERR(new_path); + goto err_put_old_path; + } + + if (d_inode(old_dentry)->i_mode & S_IFDIR) + shfl_flags = 0; + + err = vboxsf_rename(sbi->root, old_path, new_path, shfl_flags); + if (err == 0) { + /* parent directories access/change time changed */ + sf_new_parent_i->force_restat = 1; + sf_old_parent_i->force_restat = 1; + } + + __putname(new_path); +err_put_old_path: + __putname(old_path); + return err; +} + +static int vboxsf_dir_symlink(struct inode *parent, struct dentry *dentry, + const char *symname) +{ + struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent); + struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb); + int symname_size = strlen(symname) + 1; + struct shfl_string *path, *ssymname; + struct shfl_fsobjinfo info; + int err; + + path = vboxsf_path_from_dentry(sbi, dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + + ssymname = kmalloc(SHFLSTRING_HEADER_SIZE + symname_size, GFP_KERNEL); + if (!ssymname) { + __putname(path); + return -ENOMEM; + } + ssymname->length = symname_size - 1; + ssymname->size = symname_size; + memcpy(ssymname->string.utf8, symname, symname_size); + + err = vboxsf_symlink(sbi->root, path, ssymname, &info); + kfree(ssymname); + __putname(path); + if (err) { + /* -EROFS means symlinks are note support -> -EPERM */ + return (err == -EROFS) ? -EPERM : err; + } + + err = vboxsf_dir_instantiate(parent, dentry, &info); + if (err) + return err; + + /* parent directory access/change time changed */ + sf_parent_i->force_restat = 1; + return 0; +} + +const struct inode_operations vboxsf_dir_iops = { + .lookup = vboxsf_dir_lookup, + .create = vboxsf_dir_mkfile, + .mkdir = vboxsf_dir_mkdir, + .rmdir = vboxsf_dir_unlink, + .unlink = vboxsf_dir_unlink, + .rename = vboxsf_dir_rename, + .symlink = vboxsf_dir_symlink, + .getattr = vboxsf_getattr, + .setattr = vboxsf_setattr, +}; diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c new file mode 100644 index 000000000000..c4ab5996d97a --- /dev/null +++ b/fs/vboxsf/file.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: MIT +/* + * VirtualBox Guest Shared Folders support: Regular file inode and file ops. + * + * Copyright (C) 2006-2018 Oracle Corporation + */ + +#include +#include +#include +#include +#include +#include "vfsmod.h" + +struct vboxsf_handle { + u64 handle; + u32 root; + u32 access_flags; + struct kref refcount; + struct list_head head; +}; + +static int vboxsf_file_open(struct inode *inode, struct file *file) +{ + struct vboxsf_inode *sf_i = VBOXSF_I(inode); + struct shfl_createparms params = {}; + struct vboxsf_handle *sf_handle; + u32 access_flags = 0; + int err; + + sf_handle = kmalloc(sizeof(*sf_handle), GFP_KERNEL); + if (!sf_handle) + return -ENOMEM; + + /* + * We check the value of params.handle afterwards to find out if + * the call succeeded or failed, as the API does not seem to cleanly + * distinguish error and informational messages. + * + * Furthermore, we must set params.handle to SHFL_HANDLE_NIL to + * make the shared folders host service use our mode parameter. + */ + params.handle = SHFL_HANDLE_NIL; + if (file->f_flags & O_CREAT) { + params.create_flags |= SHFL_CF_ACT_CREATE_IF_NEW; + /* + * We ignore O_EXCL, as the Linux kernel seems to call create + * beforehand itself, so O_EXCL should always fail. + */ + if (file->f_flags & O_TRUNC) + params.create_flags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS; + else + params.create_flags |= SHFL_CF_ACT_OPEN_IF_EXISTS; + } else { + params.create_flags |= SHFL_CF_ACT_FAIL_IF_NEW; + if (file->f_flags & O_TRUNC) + params.create_flags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS; + } + + switch (file->f_flags & O_ACCMODE) { + case O_RDONLY: + access_flags |= SHFL_CF_ACCESS_READ; + break; + + case O_WRONLY: + access_flags |= SHFL_CF_ACCESS_WRITE; + break; + + case O_RDWR: + access_flags |= SHFL_CF_ACCESS_READWRITE; + break; + + default: + WARN_ON(1); + } + + if (file->f_flags & O_APPEND) + access_flags |= SHFL_CF_ACCESS_APPEND; + + params.create_flags |= access_flags; + params.info.attr.mode = inode->i_mode; + + err = vboxsf_create_at_dentry(file_dentry(file), ¶ms); + if (err == 0 && params.handle == SHFL_HANDLE_NIL) + err = (params.result == SHFL_FILE_EXISTS) ? -EEXIST : -ENOENT; + if (err) { + kfree(sf_handle); + return err; + } + + /* the host may have given us different attr then requested */ + sf_i->force_restat = 1; + + /* init our handle struct and add it to the inode's handles list */ + sf_handle->handle = params.handle; + sf_handle->root = VBOXSF_SBI(inode->i_sb)->root; + sf_handle->access_flags = access_flags; + kref_init(&sf_handle->refcount); + + mutex_lock(&sf_i->handle_list_mutex); + list_add(&sf_handle->head, &sf_i->handle_list); + mutex_unlock(&sf_i->handle_list_mutex); + + file->private_data = sf_handle; + return 0; +} + +static void vboxsf_handle_release(struct kref *refcount) +{ + struct vboxsf_handle *sf_handle = + container_of(refcount, struct vboxsf_handle, refcount); + + vboxsf_close(sf_handle->root, sf_handle->handle); + kfree(sf_handle); +} + +static int vboxsf_file_release(struct inode *inode, struct file *file) +{ + struct vboxsf_inode *sf_i = VBOXSF_I(inode); + struct vboxsf_handle *sf_handle = file->private_data; + + /* + * When a file is closed on our (the guest) side, we want any subsequent + * accesses done on the host side to see all changes done from our side. + */ + filemap_write_and_wait(inode->i_mapping); + + mutex_lock(&sf_i->handle_list_mutex); + list_del(&sf_handle->head); + mutex_unlock(&sf_i->handle_list_mutex); + + kref_put(&sf_handle->refcount, vboxsf_handle_release); + return 0; +} + +/* + * Write back dirty pages now, because there may not be any suitable + * open files later + */ +static void vboxsf_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +static const struct vm_operations_struct vboxsf_file_vm_ops = { + .close = vboxsf_vma_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int vboxsf_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int err; + + err = generic_file_mmap(file, vma); + if (!err) + vma->vm_ops = &vboxsf_file_vm_ops; + + return err; +} + +/* + * Note that since we are accessing files on the host's filesystem, files + * may always be changed underneath us by the host! + * + * The vboxsf API between the guest and the host does not offer any functions + * to deal with this. There is no inode-generation to check for changes, no + * events / callback on changes and no way to lock files. + * + * To avoid returning stale data when a file gets *opened* on our (the guest) + * side, we do a "stat" on the host side, then compare the mtime with the + * last known mtime and invalidate the page-cache if they differ. + * This is done from vboxsf_inode_revalidate(). + * + * When reads are done through the read_iter fop, it is possible to do + * further cache revalidation then, there are 3 options to deal with this: + * + * 1) Rely solely on the revalidation done at open time + * 2) Do another "stat" and compare mtime again. Unfortunately the vboxsf + * host API does not allow stat on handles, so we would need to use + * file->f_path.dentry and the stat will then fail if the file was unlinked + * or renamed (and there is no thing like NFS' silly-rename). So we get: + * 2a) "stat" and compare mtime, on stat failure invalidate the cache + * 2b) "stat" and compare mtime, on stat failure do nothing + * 3) Simply always call invalidate_inode_pages2_range on the range of the read + * + * Currently we are keeping things KISS and using option 1. this allows + * directly using generic_file_read_iter without wrapping it. + * + * This means that only data written on the host side before open() on + * the guest side is guaranteed to be seen by the guest. If necessary + * we may provide other read-cache strategies in the future and make this + * configurable through a mount option. + */ +const struct file_operations vboxsf_reg_fops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = vboxsf_file_mmap, + .open = vboxsf_file_open, + .release = vboxsf_file_release, + .fsync = noop_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations vboxsf_reg_iops = { + .getattr = vboxsf_getattr, + .setattr = vboxsf_setattr +}; + +static int vboxsf_readpage(struct file *file, struct page *page) +{ + struct vboxsf_handle *sf_handle = file->private_data; + loff_t off = page_offset(page); + u32 nread = PAGE_SIZE; + u8 *buf; + int err; + + buf = kmap(page); + + err = vboxsf_read(sf_handle->root, sf_handle->handle, off, &nread, buf); + if (err == 0) { + memset(&buf[nread], 0, PAGE_SIZE - nread); + flush_dcache_page(page); + SetPageUptodate(page); + } else { + SetPageError(page); + } + + kunmap(page); + unlock_page(page); + return err; +} + +static struct vboxsf_handle *vboxsf_get_write_handle(struct vboxsf_inode *sf_i) +{ + struct vboxsf_handle *h, *sf_handle = NULL; + + mutex_lock(&sf_i->handle_list_mutex); + list_for_each_entry(h, &sf_i->handle_list, head) { + if (h->access_flags == SHFL_CF_ACCESS_WRITE || + h->access_flags == SHFL_CF_ACCESS_READWRITE) { + kref_get(&h->refcount); + sf_handle = h; + break; + } + } + mutex_unlock(&sf_i->handle_list_mutex); + + return sf_handle; +} + +static int vboxsf_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct vboxsf_inode *sf_i = VBOXSF_I(inode); + struct vboxsf_handle *sf_handle; + loff_t off = page_offset(page); + loff_t size = i_size_read(inode); + u32 nwrite = PAGE_SIZE; + u8 *buf; + int err; + + if (off + PAGE_SIZE > size) + nwrite = size & ~PAGE_MASK; + + sf_handle = vboxsf_get_write_handle(sf_i); + if (!sf_handle) + return -EBADF; + + buf = kmap(page); + err = vboxsf_write(sf_handle->root, sf_handle->handle, + off, &nwrite, buf); + kunmap(page); + + kref_put(&sf_handle->refcount, vboxsf_handle_release); + + if (err == 0) { + ClearPageError(page); + /* mtime changed */ + sf_i->force_restat = 1; + } else { + ClearPageUptodate(page); + } + + unlock_page(page); + return err; +} + +static int vboxsf_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + struct vboxsf_handle *sf_handle = file->private_data; + unsigned int from = pos & ~PAGE_MASK; + u32 nwritten = len; + u8 *buf; + int err; + + /* zero the stale part of the page if we did a short copy */ + if (!PageUptodate(page) && copied < len) + zero_user(page, from + copied, len - copied); + + buf = kmap(page); + err = vboxsf_write(sf_handle->root, sf_handle->handle, + pos, &nwritten, buf + from); + kunmap(page); + + if (err) { + nwritten = 0; + goto out; + } + + /* mtime changed */ + VBOXSF_I(inode)->force_restat = 1; + + if (!PageUptodate(page) && nwritten == PAGE_SIZE) + SetPageUptodate(page); + + pos += nwritten; + if (pos > inode->i_size) + i_size_write(inode, pos); + +out: + unlock_page(page); + put_page(page); + + return nwritten; +} + +/* + * Note simple_write_begin does not read the page from disk on partial writes + * this is ok since vboxsf_write_end only writes the written parts of the + * page and it does not call SetPageUptodate for partial writes. + */ +const struct address_space_operations vboxsf_reg_aops = { + .readpage = vboxsf_readpage, + .writepage = vboxsf_writepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = simple_write_begin, + .write_end = vboxsf_write_end, +}; + +static const char *vboxsf_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb); + struct shfl_string *path; + char *link; + int err; + + if (!dentry) + return ERR_PTR(-ECHILD); + + path = vboxsf_path_from_dentry(sbi, dentry); + if (IS_ERR(path)) + return ERR_CAST(path); + + link = kzalloc(PATH_MAX, GFP_KERNEL); + if (!link) { + __putname(path); + return ERR_PTR(-ENOMEM); + } + + err = vboxsf_readlink(sbi->root, path, PATH_MAX, link); + __putname(path); + if (err) { + kfree(link); + return ERR_PTR(err); + } + + set_delayed_call(done, kfree_link, link); + return link; +} + +const struct inode_operations vboxsf_lnk_iops = { + .get_link = vboxsf_get_link +}; diff --git a/fs/vboxsf/shfl_hostintf.h b/fs/vboxsf/shfl_hostintf.h new file mode 100644 index 000000000000..aca829062c12 --- /dev/null +++ b/fs/vboxsf/shfl_hostintf.h @@ -0,0 +1,901 @@ +/* SPDX-License-Identifier: MIT */ +/* + * VirtualBox Shared Folders: host interface definition. + * + * Copyright (C) 2006-2018 Oracle Corporation + */ + +#ifndef SHFL_HOSTINTF_H +#define SHFL_HOSTINTF_H + +#include + +/* The max in/out buffer size for a FN_READ or FN_WRITE call */ +#define SHFL_MAX_RW_COUNT (16 * SZ_1M) + +/* + * Structures shared between guest and the service + * can be relocated and use offsets to point to variable + * length parts. + * + * Shared folders protocol works with handles. + * Before doing any action on a file system object, + * one have to obtain the object handle via a SHFL_FN_CREATE + * request. A handle must be closed with SHFL_FN_CLOSE. + */ + +enum { + SHFL_FN_QUERY_MAPPINGS = 1, /* Query mappings changes. */ + SHFL_FN_QUERY_MAP_NAME = 2, /* Query map name. */ + SHFL_FN_CREATE = 3, /* Open/create object. */ + SHFL_FN_CLOSE = 4, /* Close object handle. */ + SHFL_FN_READ = 5, /* Read object content. */ + SHFL_FN_WRITE = 6, /* Write new object content. */ + SHFL_FN_LOCK = 7, /* Lock/unlock a range in the object. */ + SHFL_FN_LIST = 8, /* List object content. */ + SHFL_FN_INFORMATION = 9, /* Query/set object information. */ + /* Note function number 10 is not used! */ + SHFL_FN_REMOVE = 11, /* Remove object */ + SHFL_FN_MAP_FOLDER_OLD = 12, /* Map folder (legacy) */ + SHFL_FN_UNMAP_FOLDER = 13, /* Unmap folder */ + SHFL_FN_RENAME = 14, /* Rename object */ + SHFL_FN_FLUSH = 15, /* Flush file */ + SHFL_FN_SET_UTF8 = 16, /* Select UTF8 filename encoding */ + SHFL_FN_MAP_FOLDER = 17, /* Map folder */ + SHFL_FN_READLINK = 18, /* Read symlink dest (as of VBox 4.0) */ + SHFL_FN_SYMLINK = 19, /* Create symlink (as of VBox 4.0) */ + SHFL_FN_SET_SYMLINKS = 20, /* Ask host to show symlinks (4.0+) */ +}; + +/* Root handles for a mapping are of type u32, Root handles are unique. */ +#define SHFL_ROOT_NIL UINT_MAX + +/* Shared folders handle for an opened object are of type u64. */ +#define SHFL_HANDLE_NIL ULLONG_MAX + +/* Hardcoded maximum length (in chars) of a shared folder name. */ +#define SHFL_MAX_LEN (256) +/* Hardcoded maximum number of shared folder mapping available to the guest. */ +#define SHFL_MAX_MAPPINGS (64) + +/** Shared folder string buffer structure. */ +struct shfl_string { + /** Allocated size of the string member in bytes. */ + u16 size; + + /** Length of string without trailing nul in bytes. */ + u16 length; + + /** UTF-8 or UTF-16 string. Nul terminated. */ + union { + u8 utf8[2]; + u16 utf16[1]; + u16 ucs2[1]; /* misnomer, use utf16. */ + } string; +}; +VMMDEV_ASSERT_SIZE(shfl_string, 6); + +/* The size of shfl_string w/o the string part. */ +#define SHFLSTRING_HEADER_SIZE 4 + +/* Calculate size of the string. */ +static inline u32 shfl_string_buf_size(const struct shfl_string *string) +{ + return string ? SHFLSTRING_HEADER_SIZE + string->size : 0; +} + +/* Set user id on execution (S_ISUID). */ +#define SHFL_UNIX_ISUID 0004000U +/* Set group id on execution (S_ISGID). */ +#define SHFL_UNIX_ISGID 0002000U +/* Sticky bit (S_ISVTX / S_ISTXT). */ +#define SHFL_UNIX_ISTXT 0001000U + +/* Owner readable (S_IRUSR). */ +#define SHFL_UNIX_IRUSR 0000400U +/* Owner writable (S_IWUSR). */ +#define SHFL_UNIX_IWUSR 0000200U +/* Owner executable (S_IXUSR). */ +#define SHFL_UNIX_IXUSR 0000100U + +/* Group readable (S_IRGRP). */ +#define SHFL_UNIX_IRGRP 0000040U +/* Group writable (S_IWGRP). */ +#define SHFL_UNIX_IWGRP 0000020U +/* Group executable (S_IXGRP). */ +#define SHFL_UNIX_IXGRP 0000010U + +/* Other readable (S_IROTH). */ +#define SHFL_UNIX_IROTH 0000004U +/* Other writable (S_IWOTH). */ +#define SHFL_UNIX_IWOTH 0000002U +/* Other executable (S_IXOTH). */ +#define SHFL_UNIX_IXOTH 0000001U + +/* Named pipe (fifo) (S_IFIFO). */ +#define SHFL_TYPE_FIFO 0010000U +/* Character device (S_IFCHR). */ +#define SHFL_TYPE_DEV_CHAR 0020000U +/* Directory (S_IFDIR). */ +#define SHFL_TYPE_DIRECTORY 0040000U +/* Block device (S_IFBLK). */ +#define SHFL_TYPE_DEV_BLOCK 0060000U +/* Regular file (S_IFREG). */ +#define SHFL_TYPE_FILE 0100000U +/* Symbolic link (S_IFLNK). */ +#define SHFL_TYPE_SYMLINK 0120000U +/* Socket (S_IFSOCK). */ +#define SHFL_TYPE_SOCKET 0140000U +/* Whiteout (S_IFWHT). */ +#define SHFL_TYPE_WHITEOUT 0160000U +/* Type mask (S_IFMT). */ +#define SHFL_TYPE_MASK 0170000U + +/* Checks the mode flags indicate a directory (S_ISDIR). */ +#define SHFL_IS_DIRECTORY(m) (((m) & SHFL_TYPE_MASK) == SHFL_TYPE_DIRECTORY) +/* Checks the mode flags indicate a symbolic link (S_ISLNK). */ +#define SHFL_IS_SYMLINK(m) (((m) & SHFL_TYPE_MASK) == SHFL_TYPE_SYMLINK) + +/** The available additional information in a shfl_fsobjattr object. */ +enum shfl_fsobjattr_add { + /** No additional information is available / requested. */ + SHFLFSOBJATTRADD_NOTHING = 1, + /** + * The additional unix attributes (shfl_fsobjattr::u::unix_attr) are + * available / requested. + */ + SHFLFSOBJATTRADD_UNIX, + /** + * The additional extended attribute size (shfl_fsobjattr::u::size) is + * available / requested. + */ + SHFLFSOBJATTRADD_EASIZE, + /** + * The last valid item (inclusive). + * The valid range is SHFLFSOBJATTRADD_NOTHING thru + * SHFLFSOBJATTRADD_LAST. + */ + SHFLFSOBJATTRADD_LAST = SHFLFSOBJATTRADD_EASIZE, + + /** The usual 32-bit hack. */ + SHFLFSOBJATTRADD_32BIT_SIZE_HACK = 0x7fffffff +}; + +/** + * Additional unix Attributes, these are available when + * shfl_fsobjattr.additional == SHFLFSOBJATTRADD_UNIX. + */ +struct shfl_fsobjattr_unix { + /** + * The user owning the filesystem object (st_uid). + * This field is ~0U if not supported. + */ + u32 uid; + + /** + * The group the filesystem object is assigned (st_gid). + * This field is ~0U if not supported. + */ + u32 gid; + + /** + * Number of hard links to this filesystem object (st_nlink). + * This field is 1 if the filesystem doesn't support hardlinking or + * the information isn't available. + */ + u32 hardlinks; + + /** + * The device number of the device which this filesystem object resides + * on (st_dev). This field is 0 if this information is not available. + */ + u32 inode_id_device; + + /** + * The unique identifier (within the filesystem) of this filesystem + * object (st_ino). Together with inode_id_device, this field can be + * used as a OS wide unique id, when both their values are not 0. + * This field is 0 if the information is not available. + */ + u64 inode_id; + + /** + * User flags (st_flags). + * This field is 0 if this information is not available. + */ + u32 flags; + + /** + * The current generation number (st_gen). + * This field is 0 if this information is not available. + */ + u32 generation_id; + + /** + * The device number of a char. or block device type object (st_rdev). + * This field is 0 if the file isn't a char. or block device or when + * the OS doesn't use the major+minor device idenfication scheme. + */ + u32 device; +} __packed; + +/** Extended attribute size. */ +struct shfl_fsobjattr_easize { + /** Size of EAs. */ + s64 cb; +} __packed; + +/** Shared folder filesystem object attributes. */ +struct shfl_fsobjattr { + /** Mode flags (st_mode). SHFL_UNIX_*, SHFL_TYPE_*, and SHFL_DOS_*. */ + u32 mode; + + /** The additional attributes available. */ + enum shfl_fsobjattr_add additional; + + /** + * Additional attributes. + * + * Unless explicitly specified to an API, the API can provide additional + * data as it is provided by the underlying OS. + */ + union { + struct shfl_fsobjattr_unix unix_attr; + struct shfl_fsobjattr_easize size; + } __packed u; +} __packed; +VMMDEV_ASSERT_SIZE(shfl_fsobjattr, 44); + +struct shfl_timespec { + s64 ns_relative_to_unix_epoch; +}; + +/** Filesystem object information structure. */ +struct shfl_fsobjinfo { + /** + * Logical size (st_size). + * For normal files this is the size of the file. + * For symbolic links, this is the length of the path name contained + * in the symbolic link. + * For other objects this fields needs to be specified. + */ + s64 size; + + /** Disk allocation size (st_blocks * DEV_BSIZE). */ + s64 allocated; + + /** Time of last access (st_atime). */ + struct shfl_timespec access_time; + + /** Time of last data modification (st_mtime). */ + struct shfl_timespec modification_time; + + /** + * Time of last status change (st_ctime). + * If not available this is set to modification_time. + */ + struct shfl_timespec change_time; + + /** + * Time of file birth (st_birthtime). + * If not available this is set to change_time. + */ + struct shfl_timespec birth_time; + + /** Attributes. */ + struct shfl_fsobjattr attr; + +} __packed; +VMMDEV_ASSERT_SIZE(shfl_fsobjinfo, 92); + +/** + * result of an open/create request. + * Along with handle value the result code + * identifies what has happened while + * trying to open the object. + */ +enum shfl_create_result { + SHFL_NO_RESULT, + /** Specified path does not exist. */ + SHFL_PATH_NOT_FOUND, + /** Path to file exists, but the last component does not. */ + SHFL_FILE_NOT_FOUND, + /** File already exists and either has been opened or not. */ + SHFL_FILE_EXISTS, + /** New file was created. */ + SHFL_FILE_CREATED, + /** Existing file was replaced or overwritten. */ + SHFL_FILE_REPLACED +}; + +/* No flags. Initialization value. */ +#define SHFL_CF_NONE (0x00000000) + +/* + * Only lookup the object, do not return a handle. When this is set all other + * flags are ignored. + */ +#define SHFL_CF_LOOKUP (0x00000001) + +/* + * Open parent directory of specified object. + * Useful for the corresponding Windows FSD flag + * and for opening paths like \\dir\\*.* to search the 'dir'. + */ +#define SHFL_CF_OPEN_TARGET_DIRECTORY (0x00000002) + +/* Create/open a directory. */ +#define SHFL_CF_DIRECTORY (0x00000004) + +/* + * Open/create action to do if object exists + * and if the object does not exists. + * REPLACE file means atomically DELETE and CREATE. + * OVERWRITE file means truncating the file to 0 and + * setting new size. + * When opening an existing directory REPLACE and OVERWRITE + * actions are considered invalid, and cause returning + * FILE_EXISTS with NIL handle. + */ +#define SHFL_CF_ACT_MASK_IF_EXISTS (0x000000f0) +#define SHFL_CF_ACT_MASK_IF_NEW (0x00000f00) + +/* What to do if object exists. */ +#define SHFL_CF_ACT_OPEN_IF_EXISTS (0x00000000) +#define SHFL_CF_ACT_FAIL_IF_EXISTS (0x00000010) +#define SHFL_CF_ACT_REPLACE_IF_EXISTS (0x00000020) +#define SHFL_CF_ACT_OVERWRITE_IF_EXISTS (0x00000030) + +/* What to do if object does not exist. */ +#define SHFL_CF_ACT_CREATE_IF_NEW (0x00000000) +#define SHFL_CF_ACT_FAIL_IF_NEW (0x00000100) + +/* Read/write requested access for the object. */ +#define SHFL_CF_ACCESS_MASK_RW (0x00003000) + +/* No access requested. */ +#define SHFL_CF_ACCESS_NONE (0x00000000) +/* Read access requested. */ +#define SHFL_CF_ACCESS_READ (0x00001000) +/* Write access requested. */ +#define SHFL_CF_ACCESS_WRITE (0x00002000) +/* Read/Write access requested. */ +#define SHFL_CF_ACCESS_READWRITE (0x00003000) + +/* Requested share access for the object. */ +#define SHFL_CF_ACCESS_MASK_DENY (0x0000c000) + +/* Allow any access. */ +#define SHFL_CF_ACCESS_DENYNONE (0x00000000) +/* Do not allow read. */ +#define SHFL_CF_ACCESS_DENYREAD (0x00004000) +/* Do not allow write. */ +#define SHFL_CF_ACCESS_DENYWRITE (0x00008000) +/* Do not allow access. */ +#define SHFL_CF_ACCESS_DENYALL (0x0000c000) + +/* Requested access to attributes of the object. */ +#define SHFL_CF_ACCESS_MASK_ATTR (0x00030000) + +/* No access requested. */ +#define SHFL_CF_ACCESS_ATTR_NONE (0x00000000) +/* Read access requested. */ +#define SHFL_CF_ACCESS_ATTR_READ (0x00010000) +/* Write access requested. */ +#define SHFL_CF_ACCESS_ATTR_WRITE (0x00020000) +/* Read/Write access requested. */ +#define SHFL_CF_ACCESS_ATTR_READWRITE (0x00030000) + +/* + * The file is opened in append mode. + * Ignored if SHFL_CF_ACCESS_WRITE is not set. + */ +#define SHFL_CF_ACCESS_APPEND (0x00040000) + +/** Create parameters buffer struct for SHFL_FN_CREATE call */ +struct shfl_createparms { + /** Returned handle of opened object. */ + u64 handle; + + /** Returned result of the operation */ + enum shfl_create_result result; + + /** SHFL_CF_* */ + u32 create_flags; + + /** + * Attributes of object to create and + * returned actual attributes of opened/created object. + */ + struct shfl_fsobjinfo info; +} __packed; + +/** Shared Folder directory information */ +struct shfl_dirinfo { + /** Full information about the object. */ + struct shfl_fsobjinfo info; + /** + * The length of the short field (number of UTF16 chars). + * It is 16-bit for reasons of alignment. + */ + u16 short_name_len; + /** + * The short name for 8.3 compatibility. + * Empty string if not available. + */ + u16 short_name[14]; + struct shfl_string name; +}; + +/** Shared folder filesystem properties. */ +struct shfl_fsproperties { + /** + * The maximum size of a filesystem object name. + * This does not include the '\\0'. + */ + u32 max_component_len; + + /** + * True if the filesystem is remote. + * False if the filesystem is local. + */ + bool remote; + + /** + * True if the filesystem is case sensitive. + * False if the filesystem is case insensitive. + */ + bool case_sensitive; + + /** + * True if the filesystem is mounted read only. + * False if the filesystem is mounted read write. + */ + bool read_only; + + /** + * True if the filesystem can encode unicode object names. + * False if it can't. + */ + bool supports_unicode; + + /** + * True if the filesystem is compresses. + * False if it isn't or we don't know. + */ + bool compressed; + + /** + * True if the filesystem compresses of individual files. + * False if it doesn't or we don't know. + */ + bool file_compression; +}; +VMMDEV_ASSERT_SIZE(shfl_fsproperties, 12); + +struct shfl_volinfo { + s64 total_allocation_bytes; + s64 available_allocation_bytes; + u32 bytes_per_allocation_unit; + u32 bytes_per_sector; + u32 serial; + struct shfl_fsproperties properties; +}; + + +/** SHFL_FN_MAP_FOLDER Parameters structure. */ +struct shfl_map_folder { + /** + * pointer, in: + * Points to struct shfl_string buffer. + */ + struct vmmdev_hgcm_function_parameter path; + + /** + * pointer, out: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * pointer, in: UTF16 + * Path delimiter + */ + struct vmmdev_hgcm_function_parameter delimiter; + + /** + * pointer, in: SHFLROOT (u32) + * Case senstive flag + */ + struct vmmdev_hgcm_function_parameter case_sensitive; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_MAP_FOLDER (4) + + +/** SHFL_FN_UNMAP_FOLDER Parameters structure. */ +struct shfl_unmap_folder { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_UNMAP_FOLDER (1) + + +/** SHFL_FN_CREATE Parameters structure. */ +struct shfl_create { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * pointer, in: + * Points to struct shfl_string buffer. + */ + struct vmmdev_hgcm_function_parameter path; + + /** + * pointer, in/out: + * Points to struct shfl_createparms buffer. + */ + struct vmmdev_hgcm_function_parameter parms; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_CREATE (3) + + +/** SHFL_FN_CLOSE Parameters structure. */ +struct shfl_close { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * value64, in: + * SHFLHANDLE (u64) of object to close. + */ + struct vmmdev_hgcm_function_parameter handle; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_CLOSE (2) + + +/** SHFL_FN_READ Parameters structure. */ +struct shfl_read { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * value64, in: + * SHFLHANDLE (u64) of object to read from. + */ + struct vmmdev_hgcm_function_parameter handle; + + /** + * value64, in: + * Offset to read from. + */ + struct vmmdev_hgcm_function_parameter offset; + + /** + * value64, in/out: + * Bytes to read/How many were read. + */ + struct vmmdev_hgcm_function_parameter cb; + + /** + * pointer, out: + * Buffer to place data to. + */ + struct vmmdev_hgcm_function_parameter buffer; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_READ (5) + + +/** SHFL_FN_WRITE Parameters structure. */ +struct shfl_write { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * value64, in: + * SHFLHANDLE (u64) of object to write to. + */ + struct vmmdev_hgcm_function_parameter handle; + + /** + * value64, in: + * Offset to write to. + */ + struct vmmdev_hgcm_function_parameter offset; + + /** + * value64, in/out: + * Bytes to write/How many were written. + */ + struct vmmdev_hgcm_function_parameter cb; + + /** + * pointer, in: + * Data to write. + */ + struct vmmdev_hgcm_function_parameter buffer; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_WRITE (5) + + +/* + * SHFL_FN_LIST + * Listing information includes variable length RTDIRENTRY[EX] structures. + */ + +#define SHFL_LIST_NONE 0 +#define SHFL_LIST_RETURN_ONE 1 + +/** SHFL_FN_LIST Parameters structure. */ +struct shfl_list { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * value64, in: + * SHFLHANDLE (u64) of object to be listed. + */ + struct vmmdev_hgcm_function_parameter handle; + + /** + * value32, in: + * List flags SHFL_LIST_*. + */ + struct vmmdev_hgcm_function_parameter flags; + + /** + * value32, in/out: + * Bytes to be used for listing information/How many bytes were used. + */ + struct vmmdev_hgcm_function_parameter cb; + + /** + * pointer, in/optional + * Points to struct shfl_string buffer that specifies a search path. + */ + struct vmmdev_hgcm_function_parameter path; + + /** + * pointer, out: + * Buffer to place listing information to. (struct shfl_dirinfo) + */ + struct vmmdev_hgcm_function_parameter buffer; + + /** + * value32, in/out: + * Indicates a key where the listing must be resumed. + * in: 0 means start from begin of object. + * out: 0 means listing completed. + */ + struct vmmdev_hgcm_function_parameter resume_point; + + /** + * pointer, out: + * Number of files returned + */ + struct vmmdev_hgcm_function_parameter file_count; +}; + +/* Number of parameters */ +#define SHFL_CPARMS_LIST (8) + + +/** SHFL_FN_READLINK Parameters structure. */ +struct shfl_readLink { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * pointer, in: + * Points to struct shfl_string buffer. + */ + struct vmmdev_hgcm_function_parameter path; + + /** + * pointer, out: + * Buffer to place data to. + */ + struct vmmdev_hgcm_function_parameter buffer; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_READLINK (3) + + +/* SHFL_FN_INFORMATION */ + +/* Mask of Set/Get bit. */ +#define SHFL_INFO_MODE_MASK (0x1) +/* Get information */ +#define SHFL_INFO_GET (0x0) +/* Set information */ +#define SHFL_INFO_SET (0x1) + +/* Get name of the object. */ +#define SHFL_INFO_NAME (0x2) +/* Set size of object (extend/trucate); only applies to file objects */ +#define SHFL_INFO_SIZE (0x4) +/* Get/Set file object info. */ +#define SHFL_INFO_FILE (0x8) +/* Get volume information. */ +#define SHFL_INFO_VOLUME (0x10) + +/** SHFL_FN_INFORMATION Parameters structure. */ +struct shfl_information { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * value64, in: + * SHFLHANDLE (u64) of object to be listed. + */ + struct vmmdev_hgcm_function_parameter handle; + + /** + * value32, in: + * SHFL_INFO_* + */ + struct vmmdev_hgcm_function_parameter flags; + + /** + * value32, in/out: + * Bytes to be used for information/How many bytes were used. + */ + struct vmmdev_hgcm_function_parameter cb; + + /** + * pointer, in/out: + * Information to be set/get (shfl_fsobjinfo or shfl_string). Do not + * forget to set the shfl_fsobjinfo::attr::additional for a get + * operation as well. + */ + struct vmmdev_hgcm_function_parameter info; + +}; + +/* Number of parameters */ +#define SHFL_CPARMS_INFORMATION (5) + + +/* SHFL_FN_REMOVE */ + +#define SHFL_REMOVE_FILE (0x1) +#define SHFL_REMOVE_DIR (0x2) +#define SHFL_REMOVE_SYMLINK (0x4) + +/** SHFL_FN_REMOVE Parameters structure. */ +struct shfl_remove { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * pointer, in: + * Points to struct shfl_string buffer. + */ + struct vmmdev_hgcm_function_parameter path; + + /** + * value32, in: + * remove flags (file/directory) + */ + struct vmmdev_hgcm_function_parameter flags; + +}; + +#define SHFL_CPARMS_REMOVE (3) + + +/* SHFL_FN_RENAME */ + +#define SHFL_RENAME_FILE (0x1) +#define SHFL_RENAME_DIR (0x2) +#define SHFL_RENAME_REPLACE_IF_EXISTS (0x4) + +/** SHFL_FN_RENAME Parameters structure. */ +struct shfl_rename { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * pointer, in: + * Points to struct shfl_string src. + */ + struct vmmdev_hgcm_function_parameter src; + + /** + * pointer, in: + * Points to struct shfl_string dest. + */ + struct vmmdev_hgcm_function_parameter dest; + + /** + * value32, in: + * rename flags (file/directory) + */ + struct vmmdev_hgcm_function_parameter flags; + +}; + +#define SHFL_CPARMS_RENAME (4) + + +/** SHFL_FN_SYMLINK Parameters structure. */ +struct shfl_symlink { + /** + * pointer, in: SHFLROOT (u32) + * Root handle of the mapping which name is queried. + */ + struct vmmdev_hgcm_function_parameter root; + + /** + * pointer, in: + * Points to struct shfl_string of path for the new symlink. + */ + struct vmmdev_hgcm_function_parameter new_path; + + /** + * pointer, in: + * Points to struct shfl_string of destination for symlink. + */ + struct vmmdev_hgcm_function_parameter old_path; + + /** + * pointer, out: + * Information about created symlink. + */ + struct vmmdev_hgcm_function_parameter info; + +}; + +#define SHFL_CPARMS_SYMLINK (4) + +#endif diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c new file mode 100644 index 000000000000..675e26989376 --- /dev/null +++ b/fs/vboxsf/super.c @@ -0,0 +1,491 @@ +// SPDX-License-Identifier: MIT +/* + * VirtualBox Guest Shared Folders support: Virtual File System. + * + * Module initialization/finalization + * File system registration/deregistration + * Superblock reading + * Few utility functions + * + * Copyright (C) 2006-2018 Oracle Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include "vfsmod.h" + +#define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ + +#define VBSF_MOUNT_SIGNATURE_BYTE_0 ('\000') +#define VBSF_MOUNT_SIGNATURE_BYTE_1 ('\377') +#define VBSF_MOUNT_SIGNATURE_BYTE_2 ('\376') +#define VBSF_MOUNT_SIGNATURE_BYTE_3 ('\375') + +static int follow_symlinks; +module_param(follow_symlinks, int, 0444); +MODULE_PARM_DESC(follow_symlinks, + "Let host resolve symlinks rather than showing them"); + +static DEFINE_IDA(vboxsf_bdi_ida); +static DEFINE_MUTEX(vboxsf_setup_mutex); +static bool vboxsf_setup_done; +static struct super_operations vboxsf_super_ops; /* forward declaration */ +static struct kmem_cache *vboxsf_inode_cachep; + +static char * const vboxsf_default_nls = CONFIG_NLS_DEFAULT; + +enum { opt_nls, opt_uid, opt_gid, opt_ttl, opt_dmode, opt_fmode, + opt_dmask, opt_fmask }; + +static const struct fs_parameter_spec vboxsf_fs_parameters[] = { + fsparam_string ("nls", opt_nls), + fsparam_u32 ("uid", opt_uid), + fsparam_u32 ("gid", opt_gid), + fsparam_u32 ("ttl", opt_ttl), + fsparam_u32oct ("dmode", opt_dmode), + fsparam_u32oct ("fmode", opt_fmode), + fsparam_u32oct ("dmask", opt_dmask), + fsparam_u32oct ("fmask", opt_fmask), + {} +}; + +static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct vboxsf_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + kuid_t uid; + kgid_t gid; + int opt; + + opt = fs_parse(fc, vboxsf_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case opt_nls: + if (ctx->nls_name || fc->purpose != FS_CONTEXT_FOR_MOUNT) { + vbg_err("vboxsf: Cannot reconfigure nls option\n"); + return -EINVAL; + } + ctx->nls_name = param->string; + param->string = NULL; + break; + case opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return -EINVAL; + ctx->o.uid = uid; + break; + case opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return -EINVAL; + ctx->o.gid = gid; + break; + case opt_ttl: + ctx->o.ttl = msecs_to_jiffies(result.uint_32); + break; + case opt_dmode: + if (result.uint_32 & ~0777) + return -EINVAL; + ctx->o.dmode = result.uint_32; + ctx->o.dmode_set = true; + break; + case opt_fmode: + if (result.uint_32 & ~0777) + return -EINVAL; + ctx->o.fmode = result.uint_32; + ctx->o.fmode_set = true; + break; + case opt_dmask: + if (result.uint_32 & ~07777) + return -EINVAL; + ctx->o.dmask = result.uint_32; + break; + case opt_fmask: + if (result.uint_32 & ~07777) + return -EINVAL; + ctx->o.fmask = result.uint_32; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct vboxsf_fs_context *ctx = fc->fs_private; + struct shfl_string *folder_name, root_path; + struct vboxsf_sbi *sbi; + struct dentry *droot; + struct inode *iroot; + char *nls_name; + size_t size; + int err; + + if (!fc->source) + return -EINVAL; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->o = ctx->o; + idr_init(&sbi->ino_idr); + spin_lock_init(&sbi->ino_idr_lock); + sbi->next_generation = 1; + sbi->bdi_id = -1; + + /* Load nls if not utf8 */ + nls_name = ctx->nls_name ? ctx->nls_name : vboxsf_default_nls; + if (strcmp(nls_name, "utf8") != 0) { + if (nls_name == vboxsf_default_nls) + sbi->nls = load_nls_default(); + else + sbi->nls = load_nls(nls_name); + + if (!sbi->nls) { + vbg_err("vboxsf: Count not load '%s' nls\n", nls_name); + err = -EINVAL; + goto fail_free; + } + } + + sbi->bdi_id = ida_simple_get(&vboxsf_bdi_ida, 0, 0, GFP_KERNEL); + if (sbi->bdi_id < 0) { + err = sbi->bdi_id; + goto fail_free; + } + + err = super_setup_bdi_name(sb, "vboxsf-%s.%d", fc->source, sbi->bdi_id); + if (err) + goto fail_free; + + /* Turn source into a shfl_string and map the folder */ + size = strlen(fc->source) + 1; + folder_name = kmalloc(SHFLSTRING_HEADER_SIZE + size, GFP_KERNEL); + if (!folder_name) { + err = -ENOMEM; + goto fail_free; + } + folder_name->size = size; + folder_name->length = size - 1; + strlcpy(folder_name->string.utf8, fc->source, size); + err = vboxsf_map_folder(folder_name, &sbi->root); + kfree(folder_name); + if (err) { + vbg_err("vboxsf: Host rejected mount of '%s' with error %d\n", + fc->source, err); + goto fail_free; + } + + root_path.length = 1; + root_path.size = 2; + root_path.string.utf8[0] = '/'; + root_path.string.utf8[1] = 0; + err = vboxsf_stat(sbi, &root_path, &sbi->root_info); + if (err) + goto fail_unmap; + + sb->s_magic = VBOXSF_SUPER_MAGIC; + sb->s_blocksize = 1024; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &vboxsf_super_ops; + sb->s_d_op = &vboxsf_dentry_ops; + + iroot = iget_locked(sb, 0); + if (!iroot) { + err = -ENOMEM; + goto fail_unmap; + } + vboxsf_init_inode(sbi, iroot, &sbi->root_info); + unlock_new_inode(iroot); + + droot = d_make_root(iroot); + if (!droot) { + err = -ENOMEM; + goto fail_unmap; + } + + sb->s_root = droot; + sb->s_fs_info = sbi; + return 0; + +fail_unmap: + vboxsf_unmap_folder(sbi->root); +fail_free: + if (sbi->bdi_id >= 0) + ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); + if (sbi->nls) + unload_nls(sbi->nls); + idr_destroy(&sbi->ino_idr); + kfree(sbi); + return err; +} + +static void vboxsf_inode_init_once(void *data) +{ + struct vboxsf_inode *sf_i = data; + + mutex_init(&sf_i->handle_list_mutex); + inode_init_once(&sf_i->vfs_inode); +} + +static struct inode *vboxsf_alloc_inode(struct super_block *sb) +{ + struct vboxsf_inode *sf_i; + + sf_i = kmem_cache_alloc(vboxsf_inode_cachep, GFP_NOFS); + if (!sf_i) + return NULL; + + sf_i->force_restat = 0; + INIT_LIST_HEAD(&sf_i->handle_list); + + return &sf_i->vfs_inode; +} + +static void vboxsf_free_inode(struct inode *inode) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb); + unsigned long flags; + + spin_lock_irqsave(&sbi->ino_idr_lock, flags); + idr_remove(&sbi->ino_idr, inode->i_ino); + spin_unlock_irqrestore(&sbi->ino_idr_lock, flags); + kmem_cache_free(vboxsf_inode_cachep, VBOXSF_I(inode)); +} + +static void vboxsf_put_super(struct super_block *sb) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(sb); + + vboxsf_unmap_folder(sbi->root); + if (sbi->bdi_id >= 0) + ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id); + if (sbi->nls) + unload_nls(sbi->nls); + + /* + * vboxsf_free_inode uses the idr, make sure all delayed rcu free + * inodes are flushed. + */ + rcu_barrier(); + idr_destroy(&sbi->ino_idr); + kfree(sbi); +} + +static int vboxsf_statfs(struct dentry *dentry, struct kstatfs *stat) +{ + struct super_block *sb = dentry->d_sb; + struct shfl_volinfo shfl_volinfo; + struct vboxsf_sbi *sbi; + u32 buf_len; + int err; + + sbi = VBOXSF_SBI(sb); + buf_len = sizeof(shfl_volinfo); + err = vboxsf_fsinfo(sbi->root, 0, SHFL_INFO_GET | SHFL_INFO_VOLUME, + &buf_len, &shfl_volinfo); + if (err) + return err; + + stat->f_type = VBOXSF_SUPER_MAGIC; + stat->f_bsize = shfl_volinfo.bytes_per_allocation_unit; + + do_div(shfl_volinfo.total_allocation_bytes, + shfl_volinfo.bytes_per_allocation_unit); + stat->f_blocks = shfl_volinfo.total_allocation_bytes; + + do_div(shfl_volinfo.available_allocation_bytes, + shfl_volinfo.bytes_per_allocation_unit); + stat->f_bfree = shfl_volinfo.available_allocation_bytes; + stat->f_bavail = shfl_volinfo.available_allocation_bytes; + + stat->f_files = 1000; + /* + * Don't return 0 here since the guest may then think that it is not + * possible to create any more files. + */ + stat->f_ffree = 1000000; + stat->f_fsid.val[0] = 0; + stat->f_fsid.val[1] = 0; + stat->f_namelen = 255; + return 0; +} + +static struct super_operations vboxsf_super_ops = { + .alloc_inode = vboxsf_alloc_inode, + .free_inode = vboxsf_free_inode, + .put_super = vboxsf_put_super, + .statfs = vboxsf_statfs, +}; + +static int vboxsf_setup(void) +{ + int err; + + mutex_lock(&vboxsf_setup_mutex); + + if (vboxsf_setup_done) + goto success; + + vboxsf_inode_cachep = + kmem_cache_create("vboxsf_inode_cache", + sizeof(struct vboxsf_inode), 0, + (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT), + vboxsf_inode_init_once); + if (!vboxsf_inode_cachep) { + err = -ENOMEM; + goto fail_nomem; + } + + err = vboxsf_connect(); + if (err) { + vbg_err("vboxsf: err %d connecting to guest PCI-device\n", err); + vbg_err("vboxsf: make sure you are inside a VirtualBox VM\n"); + vbg_err("vboxsf: and check dmesg for vboxguest errors\n"); + goto fail_free_cache; + } + + err = vboxsf_set_utf8(); + if (err) { + vbg_err("vboxsf_setutf8 error %d\n", err); + goto fail_disconnect; + } + + if (!follow_symlinks) { + err = vboxsf_set_symlinks(); + if (err) + vbg_warn("vboxsf: Unable to show symlinks: %d\n", err); + } + + vboxsf_setup_done = true; +success: + mutex_unlock(&vboxsf_setup_mutex); + return 0; + +fail_disconnect: + vboxsf_disconnect(); +fail_free_cache: + kmem_cache_destroy(vboxsf_inode_cachep); +fail_nomem: + mutex_unlock(&vboxsf_setup_mutex); + return err; +} + +static int vboxsf_parse_monolithic(struct fs_context *fc, void *data) +{ + char *options = data; + + if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 && + options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 && + options[2] == VBSF_MOUNT_SIGNATURE_BYTE_2 && + options[3] == VBSF_MOUNT_SIGNATURE_BYTE_3) { + vbg_err("vboxsf: Old binary mount data not supported, remove obsolete mount.vboxsf and/or update your VBoxService.\n"); + return -EINVAL; + } + + return generic_parse_monolithic(fc, data); +} + +static int vboxsf_get_tree(struct fs_context *fc) +{ + int err; + + err = vboxsf_setup(); + if (err) + return err; + + return get_tree_nodev(fc, vboxsf_fill_super); +} + +static int vboxsf_reconfigure(struct fs_context *fc) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(fc->root->d_sb); + struct vboxsf_fs_context *ctx = fc->fs_private; + struct inode *iroot = fc->root->d_sb->s_root->d_inode; + + /* Apply changed options to the root inode */ + sbi->o = ctx->o; + vboxsf_init_inode(sbi, iroot, &sbi->root_info); + + return 0; +} + +static void vboxsf_free_fc(struct fs_context *fc) +{ + struct vboxsf_fs_context *ctx = fc->fs_private; + + kfree(ctx->nls_name); + kfree(ctx); +} + +static const struct fs_context_operations vboxsf_context_ops = { + .free = vboxsf_free_fc, + .parse_param = vboxsf_parse_param, + .parse_monolithic = vboxsf_parse_monolithic, + .get_tree = vboxsf_get_tree, + .reconfigure = vboxsf_reconfigure, +}; + +static int vboxsf_init_fs_context(struct fs_context *fc) +{ + struct vboxsf_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + current_uid_gid(&ctx->o.uid, &ctx->o.gid); + + fc->fs_private = ctx; + fc->ops = &vboxsf_context_ops; + return 0; +} + +static struct file_system_type vboxsf_fs_type = { + .owner = THIS_MODULE, + .name = "vboxsf", + .init_fs_context = vboxsf_init_fs_context, + .kill_sb = kill_anon_super +}; + +/* Module initialization/finalization handlers */ +static int __init vboxsf_init(void) +{ + return register_filesystem(&vboxsf_fs_type); +} + +static void __exit vboxsf_fini(void) +{ + unregister_filesystem(&vboxsf_fs_type); + + mutex_lock(&vboxsf_setup_mutex); + if (vboxsf_setup_done) { + vboxsf_disconnect(); + /* + * Make sure all delayed rcu free inodes are flushed + * before we destroy the cache. + */ + rcu_barrier(); + kmem_cache_destroy(vboxsf_inode_cachep); + } + mutex_unlock(&vboxsf_setup_mutex); +} + +module_init(vboxsf_init); +module_exit(vboxsf_fini); + +MODULE_DESCRIPTION("Oracle VM VirtualBox Module for Host File System Access"); +MODULE_AUTHOR("Oracle Corporation"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_FS("vboxsf"); diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c new file mode 100644 index 000000000000..96bd160da48b --- /dev/null +++ b/fs/vboxsf/utils.c @@ -0,0 +1,551 @@ +// SPDX-License-Identifier: MIT +/* + * VirtualBox Guest Shared Folders support: Utility functions. + * Mainly conversion from/to VirtualBox/Linux data structures. + * + * Copyright (C) 2006-2018 Oracle Corporation + */ + +#include +#include +#include +#include +#include "vfsmod.h" + +struct inode *vboxsf_new_inode(struct super_block *sb) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(sb); + struct inode *inode; + unsigned long flags; + int cursor, ret; + u32 gen; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&sbi->ino_idr_lock, flags); + cursor = idr_get_cursor(&sbi->ino_idr); + ret = idr_alloc_cyclic(&sbi->ino_idr, inode, 1, 0, GFP_ATOMIC); + if (ret >= 0 && ret < cursor) + sbi->next_generation++; + gen = sbi->next_generation; + spin_unlock_irqrestore(&sbi->ino_idr_lock, flags); + idr_preload_end(); + + if (ret < 0) { + iput(inode); + return ERR_PTR(ret); + } + + inode->i_ino = ret; + inode->i_generation = gen; + return inode; +} + +/* set [inode] attributes based on [info], uid/gid based on [sbi] */ +void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, + const struct shfl_fsobjinfo *info) +{ + const struct shfl_fsobjattr *attr; + s64 allocated; + int mode; + + attr = &info->attr; + +#define mode_set(r) ((attr->mode & (SHFL_UNIX_##r)) ? (S_##r) : 0) + + mode = mode_set(IRUSR); + mode |= mode_set(IWUSR); + mode |= mode_set(IXUSR); + + mode |= mode_set(IRGRP); + mode |= mode_set(IWGRP); + mode |= mode_set(IXGRP); + + mode |= mode_set(IROTH); + mode |= mode_set(IWOTH); + mode |= mode_set(IXOTH); + +#undef mode_set + + /* We use the host-side values for these */ + inode->i_flags |= S_NOATIME | S_NOCMTIME; + inode->i_mapping->a_ops = &vboxsf_reg_aops; + + if (SHFL_IS_DIRECTORY(attr->mode)) { + inode->i_mode = sbi->o.dmode_set ? sbi->o.dmode : mode; + inode->i_mode &= ~sbi->o.dmask; + inode->i_mode |= S_IFDIR; + inode->i_op = &vboxsf_dir_iops; + inode->i_fop = &vboxsf_dir_fops; + /* + * XXX: this probably should be set to the number of entries + * in the directory plus two (. ..) + */ + set_nlink(inode, 1); + } else if (SHFL_IS_SYMLINK(attr->mode)) { + inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode; + inode->i_mode &= ~sbi->o.fmask; + inode->i_mode |= S_IFLNK; + inode->i_op = &vboxsf_lnk_iops; + set_nlink(inode, 1); + } else { + inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode; + inode->i_mode &= ~sbi->o.fmask; + inode->i_mode |= S_IFREG; + inode->i_op = &vboxsf_reg_iops; + inode->i_fop = &vboxsf_reg_fops; + set_nlink(inode, 1); + } + + inode->i_uid = sbi->o.uid; + inode->i_gid = sbi->o.gid; + + inode->i_size = info->size; + inode->i_blkbits = 12; + /* i_blocks always in units of 512 bytes! */ + allocated = info->allocated + 511; + do_div(allocated, 512); + inode->i_blocks = allocated; + + inode->i_atime = ns_to_timespec64( + info->access_time.ns_relative_to_unix_epoch); + inode->i_ctime = ns_to_timespec64( + info->change_time.ns_relative_to_unix_epoch); + inode->i_mtime = ns_to_timespec64( + info->modification_time.ns_relative_to_unix_epoch); +} + +int vboxsf_create_at_dentry(struct dentry *dentry, + struct shfl_createparms *params) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb); + struct shfl_string *path; + int err; + + path = vboxsf_path_from_dentry(sbi, dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + + err = vboxsf_create(sbi->root, path, params); + __putname(path); + + return err; +} + +int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, + struct shfl_fsobjinfo *info) +{ + struct shfl_createparms params = {}; + int err; + + params.handle = SHFL_HANDLE_NIL; + params.create_flags = SHFL_CF_LOOKUP | SHFL_CF_ACT_FAIL_IF_NEW; + + err = vboxsf_create(sbi->root, path, ¶ms); + if (err) + return err; + + if (params.result != SHFL_FILE_EXISTS) + return -ENOENT; + + if (info) + *info = params.info; + + return 0; +} + +int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info) +{ + struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb); + struct shfl_string *path; + int err; + + path = vboxsf_path_from_dentry(sbi, dentry); + if (IS_ERR(path)) + return PTR_ERR(path); + + err = vboxsf_stat(sbi, path, info); + __putname(path); + return err; +} + +int vboxsf_inode_revalidate(struct dentry *dentry) +{ + struct vboxsf_sbi *sbi; + struct vboxsf_inode *sf_i; + struct shfl_fsobjinfo info; + struct timespec64 prev_mtime; + struct inode *inode; + int err; + + if (!dentry || !d_really_is_positive(dentry)) + return -EINVAL; + + inode = d_inode(dentry); + prev_mtime = inode->i_mtime; + sf_i = VBOXSF_I(inode); + sbi = VBOXSF_SBI(dentry->d_sb); + if (!sf_i->force_restat) { + if (time_before(jiffies, dentry->d_time + sbi->o.ttl)) + return 0; + } + + err = vboxsf_stat_dentry(dentry, &info); + if (err) + return err; + + dentry->d_time = jiffies; + sf_i->force_restat = 0; + vboxsf_init_inode(sbi, inode, &info); + + /* + * If the file was changed on the host side we need to invalidate the + * page-cache for it. Note this also gets triggered by our own writes, + * this is unavoidable. + */ + if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0) + invalidate_inode_pages2(inode->i_mapping); + + return 0; +} + +int vboxsf_getattr(const struct path *path, struct kstat *kstat, + u32 request_mask, unsigned int flags) +{ + int err; + struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); + struct vboxsf_inode *sf_i = VBOXSF_I(inode); + + switch (flags & AT_STATX_SYNC_TYPE) { + case AT_STATX_DONT_SYNC: + err = 0; + break; + case AT_STATX_FORCE_SYNC: + sf_i->force_restat = 1; + /* fall-through */ + default: + err = vboxsf_inode_revalidate(dentry); + } + if (err) + return err; + + generic_fillattr(d_inode(dentry), kstat); + return 0; +} + +int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry)); + struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb); + struct shfl_createparms params = {}; + struct shfl_fsobjinfo info = {}; + u32 buf_len; + int err; + + params.handle = SHFL_HANDLE_NIL; + params.create_flags = SHFL_CF_ACT_OPEN_IF_EXISTS | + SHFL_CF_ACT_FAIL_IF_NEW | + SHFL_CF_ACCESS_ATTR_WRITE; + + /* this is at least required for Posix hosts */ + if (iattr->ia_valid & ATTR_SIZE) + params.create_flags |= SHFL_CF_ACCESS_WRITE; + + err = vboxsf_create_at_dentry(dentry, ¶ms); + if (err || params.result != SHFL_FILE_EXISTS) + return err ? err : -ENOENT; + +#define mode_set(r) ((iattr->ia_mode & (S_##r)) ? SHFL_UNIX_##r : 0) + + /* + * Setting the file size and setting the other attributes has to + * be handled separately. + */ + if (iattr->ia_valid & (ATTR_MODE | ATTR_ATIME | ATTR_MTIME)) { + if (iattr->ia_valid & ATTR_MODE) { + info.attr.mode = mode_set(IRUSR); + info.attr.mode |= mode_set(IWUSR); + info.attr.mode |= mode_set(IXUSR); + info.attr.mode |= mode_set(IRGRP); + info.attr.mode |= mode_set(IWGRP); + info.attr.mode |= mode_set(IXGRP); + info.attr.mode |= mode_set(IROTH); + info.attr.mode |= mode_set(IWOTH); + info.attr.mode |= mode_set(IXOTH); + + if (iattr->ia_mode & S_IFDIR) + info.attr.mode |= SHFL_TYPE_DIRECTORY; + else + info.attr.mode |= SHFL_TYPE_FILE; + } + + if (iattr->ia_valid & ATTR_ATIME) + info.access_time.ns_relative_to_unix_epoch = + timespec64_to_ns(&iattr->ia_atime); + + if (iattr->ia_valid & ATTR_MTIME) + info.modification_time.ns_relative_to_unix_epoch = + timespec64_to_ns(&iattr->ia_mtime); + + /* + * Ignore ctime (inode change time) as it can't be set + * from userland anyway. + */ + + buf_len = sizeof(info); + err = vboxsf_fsinfo(sbi->root, params.handle, + SHFL_INFO_SET | SHFL_INFO_FILE, &buf_len, + &info); + if (err) { + vboxsf_close(sbi->root, params.handle); + return err; + } + + /* the host may have given us different attr then requested */ + sf_i->force_restat = 1; + } + +#undef mode_set + + if (iattr->ia_valid & ATTR_SIZE) { + memset(&info, 0, sizeof(info)); + info.size = iattr->ia_size; + buf_len = sizeof(info); + err = vboxsf_fsinfo(sbi->root, params.handle, + SHFL_INFO_SET | SHFL_INFO_SIZE, &buf_len, + &info); + if (err) { + vboxsf_close(sbi->root, params.handle); + return err; + } + + /* the host may have given us different attr then requested */ + sf_i->force_restat = 1; + } + + vboxsf_close(sbi->root, params.handle); + + /* Update the inode with what the host has actually given us. */ + if (sf_i->force_restat) + vboxsf_inode_revalidate(dentry); + + return 0; +} + +/* + * [dentry] contains string encoded in coding system that corresponds + * to [sbi]->nls, we must convert it to UTF8 here. + * Returns a shfl_string allocated through __getname (must be freed using + * __putname), or an ERR_PTR on error. + */ +struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi, + struct dentry *dentry) +{ + struct shfl_string *shfl_path; + int path_len, out_len, nb; + char *buf, *path; + wchar_t uni; + u8 *out; + + buf = __getname(); + if (!buf) + return ERR_PTR(-ENOMEM); + + path = dentry_path_raw(dentry, buf, PATH_MAX); + if (IS_ERR(path)) { + __putname(buf); + return ERR_CAST(path); + } + path_len = strlen(path); + + if (sbi->nls) { + shfl_path = __getname(); + if (!shfl_path) { + __putname(buf); + return ERR_PTR(-ENOMEM); + } + + out = shfl_path->string.utf8; + out_len = PATH_MAX - SHFLSTRING_HEADER_SIZE - 1; + + while (path_len) { + nb = sbi->nls->char2uni(path, path_len, &uni); + if (nb < 0) { + __putname(shfl_path); + __putname(buf); + return ERR_PTR(-EINVAL); + } + path += nb; + path_len -= nb; + + nb = utf32_to_utf8(uni, out, out_len); + if (nb < 0) { + __putname(shfl_path); + __putname(buf); + return ERR_PTR(-ENAMETOOLONG); + } + out += nb; + out_len -= nb; + } + *out = 0; + shfl_path->length = out - shfl_path->string.utf8; + shfl_path->size = shfl_path->length + 1; + __putname(buf); + } else { + if ((SHFLSTRING_HEADER_SIZE + path_len + 1) > PATH_MAX) { + __putname(buf); + return ERR_PTR(-ENAMETOOLONG); + } + /* + * dentry_path stores the name at the end of buf, but the + * shfl_string string we return must be properly aligned. + */ + shfl_path = (struct shfl_string *)buf; + memmove(shfl_path->string.utf8, path, path_len); + shfl_path->string.utf8[path_len] = 0; + shfl_path->length = path_len; + shfl_path->size = path_len + 1; + } + + return shfl_path; +} + +int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, + const unsigned char *utf8_name, size_t utf8_len) +{ + const char *in; + char *out; + size_t out_len; + size_t out_bound_len; + size_t in_bound_len; + + in = utf8_name; + in_bound_len = utf8_len; + + out = name; + out_len = 0; + /* Reserve space for terminating 0 */ + out_bound_len = name_bound_len - 1; + + while (in_bound_len) { + int nb; + unicode_t uni; + + nb = utf8_to_utf32(in, in_bound_len, &uni); + if (nb < 0) + return -EINVAL; + + in += nb; + in_bound_len -= nb; + + nb = sbi->nls->uni2char(uni, out, out_bound_len); + if (nb < 0) + return nb; + + out += nb; + out_bound_len -= nb; + out_len += nb; + } + + *out = 0; + + return 0; +} + +static struct vboxsf_dir_buf *vboxsf_dir_buf_alloc(struct list_head *list) +{ + struct vboxsf_dir_buf *b; + + b = kmalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return NULL; + + b->buf = kmalloc(DIR_BUFFER_SIZE, GFP_KERNEL); + if (!b->buf) { + kfree(b); + return NULL; + } + + b->entries = 0; + b->used = 0; + b->free = DIR_BUFFER_SIZE; + list_add(&b->head, list); + + return b; +} + +static void vboxsf_dir_buf_free(struct vboxsf_dir_buf *b) +{ + list_del(&b->head); + kfree(b->buf); + kfree(b); +} + +struct vboxsf_dir_info *vboxsf_dir_info_alloc(void) +{ + struct vboxsf_dir_info *p; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return NULL; + + INIT_LIST_HEAD(&p->info_list); + return p; +} + +void vboxsf_dir_info_free(struct vboxsf_dir_info *p) +{ + struct list_head *list, *pos, *tmp; + + list = &p->info_list; + list_for_each_safe(pos, tmp, list) { + struct vboxsf_dir_buf *b; + + b = list_entry(pos, struct vboxsf_dir_buf, head); + vboxsf_dir_buf_free(b); + } + kfree(p); +} + +int vboxsf_dir_read_all(struct vboxsf_sbi *sbi, struct vboxsf_dir_info *sf_d, + u64 handle) +{ + struct vboxsf_dir_buf *b; + u32 entries, size; + int err = 0; + void *buf; + + /* vboxsf_dirinfo returns 1 on end of dir */ + while (err == 0) { + b = vboxsf_dir_buf_alloc(&sf_d->info_list); + if (!b) { + err = -ENOMEM; + break; + } + + buf = b->buf; + size = b->free; + + err = vboxsf_dirinfo(sbi->root, handle, NULL, 0, 0, + &size, buf, &entries); + if (err < 0) + break; + + b->entries += entries; + b->free -= size; + b->used += size; + } + + if (b && b->used == 0) + vboxsf_dir_buf_free(b); + + /* -EILSEQ means the host could not translate a filename, ignore */ + if (err > 0 || err == -EILSEQ) + err = 0; + + return err; +} diff --git a/fs/vboxsf/vboxsf_wrappers.c b/fs/vboxsf/vboxsf_wrappers.c new file mode 100644 index 000000000000..bfc78a097dae --- /dev/null +++ b/fs/vboxsf/vboxsf_wrappers.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: MIT +/* + * Wrapper functions for the shfl host calls. + * + * Copyright (C) 2006-2018 Oracle Corporation + */ + +#include +#include +#include +#include +#include "vfsmod.h" + +#define SHFL_REQUEST \ + (VMMDEV_REQUESTOR_KERNEL | VMMDEV_REQUESTOR_USR_DRV_OTHER | \ + VMMDEV_REQUESTOR_CON_DONT_KNOW | VMMDEV_REQUESTOR_TRUST_NOT_GIVEN) + +static u32 vboxsf_client_id; + +int vboxsf_connect(void) +{ + struct vbg_dev *gdev; + struct vmmdev_hgcm_service_location loc; + int err, vbox_status; + + loc.type = VMMDEV_HGCM_LOC_LOCALHOST_EXISTING; + strcpy(loc.u.localhost.service_name, "VBoxSharedFolders"); + + gdev = vbg_get_gdev(); + if (IS_ERR(gdev)) + return -ENODEV; /* No guest-device */ + + err = vbg_hgcm_connect(gdev, SHFL_REQUEST, &loc, + &vboxsf_client_id, &vbox_status); + vbg_put_gdev(gdev); + + return err ? err : vbg_status_code_to_errno(vbox_status); +} + +void vboxsf_disconnect(void) +{ + struct vbg_dev *gdev; + int vbox_status; + + gdev = vbg_get_gdev(); + if (IS_ERR(gdev)) + return; /* guest-device is gone, already disconnected */ + + vbg_hgcm_disconnect(gdev, SHFL_REQUEST, vboxsf_client_id, &vbox_status); + vbg_put_gdev(gdev); +} + +static int vboxsf_call(u32 function, void *parms, u32 parm_count, int *status) +{ + struct vbg_dev *gdev; + int err, vbox_status; + + gdev = vbg_get_gdev(); + if (IS_ERR(gdev)) + return -ESHUTDOWN; /* guest-dev removed underneath us */ + + err = vbg_hgcm_call(gdev, SHFL_REQUEST, vboxsf_client_id, function, + U32_MAX, parms, parm_count, &vbox_status); + vbg_put_gdev(gdev); + + if (err < 0) + return err; + + if (status) + *status = vbox_status; + + return vbg_status_code_to_errno(vbox_status); +} + +int vboxsf_map_folder(struct shfl_string *folder_name, u32 *root) +{ + struct shfl_map_folder parms; + int err, status; + + parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL; + parms.path.u.pointer.size = shfl_string_buf_size(folder_name); + parms.path.u.pointer.u.linear_addr = (uintptr_t)folder_name; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = 0; + + parms.delimiter.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.delimiter.u.value32 = '/'; + + parms.case_sensitive.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.case_sensitive.u.value32 = 1; + + err = vboxsf_call(SHFL_FN_MAP_FOLDER, &parms, SHFL_CPARMS_MAP_FOLDER, + &status); + if (err == -ENOSYS && status == VERR_NOT_IMPLEMENTED) + vbg_err("%s: Error host is too old\n", __func__); + + *root = parms.root.u.value32; + return err; +} + +int vboxsf_unmap_folder(u32 root) +{ + struct shfl_unmap_folder parms; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + return vboxsf_call(SHFL_FN_UNMAP_FOLDER, &parms, + SHFL_CPARMS_UNMAP_FOLDER, NULL); +} + +/** + * vboxsf_create - Create a new file or folder + * @root: Root of the shared folder in which to create the file + * @parsed_path: The path of the file or folder relative to the shared folder + * @param: create_parms Parameters for file/folder creation. + * + * Create a new file or folder or open an existing one in a shared folder. + * Note this function always returns 0 / success unless an exceptional condition + * occurs - out of memory, invalid arguments, etc. If the file or folder could + * not be opened or created, create_parms->handle will be set to + * SHFL_HANDLE_NIL on return. In this case the value in create_parms->result + * provides information as to why (e.g. SHFL_FILE_EXISTS), create_parms->result + * is also set on success as additional information. + * + * Returns: + * 0 or negative errno value. + */ +int vboxsf_create(u32 root, struct shfl_string *parsed_path, + struct shfl_createparms *create_parms) +{ + struct shfl_create parms; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL; + parms.path.u.pointer.size = shfl_string_buf_size(parsed_path); + parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path; + + parms.parms.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL; + parms.parms.u.pointer.size = sizeof(struct shfl_createparms); + parms.parms.u.pointer.u.linear_addr = (uintptr_t)create_parms; + + return vboxsf_call(SHFL_FN_CREATE, &parms, SHFL_CPARMS_CREATE, NULL); +} + +int vboxsf_close(u32 root, u64 handle) +{ + struct shfl_close parms; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parms.handle.u.value64 = handle; + + return vboxsf_call(SHFL_FN_CLOSE, &parms, SHFL_CPARMS_CLOSE, NULL); +} + +int vboxsf_remove(u32 root, struct shfl_string *parsed_path, u32 flags) +{ + struct shfl_remove parms; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.path.u.pointer.size = shfl_string_buf_size(parsed_path); + parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path; + + parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.flags.u.value32 = flags; + + return vboxsf_call(SHFL_FN_REMOVE, &parms, SHFL_CPARMS_REMOVE, NULL); +} + +int vboxsf_rename(u32 root, struct shfl_string *src_path, + struct shfl_string *dest_path, u32 flags) +{ + struct shfl_rename parms; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.src.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.src.u.pointer.size = shfl_string_buf_size(src_path); + parms.src.u.pointer.u.linear_addr = (uintptr_t)src_path; + + parms.dest.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.dest.u.pointer.size = shfl_string_buf_size(dest_path); + parms.dest.u.pointer.u.linear_addr = (uintptr_t)dest_path; + + parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.flags.u.value32 = flags; + + return vboxsf_call(SHFL_FN_RENAME, &parms, SHFL_CPARMS_RENAME, NULL); +} + +int vboxsf_read(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf) +{ + struct shfl_read parms; + int err; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parms.handle.u.value64 = handle; + parms.offset.type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parms.offset.u.value64 = offset; + parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.cb.u.value32 = *buf_len; + parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT; + parms.buffer.u.pointer.size = *buf_len; + parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf; + + err = vboxsf_call(SHFL_FN_READ, &parms, SHFL_CPARMS_READ, NULL); + + *buf_len = parms.cb.u.value32; + return err; +} + +int vboxsf_write(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf) +{ + struct shfl_write parms; + int err; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parms.handle.u.value64 = handle; + parms.offset.type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parms.offset.u.value64 = offset; + parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.cb.u.value32 = *buf_len; + parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.buffer.u.pointer.size = *buf_len; + parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf; + + err = vboxsf_call(SHFL_FN_WRITE, &parms, SHFL_CPARMS_WRITE, NULL); + + *buf_len = parms.cb.u.value32; + return err; +} + +/* Returns 0 on success, 1 on end-of-dir, negative errno otherwise */ +int vboxsf_dirinfo(u32 root, u64 handle, + struct shfl_string *parsed_path, u32 flags, u32 index, + u32 *buf_len, struct shfl_dirinfo *buf, u32 *file_count) +{ + struct shfl_list parms; + int err, status; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parms.handle.u.value64 = handle; + parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.flags.u.value32 = flags; + parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.cb.u.value32 = *buf_len; + if (parsed_path) { + parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.path.u.pointer.size = shfl_string_buf_size(parsed_path); + parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path; + } else { + parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_IN; + parms.path.u.pointer.size = 0; + parms.path.u.pointer.u.linear_addr = 0; + } + + parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT; + parms.buffer.u.pointer.size = *buf_len; + parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf; + + parms.resume_point.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.resume_point.u.value32 = index; + parms.file_count.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.file_count.u.value32 = 0; /* out parameter only */ + + err = vboxsf_call(SHFL_FN_LIST, &parms, SHFL_CPARMS_LIST, &status); + if (err == -ENODATA && status == VERR_NO_MORE_FILES) + err = 1; + + *buf_len = parms.cb.u.value32; + *file_count = parms.file_count.u.value32; + return err; +} + +int vboxsf_fsinfo(u32 root, u64 handle, u32 flags, + u32 *buf_len, void *buf) +{ + struct shfl_information parms; + int err; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT; + parms.handle.u.value64 = handle; + parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.flags.u.value32 = flags; + parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.cb.u.value32 = *buf_len; + parms.info.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL; + parms.info.u.pointer.size = *buf_len; + parms.info.u.pointer.u.linear_addr = (uintptr_t)buf; + + err = vboxsf_call(SHFL_FN_INFORMATION, &parms, SHFL_CPARMS_INFORMATION, + NULL); + + *buf_len = parms.cb.u.value32; + return err; +} + +int vboxsf_readlink(u32 root, struct shfl_string *parsed_path, + u32 buf_len, u8 *buf) +{ + struct shfl_readLink parms; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.path.u.pointer.size = shfl_string_buf_size(parsed_path); + parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path; + + parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT; + parms.buffer.u.pointer.size = buf_len; + parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf; + + return vboxsf_call(SHFL_FN_READLINK, &parms, SHFL_CPARMS_READLINK, + NULL); +} + +int vboxsf_symlink(u32 root, struct shfl_string *new_path, + struct shfl_string *old_path, struct shfl_fsobjinfo *buf) +{ + struct shfl_symlink parms; + + parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT; + parms.root.u.value32 = root; + + parms.new_path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.new_path.u.pointer.size = shfl_string_buf_size(new_path); + parms.new_path.u.pointer.u.linear_addr = (uintptr_t)new_path; + + parms.old_path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN; + parms.old_path.u.pointer.size = shfl_string_buf_size(old_path); + parms.old_path.u.pointer.u.linear_addr = (uintptr_t)old_path; + + parms.info.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT; + parms.info.u.pointer.size = sizeof(struct shfl_fsobjinfo); + parms.info.u.pointer.u.linear_addr = (uintptr_t)buf; + + return vboxsf_call(SHFL_FN_SYMLINK, &parms, SHFL_CPARMS_SYMLINK, NULL); +} + +int vboxsf_set_utf8(void) +{ + return vboxsf_call(SHFL_FN_SET_UTF8, NULL, 0, NULL); +} + +int vboxsf_set_symlinks(void) +{ + return vboxsf_call(SHFL_FN_SET_SYMLINKS, NULL, 0, NULL); +} diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h new file mode 100644 index 000000000000..18f95b00fc33 --- /dev/null +++ b/fs/vboxsf/vfsmod.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: MIT */ +/* + * VirtualBox Guest Shared Folders support: module header. + * + * Copyright (C) 2006-2018 Oracle Corporation + */ + +#ifndef VFSMOD_H +#define VFSMOD_H + +#include +#include +#include "shfl_hostintf.h" + +#define DIR_BUFFER_SIZE SZ_16K + +/* The cast is to prevent assignment of void * to pointers of arbitrary type */ +#define VBOXSF_SBI(sb) ((struct vboxsf_sbi *)(sb)->s_fs_info) +#define VBOXSF_I(i) container_of(i, struct vboxsf_inode, vfs_inode) + +struct vboxsf_options { + unsigned long ttl; + kuid_t uid; + kgid_t gid; + bool dmode_set; + bool fmode_set; + umode_t dmode; + umode_t fmode; + umode_t dmask; + umode_t fmask; +}; + +struct vboxsf_fs_context { + struct vboxsf_options o; + char *nls_name; +}; + +/* per-shared folder information */ +struct vboxsf_sbi { + struct vboxsf_options o; + struct shfl_fsobjinfo root_info; + struct idr ino_idr; + spinlock_t ino_idr_lock; /* This protects ino_idr */ + struct nls_table *nls; + u32 next_generation; + u32 root; + int bdi_id; +}; + +/* per-inode information */ +struct vboxsf_inode { + /* some information was changed, update data on next revalidate */ + int force_restat; + /* list of open handles for this inode + lock protecting it */ + struct list_head handle_list; + /* This mutex protects handle_list accesses */ + struct mutex handle_list_mutex; + /* The VFS inode struct */ + struct inode vfs_inode; +}; + +struct vboxsf_dir_info { + struct list_head info_list; +}; + +struct vboxsf_dir_buf { + size_t entries; + size_t free; + size_t used; + void *buf; + struct list_head head; +}; + +/* globals */ +extern const struct inode_operations vboxsf_dir_iops; +extern const struct inode_operations vboxsf_lnk_iops; +extern const struct inode_operations vboxsf_reg_iops; +extern const struct file_operations vboxsf_dir_fops; +extern const struct file_operations vboxsf_reg_fops; +extern const struct address_space_operations vboxsf_reg_aops; +extern const struct dentry_operations vboxsf_dentry_ops; + +/* from utils.c */ +struct inode *vboxsf_new_inode(struct super_block *sb); +void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, + const struct shfl_fsobjinfo *info); +int vboxsf_create_at_dentry(struct dentry *dentry, + struct shfl_createparms *params); +int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, + struct shfl_fsobjinfo *info); +int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info); +int vboxsf_inode_revalidate(struct dentry *dentry); +int vboxsf_getattr(const struct path *path, struct kstat *kstat, + u32 request_mask, unsigned int query_flags); +int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr); +struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi, + struct dentry *dentry); +int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, + const unsigned char *utf8_name, size_t utf8_len); +struct vboxsf_dir_info *vboxsf_dir_info_alloc(void); +void vboxsf_dir_info_free(struct vboxsf_dir_info *p); +int vboxsf_dir_read_all(struct vboxsf_sbi *sbi, struct vboxsf_dir_info *sf_d, + u64 handle); + +/* from vboxsf_wrappers.c */ +int vboxsf_connect(void); +void vboxsf_disconnect(void); + +int vboxsf_create(u32 root, struct shfl_string *parsed_path, + struct shfl_createparms *create_parms); + +int vboxsf_close(u32 root, u64 handle); +int vboxsf_remove(u32 root, struct shfl_string *parsed_path, u32 flags); +int vboxsf_rename(u32 root, struct shfl_string *src_path, + struct shfl_string *dest_path, u32 flags); + +int vboxsf_read(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf); +int vboxsf_write(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf); + +int vboxsf_dirinfo(u32 root, u64 handle, + struct shfl_string *parsed_path, u32 flags, u32 index, + u32 *buf_len, struct shfl_dirinfo *buf, u32 *file_count); +int vboxsf_fsinfo(u32 root, u64 handle, u32 flags, + u32 *buf_len, void *buf); + +int vboxsf_map_folder(struct shfl_string *folder_name, u32 *root); +int vboxsf_unmap_folder(u32 root); + +int vboxsf_readlink(u32 root, struct shfl_string *parsed_path, + u32 buf_len, u8 *buf); +int vboxsf_symlink(u32 root, struct shfl_string *new_path, + struct shfl_string *old_path, struct shfl_fsobjinfo *buf); + +int vboxsf_set_utf8(void); +int vboxsf_set_symlinks(void); + +#endif From 490d332ea42780577f679f5d13598b195bff360c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 9 Feb 2020 22:48:50 +0000 Subject: [PATCH 146/147] irqchip/gic-v4.1: Avoid 64bit division for the sake of 32bit ARM In order to allow the GICv4 code to link properly on 32bit ARM, make sure we don't use 64bit divisions when it isn't strictly necessary. Fixes: 4e6437f12d6e ("irqchip/gic-v4.1: Ensure L2 vPE table is allocated at RD level") Reported-by: Stephen Rothwell Cc: Zenghui Yu Signed-off-by: Marc Zyngier Signed-off-by: Linus Torvalds --- drivers/irqchip/irq-gic-v3-its.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 1ee95f546cb0..83b1186ffcad 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2444,8 +2444,8 @@ static u64 inherit_vpe_l1_table_from_rd(cpumask_t **mask) static bool allocate_vpe_l2_table(int cpu, u32 id) { void __iomem *base = gic_data_rdist_cpu(cpu)->rd_base; - u64 val, gpsz, npg; - unsigned int psz, esz, idx; + unsigned int psz, esz, idx, npg, gpsz; + u64 val; struct page *page; __le64 *table; From bb6d3fb354c5ee8d6bde2d576eb7220ea09862b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Feb 2020 16:08:48 -0800 Subject: [PATCH 147/147] Linux 5.6-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ef8913a8eb2a..84b71845c43f 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 5 +PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Kleptomaniac Octopus # *DOCUMENTATION*