From 290c4a902b79246ec55e477fc313f27f98393dee Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Sun, 10 Apr 2022 22:06:48 -0500 Subject: [PATCH 001/179] RDMA/rxe: Fix "Replace mr by rkey in responder resources" The referenced commit generates a reference counting error if the rkey has the same index but the wrong key. In this case the reference taken by rxe_pool_get_index() is not dropped. Drop the reference if the keys don't match in rxe_recheck_mr(). Check that the mw and mr are still valid. Fixes: 8a1a0be894da ("RDMA/rxe: Replace mr by rkey in responder resources") Link: https://lore.kernel.org/r/20220411030647.20011-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 16fc7ea1298d..1d95fab606da 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -680,6 +680,11 @@ static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp, * It is assumed that the access permissions if originally good * are OK and the mappings to be unchanged. * + * TODO: If someone reregisters an MR to change its size or + * access permissions during the processing of an RDMA read + * we should kill the responder resource and complete the + * operation with an error. + * * Return: mr on success else NULL */ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) @@ -690,23 +695,27 @@ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) if (rkey_is_mw(rkey)) { mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8); - if (!mw || mw->rkey != rkey) + if (!mw) return NULL; - if (mw->state != RXE_MW_STATE_VALID) { + mr = mw->mr; + if (mw->rkey != rkey || mw->state != RXE_MW_STATE_VALID || + !mr || mr->state != RXE_MR_STATE_VALID) { rxe_put(mw); return NULL; } - mr = mw->mr; + rxe_get(mr); rxe_put(mw); - } else { - mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); - if (!mr || mr->rkey != rkey) - return NULL; + + return mr; } - if (mr->state != RXE_MR_STATE_VALID) { + mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); + if (!mr) + return NULL; + + if (mr->rkey != rkey || mr->state != RXE_MR_STATE_VALID) { rxe_put(mr); return NULL; } From a063f2fba3fa633a599253b62561051ac185fa99 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 16 Apr 2022 13:51:10 +0200 Subject: [PATCH 002/179] batman-adv: Don't skb_split skbuffs with frag_list The receiving interface might have used GRO to receive more fragments than MAX_SKB_FRAGS fragments. In this case, these will not be stored in skb_shinfo(skb)->frags but merged into the frag list. batman-adv relies on the function skb_split to split packets up into multiple smaller packets which are not larger than the MTU on the outgoing interface. But this function cannot handle frag_list entries and is only operating on skb_shinfo(skb)->frags. If it is still trying to split such an skb and xmit'ing it on an interface without support for NETIF_F_FRAGLIST, then validate_xmit_skb() will try to linearize it. But this fails due to inconsistent information. And __pskb_pull_tail will trigger a BUG_ON after skb_copy_bits() returns an error. In case of entries in frag_list, just linearize the skb before operating on it with skb_split(). Reported-by: Felix Kaechele Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Sven Eckelmann Tested-by: Felix Kaechele Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0899a729a23f..c120c7c6d25f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -475,6 +475,17 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto free_skb; } + /* GRO might have added fragments to the fragment list instead of + * frags[]. But this is not handled by skb_split and must be + * linearized to avoid incorrect length information after all + * batman-adv fragments were created and submitted to the + * hard-interface + */ + if (skb_has_frag_list(skb) && __skb_linearize(skb)) { + ret = -ENOMEM; + goto free_skb; + } + /* Create one header to be copied to all fragments */ frag_header.packet_type = BATADV_UNICAST_FRAG; frag_header.version = BATADV_COMPAT_VERSION; From b4f5c6b2e52b27462c0599e64e96e53b58438de1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 16 Apr 2022 13:54:08 +0100 Subject: [PATCH 003/179] ASoC: wm8958: Fix change notifications for DSP controls The WM8958 DSP controls all return 0 on successful write, not a boolean value indicating if the write changed the value of the control. Fix this by returning 1 after a change, there is already a check at the start of each put() that skips the function in the case that there is no change. Signed-off-by: Mark Brown Acked-by: Charles Keepax Link: https://lore.kernel.org/r/20220416125408.197440-1-broonie@kernel.org Cc: stable@vger.kernel.org --- sound/soc/codecs/wm8958-dsp2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/wm8958-dsp2.c b/sound/soc/codecs/wm8958-dsp2.c index e4018ba3b19a..7878c7a58ff1 100644 --- a/sound/soc/codecs/wm8958-dsp2.c +++ b/sound/soc/codecs/wm8958-dsp2.c @@ -530,7 +530,7 @@ static int wm8958_mbc_put(struct snd_kcontrol *kcontrol, wm8958_dsp_apply(component, mbc, wm8994->mbc_ena[mbc]); - return 0; + return 1; } #define WM8958_MBC_SWITCH(xname, xval) {\ @@ -656,7 +656,7 @@ static int wm8958_vss_put(struct snd_kcontrol *kcontrol, wm8958_dsp_apply(component, vss, wm8994->vss_ena[vss]); - return 0; + return 1; } @@ -730,7 +730,7 @@ static int wm8958_hpf_put(struct snd_kcontrol *kcontrol, wm8958_dsp_apply(component, hpf % 3, ucontrol->value.integer.value[0]); - return 0; + return 1; } #define WM8958_HPF_SWITCH(xname, xval) {\ @@ -824,7 +824,7 @@ static int wm8958_enh_eq_put(struct snd_kcontrol *kcontrol, wm8958_dsp_apply(component, eq, ucontrol->value.integer.value[0]); - return 0; + return 1; } #define WM8958_ENH_EQ_SWITCH(xname, xval) {\ From 679ab61bf5f5f519377d812afb4fb93634782c74 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Mon, 18 Apr 2022 23:33:22 +0800 Subject: [PATCH 004/179] RDMA/irdma: Fix deadlock in irdma_cleanup_cm_core() There is a deadlock in irdma_cleanup_cm_core(), which is shown below: (Thread 1) | (Thread 2) | irdma_schedule_cm_timer() irdma_cleanup_cm_core() | add_timer() spin_lock_irqsave() //(1) | (wait a time) ... | irdma_cm_timer_tick() del_timer_sync() | spin_lock_irqsave() //(2) (wait timer to stop) | ... We hold cm_core->ht_lock in position (1) of thread 1 and use del_timer_sync() to wait timer to stop, but timer handler also need cm_core->ht_lock in position (2) of thread 2. As a result, irdma_cleanup_cm_core() will block forever. This patch removes the check of timer_pending() in irdma_cleanup_cm_core(), because the del_timer_sync() function will just return directly if there isn't a pending timer. As a result, the lock is redundant, because there is no resource it could protect. Link: https://lore.kernel.org/r/20220418153322.42524-1-duoming@zju.edu.cn Signed-off-by: Duoming Zhou Reviewed-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index dedb3b7edd8d..a98d962e5efb 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3246,15 +3246,10 @@ int irdma_setup_cm_core(struct irdma_device *iwdev, u8 rdma_ver) */ void irdma_cleanup_cm_core(struct irdma_cm_core *cm_core) { - unsigned long flags; - if (!cm_core) return; - spin_lock_irqsave(&cm_core->ht_lock, flags); - if (timer_pending(&cm_core->tcp_timer)) - del_timer_sync(&cm_core->tcp_timer); - spin_unlock_irqrestore(&cm_core->ht_lock, flags); + del_timer_sync(&cm_core->tcp_timer); destroy_workqueue(cm_core->event_wq); cm_core->dev->ws_reset(&cm_core->iwdev->vsi); From 3756aa16fadaef2873cfbd2659dfa1978a7e1859 Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Tue, 12 Apr 2022 13:16:58 +0200 Subject: [PATCH 005/179] ASoC: simple-card-utils: fix sysclk shutdown In asoc_simple_shutdown() the snd_soc_dai_set_sysclk() function is called twice with input direction SND_SOC_CLOCK_IN. Restore one call with output direction SND_SOC_CLOCK_OUT. Fixes: 5ca2ab459817 ("ASoC: simple-card-utils: Add new system-clock-fixed flag") Signed-off-by: Olivier Moysan Link: https://lore.kernel.org/r/20220412111658.11015-1-olivier.moysan@foss.st.com Signed-off-by: Mark Brown --- sound/soc/generic/simple-card-utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c index f2157944247f..da0c27828ce6 100644 --- a/sound/soc/generic/simple-card-utils.c +++ b/sound/soc/generic/simple-card-utils.c @@ -322,7 +322,7 @@ void asoc_simple_shutdown(struct snd_pcm_substream *substream) if (props->mclk_fs && !dai->clk_fixed && !snd_soc_dai_active(cpu_dai)) snd_soc_dai_set_sysclk(cpu_dai, - 0, 0, SND_SOC_CLOCK_IN); + 0, 0, SND_SOC_CLOCK_OUT); asoc_simple_clk_disable(dai); } From 570a4bf7440e9fb2a4164244a6bf60a46362b627 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Mon, 18 Apr 2022 12:41:04 -0500 Subject: [PATCH 006/179] RDMA/rxe: Recheck the MR in when generating a READ reply The rping benchmark fails on long runs. The root cause of this failure has been traced to a failure to compute a nonzero value of mr in rare situations. Fix this failure by correctly handling the computation of mr in read_reply() in rxe_resp.c in the replay flow. Fixes: 8a1a0be894da ("RDMA/rxe: Replace mr by rkey in responder resources") Link: https://lore.kernel.org/r/20220418174103.3040-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 1d95fab606da..9cd0eaff98de 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -745,8 +745,14 @@ static enum resp_states read_reply(struct rxe_qp *qp, } if (res->state == rdatm_res_state_new) { - mr = qp->resp.mr; - qp->resp.mr = NULL; + if (!res->replay) { + mr = qp->resp.mr; + qp->resp.mr = NULL; + } else { + mr = rxe_recheck_mr(qp, res->read.rkey); + if (!mr) + return RESPST_ERR_RKEY_VIOLATION; + } if (res->read.resid <= mtu) opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; From 08ef48404965cfef99343d6bbbcf75b88c74aa0e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Apr 2022 14:34:37 +0100 Subject: [PATCH 007/179] ASoC: da7219: Fix change notifications for tone generator frequency The tone generator frequency control just returns 0 on successful write, not a boolean value indicating if there was a change or not. Compare what was written with the value that was there previously so that notifications are generated appropriately when the value changes. Signed-off-by: Mark Brown Reviewed-by: Adam Thomson Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220420133437.569229-1-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/codecs/da7219.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/da7219.c b/sound/soc/codecs/da7219.c index 13009d08b09a..c7493549a9a5 100644 --- a/sound/soc/codecs/da7219.c +++ b/sound/soc/codecs/da7219.c @@ -446,7 +446,7 @@ static int da7219_tonegen_freq_put(struct snd_kcontrol *kcontrol, struct soc_mixer_control *mixer_ctrl = (struct soc_mixer_control *) kcontrol->private_value; unsigned int reg = mixer_ctrl->reg; - __le16 val; + __le16 val_new, val_old; int ret; /* @@ -454,13 +454,19 @@ static int da7219_tonegen_freq_put(struct snd_kcontrol *kcontrol, * Therefore we need to convert to little endian here to align with * HW registers. */ - val = cpu_to_le16(ucontrol->value.integer.value[0]); + val_new = cpu_to_le16(ucontrol->value.integer.value[0]); mutex_lock(&da7219->ctrl_lock); - ret = regmap_raw_write(da7219->regmap, reg, &val, sizeof(val)); + ret = regmap_raw_read(da7219->regmap, reg, &val_old, sizeof(val_old)); + if (ret == 0 && (val_old != val_new)) + ret = regmap_raw_write(da7219->regmap, reg, + &val_new, sizeof(val_new)); mutex_unlock(&da7219->ctrl_lock); - return ret; + if (ret < 0) + return ret; + + return val_old != val_new; } From 2e3a0d1bfa95b54333f7add3e50e288769373873 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 21 Apr 2022 13:38:01 +0100 Subject: [PATCH 008/179] ASoC: meson: Fix event generation for AUI ACODEC mux The AIU ACODEC has a custom put() operation which returns 0 when the value of the mux changes, meaning that events are not generated for userspace. Change to return 1 in this case, the function returns early in the case where there is no change. Signed-off-by: Mark Brown Reviewed-by: Jerome Brunet Link: https://lore.kernel.org/r/20220421123803.292063-2-broonie@kernel.org Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- sound/soc/meson/aiu-acodec-ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/meson/aiu-acodec-ctrl.c b/sound/soc/meson/aiu-acodec-ctrl.c index 22e181646bc3..3776b073a3db 100644 --- a/sound/soc/meson/aiu-acodec-ctrl.c +++ b/sound/soc/meson/aiu-acodec-ctrl.c @@ -58,7 +58,7 @@ static int aiu_acodec_ctrl_mux_put_enum(struct snd_kcontrol *kcontrol, snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL); - return 0; + return 1; } static SOC_ENUM_SINGLE_DECL(aiu_acodec_ctrl_mux_enum, AIU_ACODEC_CTRL, From fce49921a22262736cdc3cc74fa67915b75e9363 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 21 Apr 2022 13:38:02 +0100 Subject: [PATCH 009/179] ASoC: meson: Fix event generation for AUI CODEC mux The AIU CODEC has a custom put() operation which returns 0 when the value of the mux changes, meaning that events are not generated for userspace. Change to return 1 in this case, the function returns early in the case where there is no change. Signed-off-by: Mark Brown Reviewed-by: Jerome Brunet Link: https://lore.kernel.org/r/20220421123803.292063-3-broonie@kernel.org Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- sound/soc/meson/aiu-codec-ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/meson/aiu-codec-ctrl.c b/sound/soc/meson/aiu-codec-ctrl.c index 59ee66fc2bcd..286ac4983d40 100644 --- a/sound/soc/meson/aiu-codec-ctrl.c +++ b/sound/soc/meson/aiu-codec-ctrl.c @@ -57,7 +57,7 @@ static int aiu_codec_ctrl_mux_put_enum(struct snd_kcontrol *kcontrol, snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL); - return 0; + return 1; } static SOC_ENUM_SINGLE_DECL(aiu_hdmi_ctrl_mux_enum, AIU_HDMI_CLK_DATA_CTRL, From 12131008fc13ff7f7690d170b7a8f72d24fd7d1e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 21 Apr 2022 13:38:03 +0100 Subject: [PATCH 010/179] ASoC: meson: Fix event generation for G12A tohdmi mux The G12A tohdmi has a custom put() operation which returns 0 when the value of the mux changes, meaning that events are not generated for userspace. Change to return 1 in this case, the function returns early in the case where there is no change. Signed-off-by: Mark Brown Reviewed-by: Jerome Brunet Link: https://lore.kernel.org/r/20220421123803.292063-4-broonie@kernel.org Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- sound/soc/meson/g12a-tohdmitx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/meson/g12a-tohdmitx.c b/sound/soc/meson/g12a-tohdmitx.c index 9b2b59536ced..6c99052feafd 100644 --- a/sound/soc/meson/g12a-tohdmitx.c +++ b/sound/soc/meson/g12a-tohdmitx.c @@ -67,7 +67,7 @@ static int g12a_tohdmitx_i2s_mux_put_enum(struct snd_kcontrol *kcontrol, snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL); - return 0; + return 1; } static SOC_ENUM_SINGLE_DECL(g12a_tohdmitx_i2s_mux_enum, TOHDMITX_CTRL0, From eb5773201b1c5d603424bd21f161c8c2d1075b42 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 21 Apr 2022 11:23:28 -0500 Subject: [PATCH 011/179] ASoC: soc-ops: fix error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cppcheck throws the following warning: sound/soc/soc-ops.c:461:8: style: Variable 'ret' is assigned a value that is never used. [unreadVariable] ret = err; ^ This seems to be a missing change in the return value. Fixes: 7f3d90a351968 ("ASoC: ops: Fix stereo change notifications in snd_soc_put_volsw_sx()") Signed-off-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Rander Wang Reviewed-by: Péter Ujfalusi Link: https://lore.kernel.org/r/20220421162328.302017-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index a0ca58ba1627..58347eadd219 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -461,7 +461,7 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol, ret = err; } } - return err; + return ret; } EXPORT_SYMBOL_GPL(snd_soc_put_volsw_sx); From c26830b6c5c534d273ce007eb33d5a2d2ad4e969 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 21 Apr 2022 17:57:24 +0200 Subject: [PATCH 012/179] ASoC: meson: axg-tdm-interface: Fix formatters in trigger" This reverts commit bf5e4887eeddb48480568466536aa08ec7f179a5 because the following and required commit e138233e56e9829e65b6293887063a1a3ccb2d68 causes the following system crash when using audio: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:282 Fixes: bf5e4887eeddb4848056846 ("ASoC: meson: axg-tdm-interface: manage formatters in trigger") Reported-by: Dmitry Shmidt Signed-off-by: Neil Armstrong Acked-by: Jerome Brunet Link: https://lore.kernel.org/r/20220421155725.2589089-1-narmstrong@baylibre.com Signed-off-by: Mark Brown --- sound/soc/meson/axg-tdm-interface.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/sound/soc/meson/axg-tdm-interface.c b/sound/soc/meson/axg-tdm-interface.c index 0c31934a9630..e076ced30025 100644 --- a/sound/soc/meson/axg-tdm-interface.c +++ b/sound/soc/meson/axg-tdm-interface.c @@ -351,29 +351,13 @@ static int axg_tdm_iface_hw_free(struct snd_pcm_substream *substream, return 0; } -static int axg_tdm_iface_trigger(struct snd_pcm_substream *substream, - int cmd, +static int axg_tdm_iface_prepare(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { - struct axg_tdm_stream *ts = - snd_soc_dai_get_dma_data(dai, substream); + struct axg_tdm_stream *ts = snd_soc_dai_get_dma_data(dai, substream); - switch (cmd) { - case SNDRV_PCM_TRIGGER_START: - case SNDRV_PCM_TRIGGER_RESUME: - case SNDRV_PCM_TRIGGER_PAUSE_RELEASE: - axg_tdm_stream_start(ts); - break; - case SNDRV_PCM_TRIGGER_SUSPEND: - case SNDRV_PCM_TRIGGER_PAUSE_PUSH: - case SNDRV_PCM_TRIGGER_STOP: - axg_tdm_stream_stop(ts); - break; - default: - return -EINVAL; - } - - return 0; + /* Force all attached formatters to update */ + return axg_tdm_stream_reset(ts); } static int axg_tdm_iface_remove_dai(struct snd_soc_dai *dai) @@ -413,8 +397,8 @@ static const struct snd_soc_dai_ops axg_tdm_iface_ops = { .set_fmt = axg_tdm_iface_set_fmt, .startup = axg_tdm_iface_startup, .hw_params = axg_tdm_iface_hw_params, + .prepare = axg_tdm_iface_prepare, .hw_free = axg_tdm_iface_hw_free, - .trigger = axg_tdm_iface_trigger, }; /* TDM Backend DAIs */ From 0c9b152c72e53016e96593bdbb8cffe2176694b9 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 21 Apr 2022 17:57:25 +0200 Subject: [PATCH 013/179] ASoC: meson: axg-card: Fix nonatomic links This commit e138233e56e9829e65b6293887063a1a3ccb2d68 causes the following system crash when using audio on G12A/G12B & SM1 systems: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:282 in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 0, name: swapper/0 preempt_count: 10001, expected: 0 RCU nest depth: 0, expected: 0 Preemption disabled at: schedule_preempt_disabled+0x20/0x2c mutex_lock+0x24/0x60 _snd_pcm_stream_lock_irqsave+0x20/0x3c snd_pcm_period_elapsed+0x24/0xa4 axg_fifo_pcm_irq_block+0x64/0xdc __handle_irq_event_percpu+0x104/0x264 handle_irq_event+0x48/0xb4 ... start_kernel+0x3f0/0x484 __primary_switched+0xc0/0xc8 Revert this commit until the crash is fixed. Fixes: e138233e56e9829e65b6 ("ASoC: meson: axg-card: make links nonatomic") Reported-by: Dmitry Shmidt Signed-off-by: Neil Armstrong Acked-by: Jerome Brunet Link: https://lore.kernel.org/r/20220421155725.2589089-2-narmstrong@baylibre.com Signed-off-by: Mark Brown --- sound/soc/meson/axg-card.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/meson/axg-card.c b/sound/soc/meson/axg-card.c index cbbaa55d92a6..2b77010c2c5c 100644 --- a/sound/soc/meson/axg-card.c +++ b/sound/soc/meson/axg-card.c @@ -320,7 +320,6 @@ static int axg_card_add_link(struct snd_soc_card *card, struct device_node *np, dai_link->cpus = cpu; dai_link->num_cpus = 1; - dai_link->nonatomic = true; ret = meson_card_parse_dai(card, np, &dai_link->cpus->of_node, &dai_link->cpus->dai_name); From 60cc5468daaefc18ffc081dc484bdaa1bd270561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 21 Apr 2022 14:32:54 -0300 Subject: [PATCH 014/179] =?UTF-8?q?futex:=20MAINTAINERS,=20.mailmap:=20Upd?= =?UTF-8?q?ate=20Andr=C3=A9's=20email=20address?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update futex entry to use my new professional email address. Signed-off-by: André Almeida Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220421173254.29855-1-andrealmeid@igalia.com --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 93458154ce7d..ea1ba4a9a77e 100644 --- a/.mailmap +++ b/.mailmap @@ -45,6 +45,7 @@ Andrey Konovalov Andrey Ryabinin Andrey Ryabinin Andrzej Hajda +André Almeida Andy Adamson Antoine Tenart Antoine Tenart diff --git a/MAINTAINERS b/MAINTAINERS index 40fa1955ca3f..35dea3d12981 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8109,7 +8109,7 @@ M: Ingo Molnar R: Peter Zijlstra R: Darren Hart R: Davidlohr Bueso -R: André Almeida +R: André Almeida L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core From 214cab6f8020a9ad4a5e9862a4e68088d5a79f08 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 18 Apr 2022 21:20:16 +0000 Subject: [PATCH 015/179] MAINTAINERS: Update email address for John Stultz I've switched jobs, so update my email address in MAINTAINERS Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Acked-by: Sumit Semwal Link: https://lore.kernel.org/r/20220418212016.2669086-1-jstultz@google.com --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 40fa1955ca3f..c2e1b7202449 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5914,7 +5914,7 @@ R: Benjamin Gaignard R: Liam Mark R: Laura Abbott R: Brian Starkey -R: John Stultz +R: John Stultz L: linux-media@vger.kernel.org L: dri-devel@lists.freedesktop.org L: linaro-mm-sig@lists.linaro.org (moderated for non-subscribers) @@ -6584,7 +6584,7 @@ F: drivers/gpu/drm/gma500/ DRM DRIVERS FOR HISILICON M: Xinliang Liu M: Tian Tao -R: John Stultz +R: John Stultz R: Xinwei Kong R: Chen Feng L: dri-devel@lists.freedesktop.org @@ -8845,7 +8845,7 @@ F: Documentation/devicetree/bindings/net/hisilicon*.txt F: drivers/net/ethernet/hisilicon/ HIKEY960 ONBOARD USB GPIO HUB DRIVER -M: John Stultz +M: John Stultz L: linux-kernel@vger.kernel.org S: Maintained F: drivers/misc/hisi_hikey_usb.c @@ -19784,7 +19784,7 @@ F: drivers/net/wireless/ti/ F: include/linux/wl12xx.h TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER -M: John Stultz +M: John Stultz M: Thomas Gleixner R: Stephen Boyd L: linux-kernel@vger.kernel.org From a6ac60b36dade525c13c5bb0838589619533efb7 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Fri, 22 Apr 2022 15:39:37 +0800 Subject: [PATCH 016/179] ALSA: hda/realtek: Fix mute led issue on thinkpad with cs35l41 s-codec The quirk ALC287_FIXUP_CS35L41_I2C_2 needs to chain the quirk ALC269_FIXUP_THINKPAD_ACPI, otherwise the mute led will not work if a thinkpad machine applies that quirk. And it will be safe if non-thinkpad machines apply that quirk since hda_fixup_thinkpad_acpi() will check and return in this case. Fixes: ae7abe36e352e ("ALSA: hda/realtek: Add CS35L41 support for Thinkpad laptops") Signed-off-by: Hui Wang Link: https://lore.kernel.org/r/20220422073937.10073-1-hui.wang@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4c0c593f3c0a..f9c3b2c9ca12 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8769,6 +8769,8 @@ static const struct hda_fixup alc269_fixups[] = { [ALC287_FIXUP_CS35L41_I2C_2] = { .type = HDA_FIXUP_FUNC, .v.func = cs35l41_fixup_i2c_two, + .chained = true, + .chain_id = ALC269_FIXUP_THINKPAD_ACPI, }, [ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED] = { .type = HDA_FIXUP_FUNC, From 5f5d8890789c90470d9571a283f0b789acd594af Mon Sep 17 00:00:00 2001 From: Andy Chi Date: Fri, 22 Apr 2022 17:08:43 +0800 Subject: [PATCH 017/179] ALSA: hda/realtek: Enable mute/micmute LEDs support for HP Laptops On HP Laptops, requires the same ALC285_FIXUP_HP_GPIO_LED quirk to make its audio LEDs work. So apply the quirk, and make it the last one since it's an LED quirk. Signed-off-by: Andy Chi Fixes: 07bcab93946c ("ALSA: hda/realtek: Add support for HP Laptops") Link: https://lore.kernel.org/r/20220422090845.230071-1-andy.chi@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f9c3b2c9ca12..c65d3dbc6cc9 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9025,12 +9025,12 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8896, "HP EliteBook 855 G8 Notebook PC", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x8898, "HP EliteBook 845 G8 Notebook PC", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x103c, 0x88d0, "HP Pavilion 15-eh1xxx (mainboard 88D0)", ALC287_FIXUP_HP_GPIO_LED), - SND_PCI_QUIRK(0x103c, 0x896e, "HP EliteBook x360 830 G9", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x103c, 0x8971, "HP EliteBook 830 G9", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x103c, 0x8972, "HP EliteBook 840 G9", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x103c, 0x8973, "HP EliteBook 860 G9", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x103c, 0x8974, "HP EliteBook 840 Aero G9", ALC245_FIXUP_CS35L41_SPI_2), - SND_PCI_QUIRK(0x103c, 0x8975, "HP EliteBook x360 840 Aero G9", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x103c, 0x896e, "HP EliteBook x360 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8971, "HP EliteBook 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8972, "HP EliteBook 840 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8973, "HP EliteBook 860 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8974, "HP EliteBook 840 Aero G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8975, "HP EliteBook x360 840 Aero G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8981, "HP Elite Dragonfly G3", ALC245_FIXUP_CS35L41_SPI_4), SND_PCI_QUIRK(0x103c, 0x898e, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x898f, "HP EliteBook 835 G9", ALC287_FIXUP_CS35L41_I2C_2), From 87c18514bb8477563a61f50b4285da156296edc4 Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Fri, 22 Apr 2022 14:26:50 +0800 Subject: [PATCH 018/179] ASoC: rt9120: Correct the reg 0x09 size to one byte Correct the reg 0x09 size to one byte. Signed-off-by: ChiYuan Huang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1650608810-3829-1-git-send-email-u0084500@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt9120.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/codecs/rt9120.c b/sound/soc/codecs/rt9120.c index 7aa1772a915f..6e0d7cf0c8c9 100644 --- a/sound/soc/codecs/rt9120.c +++ b/sound/soc/codecs/rt9120.c @@ -341,7 +341,6 @@ static int rt9120_get_reg_size(unsigned int reg) { switch (reg) { case 0x00: - case 0x09: case 0x20 ... 0x27: return 2; case 0x30 ... 0x3D: From e13433b4416fa31a24e621cbbbb39227a3d651dd Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 21 Apr 2022 10:32:34 -0400 Subject: [PATCH 019/179] SUNRPC release the transport of a relocated task with an assigned transport A relocated task must release its previous transport. Fixes: 82ee41b85cef1 ("SUNRPC don't resend a task on an offlined transport") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index af0174d7ce5a..98133aa54f19 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1065,10 +1065,13 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt) static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { - if (task->tk_xprt && - !(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) && - (task->tk_flags & RPC_TASK_MOVEABLE))) - return; + if (task->tk_xprt) { + if (!(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) && + (task->tk_flags & RPC_TASK_MOVEABLE))) + return; + xprt_release(task); + xprt_put(task->tk_xprt); + } if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) task->tk_xprt = rpc_task_get_first_xprt(clnt); else From 7635a1ad8d92dcc8247b53f949e37795154b5b6f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 11 Apr 2022 08:42:10 -0700 Subject: [PATCH 020/179] iwlwifi: iwl-dbg: Use del_timer_sync() before freeing In Chrome OS, a large number of crashes is observed due to corrupted timer lists. Steven Rostedt pointed out that this usually happens when a timer is freed while still active, and that the problem is often triggered by code calling del_timer() instead of del_timer_sync() just before freeing. Steven also identified the iwlwifi driver as one of the possible culprits since it does exactly that. Reported-by: Steven Rostedt Cc: Steven Rostedt Cc: Johannes Berg Cc: Gregory Greenman Fixes: 60e8abd9d3e91 ("iwlwifi: dbg_ini: add periodic trigger new API support") Signed-off-by: Guenter Roeck Acked-by: Gregory Greenman Tested-by: Sedat Dilek # Linux v5.17.3-rc1 and Debian LLVM-14 Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220411154210.1870008-1-linux@roeck-us.net --- drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c index 866a33f49915..3237d4b528b5 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-dbg-tlv.c @@ -371,7 +371,7 @@ void iwl_dbg_tlv_del_timers(struct iwl_trans *trans) struct iwl_dbg_tlv_timer_node *node, *tmp; list_for_each_entry_safe(node, tmp, timer_list, list) { - del_timer(&node->timer); + del_timer_sync(&node->timer); list_del(&node->list); kfree(node); } From 4dd4e6f659850f2df20b9612593f5a0f040549e1 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Tue, 12 Apr 2022 22:01:41 +0300 Subject: [PATCH 021/179] MAINTAINERS: update iwlwifi driver maintainer Set myself as a maintainer of iwlwifi driver as Luca is moving to a new role. Signed-off-by: Gregory Greenman Acked-by: Luca Coelho Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220412190141.4543-1-gregory.greenman@intel.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c8fe97a8d60e..6a7a839acfe3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10129,7 +10129,7 @@ S: Supported F: drivers/net/wireless/intel/iwlegacy/ INTEL WIRELESS WIFI LINK (iwlwifi) -M: Luca Coelho +M: Gregory Greenman L: linux-wireless@vger.kernel.org S: Supported W: https://wireless.wiki.kernel.org/en/users/drivers/iwlwifi From b7c81f80246fac44077166f3e07103affe6db8ff Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Sat, 9 Apr 2022 13:12:41 +0900 Subject: [PATCH 022/179] firewire: fix potential uaf in outbound_phy_packet_callback() &e->event and e point to the same address, and &e->event could be freed in queue_event. So there is a potential uaf issue if we dereference e after calling queue_event(). Fix this by adding a temporary variable to maintain e->client in advance, this can avoid the potential uaf issue. Cc: Signed-off-by: Chengfeng Ye Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20220409041243.603210-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- drivers/firewire/core-cdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 9f89c17730b1..708e417200f4 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -1500,6 +1500,7 @@ static void outbound_phy_packet_callback(struct fw_packet *packet, { struct outbound_phy_packet_event *e = container_of(packet, struct outbound_phy_packet_event, p); + struct client *e_client; switch (status) { /* expected: */ @@ -1516,9 +1517,10 @@ static void outbound_phy_packet_callback(struct fw_packet *packet, } e->phy_packet.data[0] = packet->timestamp; + e_client = e->client; queue_event(e->client, &e->event, &e->phy_packet, sizeof(e->phy_packet) + e->phy_packet.length, NULL, 0); - client_put(e->client); + client_put(e_client); } static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg) From 9423973869bd4632ffe669f950510c49296656e0 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Sat, 9 Apr 2022 13:12:42 +0900 Subject: [PATCH 023/179] firewire: remove check of list iterator against head past the loop body When list_for_each_entry() completes the iteration over the whole list without breaking the loop, the iterator value will be a bogus pointer computed based on the head element. While it is safe to use the pointer to determine if it was computed based on the head element, either with list_entry_is_head() or &pos->member == head, using the iterator variable after the loop should be avoided. In preparation to limit the scope of a list iterator to the list traversal loop, use a dedicated pointer to point to the found element [1]. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Cc: Signed-off-by: Jakob Koschel Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20220409041243.603210-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- drivers/firewire/core-transaction.c | 30 +++++++++++++++-------------- drivers/firewire/sbp2.c | 13 +++++++------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index ac487c96bb71..6c20815cc8d1 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -73,24 +73,25 @@ static int try_cancel_split_timeout(struct fw_transaction *t) static int close_transaction(struct fw_transaction *transaction, struct fw_card *card, int rcode) { - struct fw_transaction *t; + struct fw_transaction *t = NULL, *iter; unsigned long flags; spin_lock_irqsave(&card->lock, flags); - list_for_each_entry(t, &card->transaction_list, link) { - if (t == transaction) { - if (!try_cancel_split_timeout(t)) { + list_for_each_entry(iter, &card->transaction_list, link) { + if (iter == transaction) { + if (!try_cancel_split_timeout(iter)) { spin_unlock_irqrestore(&card->lock, flags); goto timed_out; } - list_del_init(&t->link); - card->tlabel_mask &= ~(1ULL << t->tlabel); + list_del_init(&iter->link); + card->tlabel_mask &= ~(1ULL << iter->tlabel); + t = iter; break; } } spin_unlock_irqrestore(&card->lock, flags); - if (&t->link != &card->transaction_list) { + if (t) { t->callback(card, rcode, NULL, 0, t->callback_data); return 0; } @@ -935,7 +936,7 @@ EXPORT_SYMBOL(fw_core_handle_request); void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) { - struct fw_transaction *t; + struct fw_transaction *t = NULL, *iter; unsigned long flags; u32 *data; size_t data_length; @@ -947,20 +948,21 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) rcode = HEADER_GET_RCODE(p->header[1]); spin_lock_irqsave(&card->lock, flags); - list_for_each_entry(t, &card->transaction_list, link) { - if (t->node_id == source && t->tlabel == tlabel) { - if (!try_cancel_split_timeout(t)) { + list_for_each_entry(iter, &card->transaction_list, link) { + if (iter->node_id == source && iter->tlabel == tlabel) { + if (!try_cancel_split_timeout(iter)) { spin_unlock_irqrestore(&card->lock, flags); goto timed_out; } - list_del_init(&t->link); - card->tlabel_mask &= ~(1ULL << t->tlabel); + list_del_init(&iter->link); + card->tlabel_mask &= ~(1ULL << iter->tlabel); + t = iter; break; } } spin_unlock_irqrestore(&card->lock, flags); - if (&t->link == &card->transaction_list) { + if (!t) { timed_out: fw_notice(card, "unsolicited response (source %x, tlabel %x)\n", source, tlabel); diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index 85cd379fd383..60051c0cabea 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -408,7 +408,7 @@ static void sbp2_status_write(struct fw_card *card, struct fw_request *request, void *payload, size_t length, void *callback_data) { struct sbp2_logical_unit *lu = callback_data; - struct sbp2_orb *orb; + struct sbp2_orb *orb = NULL, *iter; struct sbp2_status status; unsigned long flags; @@ -433,17 +433,18 @@ static void sbp2_status_write(struct fw_card *card, struct fw_request *request, /* Lookup the orb corresponding to this status write. */ spin_lock_irqsave(&lu->tgt->lock, flags); - list_for_each_entry(orb, &lu->orb_list, link) { + list_for_each_entry(iter, &lu->orb_list, link) { if (STATUS_GET_ORB_HIGH(status) == 0 && - STATUS_GET_ORB_LOW(status) == orb->request_bus) { - orb->rcode = RCODE_COMPLETE; - list_del(&orb->link); + STATUS_GET_ORB_LOW(status) == iter->request_bus) { + iter->rcode = RCODE_COMPLETE; + list_del(&iter->link); + orb = iter; break; } } spin_unlock_irqrestore(&lu->tgt->lock, flags); - if (&orb->link != &lu->orb_list) { + if (orb) { orb->callback(orb, &status); kref_put(&orb->kref, free_orb); /* orb callback reference */ } else { From a7ecbe92b9243edbe94772f6f2c854e4142a3345 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Sat, 9 Apr 2022 13:12:43 +0900 Subject: [PATCH 024/179] firewire: core: extend card->lock in fw_core_handle_bus_reset card->local_node and card->bm_retries are both always accessed under card->lock. fw_core_handle_bus_reset has a check whose condition depends on card->local_node and whose body writes to card->bm_retries. Both of these accesses are not under card->lock. Move the lock acquiring of card->lock to before this check such that these accesses do happen when card->lock is held. fw_destroy_nodes is called inside the check. Since fw_destroy_nodes already acquires card->lock inside its function body, move this out to the callsites of fw_destroy_nodes. Also add a comment to indicate which locking is necessary when calling fw_destroy_nodes. Cc: Signed-off-by: Niels Dossche Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20220409041243.603210-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- drivers/firewire/core-card.c | 3 +++ drivers/firewire/core-topology.c | 9 +++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 54be88167c60..f3b3953cac83 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -668,6 +668,7 @@ EXPORT_SYMBOL_GPL(fw_card_release); void fw_core_remove_card(struct fw_card *card) { struct fw_card_driver dummy_driver = dummy_driver_template; + unsigned long flags; card->driver->update_phy_reg(card, 4, PHY_LINK_ACTIVE | PHY_CONTENDER, 0); @@ -682,7 +683,9 @@ void fw_core_remove_card(struct fw_card *card) dummy_driver.stop_iso = card->driver->stop_iso; card->driver = &dummy_driver; + spin_lock_irqsave(&card->lock, flags); fw_destroy_nodes(card); + spin_unlock_irqrestore(&card->lock, flags); /* Wait for all users, especially device workqueue jobs, to finish. */ fw_card_put(card); diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index b63d55f5ebd3..f40c81534381 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -375,16 +375,13 @@ static void report_found_node(struct fw_card *card, card->bm_retries = 0; } +/* Must be called with card->lock held */ void fw_destroy_nodes(struct fw_card *card) { - unsigned long flags; - - spin_lock_irqsave(&card->lock, flags); card->color++; if (card->local_node != NULL) for_each_fw_node(card, card->local_node, report_lost_node); card->local_node = NULL; - spin_unlock_irqrestore(&card->lock, flags); } static void move_tree(struct fw_node *node0, struct fw_node *node1, int port) @@ -510,6 +507,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, struct fw_node *local_node; unsigned long flags; + spin_lock_irqsave(&card->lock, flags); + /* * If the selfID buffer is not the immediate successor of the * previously processed one, we cannot reliably compare the @@ -521,8 +520,6 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, card->bm_retries = 0; } - spin_lock_irqsave(&card->lock, flags); - card->broadcast_channel_allocated = card->broadcast_channel_auto_allocated; card->node_id = node_id; /* From 3b79954fd00d540677c97a560622b73f3a1f4e28 Mon Sep 17 00:00:00 2001 From: Zihao Wang Date: Sun, 24 Apr 2022 16:41:20 +0800 Subject: [PATCH 025/179] ALSA: hda/realtek: Add quirk for Yoga Duet 7 13ITL6 speakers Lenovo Yoga Duet 7 13ITL6 has Realtek ALC287 and built-in speakers do not work out of the box. The fix developed for Yoga 7i 14ITL5 also enables speaker output for this model. Signed-off-by: Zihao Wang Cc: Link: https://lore.kernel.org/r/20220424084120.74125-1-wzhd@ustc.edu Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c65d3dbc6cc9..cf531c1efa13 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9247,6 +9247,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940", ALC298_FIXUP_LENOVO_SPK_VOLUME), SND_PCI_QUIRK(0x17aa, 0x3819, "Lenovo 13s Gen2 ITL", ALC287_FIXUP_13S_GEN2_SPEAKERS), + SND_PCI_QUIRK(0x17aa, 0x3820, "Yoga Duet 7 13ITL6", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3824, "Legion Y9000X 2020", ALC285_FIXUP_LEGION_Y9000X_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3827, "Ideapad S740", ALC285_FIXUP_IDEAPAD_S740_COEF), SND_PCI_QUIRK(0x17aa, 0x3834, "Lenovo IdeaPad Slim 9i 14ITL5", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), From eb9d84b0ffe39893cb23b0b6712bbe3637fa25fa Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 24 Apr 2022 19:24:28 +0900 Subject: [PATCH 026/179] ALSA: fireworks: fix wrong return count shorter than expected by 4 bytes ALSA fireworks driver has a bug in its initial state to return count shorter than expected by 4 bytes to userspace applications when handling response frame for Echo Audio Fireworks transaction. It's due to missing addition of the size for the type of event in ALSA firewire stack. Fixes: 555e8a8f7f14 ("ALSA: fireworks: Add command/response functionality into hwdep interface") Cc: Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20220424102428.21109-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- sound/firewire/fireworks/fireworks_hwdep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/firewire/fireworks/fireworks_hwdep.c b/sound/firewire/fireworks/fireworks_hwdep.c index 626c0c34b0b6..3a53914277d3 100644 --- a/sound/firewire/fireworks/fireworks_hwdep.c +++ b/sound/firewire/fireworks/fireworks_hwdep.c @@ -34,6 +34,7 @@ hwdep_read_resp_buf(struct snd_efw *efw, char __user *buf, long remained, type = SNDRV_FIREWIRE_EVENT_EFW_RESPONSE; if (copy_to_user(buf, &type, sizeof(type))) return -EFAULT; + count += sizeof(type); remained -= sizeof(type); buf += sizeof(type); From 2fbe467bcbfc760a08f08475eea6bbd4c2874319 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Apr 2022 20:34:53 +0100 Subject: [PATCH 027/179] ASoC: max98090: Reject invalid values in custom control put() The max98090 driver has a custom put function for some controls which can only be updated in certain circumstances which makes no effort to validate that input is suitable for the control, allowing out of spec values to be written to the hardware and presented to userspace. Fix this by returning an error when invalid values are written. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220420193454.2647908-1-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/codecs/max98090.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/codecs/max98090.c b/sound/soc/codecs/max98090.c index b45ec35cd63c..6d9261346842 100644 --- a/sound/soc/codecs/max98090.c +++ b/sound/soc/codecs/max98090.c @@ -413,6 +413,9 @@ static int max98090_put_enab_tlv(struct snd_kcontrol *kcontrol, val = (val >> mc->shift) & mask; + if (sel < 0 || sel > mc->max) + return -EINVAL; + *select = sel; /* Setting a volume is only valid if it is already On */ From 13fcf676d9e102594effc686d98521ff5c90b925 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 20 Apr 2022 20:34:54 +0100 Subject: [PATCH 028/179] ASoC: max98090: Generate notifications on changes for custom control The max98090 driver has some custom controls which share a put() function which returns 0 unconditionally, meaning that events are not generated when the value changes. Fix that. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220420193454.2647908-2-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/codecs/max98090.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/max98090.c b/sound/soc/codecs/max98090.c index 6d9261346842..62b41ca050a2 100644 --- a/sound/soc/codecs/max98090.c +++ b/sound/soc/codecs/max98090.c @@ -430,7 +430,7 @@ static int max98090_put_enab_tlv(struct snd_kcontrol *kcontrol, mask << mc->shift, sel << mc->shift); - return 0; + return *select != val; } static const char *max98090_perf_pwr_text[] = From 2bde1985e39173d8cb64005dad6f34e9bee4c750 Mon Sep 17 00:00:00 2001 From: Codrin Ciubotariu Date: Thu, 21 Apr 2022 15:54:03 +0300 Subject: [PATCH 029/179] ASoC: atmel: mchp-pdmc: set prepare_slave_config Since a pointer to struct snd_dmaengine_pcm_config is passed, snd_dmaengine_pcm_prepare_slave_config() is no longer called unless it's explicitly set in prepare_slave_config. Fixes: 50291652af52 ("ASoC: atmel: mchp-pdmc: add PDMC driver") Suggested-by: Sascha Hauer Signed-off-by: Codrin Ciubotariu Link: https://lore.kernel.org/r/20220421125403.2180824-2-codrin.ciubotariu@microchip.com Signed-off-by: Mark Brown --- sound/soc/atmel/mchp-pdmc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/atmel/mchp-pdmc.c b/sound/soc/atmel/mchp-pdmc.c index 1a7802fbf23c..a3856c73e221 100644 --- a/sound/soc/atmel/mchp-pdmc.c +++ b/sound/soc/atmel/mchp-pdmc.c @@ -966,6 +966,7 @@ static int mchp_pdmc_process(struct snd_pcm_substream *substream, static struct snd_dmaengine_pcm_config mchp_pdmc_config = { .process = mchp_pdmc_process, + .prepare_slave_config = snd_dmaengine_pcm_prepare_slave_config, }; static int mchp_pdmc_probe(struct platform_device *pdev) From 660564fc9a92a893a14f255be434f7ea0b967901 Mon Sep 17 00:00:00 2001 From: Codrin Ciubotariu Date: Thu, 21 Apr 2022 15:54:02 +0300 Subject: [PATCH 030/179] ASoC: dmaengine: Restore NULL prepare_slave_config() callback As pointed out by Sascha Hauer, this patch changes: if (pmc->config && !pcm->config->prepare_slave_config) to: if (pmc->config && !pcm->config->prepare_slave_config) snd_dmaengine_pcm_prepare_slave_config() This breaks the drivers that do not need a call to dmaengine_slave_config(). Drivers that still need to call snd_dmaengine_pcm_prepare_slave_config(), but have a NULL pcm->config->prepare_slave_config should use snd_dmaengine_pcm_prepare_slave_config() as their prepare_slave_config callback. Fixes: 9a1e13440a4f ("ASoC: dmaengine: do not use a NULL prepare_slave_config() callback") Reported-by: Sascha Hauer Signed-off-by: Codrin Ciubotariu Link: https://lore.kernel.org/r/20220421125403.2180824-1-codrin.ciubotariu@microchip.com Signed-off-by: Mark Brown --- sound/soc/soc-generic-dmaengine-pcm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c index 2ab2ddc1294d..285441d6aeed 100644 --- a/sound/soc/soc-generic-dmaengine-pcm.c +++ b/sound/soc/soc-generic-dmaengine-pcm.c @@ -86,10 +86,10 @@ static int dmaengine_pcm_hw_params(struct snd_soc_component *component, memset(&slave_config, 0, sizeof(slave_config)); - if (pcm->config && pcm->config->prepare_slave_config) - prepare_slave_config = pcm->config->prepare_slave_config; - else + if (!pcm->config) prepare_slave_config = snd_dmaengine_pcm_prepare_slave_config; + else + prepare_slave_config = pcm->config->prepare_slave_config; if (prepare_slave_config) { int ret = prepare_slave_config(substream, params, &slave_config); From 00c94ebec5925593c0377b941289224469e72ac7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Apr 2022 18:04:27 -0400 Subject: [PATCH 031/179] NFSv4: Don't invalidate inode attributes on delegation return There is no need to declare attributes such as the ctime, mtime and block size invalid when we're just returning a delegation, so it is inappropriate to call nfs_post_op_update_inode_force_wcc(). Instead, just call nfs_refresh_inode() after faking up the change attribute. We know that the GETATTR op occurs before the DELEGRETURN, so we are safe when doing this. Fixes: 0bc2c9b4dca9 ("NFSv4: Don't discard the attributes returned by asynchronous DELEGRETURN") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 16106f805ffa..a79f66432bd3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -363,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) { + fattr->pre_change_attr = version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } +} + static void nfs4_test_and_free_stateid(struct nfs_server *server, nfs4_stateid *stateid, const struct cred *cred) @@ -6553,7 +6561,9 @@ static void nfs4_delegreturn_release(void *calldata) pnfs_roc_release(&data->lr.arg, &data->lr.res, data->res.lr_ret); if (inode) { - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + nfs4_fattr_set_prechange(&data->fattr, + inode_peek_iversion_raw(inode)); + nfs_refresh_inode(inode, &data->fattr); nfs_iput_and_deactive(inode); } kfree(calldata); From 4bc31edebde51fcf8ad0794763b8679a7ecb5ec0 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 22 Apr 2022 10:08:53 -0700 Subject: [PATCH 032/179] mmc: core: Set HS clock speed before sending HS CMD13 Way back in commit 4f25580fb84d ("mmc: core: changes frequency to hs_max_dtr when selecting hs400es"), Rockchip engineers noticed that some eMMC don't respond to SEND_STATUS commands very reliably if they're still running at a low initial frequency. As mentioned in that commit, JESD84-B51 P49 suggests a sequence in which the host: 1. sets HS_TIMING 2. bumps the clock ("<= 52 MHz") 3. sends further commands It doesn't exactly require that we don't use a lower-than-52MHz frequency, but in practice, these eMMC don't like it. The aforementioned commit tried to get that right for HS400ES, although it's unclear whether this ever truly worked as committed into mainline, as other changes/refactoring adjusted the sequence in conflicting ways: 08573eaf1a70 ("mmc: mmc: do not use CMD13 to get status after speed mode switch") 53e60650f74e ("mmc: core: Allow CMD13 polling when switching to HS mode for mmc") In any case, today we do step 3 before step 2. Let's fix that, and also apply the same logic to HS200/400, where this eMMC has problems too. Resolves errors like this seen when booting some RK3399 Gru/Scarlet systems: [ 2.058881] mmc1: CQHCI version 5.10 [ 2.097545] mmc1: SDHCI controller on fe330000.mmc [fe330000.mmc] using ADMA [ 2.209804] mmc1: mmc_select_hs400es failed, error -84 [ 2.215597] mmc1: error -84 whilst initialising MMC card [ 2.417514] mmc1: mmc_select_hs400es failed, error -110 [ 2.423373] mmc1: error -110 whilst initialising MMC card [ 2.605052] mmc1: mmc_select_hs400es failed, error -110 [ 2.617944] mmc1: error -110 whilst initialising MMC card [ 2.835884] mmc1: mmc_select_hs400es failed, error -110 [ 2.841751] mmc1: error -110 whilst initialising MMC card Ealier versions of this patch bumped to 200MHz/HS200 speeds too early, which caused issues on, e.g., qcom-msm8974-fairphone-fp2. (Thanks for the report Luca!) After a second look, it appears that aligns with JESD84 / page 45 / table 28, so we need to keep to lower (HS / 52 MHz) rates first. Fixes: 08573eaf1a70 ("mmc: mmc: do not use CMD13 to get status after speed mode switch") Fixes: 53e60650f74e ("mmc: core: Allow CMD13 polling when switching to HS mode for mmc") Fixes: 4f25580fb84d ("mmc: core: changes frequency to hs_max_dtr when selecting hs400es") Cc: Shawn Lin Link: https://lore.kernel.org/linux-mmc/11962455.O9o76ZdvQC@g550jk/ Reported-by: Luca Weiss Signed-off-by: Brian Norris Tested-by: Luca Weiss Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220422100824.v4.1.I484f4ee35609f78b932bd50feed639c29e64997e@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index e7ea45386c22..efa95dc4fc4e 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1384,13 +1384,17 @@ static int mmc_select_hs400es(struct mmc_card *card) goto out_err; } + /* + * Bump to HS timing and frequency. Some cards don't handle + * SEND_STATUS reliably at the initial frequency. + */ mmc_set_timing(host, MMC_TIMING_MMC_HS); + mmc_set_bus_speed(card); + err = mmc_switch_status(card, true); if (err) goto out_err; - mmc_set_clock(host, card->ext_csd.hs_max_dtr); - /* Switch card to DDR with strobe bit */ val = EXT_CSD_DDR_BUS_WIDTH_8 | EXT_CSD_BUS_WIDTH_STROBE; err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, @@ -1448,7 +1452,7 @@ out_err: static int mmc_select_hs200(struct mmc_card *card) { struct mmc_host *host = card->host; - unsigned int old_timing, old_signal_voltage; + unsigned int old_timing, old_signal_voltage, old_clock; int err = -EINVAL; u8 val; @@ -1479,8 +1483,17 @@ static int mmc_select_hs200(struct mmc_card *card) false, true, MMC_CMD_RETRIES); if (err) goto err; + + /* + * Bump to HS timing and frequency. Some cards don't handle + * SEND_STATUS reliably at the initial frequency. + * NB: We can't move to full (HS200) speeds until after we've + * successfully switched over. + */ old_timing = host->ios.timing; + old_clock = host->ios.clock; mmc_set_timing(host, MMC_TIMING_MMC_HS200); + mmc_set_clock(card->host, card->ext_csd.hs_max_dtr); /* * For HS200, CRC errors are not a reliable way to know the @@ -1493,8 +1506,10 @@ static int mmc_select_hs200(struct mmc_card *card) * mmc_select_timing() assumes timing has not changed if * it is a switch error. */ - if (err == -EBADMSG) + if (err == -EBADMSG) { + mmc_set_clock(host, old_clock); mmc_set_timing(host, old_timing); + } } err: if (err) { From aa22125c57f9e577f0a667e4fa07fc3fa8ca1e60 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 23 Apr 2022 14:12:39 +0100 Subject: [PATCH 033/179] ASoC: ops: Validate input values in snd_soc_put_volsw_range() Check that values written via snd_soc_put_volsw_range() are within the range advertised by the control, ensuring that we don't write out of spec values to the hardware. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20220423131239.3375261-1-broonie@kernel.org Signed-off-by: Mark Brown --- sound/soc/soc-ops.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 58347eadd219..e693070f51fe 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -519,7 +519,15 @@ int snd_soc_put_volsw_range(struct snd_kcontrol *kcontrol, unsigned int mask = (1 << fls(max)) - 1; unsigned int invert = mc->invert; unsigned int val, val_mask; - int err, ret; + int err, ret, tmp; + + tmp = ucontrol->value.integer.value[0]; + if (tmp < 0) + return -EINVAL; + if (mc->platform_max && tmp > mc->platform_max) + return -EINVAL; + if (tmp > mc->max - mc->min + 1) + return -EINVAL; if (invert) val = (max - ucontrol->value.integer.value[0]) & mask; @@ -534,6 +542,14 @@ int snd_soc_put_volsw_range(struct snd_kcontrol *kcontrol, ret = err; if (snd_soc_volsw_is_stereo(mc)) { + tmp = ucontrol->value.integer.value[1]; + if (tmp < 0) + return -EINVAL; + if (mc->platform_max && tmp > mc->platform_max) + return -EINVAL; + if (tmp > mc->max - mc->min + 1) + return -EINVAL; + if (invert) val = (max - ucontrol->value.integer.value[1]) & mask; else From 8b202ee218395319aec1ef44f72043e1fbaccdd6 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 25 Apr 2022 14:17:42 +0200 Subject: [PATCH 034/179] s390: disable -Warray-bounds gcc-12 shows a lot of array bound warnings on s390. This is caused by the S390_lowcore macro which uses a hardcoded address of 0. Wrapping that with absolute_pointer() works, but gcc no longer knows that a 12 bit displacement is sufficient to access lowcore. So it emits instructions like 'lghi %r1,0; l %rx,xxx(%r1)' instead of a single load/store instruction. As s390 stores variables often read/written in lowcore, this is considered problematic. Therefore disable -Warray-bounds on s390 for gcc-12 for the time being, until there is a better solution. Signed-off-by: Sven Schnelle Link: https://lore.kernel.org/r/yt9dzgkelelc.fsf@linux.ibm.com Link: https://lore.kernel.org/r/20220422134308.1613610-1-svens@linux.ibm.com Link: https://lore.kernel.org/r/20220425121742.3222133-1-svens@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/Makefile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e441b60b1812..df325eacf62d 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -30,6 +30,16 @@ KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) + +ifdef CONFIG_CC_IS_GCC + ifeq ($(call cc-ifversion, -ge, 1200, y), y) + ifeq ($(call cc-ifversion, -lt, 1300, y), y) + KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) + KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, array-bounds) + endif + endif +endif + UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) CHECKFLAGS += -D__s390__ -D__s390x__ From c61711c1c95791850be48dd65a1d72eb34ba719f Mon Sep 17 00:00:00 2001 From: Ajit Kumar Pandey Date: Tue, 26 Apr 2022 13:33:57 -0500 Subject: [PATCH 035/179] ASoC: SOF: Fix NULL pointer exception in sof_pci_probe callback We are accessing "desc->ops" in sof_pci_probe without checking "desc" pointer. This results in NULL pointer exception if pci_id->driver_data i.e desc pointer isn't defined in sof device probe: BUG: kernel NULL pointer dereference, address: 0000000000000060 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:sof_pci_probe+0x1e/0x17f [snd_sof_pci] Code: Unable to access opcode bytes at RIP 0xffffffffc043dff4. RSP: 0018:ffffac4b03b9b8d8 EFLAGS: 00010246 Add NULL pointer check for sof_dev_desc pointer to avoid such exception. Reviewed-by: Ranjani Sridharan Signed-off-by: Ajit Kumar Pandey Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20220426183357.102155-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/sof-pci-dev.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/soc/sof/sof-pci-dev.c b/sound/soc/sof/sof-pci-dev.c index 12f5cff22448..7fa2649e56e5 100644 --- a/sound/soc/sof/sof-pci-dev.c +++ b/sound/soc/sof/sof-pci-dev.c @@ -153,6 +153,11 @@ int sof_pci_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) dev_dbg(&pci->dev, "PCI DSP detected"); + if (!desc) { + dev_err(dev, "error: no matching PCI descriptor\n"); + return -ENODEV; + } + if (!desc->ops) { dev_err(dev, "error: no matching PCI descriptor ops\n"); return -ENODEV; From 3f65b1e2f424f44585bd701024a3bfd0b1e0ade2 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Tue, 26 Apr 2022 14:12:14 -0700 Subject: [PATCH 036/179] drm/msm/dp: remove fail safe mode related code Current DP driver implementation has adding safe mode done at dp_hpd_plug_handle() which is expected to be executed under event thread context. However there is possible circular locking happen (see blow stack trace) after edp driver call dp_hpd_plug_handle() from dp_bridge_enable() which is executed under drm_thread context. After review all possibilities methods and as discussed on https://patchwork.freedesktop.org/patch/483155/, supporting EDID compliance tests in the driver is quite hacky. As seen with other vendor drivers, supporting these will be much easier with IGT. Hence removing all the related fail safe code for it so that no possibility of circular lock will happen. Reviewed-by: Stephen Boyd Reviewed-by: Douglas Anderson Reviewed-by: Dmitry Baryshkov ====================================================== WARNING: possible circular locking dependency detected 5.15.35-lockdep #6 Tainted: G W ------------------------------------------------------ frecon/429 is trying to acquire lock: ffffff808dc3c4e8 (&dev->mode_config.mutex){+.+.}-{3:3}, at: dp_panel_add_fail_safe_mode+0x4c/0xa0 but task is already holding lock: ffffff808dc441e0 (&kms->commit_lock[i]){+.+.}-{3:3}, at: lock_crtcs+0xb4/0x124 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&kms->commit_lock[i]){+.+.}-{3:3}: __mutex_lock_common+0x174/0x1a64 mutex_lock_nested+0x98/0xac lock_crtcs+0xb4/0x124 msm_atomic_commit_tail+0x330/0x748 commit_tail+0x19c/0x278 drm_atomic_helper_commit+0x1dc/0x1f0 drm_atomic_commit+0xc0/0xd8 drm_atomic_helper_set_config+0xb4/0x134 drm_mode_setcrtc+0x688/0x1248 drm_ioctl_kernel+0x1e4/0x338 drm_ioctl+0x3a4/0x684 __arm64_sys_ioctl+0x118/0x154 invoke_syscall+0x78/0x224 el0_svc_common+0x178/0x200 do_el0_svc+0x94/0x13c el0_svc+0x5c/0xec el0t_64_sync_handler+0x78/0x108 el0t_64_sync+0x1a4/0x1a8 -> #2 (crtc_ww_class_mutex){+.+.}-{3:3}: __mutex_lock_common+0x174/0x1a64 ww_mutex_lock+0xb8/0x278 modeset_lock+0x304/0x4ac drm_modeset_lock+0x4c/0x7c drmm_mode_config_init+0x4a8/0xc50 msm_drm_init+0x274/0xac0 msm_drm_bind+0x20/0x2c try_to_bring_up_master+0x3dc/0x470 __component_add+0x18c/0x3c0 component_add+0x1c/0x28 dp_display_probe+0x954/0xa98 platform_probe+0x124/0x15c really_probe+0x1b0/0x5f8 __driver_probe_device+0x174/0x20c driver_probe_device+0x70/0x134 __device_attach_driver+0x130/0x1d0 bus_for_each_drv+0xfc/0x14c __device_attach+0x1bc/0x2bc device_initial_probe+0x1c/0x28 bus_probe_device+0x94/0x178 deferred_probe_work_func+0x1a4/0x1f0 process_one_work+0x5d4/0x9dc worker_thread+0x898/0xccc kthread+0x2d4/0x3d4 ret_from_fork+0x10/0x20 -> #1 (crtc_ww_class_acquire){+.+.}-{0:0}: ww_acquire_init+0x1c4/0x2c8 drm_modeset_acquire_init+0x44/0xc8 drm_helper_probe_single_connector_modes+0xb0/0x12dc drm_mode_getconnector+0x5dc/0xfe8 drm_ioctl_kernel+0x1e4/0x338 drm_ioctl+0x3a4/0x684 __arm64_sys_ioctl+0x118/0x154 invoke_syscall+0x78/0x224 el0_svc_common+0x178/0x200 do_el0_svc+0x94/0x13c el0_svc+0x5c/0xec el0t_64_sync_handler+0x78/0x108 el0t_64_sync+0x1a4/0x1a8 -> #0 (&dev->mode_config.mutex){+.+.}-{3:3}: __lock_acquire+0x2650/0x672c lock_acquire+0x1b4/0x4ac __mutex_lock_common+0x174/0x1a64 mutex_lock_nested+0x98/0xac dp_panel_add_fail_safe_mode+0x4c/0xa0 dp_hpd_plug_handle+0x1f0/0x280 dp_bridge_enable+0x94/0x2b8 drm_atomic_bridge_chain_enable+0x11c/0x168 drm_atomic_helper_commit_modeset_enables+0x500/0x740 msm_atomic_commit_tail+0x3e4/0x748 commit_tail+0x19c/0x278 drm_atomic_helper_commit+0x1dc/0x1f0 drm_atomic_commit+0xc0/0xd8 drm_atomic_helper_set_config+0xb4/0x134 drm_mode_setcrtc+0x688/0x1248 drm_ioctl_kernel+0x1e4/0x338 drm_ioctl+0x3a4/0x684 __arm64_sys_ioctl+0x118/0x154 invoke_syscall+0x78/0x224 el0_svc_common+0x178/0x200 do_el0_svc+0x94/0x13c el0_svc+0x5c/0xec el0t_64_sync_handler+0x78/0x108 el0t_64_sync+0x1a4/0x1a8 Changes in v2: -- re text commit title -- remove all fail safe mode Changes in v3: -- remove dp_panel_add_fail_safe_mode() from dp_panel.h -- add Fixes Changes in v5: -- to=dianders@chromium.org Changes in v6: -- fix Fixes commit ID Fixes: 8b2c181e3dcf ("drm/msm/dp: add fail safe mode outside of event_mutex context") Reported-by: Douglas Anderson Signed-off-by: Kuogee Hsieh Link: https://lore.kernel.org/r/1651007534-31842-1-git-send-email-quic_khsieh@quicinc.com Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_display.c | 6 ------ drivers/gpu/drm/msm/dp/dp_panel.c | 11 ----------- drivers/gpu/drm/msm/dp/dp_panel.h | 1 - 3 files changed, 18 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index a42732b67349..178b774a5fbd 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -580,12 +580,6 @@ static int dp_hpd_plug_handle(struct dp_display_private *dp, u32 data) dp->dp_display.connector_type, state); mutex_unlock(&dp->event_mutex); - /* - * add fail safe mode outside event_mutex scope - * to avoid potiential circular lock with drm thread - */ - dp_panel_add_fail_safe_mode(dp->dp_display.connector); - /* uevent will complete connection part */ return 0; }; diff --git a/drivers/gpu/drm/msm/dp/dp_panel.c b/drivers/gpu/drm/msm/dp/dp_panel.c index 26c3653c99ec..26f4b6959c31 100644 --- a/drivers/gpu/drm/msm/dp/dp_panel.c +++ b/drivers/gpu/drm/msm/dp/dp_panel.c @@ -151,15 +151,6 @@ static int dp_panel_update_modes(struct drm_connector *connector, return rc; } -void dp_panel_add_fail_safe_mode(struct drm_connector *connector) -{ - /* fail safe edid */ - mutex_lock(&connector->dev->mode_config.mutex); - if (drm_add_modes_noedid(connector, 640, 480)) - drm_set_preferred_mode(connector, 640, 480); - mutex_unlock(&connector->dev->mode_config.mutex); -} - int dp_panel_read_sink_caps(struct dp_panel *dp_panel, struct drm_connector *connector) { @@ -215,8 +206,6 @@ int dp_panel_read_sink_caps(struct dp_panel *dp_panel, rc = -ETIMEDOUT; goto end; } - - dp_panel_add_fail_safe_mode(connector); } if (panel->aux_cfg_update_done) { diff --git a/drivers/gpu/drm/msm/dp/dp_panel.h b/drivers/gpu/drm/msm/dp/dp_panel.h index 99739ea679a7..9023e5bb4b8b 100644 --- a/drivers/gpu/drm/msm/dp/dp_panel.h +++ b/drivers/gpu/drm/msm/dp/dp_panel.h @@ -59,7 +59,6 @@ int dp_panel_init_panel_info(struct dp_panel *dp_panel); int dp_panel_deinit(struct dp_panel *dp_panel); int dp_panel_timing_cfg(struct dp_panel *dp_panel); void dp_panel_dump_regs(struct dp_panel *dp_panel); -void dp_panel_add_fail_safe_mode(struct drm_connector *connector); int dp_panel_read_sink_caps(struct dp_panel *dp_panel, struct drm_connector *connector); u32 dp_panel_get_mode_bpp(struct dp_panel *dp_panel, u32 mode_max_bpp, From f9095ac1ba1ce407cecc1df93c05ad4ac504661c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Apr 2022 08:58:02 +0200 Subject: [PATCH 037/179] dt-bindings: ufs: cdns,ufshc: Add power-domains The Cadence UFS controller can be part of power domain (as it is in example DTS of TI J721e UFS Host Controller Glue), so allow such property. Reported-by: Rob Herring Signed-off-by: Krzysztof Kozlowski Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220427065802.110402-1-krzysztof.kozlowski@linaro.org --- Documentation/devicetree/bindings/ufs/cdns,ufshc.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/ufs/cdns,ufshc.yaml b/Documentation/devicetree/bindings/ufs/cdns,ufshc.yaml index d227dea368be..fb45f66d6454 100644 --- a/Documentation/devicetree/bindings/ufs/cdns,ufshc.yaml +++ b/Documentation/devicetree/bindings/ufs/cdns,ufshc.yaml @@ -43,6 +43,9 @@ properties: - const: phy_clk - const: ref_clk + power-domains: + maxItems: 1 + reg: maxItems: 1 From e17fd4bf54fb579d03566ab22a02dc03b36f4d06 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 26 Apr 2022 08:35:08 -0500 Subject: [PATCH 038/179] dt-bindings: leds-mt6360: Drop redundant 'unevaluatedProperties' The binding has both 'unevaluatedProperties: false' and 'additionalProperties: false' which is redundant. 'additionalProperties' is the stricter of the two, so drop 'unevaluatedProperties'. Fixes: e05cab34e417 ("dt-bindings: leds: Add bindings for MT6360 LED") Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220426133508.1849580-1-robh@kernel.org --- Documentation/devicetree/bindings/leds/leds-mt6360.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/leds-mt6360.yaml b/Documentation/devicetree/bindings/leds/leds-mt6360.yaml index b2fe6eb89389..10f95bf1d666 100644 --- a/Documentation/devicetree/bindings/leds/leds-mt6360.yaml +++ b/Documentation/devicetree/bindings/leds/leds-mt6360.yaml @@ -43,8 +43,6 @@ patternProperties: - 4 # LED output FLASH1 - 5 # LED output FLASH2 -unevaluatedProperties: false - required: - compatible - "#address-cells" From aad41a7d7cf6c6fa804c872a2480f8e541da37cf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 28 Apr 2022 11:08:13 -0400 Subject: [PATCH 039/179] SUNRPC: Don't leak sockets in xs_local_connect() If there is still a closed socket associated with the transport, then we need to trigger an autoclose before we can set up a new connection. Reported-by: wanghai (M) Fixes: f00432063db1 ("SUNRPC: Ensure we flush any closed sockets before xs_xprt_free()") Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8ab64ea46870..f9849b297ea3 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1950,6 +1950,9 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); int ret; + if (transport->file) + goto force_disconnect; + if (RPC_IS_ASYNC(task)) { /* * We want the AF_LOCAL connect to be resolved in the @@ -1962,11 +1965,17 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) */ task->tk_rpc_status = -ENOTCONN; rpc_exit(task, -ENOTCONN); - return; + goto out_wake; } ret = xs_local_setup_socket(transport); if (ret && !RPC_IS_SOFTCONN(task)) msleep_interruptible(15000); + return; +force_disconnect: + xprt_force_disconnect(xprt); +out_wake: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } #if IS_ENABLED(CONFIG_SUNRPC_SWAP) From 2c33d775ef4c25c0e1e1cc0fd5496d02f76bfa20 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Thu, 28 Apr 2022 08:24:32 +0200 Subject: [PATCH 040/179] timekeeping: Mark NMI safe time accessors as notrace Mark the CLOCK_MONOTONIC fast time accessors as notrace. These functions are used in tracing to retrieve timestamps, so they should not recurse. Fixes: 4498e7467e9e ("time: Parametrize all tk_fast_mono users") Fixes: f09cb9a1808e ("time: Introduce tk_fast_raw") Reported-by: Steven Rostedt Signed-off-by: Kurt Kanzenbach Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220426175338.3807ca4f@gandalf.local.home/ Link: https://lore.kernel.org/r/20220428062432.61063-1-kurt@linutronix.de --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dcdcb85121e4..3b1398fbddaf 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -482,7 +482,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) * of the following timestamps. Callers need to be aware of that and * deal with it. */ -u64 ktime_get_mono_fast_ns(void) +u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } @@ -494,7 +494,7 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ -u64 ktime_get_raw_fast_ns(void) +u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } From bb300130e47fcefbe938f06dbacaef0312e28416 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Wed, 27 Apr 2022 14:16:19 +0300 Subject: [PATCH 041/179] ath11k: reduce the wait time of 11d scan and hw scan while add interface (cherry picked from commit 1f682dc9fb3790aa7ec27d3d122ff32b1eda1365 in wireless-next) Currently ath11k will wait 11d scan complete while add interface in ath11k_mac_op_add_interface(), when system resume without enable wowlan, ath11k_mac_op_add_interface() is called for each resume, thus it increase the resume time of system. And ath11k_mac_op_hw_scan() after ath11k_mac_op_add_interface() also needs some time cost because the previous 11d scan need more than 5 seconds when 6 GHz is enabled, then the scan started event will indicated to ath11k after the 11d scan completed. While 11d scan/hw scan is running in firmware, if ath11k update channel list to firmware by WMI_SCAN_CHAN_LIST_CMDID, then firmware will cancel the current scan which is running, it lead the scan failed. The patch commit 9dcf6808b253 ("ath11k: add 11d scan offload support") used finish_11d_scan/finish_11d_ch_list/pending_11d to synchronize the 11d scan/hw scan/channel list between ath11k/firmware/mac80211 and to avoid the scan fail. Add wait operation before ath11k update channel list, function ath11k_reg_update_chan_list() will wait until the current 11d scan/hw scan completed. And remove the wait operation of start 11d scan and waiting channel list complete in hw scan. After these changes, resume time cost reduce about 5 seconds and also hw scan time cost reduced obviously, and scan failed not seen. The 11d scan is sent to firmware only one time for each interface added in mac.c, and it is moved after the 1st hw scan because 11d scan will cost some time and thus leads the AP scan result update to UI delay. Currently priority of ath11k's hw scan is WMI_SCAN_PRIORITY_LOW, and priority of 11d scan in firmware is WMI_SCAN_PRIORITY_MEDIUM, then the 11d scan which sent after hw scan will cancel the hw scan in firmware, so change the priority to WMI_SCAN_PRIORITY_MEDIUM for the hw scan which is in front of the 11d scan, thus it will not happen scan cancel in firmware. Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3 Fixes: 9dcf6808b253 ("ath11k: add 11d scan offload support") Link: https://bugzilla.kernel.org/show_bug.cgi?id=215777 Cc: Signed-off-by: Wen Gong Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220328035832.14122-1-quic_wgong@quicinc.com Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220427111619.9758-1-kvalo@kernel.org --- drivers/net/wireless/ath/ath11k/core.c | 1 + drivers/net/wireless/ath/ath11k/core.h | 13 +++-- drivers/net/wireless/ath/ath11k/mac.c | 71 +++++++++++--------------- drivers/net/wireless/ath/ath11k/mac.h | 2 +- drivers/net/wireless/ath/ath11k/reg.c | 43 ++++++++++------ drivers/net/wireless/ath/ath11k/reg.h | 2 +- drivers/net/wireless/ath/ath11k/wmi.c | 16 +++++- 7 files changed, 84 insertions(+), 64 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c index 71eb7d04c3bf..90a5df1fbdbd 100644 --- a/drivers/net/wireless/ath/ath11k/core.c +++ b/drivers/net/wireless/ath/ath11k/core.c @@ -1288,6 +1288,7 @@ static void ath11k_core_restart(struct work_struct *work) ieee80211_stop_queues(ar->hw); ath11k_mac_drain_tx(ar); + complete(&ar->completed_11d_scan); complete(&ar->scan.started); complete(&ar->scan.completed); complete(&ar->peer_assoc_done); diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h index c0228e91a596..b8634eddf49a 100644 --- a/drivers/net/wireless/ath/ath11k/core.h +++ b/drivers/net/wireless/ath/ath11k/core.h @@ -38,6 +38,8 @@ extern unsigned int ath11k_frame_mode; +#define ATH11K_SCAN_TIMEOUT_HZ (20 * HZ) + #define ATH11K_MON_TIMER_INTERVAL 10 enum ath11k_supported_bw { @@ -189,6 +191,12 @@ enum ath11k_scan_state { ATH11K_SCAN_ABORTING, }; +enum ath11k_11d_state { + ATH11K_11D_IDLE, + ATH11K_11D_PREPARING, + ATH11K_11D_RUNNING, +}; + enum ath11k_dev_flags { ATH11K_CAC_RUNNING, ATH11K_FLAG_CORE_REGISTERED, @@ -607,9 +615,8 @@ struct ath11k { bool dfs_block_radar_events; struct ath11k_thermal thermal; u32 vdev_id_11d_scan; - struct completion finish_11d_scan; - struct completion finish_11d_ch_list; - bool pending_11d; + struct completion completed_11d_scan; + enum ath11k_11d_state state_11d; bool regdom_set_by_user; int hw_rate_code; u8 twt_enabled; diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index e6b34b0d61bd..58ff761393db 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3601,26 +3601,6 @@ static int ath11k_mac_op_hw_scan(struct ieee80211_hw *hw, if (ret) goto exit; - /* Currently the pending_11d=true only happened 1 time while - * wlan interface up in ath11k_mac_11d_scan_start(), it is called by - * ath11k_mac_op_add_interface(), after wlan interface up, - * pending_11d=false always. - * If remove below wait, it always happened scan fail and lead connect - * fail while wlan interface up, because it has a 11d scan which is running - * in firmware, and lead this scan failed. - */ - if (ar->pending_11d) { - long time_left; - unsigned long timeout = 5 * HZ; - - if (ar->supports_6ghz) - timeout += 5 * HZ; - - time_left = wait_for_completion_timeout(&ar->finish_11d_ch_list, timeout); - ath11k_dbg(ar->ab, ATH11K_DBG_MAC, - "mac wait 11d channel list time left %ld\n", time_left); - } - memset(&arg, 0, sizeof(arg)); ath11k_wmi_start_scan_init(ar, &arg); arg.vdev_id = arvif->vdev_id; @@ -3686,6 +3666,10 @@ exit: kfree(arg.extraie.ptr); mutex_unlock(&ar->conf_mutex); + + if (ar->state_11d == ATH11K_11D_PREPARING) + ath11k_mac_11d_scan_start(ar, arvif->vdev_id); + return ret; } @@ -5814,7 +5798,7 @@ static int ath11k_mac_op_start(struct ieee80211_hw *hw) /* TODO: Do we need to enable ANI? */ - ath11k_reg_update_chan_list(ar); + ath11k_reg_update_chan_list(ar, false); ar->num_started_vdevs = 0; ar->num_created_vdevs = 0; @@ -5881,6 +5865,11 @@ static void ath11k_mac_op_stop(struct ieee80211_hw *hw) cancel_work_sync(&ar->ab->update_11d_work); cancel_work_sync(&ar->ab->rfkill_work); + if (ar->state_11d == ATH11K_11D_PREPARING) { + ar->state_11d = ATH11K_11D_IDLE; + complete(&ar->completed_11d_scan); + } + spin_lock_bh(&ar->data_lock); list_for_each_entry_safe(ppdu_stats, tmp, &ar->ppdu_stats_info, list) { list_del(&ppdu_stats->list); @@ -6051,7 +6040,7 @@ static bool ath11k_mac_vif_ap_active_any(struct ath11k_base *ab) return false; } -void ath11k_mac_11d_scan_start(struct ath11k *ar, u32 vdev_id, bool wait) +void ath11k_mac_11d_scan_start(struct ath11k *ar, u32 vdev_id) { struct wmi_11d_scan_start_params param; int ret; @@ -6079,28 +6068,22 @@ void ath11k_mac_11d_scan_start(struct ath11k *ar, u32 vdev_id, bool wait) ath11k_dbg(ar->ab, ATH11K_DBG_MAC, "mac start 11d scan\n"); - if (wait) - reinit_completion(&ar->finish_11d_scan); - ret = ath11k_wmi_send_11d_scan_start_cmd(ar, ¶m); if (ret) { ath11k_warn(ar->ab, "failed to start 11d scan vdev %d ret: %d\n", vdev_id, ret); } else { ar->vdev_id_11d_scan = vdev_id; - if (wait) { - ar->pending_11d = true; - ret = wait_for_completion_timeout(&ar->finish_11d_scan, - 5 * HZ); - ath11k_dbg(ar->ab, ATH11K_DBG_MAC, - "mac 11d scan left time %d\n", ret); - - if (!ret) - ar->pending_11d = false; - } + if (ar->state_11d == ATH11K_11D_PREPARING) + ar->state_11d = ATH11K_11D_RUNNING; } fin: + if (ar->state_11d == ATH11K_11D_PREPARING) { + ar->state_11d = ATH11K_11D_IDLE; + complete(&ar->completed_11d_scan); + } + mutex_unlock(&ar->ab->vdev_id_11d_lock); } @@ -6123,12 +6106,15 @@ void ath11k_mac_11d_scan_stop(struct ath11k *ar) vdev_id = ar->vdev_id_11d_scan; ret = ath11k_wmi_send_11d_scan_stop_cmd(ar, vdev_id); - if (ret) + if (ret) { ath11k_warn(ar->ab, "failed to stopt 11d scan vdev %d ret: %d\n", vdev_id, ret); - else + } else { ar->vdev_id_11d_scan = ATH11K_11D_INVALID_VDEV_ID; + ar->state_11d = ATH11K_11D_IDLE; + complete(&ar->completed_11d_scan); + } } mutex_unlock(&ar->ab->vdev_id_11d_lock); } @@ -6324,8 +6310,10 @@ static int ath11k_mac_op_add_interface(struct ieee80211_hw *hw, goto err_peer_del; } - ath11k_mac_11d_scan_start(ar, arvif->vdev_id, true); - + if (test_bit(WMI_TLV_SERVICE_11D_OFFLOAD, ab->wmi_ab.svc_map)) { + reinit_completion(&ar->completed_11d_scan); + ar->state_11d = ATH11K_11D_PREPARING; + } break; case WMI_VDEV_TYPE_MONITOR: set_bit(ATH11K_FLAG_MONITOR_VDEV_CREATED, &ar->monitor_flags); @@ -7190,7 +7178,7 @@ ath11k_mac_op_unassign_vif_chanctx(struct ieee80211_hw *hw, } if (arvif->vdev_type == WMI_VDEV_TYPE_STA) - ath11k_mac_11d_scan_start(ar, arvif->vdev_id, false); + ath11k_mac_11d_scan_start(ar, arvif->vdev_id); mutex_unlock(&ar->conf_mutex); } @@ -8671,8 +8659,7 @@ int ath11k_mac_allocate(struct ath11k_base *ab) ar->monitor_vdev_id = -1; clear_bit(ATH11K_FLAG_MONITOR_VDEV_CREATED, &ar->monitor_flags); ar->vdev_id_11d_scan = ATH11K_11D_INVALID_VDEV_ID; - init_completion(&ar->finish_11d_scan); - init_completion(&ar->finish_11d_ch_list); + init_completion(&ar->completed_11d_scan); } return 0; diff --git a/drivers/net/wireless/ath/ath11k/mac.h b/drivers/net/wireless/ath/ath11k/mac.h index 0e6c870b09c8..29b523af66dd 100644 --- a/drivers/net/wireless/ath/ath11k/mac.h +++ b/drivers/net/wireless/ath/ath11k/mac.h @@ -130,7 +130,7 @@ extern const struct htt_rx_ring_tlv_filter ath11k_mac_mon_status_filter_default; #define ATH11K_SCAN_11D_INTERVAL 600000 #define ATH11K_11D_INVALID_VDEV_ID 0xFFFF -void ath11k_mac_11d_scan_start(struct ath11k *ar, u32 vdev_id, bool wait); +void ath11k_mac_11d_scan_start(struct ath11k *ar, u32 vdev_id); void ath11k_mac_11d_scan_stop(struct ath11k *ar); void ath11k_mac_11d_scan_stop_all(struct ath11k_base *ab); diff --git a/drivers/net/wireless/ath/ath11k/reg.c b/drivers/net/wireless/ath/ath11k/reg.c index 81e11cde31d7..80a697771393 100644 --- a/drivers/net/wireless/ath/ath11k/reg.c +++ b/drivers/net/wireless/ath/ath11k/reg.c @@ -102,7 +102,7 @@ ath11k_reg_notifier(struct wiphy *wiphy, struct regulatory_request *request) ar->regdom_set_by_user = true; } -int ath11k_reg_update_chan_list(struct ath11k *ar) +int ath11k_reg_update_chan_list(struct ath11k *ar, bool wait) { struct ieee80211_supported_band **bands; struct scan_chan_list_params *params; @@ -111,7 +111,32 @@ int ath11k_reg_update_chan_list(struct ath11k *ar) struct channel_param *ch; enum nl80211_band band; int num_channels = 0; - int i, ret; + int i, ret, left; + + if (wait && ar->state_11d != ATH11K_11D_IDLE) { + left = wait_for_completion_timeout(&ar->completed_11d_scan, + ATH11K_SCAN_TIMEOUT_HZ); + if (!left) { + ath11k_dbg(ar->ab, ATH11K_DBG_REG, + "failed to receive 11d scan complete: timed out\n"); + ar->state_11d = ATH11K_11D_IDLE; + } + ath11k_dbg(ar->ab, ATH11K_DBG_REG, + "reg 11d scan wait left time %d\n", left); + } + + if (wait && + (ar->scan.state == ATH11K_SCAN_STARTING || + ar->scan.state == ATH11K_SCAN_RUNNING)) { + left = wait_for_completion_timeout(&ar->scan.completed, + ATH11K_SCAN_TIMEOUT_HZ); + if (!left) + ath11k_dbg(ar->ab, ATH11K_DBG_REG, + "failed to receive hw scan complete: timed out\n"); + + ath11k_dbg(ar->ab, ATH11K_DBG_REG, + "reg hw scan wait left time %d\n", left); + } bands = hw->wiphy->bands; for (band = 0; band < NUM_NL80211_BANDS; band++) { @@ -193,11 +218,6 @@ int ath11k_reg_update_chan_list(struct ath11k *ar) ret = ath11k_wmi_send_scan_chan_list_cmd(ar, params); kfree(params); - if (ar->pending_11d) { - complete(&ar->finish_11d_ch_list); - ar->pending_11d = false; - } - return ret; } @@ -263,15 +283,8 @@ int ath11k_regd_update(struct ath11k *ar) goto err; } - if (ar->pending_11d) - complete(&ar->finish_11d_scan); - rtnl_lock(); wiphy_lock(ar->hw->wiphy); - - if (ar->pending_11d) - reinit_completion(&ar->finish_11d_ch_list); - ret = regulatory_set_wiphy_regd_sync(ar->hw->wiphy, regd_copy); wiphy_unlock(ar->hw->wiphy); rtnl_unlock(); @@ -282,7 +295,7 @@ int ath11k_regd_update(struct ath11k *ar) goto err; if (ar->state == ATH11K_STATE_ON) { - ret = ath11k_reg_update_chan_list(ar); + ret = ath11k_reg_update_chan_list(ar, true); if (ret) goto err; } diff --git a/drivers/net/wireless/ath/ath11k/reg.h b/drivers/net/wireless/ath/ath11k/reg.h index 5fb9dc03a74e..2f284f26378d 100644 --- a/drivers/net/wireless/ath/ath11k/reg.h +++ b/drivers/net/wireless/ath/ath11k/reg.h @@ -32,5 +32,5 @@ struct ieee80211_regdomain * ath11k_reg_build_regd(struct ath11k_base *ab, struct cur_regulatory_info *reg_info, bool intersect); int ath11k_regd_update(struct ath11k *ar); -int ath11k_reg_update_chan_list(struct ath11k *ar); +int ath11k_reg_update_chan_list(struct ath11k *ar, bool wait); #endif diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index b4f86c45d81f..2751fe8814df 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -2015,7 +2015,10 @@ void ath11k_wmi_start_scan_init(struct ath11k *ar, { /* setup commonly used values */ arg->scan_req_id = 1; - arg->scan_priority = WMI_SCAN_PRIORITY_LOW; + if (ar->state_11d == ATH11K_11D_PREPARING) + arg->scan_priority = WMI_SCAN_PRIORITY_MEDIUM; + else + arg->scan_priority = WMI_SCAN_PRIORITY_LOW; arg->dwell_time_active = 50; arg->dwell_time_active_2g = 0; arg->dwell_time_passive = 150; @@ -6350,8 +6353,10 @@ static void ath11k_wmi_op_ep_tx_credits(struct ath11k_base *ab) static int ath11k_reg_11d_new_cc_event(struct ath11k_base *ab, struct sk_buff *skb) { const struct wmi_11d_new_cc_ev *ev; + struct ath11k *ar; + struct ath11k_pdev *pdev; const void **tb; - int ret; + int ret, i; tb = ath11k_wmi_tlv_parse_alloc(ab, skb->data, skb->len, GFP_ATOMIC); if (IS_ERR(tb)) { @@ -6377,6 +6382,13 @@ static int ath11k_reg_11d_new_cc_event(struct ath11k_base *ab, struct sk_buff *s kfree(tb); + for (i = 0; i < ab->num_radios; i++) { + pdev = &ab->pdevs[i]; + ar = pdev->ar; + ar->state_11d = ATH11K_11D_IDLE; + complete(&ar->completed_11d_scan); + } + queue_work(ab->workqueue, &ab->update_11d_work); return 0; From efce2d0ba6bf70994394a5a139347ced4d172771 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 28 Apr 2022 11:15:08 -0400 Subject: [PATCH 042/179] SUNRPC: Ensure timely close of disconnected AF_LOCAL sockets When the rpcbind server closes the socket, we need to ensure that the socket is closed by the kernel as soon as feasible, so add a sk_state_change callback to trigger this close. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f9849b297ea3..25b8a8ead56b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1418,6 +1418,26 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ +/** + * xs_local_state_change - callback to handle AF_LOCAL socket state changes + * @sk: socket whose state has changed + * + */ +static void xs_local_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + struct sock_xprt *transport; + + if (!(xprt = xprt_from_sock(sk))) + return; + transport = container_of(xprt, struct sock_xprt, xprt); + if (sk->sk_shutdown & SHUTDOWN_MASK) { + clear_bit(XPRT_CONNECTED, &xprt->state); + /* Trigger the socket release */ + xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); + } +} + /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1866,6 +1886,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; xprt_clear_connected(xprt); From f0a6c68f69981214cb7858738dd2bc81475111f7 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sun, 24 Apr 2022 12:46:23 +0100 Subject: [PATCH 043/179] MIPS: Fix CP0 counter erratum detection for R4k CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the discrepancy between the two places we check for the CP0 counter erratum in along with the incorrect comparison of the R4400 revision number against 0x30 which matches none and consistently consider all R4000 and R4400 processors affected, as documented in processor errata publications[1][2][3], following the mapping between CP0 PRId register values and processor models: PRId | Processor Model ---------+-------------------- 00000422 | R4000 Revision 2.2 00000430 | R4000 Revision 3.0 00000440 | R4400 Revision 1.0 00000450 | R4400 Revision 2.0 00000460 | R4400 Revision 3.0 No other revision of either processor has ever been spotted. Contrary to what has been stated in commit ce202cbb9e0b ("[MIPS] Assume R4000/R4400 newer than 3.0 don't have the mfc0 count bug") marking the CP0 counter as buggy does not preclude it from being used as either a clock event or a clock source device. It just cannot be used as both at a time, because in that case clock event interrupts will be occasionally lost, and the use as a clock event device takes precedence. Compare against 0x4ff in `can_use_mips_counter' so that a single machine instruction is produced. References: [1] "MIPS R4000PC/SC Errata, Processor Revision 2.2 and 3.0", MIPS Technologies Inc., May 10, 1994, Erratum 53, p.13 [2] "MIPS R4400PC/SC Errata, Processor Revision 1.0", MIPS Technologies Inc., February 9, 1994, Erratum 21, p.4 [3] "MIPS R4400PC/SC Errata, Processor Revision 2.0 & 3.0", MIPS Technologies Inc., January 24, 1995, Erratum 14, p.3 Signed-off-by: Maciej W. Rozycki Fixes: ce202cbb9e0b ("[MIPS] Assume R4000/R4400 newer than 3.0 don't have the mfc0 count bug") Cc: stable@vger.kernel.org # v2.6.24+ Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/timex.h | 8 ++++---- arch/mips/kernel/time.c | 11 +++-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/mips/include/asm/timex.h b/arch/mips/include/asm/timex.h index b05bb70a2e46..8026baf46e72 100644 --- a/arch/mips/include/asm/timex.h +++ b/arch/mips/include/asm/timex.h @@ -40,9 +40,9 @@ typedef unsigned int cycles_t; /* - * On R4000/R4400 before version 5.0 an erratum exists such that if the - * cycle counter is read in the exact moment that it is matching the - * compare register, no interrupt will be generated. + * On R4000/R4400 an erratum exists such that if the cycle counter is + * read in the exact moment that it is matching the compare register, + * no interrupt will be generated. * * There is a suggested workaround and also the erratum can't strike if * the compare interrupt isn't being used as the clock source device. @@ -63,7 +63,7 @@ static inline int can_use_mips_counter(unsigned int prid) if (!__builtin_constant_p(cpu_has_counter)) asm volatile("" : "=m" (cpu_data[0].options)); if (likely(cpu_has_counter && - prid >= (PRID_IMP_R4000 | PRID_REV_ENCODE_44(5, 0)))) + prid > (PRID_IMP_R4000 | PRID_REV_ENCODE_44(15, 15)))) return 1; else return 0; diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index caa01457dce6..ed339d7979f3 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -141,15 +141,10 @@ static __init int cpu_has_mfc0_count_bug(void) case CPU_R4400MC: /* * The published errata for the R4400 up to 3.0 say the CPU - * has the mfc0 from count bug. + * has the mfc0 from count bug. This seems the last version + * produced. */ - if ((current_cpu_data.processor_id & 0xff) <= 0x30) - return 1; - - /* - * we assume newer revisions are ok - */ - return 0; + return 1; } return 0; From c6fe81191bd74f7e6ae9ce96a4837df9485f3ab8 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Tue, 22 Mar 2022 15:28:39 +0200 Subject: [PATCH 044/179] RISC-V: relocate DTB if it's outside memory region In case the DTB provided by the bootloader/BootROM is before the kernel image or outside /memory, we won't be able to access it through the linear mapping, and get a segfault on setup_arch(). Currently OpenSBI relocates DTB but that's not always the case (e.g. if FW_JUMP_FDT_ADDR is not specified), and it's also not the most portable approach since the default FW_JUMP_FDT_ADDR of the generic platform relocates the DTB at a specific offset that may not be available. To avoid this situation copy DTB so that it's visible through the linear mapping. Signed-off-by: Nick Kossifidis Link: https://lore.kernel.org/r/20220322132839.3653682-1-mick@ics.forth.gr Tested-by: Conor Dooley Fixes: f105aa940e78 ("riscv: add BUILTIN_DTB support for MMU-enabled targets") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b0793dc0c291..05ed641a1134 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -208,8 +208,25 @@ static void __init setup_bootmem(void) * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { + /* + * In case the DTB is not located in a memory region we won't + * be able to locate it later on via the linear mapping and + * get a segfault when accessing it via __va(dtb_early_pa). + * To avoid this situation copy DTB to a memory region. + * Note that memblock_phys_alloc will also reserve DTB region. + */ + if (!memblock_is_memory(dtb_early_pa)) { + size_t fdt_size = fdt_totalsize(dtb_early_va); + phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); + void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); + + memcpy(new_dtb_early_va, dtb_early_va, fdt_size); + early_memunmap(new_dtb_early_va, fdt_size); + _dtb_early_pa = new_dtb_early_pa; + } else + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + } early_init_fdt_scan_reserved_mem(); dma_contiguous_reserve(dma32_phys_limit); From 892de36fd4a98fab3298d417c051d9099af5448d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Apr 2022 12:22:10 -0400 Subject: [PATCH 045/179] SUNRPC: Ensure gss-proxy connects on setup For reasons best known to the author, gss-proxy does not implement a NULL procedure, and returns RPC_PROC_UNAVAIL. However we still want to ensure that we connect to the service at setup time. So add a quirk-flag specially for this case. Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 +- net/sunrpc/clnt.c | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 267b7aeaf1a6..db5149567305 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,6 +160,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) +#define RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 61c276bddaf2..8ca1d809b78d 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -97,7 +97,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * timeout, which would result in reconnections being * done without the correct namespace: */ - .flags = RPC_CLNT_CREATE_NOPING | + .flags = RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 98133aa54f19..22c28cf43eba 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -479,6 +479,9 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt); + if ((args->flags & RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL) && + err == -EOPNOTSUPP) + err = 0; if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); From a3d0562d4dc039bca39445e1cddde7951662e17d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Apr 2022 12:27:30 -0400 Subject: [PATCH 046/179] Revert "SUNRPC: attempt AF_LOCAL connect on setup" This reverts commit 7073ea8799a8cf73db60270986f14e4aae20fa80. We must not try to connect the socket while the transport is under construction, because the mechanisms to safely tear it down are not in place. As the code stands, we end up leaking the sockets on a connection error. Reported-by: wanghai (M) Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 25b8a8ead56b..650102a9c86a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2875,9 +2875,6 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) } xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); - ret = ERR_PTR(xs_local_setup_socket(transport)); - if (ret) - goto out_err; break; default: ret = ERR_PTR(-EAFNOSUPPORT); From 2667ed10d9f01e250ba806276740782c89d77fda Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 28 Apr 2022 11:00:41 -0700 Subject: [PATCH 047/179] mm: Fix PASID use-after-free issue The PASID is being freed too early. It needs to stay around until after device drivers that might be using it have had a chance to clear it out of the hardware. The relevant refcounts are: mmget() /mmput() refcount the mm's address space mmgrab()/mmdrop() refcount the mm itself The PASID is currently tied to the life of the mm's address space and freed in __mmput(). This makes logical sense because the PASID can't be used once the address space is gone. But, this misses an important point: even after the address space is gone, the PASID will still be programmed into a device. Device drivers might, for instance, still need to flush operations that are outstanding and need to use that PASID. They do this at file->release() time. Device drivers call the IOMMU driver to hold a reference on the mm itself and drop it at file->release() time. But, the IOMMU driver holds a reference on the mm itself, not the address space. The address space (and the PASID) is long gone by the time the driver tries to clean up. This is effectively a use-after-free bug on the PASID. To fix this, move the PASID free operation from __mmput() to __mmdrop(). This ensures that the IOMMU driver's existing mmgrab() keeps the PASID allocated until it drops its mm reference. Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit") Reported-by: Zhangfei Gao Suggested-by: Jean-Philippe Brucker Suggested-by: Jacob Pan Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Tested-by: Zhangfei Gao Reviewed-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20220428180041.806809-1-fenghua.yu@intel.com --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 9796897560ab..35a3beff140b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm) mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); + mm_pasid_drop(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - mm_pasid_drop(mm); mmdrop(mm); } From e75f88efac05bf4e107e4171d8db6d8c3937252d Mon Sep 17 00:00:00 2001 From: Andrei Lalaev Date: Fri, 15 Apr 2022 10:07:11 +0300 Subject: [PATCH 048/179] gpiolib: of: fix bounds check for 'gpio-reserved-ranges' Gpiolib interprets the elements of "gpio-reserved-ranges" as "start,size" because it clears "size" bits starting from the "start" bit in the according bitmap. So it has to use "greater" instead of "greater or equal" when performs bounds check to make sure that GPIOs are in the available range. Previous implementation skipped ranges that include the last GPIO in the range. I wrote the mail to the maintainers (https://lore.kernel.org/linux-gpio/20220412115554.159435-1-andrei.lalaev@emlid.com/T/#u) of the questioned DTSes (because I couldn't understand how the maintainers interpreted this property), but I haven't received a response. Since the questioned DTSes use "gpio-reserved-ranges = <0 4>" (i.e., the beginning of the range), this patch doesn't affect these DTSes at all. TBH this patch doesn't break any existing DTSes because none of them reserve gpios at the end of range. Fixes: 726cb3ba4969 ("gpiolib: Support 'gpio-reserved-ranges' property") Signed-off-by: Andrei Lalaev Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Cc: stable@vger.kernel.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index ae1ce319cd78..7e5e51d49d09 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -910,7 +910,7 @@ static void of_gpiochip_init_valid_mask(struct gpio_chip *chip) i, &start); of_property_read_u32_index(np, "gpio-reserved-ranges", i + 1, &count); - if (start >= chip->ngpio || start + count >= chip->ngpio) + if (start >= chip->ngpio || start + count > chip->ngpio) continue; bitmap_clear(chip->valid_mask, start, count); From e5f6e5d554ac274f9c8ba60078103d0425b93c19 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 11 Apr 2022 09:23:40 +0300 Subject: [PATCH 049/179] gpio: mvebu: drop pwm base assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pwmchip_add() unconditionally assigns the base ID dynamically. Commit f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically") dropped all base assignment from drivers under drivers/pwm/. It missed this driver. Fix that. Fixes: f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically") Signed-off-by: Baruch Siach Reviewed-by: Uwe Kleine-König Acked-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mvebu.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 4c1f9e1091b7..a2c8dd329b31 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -871,13 +871,6 @@ static int mvebu_pwm_probe(struct platform_device *pdev, mvpwm->chip.dev = dev; mvpwm->chip.ops = &mvebu_pwm_ops; mvpwm->chip.npwm = mvchip->chip.ngpio; - /* - * There may already be some PWM allocated, so we can't force - * mvpwm->chip.base to a fixed point like mvchip->chip.base. - * So, we let pwmchip_add() do the numbering and take the next free - * region. - */ - mvpwm->chip.base = -1; spin_lock_init(&mvpwm->lock); From a196c78b5443fc61af2c0490213b9d125482cbd1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 1 May 2022 21:19:50 -0600 Subject: [PATCH 050/179] io_uring: assign non-fixed early for async work We defer file assignment to ensure that fixed files work with links between a direct accept/open and the links that follow it. But this has the side effect that normal file assignment is then not complete by the time that request submission has been done. For deferred execution, if the file is a regular file, assign it when we do the async prep anyway. Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e01f595f5b7d..91de361ea9ab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6947,7 +6947,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_req_prep_async(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].needs_async_setup) + const struct io_op_def *def = &io_op_defs[req->opcode]; + + /* assign early for deferred execution for non-fixed file */ + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + req->file = io_file_get_normal(req, req->fd); + if (!def->needs_async_setup) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; From 7b8943b821bafab492f43aafbd006b57c6b65845 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Mon, 25 Apr 2022 13:17:01 -0500 Subject: [PATCH 051/179] RDMA/irdma: Flush iWARP QP if modified to ERR from RTR state When connection establishment fails in iWARP mode, an app can drain the QPs and hang because flush isn't issued when the QP is modified from RTR state to error. Issue a flush in this case using function irdma_cm_disconn(). Update irdma_cm_disconn() to do flush when cm_id is NULL, which is the case when the QP is in RTR state and there is an error in the connection establishment. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20220425181703.1634-2-shiraz.saleem@intel.com Signed-off-by: Tatyana Nikolova Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 16 +++++----------- drivers/infiniband/hw/irdma/verbs.c | 4 ++-- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index a98d962e5efb..90b4113e7071 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -3462,12 +3462,6 @@ static void irdma_cm_disconn_true(struct irdma_qp *iwqp) } cm_id = iwqp->cm_id; - /* make sure we havent already closed this connection */ - if (!cm_id) { - spin_unlock_irqrestore(&iwqp->lock, flags); - return; - } - original_hw_tcp_state = iwqp->hw_tcp_state; original_ibqp_state = iwqp->ibqp_state; last_ae = iwqp->last_aeq; @@ -3489,11 +3483,11 @@ static void irdma_cm_disconn_true(struct irdma_qp *iwqp) disconn_status = -ECONNRESET; } - if ((original_hw_tcp_state == IRDMA_TCP_STATE_CLOSED || - original_hw_tcp_state == IRDMA_TCP_STATE_TIME_WAIT || - last_ae == IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE || - last_ae == IRDMA_AE_BAD_CLOSE || - last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->rf->reset)) { + if (original_hw_tcp_state == IRDMA_TCP_STATE_CLOSED || + original_hw_tcp_state == IRDMA_TCP_STATE_TIME_WAIT || + last_ae == IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE || + last_ae == IRDMA_AE_BAD_CLOSE || + last_ae == IRDMA_AE_LLP_CONNECTION_RESET || iwdev->rf->reset || !cm_id) { issue_close = 1; iwqp->cm_id = NULL; qp->term_flags = 0; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 46f475394af5..52f3e88f8569 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1618,13 +1618,13 @@ int irdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, if (issue_modify_qp && iwqp->ibqp_state > IB_QPS_RTS) { if (dont_wait) { - if (iwqp->cm_id && iwqp->hw_tcp_state) { + if (iwqp->hw_tcp_state) { spin_lock_irqsave(&iwqp->lock, flags); iwqp->hw_tcp_state = IRDMA_TCP_STATE_CLOSED; iwqp->last_aeq = IRDMA_AE_RESET_SENT; spin_unlock_irqrestore(&iwqp->lock, flags); - irdma_cm_disconn(iwqp); } + irdma_cm_disconn(iwqp); } else { int close_timer_started; From 2df6d895907b2f5dfbc558cbff7801bba82cb3cc Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Mon, 25 Apr 2022 13:17:02 -0500 Subject: [PATCH 052/179] RDMA/irdma: Reduce iWARP QP destroy time QP destroy is synchronous and waits for its refcnt to be decremented in irdma_cm_node_free_cb (for iWARP) which fires after the RCU grace period elapses. Applications running a large number of connections are exposed to high wait times on destroy QP for events like SIGABORT. The long pole for this wait time is the firing of the call_rcu callback during a CM node destroy which can be slow. It holds the QP reference count and blocks the destroy QP from completing. call_rcu only needs to make sure that list walkers have a reference to the cm_node object before freeing it and thus need to wait for grace period elapse. The rest of the connection teardown in irdma_cm_node_free_cb is moved out of the grace period wait in irdma_destroy_connection. Also, replace call_rcu with a simple kfree_rcu as it just needs to do a kfree on the cm_node Fixes: 146b9756f14c ("RDMA/irdma: Add connection manager") Link: https://lore.kernel.org/r/20220425181703.1634-3-shiraz.saleem@intel.com Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 90b4113e7071..638bf4a1ed94 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -2308,10 +2308,8 @@ err: return NULL; } -static void irdma_cm_node_free_cb(struct rcu_head *rcu_head) +static void irdma_destroy_connection(struct irdma_cm_node *cm_node) { - struct irdma_cm_node *cm_node = - container_of(rcu_head, struct irdma_cm_node, rcu_head); struct irdma_cm_core *cm_core = cm_node->cm_core; struct irdma_qp *iwqp; struct irdma_cm_info nfo; @@ -2359,7 +2357,6 @@ static void irdma_cm_node_free_cb(struct rcu_head *rcu_head) } cm_core->cm_free_ah(cm_node); - kfree(cm_node); } /** @@ -2387,8 +2384,9 @@ void irdma_rem_ref_cm_node(struct irdma_cm_node *cm_node) spin_unlock_irqrestore(&cm_core->ht_lock, flags); - /* wait for all list walkers to exit their grace period */ - call_rcu(&cm_node->rcu_head, irdma_cm_node_free_cb); + irdma_destroy_connection(cm_node); + + kfree_rcu(cm_node, rcu_head); } /** From 1c9043ae0667a43bd87beeebbdd4bed674713629 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 25 Apr 2022 13:17:03 -0500 Subject: [PATCH 053/179] RDMA/irdma: Fix possible crash due to NULL netdev in notifier For some net events in irdma_net_event notifier, the netdev can be NULL which will cause a crash in rdma_vlan_dev_real_dev. Fix this by moving all processing to the NETEVENT_NEIGH_UPDATE case where the netdev is guaranteed to not be NULL. Fixes: 6702bc147448 ("RDMA/irdma: Fix netdev notifications for vlan's") Link: https://lore.kernel.org/r/20220425181703.1634-4-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/utils.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 346c2c5dabdf..81760415d66c 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -258,18 +258,16 @@ int irdma_net_event(struct notifier_block *notifier, unsigned long event, u32 local_ipaddr[4] = {}; bool ipv4 = true; - real_dev = rdma_vlan_dev_real_dev(netdev); - if (!real_dev) - real_dev = netdev; - - ibdev = ib_device_get_by_netdev(real_dev, RDMA_DRIVER_IRDMA); - if (!ibdev) - return NOTIFY_DONE; - - iwdev = to_iwdev(ibdev); - switch (event) { case NETEVENT_NEIGH_UPDATE: + real_dev = rdma_vlan_dev_real_dev(netdev); + if (!real_dev) + real_dev = netdev; + ibdev = ib_device_get_by_netdev(real_dev, RDMA_DRIVER_IRDMA); + if (!ibdev) + return NOTIFY_DONE; + + iwdev = to_iwdev(ibdev); p = (__be32 *)neigh->primary_key; if (neigh->tbl->family == AF_INET6) { ipv4 = false; @@ -290,13 +288,12 @@ int irdma_net_event(struct notifier_block *notifier, unsigned long event, irdma_manage_arp_cache(iwdev->rf, neigh->ha, local_ipaddr, ipv4, IRDMA_ARP_DELETE); + ib_device_put(ibdev); break; default: break; } - ib_device_put(ibdev); - return NOTIFY_DONE; } From 285d5731a0cb2dd3a12ddf34d67be4e4965e64da Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 26 Apr 2022 10:49:36 +0800 Subject: [PATCH 054/179] Revert "block: release rq qos structures for queue without disk" This reverts commit daaca3522a8e67c46e39ef09c1d542e866f85f3b. Commit daaca3522a8e ("block: release rq qos structures for queue without disk") is only needed for v5.15~v5.17, and isn't needed for v5.18, so revert it. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220426024936.3321341-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 937bb6b86331..bc0506772152 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -50,7 +50,6 @@ #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" -#include "blk-rq-qos.h" struct dentry *blk_debugfs_root; @@ -315,9 +314,6 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); - /* cleanup rq qos structures for queue without disk */ - rq_qos_exit(q); - blk_queue_flag_set(QUEUE_FLAG_DEAD, q); blk_sync_queue(q); From b5d1274409d0eec6d826f65d6dafebf9d77a1b99 Mon Sep 17 00:00:00 2001 From: Janis Schoetterl-Glausch Date: Fri, 11 Mar 2022 18:00:40 +0100 Subject: [PATCH 055/179] KVM: s390: Fix lockdep issue in vm memop Issuing a memop on a protected vm does not make sense, neither is the memory readable/writable, nor does it make sense to check storage keys. This is why the ioctl will return -EINVAL when it detects the vm to be protected. However, in order to ensure that the vm cannot become protected during the memop, the kvm->lock would need to be taken for the duration of the ioctl. This is also required because kvm_s390_pv_is_protected asserts that the lock must be held. Instead, don't try to prevent this. If user space enables secure execution concurrently with a memop it must accecpt the possibility of the memop failing. Still check if the vm is currently protected, but without locking and consider it a heuristic. Fixes: ef11c9463ae0 ("KVM: s390: Add vm IOCTL for key checked guest absolute memory access") Signed-off-by: Janis Schoetterl-Glausch Reviewed-by: Janosch Frank Reviewed-by: Claudio Imbrenda Link: https://lore.kernel.org/r/20220322153204.2637400-1-scgl@linux.ibm.com Signed-off-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/kvm/kvm-s390.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index da3dabda1a12..76ad6408cb2c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2384,7 +2384,16 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -EINVAL; if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; - if (kvm_s390_pv_is_protected(kvm)) + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { if (access_key_invalid(mop->key)) From 706c9c55e5a32800605eb6a864ef6e1ca0c6c179 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Apr 2022 03:47:41 +0000 Subject: [PATCH 056/179] KVM: x86/mmu: Don't treat fully writable SPTEs as volatile (modulo A/D) Don't treat SPTEs that are truly writable, i.e. writable in hardware, as being volatile (unless they're volatile for other reasons, e.g. A/D bits). KVM _sets_ the WRITABLE bit out of mmu_lock, but never _clears_ the bit out of mmu_lock, so if the WRITABLE bit is set, it cannot magically get cleared just because the SPTE is MMU-writable. Rename the wrapper of MMU-writable to be more literal, the previous name of spte_can_locklessly_be_made_writable() is wrong and misleading. Fixes: c7ba5b48cc8d ("KVM: MMU: fast path of handling guest page fault") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220423034752.1161007-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 +++++++++-------- arch/x86/kvm/mmu/spte.h | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64a2a7e2be90..48dcb6a782f4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -484,13 +484,15 @@ static bool spte_has_volatile_bits(u64 spte) * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_can_locklessly_be_made_writable(spte) || - is_access_track_spte(spte)) + if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) + return true; + + if (is_access_track_spte(spte)) return true; if (spte_ad_enabled(spte)) { - if ((spte & shadow_accessed_mask) == 0 || - (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) + if (!(spte & shadow_accessed_mask) || + (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) return true; } @@ -557,7 +559,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_can_locklessly_be_made_writable(old_spte) && + if (is_mmu_writable_spte(old_spte) && !is_writable_pte(new_spte)) flush = true; @@ -1187,7 +1189,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_can_locklessly_be_made_writable(spte))) + !(pt_protect && is_mmu_writable_spte(spte))) return false; rmap_printk("spte %p %llx\n", sptep, *sptep); @@ -3196,8 +3198,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if (fault->write && - spte_can_locklessly_be_made_writable(spte)) { + if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; /* diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index e4abeb5df1b1..c571784cb567 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte) "kvm: Writable SPTE is not MMU-writable: %llx", spte); } -static inline bool spte_can_locklessly_be_made_writable(u64 spte) +static inline bool is_mmu_writable_spte(u64 spte) { return spte & shadow_mmu_writable_mask; } From 54eb3ef56f36827aad90915df33387d4c2b5df5a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Apr 2022 03:47:42 +0000 Subject: [PATCH 057/179] KVM: x86/mmu: Move shadow-present check out of spte_has_volatile_bits() Move the is_shadow_present_pte() check out of spte_has_volatile_bits() and into its callers. Well, caller, since only one of its two callers doesn't already do the shadow-present check. Opportunistically move the helper to spte.c/h so that it can be used by the TDP MMU, which is also the primary motivation for the shadow-present change. Unlike the legacy MMU, the TDP MMU uses a single path for clear leaf and non-leaf SPTEs, and to avoid unnecessary atomic updates, the TDP MMU will need to check is_last_spte() prior to calling spte_has_volatile_bits(), and calling is_last_spte() without first calling is_shadow_present_spte() is at best odd, and at worst a violation of KVM's loosely defines SPTE rules. Note, mmu_spte_clear_track_bits() could likely skip the write entirely for SPTEs that are not shadow-present. Leave that cleanup for a future patch to avoid introducing a functional change, and because the shadow-present check can likely be moved further up the stack, e.g. drop_large_spte() appears to be the only path that doesn't already explicitly check for a shadow-present SPTE. No functional change intended. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20220423034752.1161007-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 29 ++--------------------------- arch/x86/kvm/mmu/spte.c | 28 ++++++++++++++++++++++++++++ arch/x86/kvm/mmu/spte.h | 2 ++ 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 48dcb6a782f4..311e4e1d7870 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -473,32 +473,6 @@ retry: } #endif -static bool spte_has_volatile_bits(u64 spte) -{ - if (!is_shadow_present_pte(spte)) - return false; - - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ - if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) - return true; - - if (is_access_track_spte(spte)) - return true; - - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -593,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; - if (!spte_has_volatile_bits(old_spte)) + if (!is_shadow_present_pte(old_spte) || + !spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); else old_spte = __update_clear_spte_slow(sptep, 0ull); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 4739b53c9734..e5c0b6db6f2c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } +/* + * Returns true if the SPTE has bits that may be set without holding mmu_lock. + * The caller is responsible for checking if the SPTE is shadow-present, and + * for determining whether or not the caller cares about non-leaf SPTEs. + */ +bool spte_has_volatile_bits(u64 spte) +{ + /* + * Always atomically update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) + return true; + + if (is_access_track_spte(spte)) + return true; + + if (spte_ad_enabled(spte)) { + if (!(spte & shadow_accessed_mask) || + (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) + return true; + } + + return false; +} + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index c571784cb567..80ab0f5cff01 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } +bool spte_has_volatile_bits(u64 spte); + bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, From ba3a6120a4e7efc13d19fe43eb6c5caf1da05b72 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 23 Apr 2022 03:47:43 +0000 Subject: [PATCH 058/179] KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits Use an atomic XCHG to write TDP MMU SPTEs that have volatile bits, even if mmu_lock is held for write, as volatile SPTEs can be written by other tasks/vCPUs outside of mmu_lock. If a vCPU uses the to-be-modified SPTE to write a page, the CPU can cache the translation as WRITABLE in the TLB despite it being seen by KVM as !WRITABLE, and/or KVM can clobber the Accessed/Dirty bits and not properly tag the backing page. Exempt non-leaf SPTEs from atomic updates as KVM itself doesn't modify non-leaf SPTEs without holding mmu_lock, they do not have Dirty bits, and KVM doesn't consume the Accessed bit of non-leaf SPTEs. Dropping the Dirty and/or Writable bits is most problematic for dirty logging, as doing so can result in a missed TLB flush and eventually a missed dirty page. In the unlikely event that the only dirty page(s) is a clobbered SPTE, clear_dirty_gfn_range() will see the SPTE as not dirty (based on the Dirty or Writable bit depending on the method) and so not update the SPTE and ultimately not flush. If the SPTE is cached in the TLB as writable before it is clobbered, the guest can continue writing the associated page without ever taking a write-protect fault. For most (all?) file back memory, dropping the Dirty bit is a non-issue. The primary MMU write-protects its PTEs on writeback, i.e. KVM's dirty bit is effectively ignored because the primary MMU will mark that page dirty when the write-protection is lifted, e.g. when KVM faults the page back in for write. The Accessed bit is a complete non-issue. Aside from being unused for non-leaf SPTEs, KVM doesn't do a TLB flush when aging SPTEs, i.e. the Accessed bit may be dropped anyways. Lastly, the Writable bit is also problematic as an extension of the Dirty bit, as KVM (correctly) treats the Dirty bit as volatile iff the SPTE is !DIRTY && WRITABLE. If KVM fixes an MMU-writable, but !WRITABLE, SPTE out of mmu_lock, then it can allow the CPU to set the Dirty bit despite the SPTE being !WRITABLE when it is checked by KVM. But that all depends on the Dirty bit being problematic in the first place. Fixes: 2f2fad0897cb ("kvm: x86/mmu: Add functions to handle changed TDP SPTEs") Cc: stable@vger.kernel.org Cc: Ben Gardon Cc: David Matlack Cc: Venkatesh Srinivas Signed-off-by: Sean Christopherson Message-Id: <20220423034752.1161007-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_iter.h | 34 ++++++++++++++- arch/x86/kvm/mmu/tdp_mmu.c | 82 ++++++++++++++++++++++++------------- 2 files changed, 85 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1eaf6ec0e0b..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -6,6 +6,7 @@ #include #include "mmu.h" +#include "spte.h" /* * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs) @@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) { return READ_ONCE(*rcu_dereference(sptep)); } -static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val) + +static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) { - WRITE_ONCE(*rcu_dereference(sptep), val); + return xchg(rcu_dereference(sptep), new_spte); +} + +static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) +{ + WRITE_ONCE(*rcu_dereference(sptep), new_spte); +} + +static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, + u64 new_spte, int level) +{ + /* + * Atomically write the SPTE if it is a shadow-present, leaf SPTE with + * volatile bits, i.e. has bits that can be set outside of mmu_lock. + * The Writable bit can be set by KVM's fast page fault handler, and + * Accessed and Dirty bits can be set by the CPU. + * + * Note, non-leaf SPTEs do have Accessed bits and those bits are + * technically volatile, but KVM doesn't consume the Accessed bit of + * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This + * logic needs to be reassessed if KVM were to use non-leaf Accessed + * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + */ + if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && + spte_has_volatile_bits(old_spte)) + return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); + + __kvm_tdp_mmu_write_spte(sptep, new_spte); + return old_spte; } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index edc68538819b..922b06bf4b94 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - u64 *sptep = rcu_dereference(pt) + i; + tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); - u64 old_child_spte; + u64 old_spte; if (shared) { /* @@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * value to the removed SPTE value. */ for (;;) { - old_child_spte = xchg(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_child_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); + if (!is_removed_spte(old_spte)) break; cpu_relax(); } @@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * are guarded by the memslots generation, not by being * unreachable. */ - old_child_spte = READ_ONCE(*sptep); - if (!is_shadow_present_pte(old_child_spte)) + old_spte = kvm_tdp_mmu_read_spte(sptep); + if (!is_shadow_present_pte(old_spte)) continue; /* - * Marking the SPTE as a removed SPTE is not - * strictly necessary here as the MMU lock will - * stop other threads from concurrently modifying - * this SPTE. Using the removed SPTE value keeps - * the two branches consistent and simplifies - * the function. + * Use the common helper instead of a raw WRITE_ONCE as + * the SPTE needs to be updated atomically if it can be + * modified by a different vCPU outside of mmu_lock. + * Even though the parent SPTE is !PRESENT, the TLB + * hasn't yet been flushed, and both Intel and AMD + * document that A/D assists can use upper-level PxE + * entries that are cached in the TLB, i.e. the CPU can + * still access the page and mark it dirty. + * + * No retry is needed in the atomic update path as the + * sole concern is dropping a Dirty bit, i.e. no other + * task can zap/remove the SPTE as mmu_lock is held for + * write. Marking the SPTE as a removed SPTE is not + * strictly necessary for the same reason, but using + * the remove SPTE value keeps the shared/exclusive + * paths consistent and allows the handle_changed_spte() + * call below to hardcode the new value to REMOVED_SPTE. + * + * Note, even though dropping a Dirty bit is the only + * scenario where a non-atomic update could result in a + * functional bug, simply checking the Dirty bit isn't + * sufficient as a fast page fault could read the upper + * level SPTE before it is zapped, and then make this + * target SPTE writable, resume the guest, and set the + * Dirty bit between reading the SPTE above and writing + * it here. */ - WRITE_ONCE(*sptep, REMOVED_SPTE); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, + REMOVED_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level, - shared); + old_spte, REMOVED_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, KVM_PAGES_PER_HPAGE(iter->level)); /* - * No other thread can overwrite the removed SPTE as they - * must either wait on the MMU lock or use - * tdp_mmu_set_spte_atomic which will not overwrite the - * special removed SPTE value. No bookkeeping is needed - * here since the SPTE is going from non-present - * to non-present. + * No other thread can overwrite the removed SPTE as they must either + * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not + * overwrite the special removed SPTE value. No bookkeeping is needed + * here since the SPTE is going from non-present to non-present. Use + * the raw write helper to avoid an unnecessary check on volatile bits. */ - kvm_tdp_mmu_write_spte(iter->sptep, 0); + __kvm_tdp_mmu_write_spte(iter->sptep, 0); return 0; } @@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * unless performing certain dirty logging operations. * Leaving record_dirty_log unset in that case prevents page * writes from being double counted. + * + * Returns the old SPTE value, which _may_ be different than @old_spte if the + * SPTE had voldatile bits. */ -static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level, + bool record_acc_track, bool record_dirty_log) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, */ WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); - kvm_tdp_mmu_write_spte(sptep, new_spte); + old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); @@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, if (record_dirty_log) handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, new_spte, level); + return old_spte; } static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, @@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, { WARN_ON_ONCE(iter->yielded); - __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, - new_spte, iter->gfn, iter->level, - record_acc_track, record_dirty_log); + iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level, + record_acc_track, record_dirty_log); } static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, From 5eb849322d7f7ae9d5c587c7bc3b4f7c6872cd2f Mon Sep 17 00:00:00 2001 From: Kyle Huey Date: Mon, 2 May 2022 22:01:36 -0700 Subject: [PATCH 059/179] KVM: x86/svm: Account for family 17h event renumberings in amd_pmc_perf_hw_id Zen renumbered some of the performance counters that correspond to the well known events in perf_hw_id. This code in KVM was never updated for that, so guest that attempt to use counters on Zen that correspond to the pre-Zen perf_hw_id values will silently receive the wrong values. This has been observed in the wild with rr[0] when running in Zen 3 guests. rr uses the retired conditional branch counter 00d1 which is incorrectly recognized by KVM as PERF_COUNT_HW_STALLED_CYCLES_BACKEND. [0] https://rr-project.org/ Signed-off-by: Kyle Huey Message-Id: <20220503050136.86298-1-khuey@kylehuey.com> Cc: stable@vger.kernel.org [Check guest family, not host. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/pmu.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 24eb935b6f85..311cbaa0c3dd 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, }; +/* duplicated from amd_f17h_perfmon_event_map. */ +static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { + [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, + [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, +}; + +/* amd_pmc_perf_hw_id depends on these being the same size */ +static_assert(ARRAY_SIZE(amd_event_mapping) == + ARRAY_SIZE(amd_f17h_event_mapping)); + static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) { + struct kvm_event_hw_type_mapping *event_mapping; u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; @@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) if (WARN_ON(pmc_is_fixed(pmc))) return PERF_COUNT_HW_MAX; + if (guest_cpuid_family(pmc->vcpu) >= 0x17) + event_mapping = amd_f17h_event_mapping; + else + event_mapping = amd_event_mapping; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (amd_event_mapping[i].eventsel == event_select - && amd_event_mapping[i].unit_mask == unit_mask) + if (event_mapping[i].eventsel == event_select + && event_mapping[i].unit_mask == unit_mask) break; if (i == ARRAY_SIZE(amd_event_mapping)) return PERF_COUNT_HW_MAX; - return amd_event_mapping[i].event_type; + return event_mapping[i].event_type; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because From 5a1bde46f98b893cda6122b00e94c0c40a6ead3c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Wed, 27 Apr 2022 17:01:49 +0530 Subject: [PATCH 060/179] kvm: x86/cpuid: Only provide CPUID leaf 0xA if host has architectural PMU On some x86 processors, CPUID leaf 0xA provides information on Architectural Performance Monitoring features. It advertises a PMU version which Qemu uses to determine the availability of additional MSRs to manage the PMCs. Upon receiving a KVM_GET_SUPPORTED_CPUID ioctl request for the same, the kernel constructs return values based on the x86_pmu_capability irrespective of the vendor. This leaf and the additional MSRs are not supported on AMD and Hygon processors. If AMD PerfMonV2 is detected, the PMU version is set to 2 and guest startup breaks because of an attempt to access a non-existent MSR. Return zeros to avoid this. Fixes: a6c06ed1a60a ("KVM: Expose the architectural performance monitoring CPUID leaf") Reported-by: Vasant Hegde Signed-off-by: Sandipan Das Message-Id: <3fef83d9c2b2f7516e8ff50d60851f29a4bcb716.1651058600.git.sandipan.das@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b24ca7f4ed7c..732724ea5b10 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -887,6 +887,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) union cpuid10_eax eax; union cpuid10_edx edx; + if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + perf_get_x86_pmu_capability(&cap); /* From a06afe8383080c630a7a528b8382fc6bb4925b61 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 29 Apr 2022 17:15:26 +0200 Subject: [PATCH 061/179] KVM: s390: vsie/gmap: reduce gmap_rmap overhead there are cases that trigger a 2nd shadow event for the same vmaddr/raddr combination. (prefix changes, reboots, some known races) This will increase memory usages and it will result in long latencies when cleaning up, e.g. on shutdown. To avoid cases with a list that has hundreds of identical raddrs we check existing entries at insert time. As this measurably reduces the list length this will be faster than traversing the list at shutdown time. In the long run several places will be optimized to create less entries and a shrinker might be necessary. Fixes: 4be130a08420 ("s390/mm: add shadow gmap support") Signed-off-by: Christian Borntraeger Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20220429151526.1560-1-borntraeger@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/mm/gmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index af03cacf34ec..1ac73917a8d3 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1183,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1190,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; From aafa025c76dcc7d1a8c8f0bdefcbe4eb480b2f6a Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Mon, 2 May 2022 15:50:14 +0200 Subject: [PATCH 062/179] fbdev: Make fb_release() return -ENODEV if fbdev was unregistered A reference to the framebuffer device struct fb_info is stored in the file private data, but this reference could no longer be valid and must not be accessed directly. Instead, the file_fb_info() accessor function must be used since it does sanity checking to make sure that the fb_info is valid. This can happen for example if the registered framebuffer device is for a driver that just uses a framebuffer provided by the system firmware. In that case, the fbdev core would unregister the framebuffer device when a real video driver is probed and ask to remove conflicting framebuffers. The bug has been present for a long time but commit 27599aacbaef ("fbdev: Hot-unplug firmware fb devices on forced removal") unmasked it since the fbdev core started unregistering the framebuffers' devices associated. Fixes: 27599aacbaef ("fbdev: Hot-unplug firmware fb devices on forced removal") Reported-by: Maxime Ripard Reported-by: Junxiao Chang Signed-off-by: Javier Martinez Canillas Reviewed-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20220502135014.377945-1-javierm@redhat.com --- drivers/video/fbdev/core/fbmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index a6bb0e438216..97eb0dee411c 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1434,7 +1434,10 @@ fb_release(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { - struct fb_info * const info = file->private_data; + struct fb_info * const info = file_fb_info(file); + + if (!info) + return -ENODEV; lock_fb_info(info); if (info->fbops->fb_release) From 841e512ffb64898db6322c0619f6bbc41266d86f Mon Sep 17 00:00:00 2001 From: Fabien Parent Date: Tue, 26 Apr 2022 16:15:36 +0200 Subject: [PATCH 063/179] drm/bridge: ite-it6505: add missing Kconfig option select The IT6505 is using functions provided by the DRM_DP_HELPER driver. In order to avoid having the bridge enabled but the helper disabled, let's add a select in order to be sure that the DP helper functions are always available. Fixes: b5c84a9edcd4 ("drm/bridge: add it6505 driver") Signed-off-by: Fabien Parent Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20220426141536.274727-1-fparent@baylibre.com --- drivers/gpu/drm/bridge/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/bridge/Kconfig b/drivers/gpu/drm/bridge/Kconfig index 007e5a282f67..2145b08f9534 100644 --- a/drivers/gpu/drm/bridge/Kconfig +++ b/drivers/gpu/drm/bridge/Kconfig @@ -78,6 +78,7 @@ config DRM_ITE_IT6505 tristate "ITE IT6505 DisplayPort bridge" depends on OF select DRM_KMS_HELPER + select DRM_DP_HELPER select EXTCON help ITE IT6505 DisplayPort bridge chip driver. From 5e469ed9764d4722c59562da13120bd2dc6834c5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 20 Apr 2022 12:50:38 +0200 Subject: [PATCH 064/179] mac80211: fix rx reordering with non explicit / psmp ack policy When the QoS ack policy was set to non explicit / psmp ack, frames are treated as not being part of a BA session, which causes extra latency on reordering. Fix this by only bypassing reordering for packets with no-ack policy Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20220420105038.36443-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index beb6b92eb780..88d797fa82ff 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1405,8 +1405,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, goto dont_reorder; /* not part of a BA session */ - if (ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK && - ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL) + if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_NOACK) goto dont_reorder; /* new, potentially un-ordered, ampdu frame - process it */ From 5d087aa759eb82b8208411913f6c2158bd85abc0 Mon Sep 17 00:00:00 2001 From: Kieran Frewen Date: Wed, 20 Apr 2022 04:13:21 +0000 Subject: [PATCH 065/179] nl80211: validate S1G channel width Validate the S1G channel width input by user to ensure it matches that of the requested channel Signed-off-by: Kieran Frewen Signed-off-by: Bassem Dawood Link: https://lore.kernel.org/r/20220420041321.3788789-2-kieran.frewen@morsemicro.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 21e808fcb676..aa6094c3c9b0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3173,6 +3173,15 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); + if (chandef->chan->band == NL80211_BAND_S1GHZ) { + /* User input error for channel width doesn't match channel */ + if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { + NL_SET_ERR_MSG_ATTR(extack, + attrs[NL80211_ATTR_CHANNEL_WIDTH], + "bad channel width"); + return -EINVAL; + } + } if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); From e847ffe2d146cfd52980ca688d84358e024a6e70 Mon Sep 17 00:00:00 2001 From: Kieran Frewen Date: Wed, 20 Apr 2022 04:13:20 +0000 Subject: [PATCH 066/179] cfg80211: retrieve S1G operating channel number When retrieving the S1G channel number from IEs, we should retrieve the operating channel instead of the primary channel. The S1G operation element specifies the main channel of operation as the oper channel, unlike for HT and HE which specify their main channel of operation as the primary channel. Signed-off-by: Kieran Frewen Signed-off-by: Bassem Dawood Link: https://lore.kernel.org/r/20220420041321.3788789-1-kieran.frewen@morsemicro.com Signed-off-by: Johannes Berg --- net/wireless/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4a6d86432910..6d82bd9eaf8c 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1829,7 +1829,7 @@ int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, if (tmp && tmp->datalen >= sizeof(struct ieee80211_s1g_oper_ie)) { struct ieee80211_s1g_oper_ie *s1gop = (void *)tmp->data; - return s1gop->primary_ch; + return s1gop->oper_ch; } } else { tmp = cfg80211_find_elem(WLAN_EID_DS_PARAMS, ie, ielen); From 86af062f40a73bf63321694e6bf637144f0383fe Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Thu, 28 Apr 2022 10:57:44 +0530 Subject: [PATCH 067/179] mac80211: Reset MBSSID parameters upon connection Currently MBSSID parameters in struct ieee80211_bss_conf are not reset upon connection. This could be problematic with some drivers in a scenario where the device first connects to a non-transmit BSS and then connects to a transmit BSS of a Multi BSS AP. The MBSSID parameters which are set after connecting to a non-transmit BSS will not be reset and the same parameters will be passed on to the driver during the subsequent connection to a transmit BSS of a Multi BSS AP. For example, firmware running on the ath11k device uses the Multi BSS data for tracking the beacon of a non-transmit BSS and reports the driver when there is a beacon miss. If we do not reset the MBSSID parameters during the subsequent connection to a transmit BSS, then the driver would have wrong MBSSID data and FW would be looking for an incorrect BSSID in the MBSSID beacon of a Multi BSS AP and reports beacon loss leading to an unstable connection. Reset the MBSSID parameters upon every connection to solve this problem. Fixes: 78ac51f81532 ("mac80211: support multi-bssid") Signed-off-by: Manikanta Pubbisetty Link: https://lore.kernel.org/r/20220428052744.27040-1-quic_mpubbise@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1b30c724ca8d..dc8aec1a5d3d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3657,6 +3657,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, cbss->transmitted_bss->bssid); bss_conf->bssid_indicator = cbss->max_bssid_indicator; bss_conf->bssid_index = cbss->bssid_index; + } else { + bss_conf->nontransmitted = false; + memset(bss_conf->transmitter_bssid, 0, + sizeof(bss_conf->transmitter_bssid)); + bss_conf->bssid_indicator = 0; + bss_conf->bssid_index = 0; } /* From e9f3fb523dbf476dc86beea23f5b5ca8f9687c93 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 24 Apr 2022 18:17:50 -0500 Subject: [PATCH 068/179] mmc: sunxi-mmc: Fix DMA descriptors allocated above 32 bits Newer variants of the MMC controller support a 34-bit physical address space by using word addresses instead of byte addresses. However, the code truncates the DMA descriptor address to 32 bits before applying the shift. This breaks DMA for descriptors allocated above the 32-bit limit. Fixes: 3536b82e5853 ("mmc: sunxi: add support for A100 mmc controller") Signed-off-by: Samuel Holland Reviewed-by: Andre Przywara Reviewed-by: Jernej Skrabec Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220424231751.32053-1-samuel@sholland.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sunxi-mmc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index c62afd212692..46f9e2923d86 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -377,8 +377,9 @@ static void sunxi_mmc_init_idma_des(struct sunxi_mmc_host *host, pdes[i].buf_addr_ptr1 = cpu_to_le32(sg_dma_address(&data->sg[i]) >> host->cfg->idma_des_shift); - pdes[i].buf_addr_ptr2 = cpu_to_le32((u32)next_desc >> - host->cfg->idma_des_shift); + pdes[i].buf_addr_ptr2 = + cpu_to_le32(next_desc >> + host->cfg->idma_des_shift); } pdes[0].config |= cpu_to_le32(SDXC_IDMAC_DES0_FD); From 3e5a8e8494a8122fe4eb3f167662f406cab753b9 Mon Sep 17 00:00:00 2001 From: Shaik Sajida Bhanu Date: Sun, 24 Apr 2022 21:32:33 +0530 Subject: [PATCH 069/179] mmc: sdhci-msm: Reset GCC_SDCC_BCR register for SDHC Reset GCC_SDCC_BCR register before every fresh initilazation. This will reset whole SDHC-msm controller, clears the previous power control states and avoids, software reset timeout issues as below. [ 5.458061][ T262] mmc1: Reset 0x1 never completed. [ 5.462454][ T262] mmc1: sdhci: ============ SDHCI REGISTER DUMP =========== [ 5.469065][ T262] mmc1: sdhci: Sys addr: 0x00000000 | Version: 0x00007202 [ 5.475688][ T262] mmc1: sdhci: Blk size: 0x00000000 | Blk cnt: 0x00000000 [ 5.482315][ T262] mmc1: sdhci: Argument: 0x00000000 | Trn mode: 0x00000000 [ 5.488927][ T262] mmc1: sdhci: Present: 0x01f800f0 | Host ctl: 0x00000000 [ 5.495539][ T262] mmc1: sdhci: Power: 0x00000000 | Blk gap: 0x00000000 [ 5.502162][ T262] mmc1: sdhci: Wake-up: 0x00000000 | Clock: 0x00000003 [ 5.508768][ T262] mmc1: sdhci: Timeout: 0x00000000 | Int stat: 0x00000000 [ 5.515381][ T262] mmc1: sdhci: Int enab: 0x00000000 | Sig enab: 0x00000000 [ 5.521996][ T262] mmc1: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00000000 [ 5.528607][ T262] mmc1: sdhci: Caps: 0x362dc8b2 | Caps_1: 0x0000808f [ 5.535227][ T262] mmc1: sdhci: Cmd: 0x00000000 | Max curr: 0x00000000 [ 5.541841][ T262] mmc1: sdhci: Resp[0]: 0x00000000 | Resp[1]: 0x00000000 [ 5.548454][ T262] mmc1: sdhci: Resp[2]: 0x00000000 | Resp[3]: 0x00000000 [ 5.555079][ T262] mmc1: sdhci: Host ctl2: 0x00000000 [ 5.559651][ T262] mmc1: sdhci_msm: ----------- VENDOR REGISTER DUMP----------- [ 5.566621][ T262] mmc1: sdhci_msm: DLL sts: 0x00000000 | DLL cfg: 0x6000642c | DLL cfg2: 0x0020a000 [ 5.575465][ T262] mmc1: sdhci_msm: DLL cfg3: 0x00000000 | DLL usr ctl: 0x00010800 | DDR cfg: 0x80040873 [ 5.584658][ T262] mmc1: sdhci_msm: Vndr func: 0x00018a9c | Vndr func2 : 0xf88218a8 Vndr func3: 0x02626040 Fixes: 0eb0d9f4de34 ("mmc: sdhci-msm: Initial support for Qualcomm chipsets") Signed-off-by: Shaik Sajida Bhanu Acked-by: Adrian Hunter Reviewed-by: Philipp Zabel Tested-by: Konrad Dybcio Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1650816153-23797-1-git-send-email-quic_c_sbhanu@quicinc.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 50c71e0ba5e4..ff9f5b63c337 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "sdhci-pltfm.h" #include "cqhci.h" @@ -2482,6 +2483,43 @@ static inline void sdhci_msm_get_of_property(struct platform_device *pdev, of_property_read_u32(node, "qcom,dll-config", &msm_host->dll_config); } +static int sdhci_msm_gcc_reset(struct device *dev, struct sdhci_host *host) +{ + struct reset_control *reset; + int ret = 0; + + reset = reset_control_get_optional_exclusive(dev, NULL); + if (IS_ERR(reset)) + return dev_err_probe(dev, PTR_ERR(reset), + "unable to acquire core_reset\n"); + + if (!reset) + return ret; + + ret = reset_control_assert(reset); + if (ret) { + reset_control_put(reset); + return dev_err_probe(dev, ret, "core_reset assert failed\n"); + } + + /* + * The hardware requirement for delay between assert/deassert + * is at least 3-4 sleep clock (32.7KHz) cycles, which comes to + * ~125us (4/32768). To be on the safe side add 200us delay. + */ + usleep_range(200, 210); + + ret = reset_control_deassert(reset); + if (ret) { + reset_control_put(reset); + return dev_err_probe(dev, ret, "core_reset deassert failed\n"); + } + + usleep_range(200, 210); + reset_control_put(reset); + + return ret; +} static int sdhci_msm_probe(struct platform_device *pdev) { @@ -2529,6 +2567,10 @@ static int sdhci_msm_probe(struct platform_device *pdev) msm_host->saved_tuning_phase = INVALID_TUNING_PHASE; + ret = sdhci_msm_gcc_reset(&pdev->dev, host); + if (ret) + goto pltfm_free; + /* Setup SDCC bus voter clock. */ msm_host->bus_clk = devm_clk_get(&pdev->dev, "bus"); if (!IS_ERR(msm_host->bus_clk)) { From 57831bfb5e78777dc399e351ed68ef77c3aee385 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Sat, 19 Mar 2022 02:28:09 -0700 Subject: [PATCH 070/179] powerpc/pseries/vas: Use QoS credits from the userspace The user can change the QoS credits dynamically with the management console interface which notifies OS with sysfs. After returning from the OS interface successfully, the management console updates the hypervisor. Since the VAS capabilities in the hypervisor is not updated when the OS gets the update, the kernel is using the old total credits value from the hypervisor. Fix this issue by using the new QoS credits from the userspace instead of depending on VAS capabilities from the hypervisor. Signed-off-by: Haren Myneni Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/76d156f8af1e03cc09369d68e0bfad0c40031bcc.camel@linux.ibm.com --- arch/powerpc/platforms/pseries/vas-sysfs.c | 19 +++++++++++++----- arch/powerpc/platforms/pseries/vas.c | 23 +++++++++++----------- arch/powerpc/platforms/pseries/vas.h | 2 +- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c b/arch/powerpc/platforms/pseries/vas-sysfs.c index 909535ca513a..ec65586cbeb3 100644 --- a/arch/powerpc/platforms/pseries/vas-sysfs.c +++ b/arch/powerpc/platforms/pseries/vas-sysfs.c @@ -27,22 +27,31 @@ struct vas_caps_entry { /* * This function is used to get the notification from the drmgr when - * QoS credits are changed. Though receiving the target total QoS - * credits here, get the official QoS capabilities from the hypervisor. + * QoS credits are changed. */ -static ssize_t update_total_credits_trigger(struct vas_cop_feat_caps *caps, +static ssize_t update_total_credits_store(struct vas_cop_feat_caps *caps, const char *buf, size_t count) { int err; u16 creds; err = kstrtou16(buf, 0, &creds); + /* + * The user space interface from the management console + * notifies OS with the new QoS credits and then the + * hypervisor. So OS has to use this new credits value + * and reconfigure VAS windows (close or reopen depends + * on the credits available) instead of depending on VAS + * QoS capabilities from the hypervisor. + */ if (!err) - err = vas_reconfig_capabilties(caps->win_type); + err = vas_reconfig_capabilties(caps->win_type, creds); if (err) return -EINVAL; + pr_info("Set QoS total credits %u\n", creds); + return count; } @@ -92,7 +101,7 @@ VAS_ATTR_RO(nr_total_credits); VAS_ATTR_RO(nr_used_credits); static struct vas_sysfs_entry update_total_credits_attribute = - __ATTR(update_total_credits, 0200, NULL, update_total_credits_trigger); + __ATTR(update_total_credits, 0200, NULL, update_total_credits_store); static struct attribute *vas_def_capab_attrs[] = { &nr_total_credits_attribute.attr, diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 1f59d78c77a1..ec643bbdb67f 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -779,10 +779,10 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * changes. Reconfig window configurations based on the credits * availability from this new capabilities. */ -int vas_reconfig_capabilties(u8 type) +int vas_reconfig_capabilties(u8 type, int new_nr_creds) { struct vas_cop_feat_caps *caps; - int old_nr_creds, new_nr_creds; + int old_nr_creds; struct vas_caps *vcaps; int rc = 0, nr_active_wins; @@ -795,12 +795,6 @@ int vas_reconfig_capabilties(u8 type) caps = &vcaps->caps; mutex_lock(&vas_pseries_mutex); - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vcaps->feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (rc) - goto out; - - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); old_nr_creds = atomic_read(&caps->nr_total_credits); @@ -832,7 +826,6 @@ int vas_reconfig_capabilties(u8 type) false); } -out: mutex_unlock(&vas_pseries_mutex); return rc; } @@ -850,7 +843,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int len, rc = 0; + int new_nr_creds, len, rc = 0; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -862,7 +855,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE); + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, + new_nr_creds); + } + if (rc) pr_err("Failed reconfig VAS capabilities with DLPAR\n"); diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 34177881e998..333ffa2f9f42 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -135,7 +135,7 @@ struct pseries_vas_window { }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); -int vas_reconfig_capabilties(u8 type); +int vas_reconfig_capabilties(u8 type, int new_nr_creds); int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS From 6d65028eb67dbb7627651adfc460d64196d38bd8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 2 May 2022 22:50:10 +1000 Subject: [PATCH 071/179] powerpc/vdso: Fix incorrect CFI in gettimeofday.S As reported by Alan, the CFI (Call Frame Information) in the VDSO time routines is incorrect since commit ce7d8056e38b ("powerpc/vdso: Prepare for switching VDSO to generic C implementation."). DWARF has a concept called the CFA (Canonical Frame Address), which on powerpc is calculated as an offset from the stack pointer (r1). That means when the stack pointer is changed there must be a corresponding CFI directive to update the calculation of the CFA. The current code is missing those directives for the changes to r1, which prevents gdb from being able to generate a backtrace from inside VDSO functions, eg: Breakpoint 1, 0x00007ffff7f804dc in __kernel_clock_gettime () (gdb) bt #0 0x00007ffff7f804dc in __kernel_clock_gettime () #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 #2 0x00007fffffffd960 in ?? () #3 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 Backtrace stopped: frame did not save the PC Alan helpfully describes some rules for correctly maintaining the CFI information: 1) Every adjustment to the current frame address reg (ie. r1) must be described, and exactly at the instruction where r1 changes. Why? Because stack unwinding might want to access previous frames. 2) If a function changes LR or any non-volatile register, the save location for those regs must be given. The CFI can be at any instruction after the saves up to the point that the reg is changed. (Exception: LR save should be described before a bl. not after) 3) If asychronous unwind info is needed then restores of LR and non-volatile regs must also be described. The CFI can be at any instruction after the reg is restored up to the point where the save location is (potentially) trashed. Fix the inability to backtrace by adding CFI directives describing the changes to r1, ie. satisfying rule 1. Also change the information for LR to point to the copy saved on the stack, not the value in r0 that will be overwritten by the function call. Finally, add CFI directives describing the save/restore of r2. With the fix gdb can correctly back trace and navigate up and down the stack: Breakpoint 1, 0x00007ffff7f804dc in __kernel_clock_gettime () (gdb) bt #0 0x00007ffff7f804dc in __kernel_clock_gettime () #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 #2 0x0000000100015b60 in gettime () #3 0x000000010000c8bc in print_long_format () #4 0x000000010000d180 in print_current_files () #5 0x00000001000054ac in main () (gdb) up #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 (gdb) #2 0x0000000100015b60 in gettime () (gdb) #3 0x000000010000c8bc in print_long_format () (gdb) #4 0x000000010000d180 in print_current_files () (gdb) #5 0x00000001000054ac in main () (gdb) Initial frame selected; you cannot go up. (gdb) down #4 0x000000010000d180 in print_current_files () (gdb) #3 0x000000010000c8bc in print_long_format () (gdb) #2 0x0000000100015b60 in gettime () (gdb) #1 0x00007ffff7d8872c in clock_gettime@@GLIBC_2.17 () from /lib64/libc.so.6 (gdb) #0 0x00007ffff7f804dc in __kernel_clock_gettime () (gdb) Fixes: ce7d8056e38b ("powerpc/vdso: Prepare for switching VDSO to generic C implementation.") Cc: stable@vger.kernel.org # v5.11+ Reported-by: Alan Modra Signed-off-by: Michael Ellerman Reviewed-by: Segher Boessenkool Link: https://lore.kernel.org/r/20220502125010.1319370-1-mpe@ellerman.id.au --- arch/powerpc/kernel/vdso/gettimeofday.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index eb9c81e1c218..0c4ecc8fec5a 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -22,12 +22,15 @@ .macro cvdso_call funct call_time=0 .cfi_startproc PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM mflr r0 - .cfi_register lr, r0 PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) + .cfi_rel_offset lr, PPC_MIN_STKFRM + PPC_LR_STKOFF #ifdef __powerpc64__ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif get_datapage r5 .ifeq \call_time @@ -39,13 +42,15 @@ PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) #ifdef __powerpc64__ PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_restore r2 #endif .ifeq \call_time cmpwi r3, 0 .endif mtlr r0 - .cfi_restore lr addi r1, r1, 2 * PPC_MIN_STKFRM + .cfi_restore lr + .cfi_def_cfa_offset 0 crclr so .ifeq \call_time beqlr+ From 19965d8259fdabc6806da92adda49684f5bcbec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 27 Apr 2022 01:57:15 +0200 Subject: [PATCH 072/179] drm/amdgpu: do not use passthrough mode in Xen dom0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While technically Xen dom0 is a virtual machine too, it does have access to most of the hardware so it doesn't need to be considered a "passthrough". Commit b818a5d37454 ("drm/amdgpu/gmc: use PCI BARs for APUs in passthrough") changed how FB is accessed based on passthrough mode. This breaks amdgpu in Xen dom0 with message like this: [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for DMUB idle: status=3 While the reason for this failure is unclear, the passthrough mode is not really necessary in Xen dom0 anyway. So, to unbreak booting affected kernels, disable passthrough mode in this case. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1985 Fixes: b818a5d37454 ("drm/amdgpu/gmc: use PCI BARs for APUs in passthrough") Signed-off-by: Marek Marczykowski-Górecki Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index a025f080aa6a..5e3756643da3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -24,6 +24,7 @@ #include #include +#include #include "amdgpu.h" #include "amdgpu_ras.h" @@ -710,7 +711,8 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev) adev->virt.caps |= AMDGPU_SRIOV_CAPS_ENABLE_IOV; if (!reg) { - if (is_virtual_machine()) /* passthrough mode exclus sriov mod */ + /* passthrough mode exclus sriov mod */ + if (is_virtual_machine() && !xen_initial_domain()) adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; } From 3dfe85fa87b2a26bdbd292b66653bba065cf9941 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Tue, 19 Apr 2022 13:03:12 -0400 Subject: [PATCH 073/179] drm/amd/display: Avoid reading audio pattern past AUDIO_CHANNELS_COUNT A faulty receiver might report an erroneous channel count. We should guard against reading beyond AUDIO_CHANNELS_COUNT as that would overflow the dpcd_pattern_period array. Signed-off-by: Harry Wentland Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 22dabe596dfc..95b5b5bfa1ff 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -4440,7 +4440,7 @@ static void dp_test_get_audio_test_data(struct dc_link *link, bool disable_video &dpcd_pattern_type.value, sizeof(dpcd_pattern_type)); - channel_count = dpcd_test_mode.bits.channel_count + 1; + channel_count = min(dpcd_test_mode.bits.channel_count + 1, AUDIO_CHANNELS_COUNT); // read pattern periods for requested channels when sawTooth pattern is requested if (dpcd_pattern_type.value == AUDIO_TEST_PATTERN_SAWTOOTH || From 770fb0942c338545f93a584342b64848cff31efe Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 4 May 2022 11:07:45 -0700 Subject: [PATCH 074/179] MAINTAINERS: Update Josh Poimboeuf's email address Change to my kernel.org email address. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/1abc3de4b00dc6f915ac975a2ec29ed545d96dc4.1651687652.git.jpoimboe@redhat.com --- MAINTAINERS | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index edc96cdb85e8..1e1a2264792d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7499,7 +7499,7 @@ F: Documentation/hwmon/f71805f.rst F: drivers/hwmon/f71805f.c FADDR2LINE -M: Josh Poimboeuf +M: Josh Poimboeuf S: Maintained F: scripts/faddr2line @@ -11348,7 +11348,7 @@ F: drivers/mmc/host/litex_mmc.c N: litex LIVE PATCHING -M: Josh Poimboeuf +M: Josh Poimboeuf M: Jiri Kosina M: Miroslav Benes M: Petr Mladek @@ -14224,7 +14224,7 @@ F: lib/objagg.c F: lib/test_objagg.c OBJTOOL -M: Josh Poimboeuf +M: Josh Poimboeuf M: Peter Zijlstra S: Supported F: tools/objtool/ @@ -18792,7 +18792,7 @@ F: include/dt-bindings/reset/starfive-jh7100.h STATIC BRANCH/CALL M: Peter Zijlstra -M: Josh Poimboeuf +M: Josh Poimboeuf M: Jason Baron R: Steven Rostedt R: Ard Biesheuvel @@ -21444,7 +21444,7 @@ F: arch/x86/kernel/apic/x2apic_uv_x.c F: arch/x86/platform/uv/ X86 STACK UNWINDING -M: Josh Poimboeuf +M: Josh Poimboeuf M: Peter Zijlstra S: Supported F: arch/x86/include/asm/unwind*.h From b2b701b31e1c5827957c88e5c1f0c3dde1f55b2f Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 29 Apr 2022 14:46:11 -0500 Subject: [PATCH 075/179] dt-bindings: pinctrl: Allow values for drive-push-pull and drive-open-drain A few platforms, at91 and tegra, use drive-push-pull and drive-open-drain with a 0 or 1 value. There's not really a need for values as '1' should be equivalent to no value (it wasn't treated that way) and drive-push-pull disabled is equivalent to drive-open-drain. So dropping the value can't be done without breaking existing OSs. As we don't want new cases, mark the case with values as deprecated. Cc: Arnd Bergmann Cc: Thierry Reding Cc: Jonathan Hunter Cc: Nicolas Ferre Cc: Claudiu Beznea Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220429194610.2741437-1-robh@kernel.org --- .../devicetree/bindings/pinctrl/pincfg-node.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pinctrl/pincfg-node.yaml b/Documentation/devicetree/bindings/pinctrl/pincfg-node.yaml index 4b22a9e3a447..f5a121311f61 100644 --- a/Documentation/devicetree/bindings/pinctrl/pincfg-node.yaml +++ b/Documentation/devicetree/bindings/pinctrl/pincfg-node.yaml @@ -52,11 +52,19 @@ properties: hardware supporting it the pull strength in Ohm. drive-push-pull: - type: boolean + oneOf: + - type: boolean + - $ref: /schemas/types.yaml#/definitions/uint32 + enum: [ 0, 1 ] + deprecated: true description: drive actively high and low drive-open-drain: - type: boolean + oneOf: + - type: boolean + - $ref: /schemas/types.yaml#/definitions/uint32 + const: 1 # No known cases of 0 + deprecated: true description: drive with open drain drive-open-source: From caf83e494de965dbb5f8add655c526b9af6a96cb Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 3 May 2022 11:27:38 -0500 Subject: [PATCH 076/179] dt-bindings: Drop redundant 'maxItems/minItems' in if/then schemas Another round of removing redundant minItems/maxItems when 'items' list is specified. This time it is in if/then schemas as the meta-schema was failing to check this case. If a property has an 'items' list, then a 'minItems' or 'maxItems' with the same size as the list is redundant and can be dropped. Note that is DT schema specific behavior and not standard json-schema behavior. The tooling will fixup the final schema adding any unspecified minItems/maxItems. Signed-off-by: Rob Herring Acked-By: Vinod Koul Acked-by: Marc Kleine-Budde Acked-by: Mark Brown Acked-by: Ulf Hansson # For MMC Acked-by: Jonathan Cameron #for IIO Link: https://lore.kernel.org/r/20220503162738.3827041-1-robh@kernel.org --- .../bindings/clock/imx8m-clock.yaml | 4 ---- .../bindings/display/bridge/renesas,lvds.yaml | 4 ---- .../bindings/display/renesas,du.yaml | 23 ------------------- .../bindings/iio/adc/st,stm32-adc.yaml | 2 -- .../bindings/mmc/nvidia,tegra20-sdhci.yaml | 7 +----- .../devicetree/bindings/mtd/gpmi-nand.yaml | 2 -- .../bindings/net/can/bosch,c_can.yaml | 3 --- .../bindings/phy/brcm,sata-phy.yaml | 10 ++++---- .../bindings/rtc/allwinner,sun6i-a31-rtc.yaml | 10 -------- .../bindings/serial/samsung_uart.yaml | 4 ---- .../sound/allwinner,sun4i-a10-i2s.yaml | 1 - .../bindings/sound/ti,j721e-cpb-audio.yaml | 2 -- .../bindings/thermal/rcar-gen3-thermal.yaml | 1 - 13 files changed, 5 insertions(+), 68 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/imx8m-clock.yaml b/Documentation/devicetree/bindings/clock/imx8m-clock.yaml index 625f573a7b90..458c7645ee68 100644 --- a/Documentation/devicetree/bindings/clock/imx8m-clock.yaml +++ b/Documentation/devicetree/bindings/clock/imx8m-clock.yaml @@ -55,8 +55,6 @@ allOf: then: properties: clocks: - minItems: 7 - maxItems: 7 items: - description: 32k osc - description: 25m osc @@ -66,8 +64,6 @@ allOf: - description: ext3 clock input - description: ext4 clock input clock-names: - minItems: 7 - maxItems: 7 items: - const: ckil - const: osc_25m diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml index a51baf8a4c76..bb9dbfb9beaf 100644 --- a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml +++ b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml @@ -95,7 +95,6 @@ then: properties: clocks: minItems: 1 - maxItems: 4 items: - description: Functional clock - description: EXTAL input clock @@ -104,7 +103,6 @@ then: clock-names: minItems: 1 - maxItems: 4 items: - const: fck # The LVDS encoder can use the EXTAL or DU_DOTCLKINx clocks. @@ -128,12 +126,10 @@ then: else: properties: clocks: - maxItems: 1 items: - description: Functional clock clock-names: - maxItems: 1 items: - const: fck diff --git a/Documentation/devicetree/bindings/display/renesas,du.yaml b/Documentation/devicetree/bindings/display/renesas,du.yaml index 56cedcd6d576..b3e588022082 100644 --- a/Documentation/devicetree/bindings/display/renesas,du.yaml +++ b/Documentation/devicetree/bindings/display/renesas,du.yaml @@ -109,7 +109,6 @@ allOf: properties: clocks: minItems: 1 - maxItems: 3 items: - description: Functional clock - description: DU_DOTCLKIN0 input clock @@ -117,7 +116,6 @@ allOf: clock-names: minItems: 1 - maxItems: 3 items: - const: du.0 - pattern: '^dclkin\.[01]$' @@ -159,7 +157,6 @@ allOf: properties: clocks: minItems: 2 - maxItems: 4 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -168,7 +165,6 @@ allOf: clock-names: minItems: 2 - maxItems: 4 items: - const: du.0 - const: du.1 @@ -216,7 +212,6 @@ allOf: properties: clocks: minItems: 2 - maxItems: 4 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -225,7 +220,6 @@ allOf: clock-names: minItems: 2 - maxItems: 4 items: - const: du.0 - const: du.1 @@ -271,7 +265,6 @@ allOf: properties: clocks: minItems: 2 - maxItems: 4 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -280,7 +273,6 @@ allOf: clock-names: minItems: 2 - maxItems: 4 items: - const: du.0 - const: du.1 @@ -327,7 +319,6 @@ allOf: properties: clocks: minItems: 2 - maxItems: 4 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -336,7 +327,6 @@ allOf: clock-names: minItems: 2 - maxItems: 4 items: - const: du.0 - const: du.1 @@ -386,7 +376,6 @@ allOf: properties: clocks: minItems: 3 - maxItems: 6 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -397,7 +386,6 @@ allOf: clock-names: minItems: 3 - maxItems: 6 items: - const: du.0 - const: du.1 @@ -448,7 +436,6 @@ allOf: properties: clocks: minItems: 4 - maxItems: 8 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -461,7 +448,6 @@ allOf: clock-names: minItems: 4 - maxItems: 8 items: - const: du.0 - const: du.1 @@ -525,7 +511,6 @@ allOf: properties: clocks: minItems: 3 - maxItems: 6 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -536,7 +521,6 @@ allOf: clock-names: minItems: 3 - maxItems: 6 items: - const: du.0 - const: du.1 @@ -596,7 +580,6 @@ allOf: properties: clocks: minItems: 3 - maxItems: 6 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -607,7 +590,6 @@ allOf: clock-names: minItems: 3 - maxItems: 6 items: - const: du.0 - const: du.1 @@ -666,14 +648,12 @@ allOf: properties: clocks: minItems: 1 - maxItems: 2 items: - description: Functional clock for DU0 - description: DU_DOTCLKIN0 input clock clock-names: minItems: 1 - maxItems: 2 items: - const: du.0 - const: dclkin.0 @@ -723,7 +703,6 @@ allOf: properties: clocks: minItems: 2 - maxItems: 4 items: - description: Functional clock for DU0 - description: Functional clock for DU1 @@ -732,7 +711,6 @@ allOf: clock-names: minItems: 2 - maxItems: 4 items: - const: du.0 - const: du.1 @@ -791,7 +769,6 @@ allOf: - description: Functional clock clock-names: - maxItems: 1 items: - const: du.0 diff --git a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.yaml b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.yaml index 4d6074518b5c..fa8da42cb1e6 100644 --- a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.yaml +++ b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.yaml @@ -138,7 +138,6 @@ allOf: - const: bus - const: adc minItems: 1 - maxItems: 2 interrupts: items: @@ -170,7 +169,6 @@ allOf: - const: bus - const: adc minItems: 1 - maxItems: 2 interrupts: items: diff --git a/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.yaml b/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.yaml index f3f4d5b02744..fe0270207622 100644 --- a/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.yaml +++ b/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.yaml @@ -202,22 +202,17 @@ allOf: clocks: items: - description: module clock - minItems: 1 - maxItems: 1 else: properties: clocks: items: - description: module clock - description: timeout clock - minItems: 2 - maxItems: 2 + clock-names: items: - const: sdhci - const: tmclk - minItems: 2 - maxItems: 2 required: - clock-names diff --git a/Documentation/devicetree/bindings/mtd/gpmi-nand.yaml b/Documentation/devicetree/bindings/mtd/gpmi-nand.yaml index 9d764e654e1d..849aeae319a9 100644 --- a/Documentation/devicetree/bindings/mtd/gpmi-nand.yaml +++ b/Documentation/devicetree/bindings/mtd/gpmi-nand.yaml @@ -147,8 +147,6 @@ allOf: - description: SoC gpmi io clock - description: SoC gpmi bch apb clock clock-names: - minItems: 2 - maxItems: 2 items: - const: gpmi_io - const: gpmi_bch_apb diff --git a/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml b/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml index 8bad328b184d..51aa89ac7e85 100644 --- a/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml +++ b/Documentation/devicetree/bindings/net/can/bosch,c_can.yaml @@ -80,8 +80,6 @@ if: then: properties: interrupts: - minItems: 4 - maxItems: 4 items: - description: Error and status IRQ - description: Message object IRQ @@ -91,7 +89,6 @@ then: else: properties: interrupts: - maxItems: 1 items: - description: Error and status IRQ diff --git a/Documentation/devicetree/bindings/phy/brcm,sata-phy.yaml b/Documentation/devicetree/bindings/phy/brcm,sata-phy.yaml index cb1aa325336f..435b971dfd9b 100644 --- a/Documentation/devicetree/bindings/phy/brcm,sata-phy.yaml +++ b/Documentation/devicetree/bindings/phy/brcm,sata-phy.yaml @@ -102,19 +102,17 @@ if: then: properties: reg: - maxItems: 2 + minItems: 2 + reg-names: - items: - - const: "phy" - - const: "phy-ctrl" + minItems: 2 else: properties: reg: maxItems: 1 + reg-names: maxItems: 1 - items: - - const: "phy" required: - compatible diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml index 0b767fec39d8..6b38bd7eb3b4 100644 --- a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml @@ -71,7 +71,6 @@ allOf: then: properties: clock-output-names: - minItems: 1 maxItems: 1 - if: @@ -102,7 +101,6 @@ allOf: properties: clock-output-names: minItems: 3 - maxItems: 3 - if: properties: @@ -113,16 +111,12 @@ allOf: then: properties: clocks: - minItems: 3 - maxItems: 3 items: - description: Bus clock for register access - description: 24 MHz oscillator - description: 32 kHz clock from the CCU clock-names: - minItems: 3 - maxItems: 3 items: - const: bus - const: hosc @@ -142,7 +136,6 @@ allOf: properties: clocks: minItems: 3 - maxItems: 4 items: - description: Bus clock for register access - description: 24 MHz oscillator @@ -151,7 +144,6 @@ allOf: clock-names: minItems: 3 - maxItems: 4 items: - const: bus - const: hosc @@ -174,14 +166,12 @@ allOf: then: properties: interrupts: - minItems: 1 maxItems: 1 else: properties: interrupts: minItems: 2 - maxItems: 2 required: - "#clock-cells" diff --git a/Documentation/devicetree/bindings/serial/samsung_uart.yaml b/Documentation/devicetree/bindings/serial/samsung_uart.yaml index d4688e317fc5..901c1e2cea28 100644 --- a/Documentation/devicetree/bindings/serial/samsung_uart.yaml +++ b/Documentation/devicetree/bindings/serial/samsung_uart.yaml @@ -100,7 +100,6 @@ allOf: maxItems: 3 clock-names: minItems: 2 - maxItems: 3 items: - const: uart - pattern: '^clk_uart_baud[0-1]$' @@ -118,11 +117,8 @@ allOf: then: properties: clocks: - minItems: 2 maxItems: 2 clock-names: - minItems: 2 - maxItems: 2 items: - const: uart - const: clk_uart_baud0 diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml index c21c807b667c..34f6ee9de392 100644 --- a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml +++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml @@ -89,7 +89,6 @@ allOf: properties: dmas: minItems: 1 - maxItems: 2 items: - description: RX DMA Channel - description: TX DMA Channel diff --git a/Documentation/devicetree/bindings/sound/ti,j721e-cpb-audio.yaml b/Documentation/devicetree/bindings/sound/ti,j721e-cpb-audio.yaml index 6806f53a4aed..20ea5883b7ff 100644 --- a/Documentation/devicetree/bindings/sound/ti,j721e-cpb-audio.yaml +++ b/Documentation/devicetree/bindings/sound/ti,j721e-cpb-audio.yaml @@ -80,7 +80,6 @@ allOf: then: properties: clocks: - minItems: 6 items: - description: AUXCLK clock for McASP used by CPB audio - description: Parent for CPB_McASP auxclk (for 48KHz) @@ -107,7 +106,6 @@ allOf: then: properties: clocks: - maxItems: 4 items: - description: AUXCLK clock for McASP used by CPB audio - description: Parent for CPB_McASP auxclk (for 48KHz) diff --git a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml index f963204e0b16..1368d90da0e8 100644 --- a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml @@ -67,7 +67,6 @@ then: properties: reg: minItems: 2 - maxItems: 3 items: - description: TSC1 registers - description: TSC2 registers From 5dc4630426511f641b7ac44fc550b8e21eafb237 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Mon, 2 May 2022 18:13:08 +0900 Subject: [PATCH 077/179] dt-bindings: pci: apple,pcie: Drop max-link-speed from example We no longer use these since 111659c2a570 (and they never worked anyway); drop them from the example to avoid confusion. Fixes: 111659c2a570 ("arm64: dts: apple: t8103: Remove PCIe max-link-speed properties") Signed-off-by: Hector Martin Reviewed-by: Alyssa Rosenzweig Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220502091308.28233-1-marcan@marcan.st --- Documentation/devicetree/bindings/pci/apple,pcie.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/apple,pcie.yaml b/Documentation/devicetree/bindings/pci/apple,pcie.yaml index 7f01e15fc81c..daf602ac0d0f 100644 --- a/Documentation/devicetree/bindings/pci/apple,pcie.yaml +++ b/Documentation/devicetree/bindings/pci/apple,pcie.yaml @@ -142,7 +142,6 @@ examples: device_type = "pci"; reg = <0x0 0x0 0x0 0x0 0x0>; reset-gpios = <&pinctrl_ap 152 0>; - max-link-speed = <2>; #address-cells = <3>; #size-cells = <2>; @@ -153,7 +152,6 @@ examples: device_type = "pci"; reg = <0x800 0x0 0x0 0x0 0x0>; reset-gpios = <&pinctrl_ap 153 0>; - max-link-speed = <2>; #address-cells = <3>; #size-cells = <2>; @@ -164,7 +162,6 @@ examples: device_type = "pci"; reg = <0x1000 0x0 0x0 0x0 0x0>; reset-gpios = <&pinctrl_ap 33 0>; - max-link-speed = <1>; #address-cells = <3>; #size-cells = <2>; From ef91271c65c12d36e4c2b61c61d4849fb6d11aa0 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Sun, 24 Apr 2022 16:01:03 +0800 Subject: [PATCH 078/179] RDMA/siw: Fix a condition race issue in MPA request processing The calling of siw_cm_upcall and detaching new_cep with its listen_cep should be atomistic semantics. Otherwise siw_reject may be called in a temporary state, e,g, siw_cm_upcall is called but the new_cep->listen_cep has not being cleared. This fixes a WARN: WARNING: CPU: 7 PID: 201 at drivers/infiniband/sw/siw/siw_cm.c:255 siw_cep_put+0x125/0x130 [siw] CPU: 2 PID: 201 Comm: kworker/u16:22 Kdump: loaded Tainted: G E 5.17.0-rc7 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Workqueue: iw_cm_wq cm_work_handler [iw_cm] RIP: 0010:siw_cep_put+0x125/0x130 [siw] Call Trace: siw_reject+0xac/0x180 [siw] iw_cm_reject+0x68/0xc0 [iw_cm] cm_work_handler+0x59d/0xe20 [iw_cm] process_one_work+0x1e2/0x3b0 worker_thread+0x50/0x3a0 ? rescuer_thread+0x390/0x390 kthread+0xe5/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Link: https://lore.kernel.org/r/d528d83466c44687f3872eadcb8c184528b2e2d4.1650526554.git.chengyou@linux.alibaba.com Reported-by: Luis Chamberlain Reviewed-by: Bernard Metzler Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 7acdd3c3a599..17f34d584cd9 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -968,14 +968,15 @@ static void siw_accept_newconn(struct siw_cep *cep) siw_cep_set_inuse(new_cep); rv = siw_proc_mpareq(new_cep); - siw_cep_set_free(new_cep); - if (rv != -EAGAIN) { siw_cep_put(cep); new_cep->listen_cep = NULL; - if (rv) + if (rv) { + siw_cep_set_free(new_cep); goto error; + } } + siw_cep_set_free(new_cep); } return; From a926a903b7dc39a8a949150258c09290998dd812 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 4 May 2022 15:28:17 -0500 Subject: [PATCH 079/179] RDMA/rxe: Do not call dev_mc_add/del() under a spinlock These routines were not intended to be called under a spinlock and will throw debugging warnings: raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 13 PID: 3107 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x2f/0x50 CPU: 13 PID: 3107 Comm: python3 Tainted: G E 5.18.0-rc1+ #7 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 RIP: 0010:warn_bogus_irq_restore+0x2f/0x50 Call Trace: _raw_spin_unlock_irqrestore+0x75/0x80 rxe_attach_mcast+0x304/0x480 [rdma_rxe] ib_attach_mcast+0x88/0xa0 [ib_core] ib_uverbs_attach_mcast+0x186/0x1e0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xcd/0x140 [ib_uverbs] ib_uverbs_cmd_verbs+0xdb0/0xea0 [ib_uverbs] ib_uverbs_ioctl+0xd2/0x160 [ib_uverbs] do_syscall_64+0x5c/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Move them out of the spinlock, it is OK if there is some races setting up the MC reception at the ethernet layer with rbtree lookups. Fixes: 6090a0c4c7c6 ("RDMA/rxe: Cleanup rxe_mcast.c") Link: https://lore.kernel.org/r/20220504202817.98247-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 51 ++++++++++++--------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index ae8f11cb704a..77e45cabd8ea 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -38,13 +38,13 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) } /** - * rxe_mcast_delete - delete multicast address from rxe device + * rxe_mcast_del - delete multicast address from rxe device * @rxe: rxe device object * @mgid: multicast address as a gid * * Returns 0 on success else an error */ -static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) +static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; @@ -159,17 +159,10 @@ struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid) * @mcg: new mcg object * * Context: caller should hold rxe->mcg lock - * Returns: 0 on success else an error */ -static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, - struct rxe_mcg *mcg) +static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mcg *mcg) { - int err; - - err = rxe_mcast_add(rxe, mgid); - if (unlikely(err)) - return err; - kref_init(&mcg->ref_cnt); memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid)); INIT_LIST_HEAD(&mcg->qp_list); @@ -184,8 +177,6 @@ static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, */ kref_get(&mcg->ref_cnt); __rxe_insert_mcg(mcg); - - return 0; } /** @@ -209,6 +200,12 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) if (mcg) return mcg; + /* check to see if we have reached limit */ + if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) { + err = -ENOMEM; + goto err_dec; + } + /* speculative alloc of new mcg */ mcg = kzalloc(sizeof(*mcg), GFP_KERNEL); if (!mcg) @@ -218,27 +215,23 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) /* re-check to see if someone else just added it */ tmp = __rxe_lookup_mcg(rxe, mgid); if (tmp) { + spin_unlock_irqrestore(&rxe->mcg_lock, flags); + atomic_dec(&rxe->mcg_num); kfree(mcg); - mcg = tmp; - goto out; + return tmp; } - if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) { - err = -ENOMEM; - goto err_dec; - } - - err = __rxe_init_mcg(rxe, mgid, mcg); - if (err) - goto err_dec; -out: + __rxe_init_mcg(rxe, mgid, mcg); spin_unlock_irqrestore(&rxe->mcg_lock, flags); - return mcg; + /* add mcast address outside of lock */ + err = rxe_mcast_add(rxe, mgid); + if (!err) + return mcg; + + kfree(mcg); err_dec: atomic_dec(&rxe->mcg_num); - spin_unlock_irqrestore(&rxe->mcg_lock, flags); - kfree(mcg); return ERR_PTR(err); } @@ -268,7 +261,6 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg) __rxe_remove_mcg(mcg); kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); - rxe_mcast_delete(mcg->rxe, &mcg->mgid); atomic_dec(&rxe->mcg_num); } @@ -282,6 +274,9 @@ static void rxe_destroy_mcg(struct rxe_mcg *mcg) { unsigned long flags; + /* delete mcast address outside of lock */ + rxe_mcast_del(mcg->rxe, &mcg->mgid); + spin_lock_irqsave(&mcg->rxe->mcg_lock, flags); __rxe_destroy_mcg(mcg); spin_unlock_irqrestore(&mcg->rxe->mcg_lock, flags); From bfdc0edd11f9501b891a069b5bbd3b16731941e1 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 4 May 2022 15:28:17 -0500 Subject: [PATCH 080/179] RDMA/rxe: Change mcg_lock to a _bh lock rxe_mcast.c currently uses _irqsave spinlocks for rxe->mcg_lock while rxe_recv.c uses _bh spinlocks for the same lock. As there is no case where the mcg_lock can be taken from an IRQ, change these all to bh locks so we don't have confusing mismatched lock types on the same spinlock. Fixes: 6090a0c4c7c6 ("RDMA/rxe: Cleanup rxe_mcast.c") Link: https://lore.kernel.org/r/20220504202817.98247-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 36 +++++++++++---------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 77e45cabd8ea..873a9b10307c 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -143,11 +143,10 @@ static struct rxe_mcg *__rxe_lookup_mcg(struct rxe_dev *rxe, struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid) { struct rxe_mcg *mcg; - unsigned long flags; - spin_lock_irqsave(&rxe->mcg_lock, flags); + spin_lock_bh(&rxe->mcg_lock); mcg = __rxe_lookup_mcg(rxe, mgid); - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); return mcg; } @@ -189,7 +188,6 @@ static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) { struct rxe_mcg *mcg, *tmp; - unsigned long flags; int err; if (rxe->attr.max_mcast_grp == 0) @@ -211,18 +209,18 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) if (!mcg) return ERR_PTR(-ENOMEM); - spin_lock_irqsave(&rxe->mcg_lock, flags); + spin_lock_bh(&rxe->mcg_lock); /* re-check to see if someone else just added it */ tmp = __rxe_lookup_mcg(rxe, mgid); if (tmp) { - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); atomic_dec(&rxe->mcg_num); kfree(mcg); return tmp; } __rxe_init_mcg(rxe, mgid, mcg); - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); /* add mcast address outside of lock */ err = rxe_mcast_add(rxe, mgid); @@ -272,14 +270,12 @@ static void __rxe_destroy_mcg(struct rxe_mcg *mcg) */ static void rxe_destroy_mcg(struct rxe_mcg *mcg) { - unsigned long flags; - /* delete mcast address outside of lock */ rxe_mcast_del(mcg->rxe, &mcg->mgid); - spin_lock_irqsave(&mcg->rxe->mcg_lock, flags); + spin_lock_bh(&mcg->rxe->mcg_lock); __rxe_destroy_mcg(mcg); - spin_unlock_irqrestore(&mcg->rxe->mcg_lock, flags); + spin_unlock_bh(&mcg->rxe->mcg_lock); } /** @@ -334,25 +330,24 @@ static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) { struct rxe_dev *rxe = mcg->rxe; struct rxe_mca *mca, *tmp; - unsigned long flags; int err; /* check to see if the qp is already a member of the group */ - spin_lock_irqsave(&rxe->mcg_lock, flags); + spin_lock_bh(&rxe->mcg_lock); list_for_each_entry(mca, &mcg->qp_list, qp_list) { if (mca->qp == qp) { - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); return 0; } } - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); /* speculative alloc new mca without using GFP_ATOMIC */ mca = kzalloc(sizeof(*mca), GFP_KERNEL); if (!mca) return -ENOMEM; - spin_lock_irqsave(&rxe->mcg_lock, flags); + spin_lock_bh(&rxe->mcg_lock); /* re-check to see if someone else just attached qp */ list_for_each_entry(tmp, &mcg->qp_list, qp_list) { if (tmp->qp == qp) { @@ -366,7 +361,7 @@ static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) if (err) kfree(mca); out: - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); return err; } @@ -400,9 +395,8 @@ static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) { struct rxe_dev *rxe = mcg->rxe; struct rxe_mca *mca, *tmp; - unsigned long flags; - spin_lock_irqsave(&rxe->mcg_lock, flags); + spin_lock_bh(&rxe->mcg_lock); list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) { if (mca->qp == qp) { __rxe_cleanup_mca(mca, mcg); @@ -416,13 +410,13 @@ static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) if (atomic_read(&mcg->qp_num) <= 0) __rxe_destroy_mcg(mcg); - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); return 0; } } /* we didn't find the qp on the list */ - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + spin_unlock_bh(&rxe->mcg_lock); return -EINVAL; } From 59f5ede3bc0f00eb856425f636dab0c10feb06d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 1 May 2022 21:31:43 +0200 Subject: [PATCH 081/179] x86/fpu: Prevent FPU state corruption The FPU usage related to task FPU management is either protected by disabling interrupts (switch_to, return to user) or via fpregs_lock() which is a wrapper around local_bh_disable(). When kernel code wants to use the FPU then it has to check whether it is possible by calling irq_fpu_usable(). But the condition in irq_fpu_usable() is wrong. It allows FPU to be used when: !in_interrupt() || interrupted_user_mode() || interrupted_kernel_fpu_idle() The latter is checking whether some other context already uses FPU in the kernel, but if that's not the case then it allows FPU to be used unconditionally even if the calling context interrupted a fpregs_lock() critical region. If that happens then the FPU state of the interrupted context becomes corrupted. Allow in kernel FPU usage only when no other context has in kernel FPU usage and either the calling context is not hard interrupt context or the hard interrupt did not interrupt a local bottomhalf disabled region. It's hard to find a proper Fixes tag as the condition was broken in one way or the other for a very long time and the eager/lazy FPU changes caused a lot of churn. Picked something remotely connected from the history. This survived undetected for quite some time as FPU usage in interrupt context is rare, but the recent changes to the random code unearthed it at least on a kernel which had FPU debugging enabled. There is probably a higher rate of silent corruption as not all issues can be detected by the FPU debugging code. This will be addressed in a subsequent change. Fixes: 5d2bd7009f30 ("x86, fpu: decouple non-lazy/eager fpu restore from xsave") Reported-by: Filipe Manana Signed-off-by: Thomas Gleixner Tested-by: Filipe Manana Reviewed-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220501193102.588689270@linutronix.de --- arch/x86/kernel/fpu/core.c | 67 +++++++++++++++----------------------- 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..e28ab0ecc537 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -41,17 +41,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@ -59,42 +49,37 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) -{ - return this_cpu_read(in_kernel_fpu); -} - -static bool interrupted_kernel_fpu_idle(void) -{ - return !kernel_fpu_disabled(); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. */ bool irq_fpu_usable(void) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); From 170f37d6aa6ad4582eefd7459015de79e244536e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 3 May 2022 00:09:31 -0400 Subject: [PATCH 082/179] block: Do not call folio_next() on an unreferenced folio It is unsafe to call folio_next() on a folio unless you hold a reference on it that prevents it from being split or freed. After returning from the iterator, iomap calls folio_end_writeback() which may drop the last reference to the page, or allow the page to be split. If that happens, the iterator will not advance far enough through the bio_vec, leading to assertion failures like the BUG() in folio_end_writeback() that checks we're not trying to end writeback on a page not currently under writeback. Other assertion failures were also seen, but they're all explained by this one bug. Fix the bug by remembering where the next folio starts before returning from the iterator. There are other ways of fixing this bug, but this seems the simplest. Reported-by: Darrick J. Wong Tested-by: Darrick J. Wong Reported-by: Brian Foster Tested-by: Brian Foster Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/bio.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 278cc81cc1e7..00450fd86bb4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -269,6 +269,7 @@ struct folio_iter { size_t offset; size_t length; /* private: for use by the iterator */ + struct folio *_next; size_t _seg_count; int _i; }; @@ -283,6 +284,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, PAGE_SIZE * (bvec->bv_page - &fi->folio->page); fi->_seg_count = bvec->bv_len; fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count); + fi->_next = folio_next(fi->folio); fi->_i = i; } @@ -290,9 +292,10 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) { fi->_seg_count -= fi->length; if (fi->_seg_count) { - fi->folio = folio_next(fi->folio); + fi->folio = fi->_next; fi->offset = 0; fi->length = min(folio_size(fi->folio), fi->_seg_count); + fi->_next = folio_next(fi->folio); } else if (fi->_i + 1 < bio->bi_vcnt) { bio_first_folio(fi, bio, fi->_i + 1); } else { From b9ff43dd27434dbd850b908e2e0e1f6e794efd9b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 27 Apr 2022 17:01:28 -0400 Subject: [PATCH 083/179] mm/readahead: Fix readahead with large folios Reading 100KB chunks from a big file (eg dd bs=100K) leads to poor readahead behaviour. Studying the traces in detail, I noticed two problems. The first is that we were setting the readahead flag on the folio which contains the last byte read from the block. This is wrong because we will trigger readahead at the end of the read without waiting to see if a subsequent read is going to use the pages we just read. Instead, we need to set the readahead flag on the first folio _after_ the one which contains the last byte that we're reading. The second is that we were looking for the index of the folio with the readahead flag set to exactly match the start + size - async_size. If we've rounded this, either down (as previously) or up (as now), we'll think we hit a folio marked as readahead by a different read, and try to read the wrong pages. So round the expected index to the order of the folio we hit. Reported-by: Guo Xuenan Signed-off-by: Matthew Wilcox (Oracle) --- mm/readahead.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 8e3775829513..4a60cdb64262 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -474,7 +474,8 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (!folio) return -ENOMEM; - if (mark - index < (1UL << order)) + mark = round_up(mark, 1UL << order); + if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) @@ -555,8 +556,9 @@ static void ondemand_readahead(struct readahead_control *ractl, struct file_ra_state *ra = ractl->ra; unsigned long max_pages = ra->ra_pages; unsigned long add_pages; - unsigned long index = readahead_index(ractl); - pgoff_t prev_index; + pgoff_t index = readahead_index(ractl); + pgoff_t expected, prev_index; + unsigned int order = folio ? folio_order(folio) : 0; /* * If the request exceeds the readahead window, allow the read to @@ -575,8 +577,9 @@ static void ondemand_readahead(struct readahead_control *ractl, * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if ((index == (ra->start + ra->size - ra->async_size) || - index == (ra->start + ra->size))) { + expected = round_up(ra->start + ra->size - ra->async_size, + 1UL << order); + if (index == expected || index == (ra->start + ra->size)) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -662,7 +665,7 @@ readit: } ractl->_index = ra->start; - page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0); + page_cache_ra_order(ractl, ra, order); } void page_cache_sync_ra(struct readahead_control *ractl, From 2d3535ed2c73fee356160aed40714b27be07442a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 2 May 2022 11:34:16 +0200 Subject: [PATCH 084/179] MAINTAINERS: update the GPIO git tree entry My git tree has become the de facto main GPIO tree. Update the MAINTAINERS file to reflect that. Signed-off-by: Bartosz Golaszewski Reported-by: Baruch Siach Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index edc96cdb85e8..9d47c5e7c6ae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8385,7 +8385,7 @@ M: Linus Walleij M: Bartosz Golaszewski L: linux-gpio@vger.kernel.org S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git F: Documentation/ABI/obsolete/sysfs-gpio F: Documentation/ABI/testing/gpio-cdev F: Documentation/admin-guide/gpio/ From 8707898e22fd665bc1d7b18b809be4b56ce25bdd Mon Sep 17 00:00:00 2001 From: Thomas Pfaff Date: Mon, 2 May 2022 13:28:29 +0200 Subject: [PATCH 085/179] genirq: Synchronize interrupt thread startup A kernel hang can be observed when running setserial in a loop on a kernel with force threaded interrupts. The sequence of events is: setserial open("/dev/ttyXXX") request_irq() do_stuff() -> serial interrupt -> wake(irq_thread) desc->threads_active++; close() free_irq() kthread_stop(irq_thread) synchronize_irq() <- hangs because desc->threads_active != 0 The thread is created in request_irq() and woken up, but does not get on a CPU to reach the actual thread function, which would handle the pending wake-up. kthread_stop() sets the should stop condition which makes the thread immediately exit, which in turn leaves the stale threads_active count around. This problem was introduced with commit 519cc8652b3a, which addressed a interrupt sharing issue in the PCIe code. Before that commit free_irq() invoked synchronize_irq(), which waits for the hard interrupt handler and also for associated threads to complete. To address the PCIe issue synchronize_irq() was replaced with __synchronize_hardirq(), which only waits for the hard interrupt handler to complete, but not for threaded handlers. This was done under the assumption, that the interrupt thread already reached the thread function and waits for a wake-up, which is guaranteed to be handled before acting on the stop condition. The problematic case, that the thread would not reach the thread function, was obviously overlooked. Make sure that the interrupt thread is really started and reaches thread_fn() before returning from __setup_irq(). This utilizes the existing wait queue in the interrupt descriptor. The wait queue is unused for non-shared interrupts. For shared interrupts the usage might cause a spurious wake-up of a waiter in synchronize_irq() or the completion of a threaded handler might cause a spurious wake-up of the waiter for the ready flag. Both are harmless and have no functional impact. [ tglx: Amended changelog ] Fixes: 519cc8652b3a ("genirq: Synchronize only with single thread on free_irq()") Signed-off-by: Thomas Pfaff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/552fe7b4-9224-b183-bb87-a8f36d335690@pcs.com --- kernel/irq/internals.h | 2 ++ kernel/irq/irqdesc.c | 2 ++ kernel/irq/manage.c | 39 +++++++++++++++++++++++++++++---------- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99cbdf55a8bd..f09c60393e55 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -29,12 +29,14 @@ extern struct irqaction chained_action; * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded + * IRQTF_READY - signals that irq thread is ready */ enum { IRQTF_RUNTHREAD, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, + IRQTF_READY, }; /* diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 939d21cd55c3..0099b87dd853 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -407,6 +407,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); + init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); @@ -575,6 +576,7 @@ int __init early_irq_init(void) raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); mutex_init(&desc[i].request_mutex); + init_waitqueue_head(&desc[i].wait_for_threads); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c03f71d5ec10..e3e245a4fd70 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1248,6 +1248,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) raw_spin_unlock_irq(&desc->lock); } +/* + * Internal function to notify that a interrupt thread is ready. + */ +static void irq_thread_set_ready(struct irq_desc *desc, + struct irqaction *action) +{ + set_bit(IRQTF_READY, &action->thread_flags); + wake_up(&desc->wait_for_threads); +} + +/* + * Internal function to wake up a interrupt thread and wait until it is + * ready. + */ +static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, + struct irqaction *action) +{ + if (!action || !action->thread) + return; + + wake_up_process(action->thread); + wait_event(desc->wait_for_threads, + test_bit(IRQTF_READY, &action->thread_flags)); +} + /* * Interrupt handler thread */ @@ -1259,6 +1284,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + irq_thread_set_ready(desc, action); + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, @@ -1683,8 +1710,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - init_waitqueue_head(&desc->wait_for_threads); - /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, @@ -1780,14 +1805,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq_setup_timings(desc, new); - /* - * Strictly no need to wake it up, but hung_task complains - * when no hard interrupt wakes the thread up. - */ - if (new->thread) - wake_up_process(new->thread); - if (new->secondary) - wake_up_process(new->secondary->thread); + wake_up_and_wait_for_irq_thread_ready(desc, new); + wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; From 171865dab096da1ab980a32eeea5d1b88cd7bc50 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Thu, 21 Apr 2022 18:42:28 +0900 Subject: [PATCH 086/179] gpio: visconti: Fix fwnode of GPIO IRQ The fwnode of GPIO IRQ must be set to its own fwnode, not the fwnode of the parent IRQ. Therefore, this sets own fwnode instead of the parent IRQ fwnode to GPIO IRQ's. Fixes: 2ad74f40dacc ("gpio: visconti: Add Toshiba Visconti GPIO support") Signed-off-by: Nobuhiro Iwamatsu Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-visconti.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-visconti.c b/drivers/gpio/gpio-visconti.c index 47455810bdb9..e6534ea1eaa7 100644 --- a/drivers/gpio/gpio-visconti.c +++ b/drivers/gpio/gpio-visconti.c @@ -130,7 +130,6 @@ static int visconti_gpio_probe(struct platform_device *pdev) struct gpio_irq_chip *girq; struct irq_domain *parent; struct device_node *irq_parent; - struct fwnode_handle *fwnode; int ret; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -150,14 +149,12 @@ static int visconti_gpio_probe(struct platform_device *pdev) } parent = irq_find_host(irq_parent); + of_node_put(irq_parent); if (!parent) { dev_err(dev, "No IRQ parent domain\n"); return -ENODEV; } - fwnode = of_node_to_fwnode(irq_parent); - of_node_put(irq_parent); - ret = bgpio_init(&priv->gpio_chip, dev, 4, priv->base + GPIO_IDATA, priv->base + GPIO_OSET, @@ -180,7 +177,7 @@ static int visconti_gpio_probe(struct platform_device *pdev) girq = &priv->gpio_chip.irq; girq->chip = irq_chip; - girq->fwnode = fwnode; + girq->fwnode = of_node_to_fwnode(dev->of_node); girq->parent_domain = parent; girq->child_to_parent_hwirq = visconti_gpio_child_to_parent_hwirq; girq->populate_parent_alloc_arg = visconti_gpio_populate_parent_fwspec; From 2685027fca387b602ae565bff17895188b803988 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 27 Apr 2022 10:54:28 -0400 Subject: [PATCH 087/179] cgroup/cpuset: Remove cpus_allowed/mems_allowed setup in cpuset_init_smp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are 3 places where the cpu and node masks of the top cpuset can be initialized in the order they are executed: 1) start_kernel -> cpuset_init() 2) start_kernel -> cgroup_init() -> cpuset_bind() 3) kernel_init_freeable() -> do_basic_setup() -> cpuset_init_smp() The first cpuset_init() call just sets all the bits in the masks. The second cpuset_bind() call sets cpus_allowed and mems_allowed to the default v2 values. The third cpuset_init_smp() call sets them back to v1 values. For systems with cgroup v2 setup, cpuset_bind() is called once. As a result, cpu and memory node hot add may fail to update the cpu and node masks of the top cpuset to include the newly added cpu or node in a cgroup v2 environment. For systems with cgroup v1 setup, cpuset_bind() is called again by rebind_subsystem() when the v1 cpuset filesystem is mounted as shown in the dmesg log below with an instrumented kernel. [ 2.609781] cpuset_bind() called - v2 = 1 [ 3.079473] cpuset_init_smp() called [ 7.103710] cpuset_bind() called - v2 = 0 smp_init() is called after the first two init functions. So we don't have a complete list of active cpus and memory nodes until later in cpuset_init_smp() which is the right time to set up effective_cpus and effective_mems. To fix this cgroup v2 mask setup problem, the potentially incorrect cpus_allowed & mems_allowed setting in cpuset_init_smp() are removed. For cgroup v2 systems, the initial cpuset_bind() call will set the masks correctly. For cgroup v1 systems, the second call to cpuset_bind() will do the right setup. cc: stable@vger.kernel.org Signed-off-by: Waiman Long Tested-by: Feng Tang Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9390bfd9f1cd..71a418858a5e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3390,8 +3390,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + /* + * cpus_allowd/mems_allowed set to v2 values in the initial + * cpuset_bind() call will be reset to v1 values in another + * cpuset_bind() call when v1 cpuset is mounted. + */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); From 9f73f1aef98b2fa7252c0a89be64840271ce8ea0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 1 Apr 2022 15:29:37 +0800 Subject: [PATCH 088/179] btrfs: force v2 space cache usage for subpage mount [BUG] For a 4K sector sized btrfs with v1 cache enabled and only mounted on systems with 4K page size, if it's mounted on subpage (64K page size) systems, it can cause the following warning on v1 space cache: BTRFS error (device dm-1): csum mismatch on free space cache BTRFS warning (device dm-1): failed to load free space cache for block group 84082688, rebuilding it now Although not a big deal, as kernel can rebuild it without problem, such warning will bother end users, especially if they want to switch the same btrfs seamlessly between different page sized systems. [CAUSE] V1 free space cache is still using fixed PAGE_SIZE for various bitmap, like BITS_PER_BITMAP. Such hard-coded PAGE_SIZE usage will cause various mismatch, from v1 cache size to checksum. Thus kernel will always reject v1 cache with a different PAGE_SIZE with csum mismatch. [FIX] Although we should fix v1 cache, it's already going to be marked deprecated soon. And we have v2 cache based on metadata (which is already fully subpage compatible), and it has almost everything superior than v1 cache. So just force subpage mount to use v2 cache on mount. Reported-by: Matt Corallo CC: stable@vger.kernel.org # 5.15+ Link: https://lore.kernel.org/linux-btrfs/61aa27d1-30fc-c1a9-f0f4-9df544395ec3@bluematt.me/ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20e70eb88465..3e0acc362233 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3657,6 +3657,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); From 549577127afeb759c29cdeeff1bdccd86e6c0dbf Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 3 May 2022 14:10:04 -0700 Subject: [PATCH 089/179] btrfs: zoned: move non-changing condition check out of the loop btrfs_zone_activate() checks if block_group->alloc_offset == block_group->zone_capacity every time it iterates the loop. But, it is not depending on the index. Move out the check and do it only once. Fixes: f9a912a3c45f ("btrfs: zoned: make zone activation multi stripe capable") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1b1b310c3c51..b5eb794c1e23 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1835,6 +1835,12 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; physical = map->stripes[i].physical; @@ -1842,12 +1848,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) if (device->zone_info->max_active_zones == 0) continue; - /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { - ret = false; - goto out_unlock; - } - if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; From ceb4f60830a7cd38ff47b7dd54e9e06ddbaf413c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 3 May 2022 14:10:05 -0700 Subject: [PATCH 090/179] btrfs: zoned: activate block group properly on unlimited active zone device btrfs_zone_activate() checks if it activated all the underlying zones in the loop. However, that check never hit on an unlimited activate zone device (max_active_zones == 0). Fortunately, it still works without ENOSPC because btrfs_zone_activate() returns true in the end, even if block_group->zone_is_active == 0. But, it is confusing to have non zone_is_active block group still usable for allocation. Also, we are wasting CPU time to iterate the loop every time btrfs_zone_activate() is called for the blog groups. Since error case in the loop is handled by out_unlock, we can just set zone_is_active and do the list stuff after the loop. Fixes: f9a912a3c45f ("btrfs: zoned: make zone activation multi stripe capable") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b5eb794c1e23..d31b0eda210f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1853,24 +1853,18 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) ret = false; goto out_unlock; } - - /* Successfully activated all the zones */ - if (i == map->num_stripes - 1) - block_group->zone_is_active = 1; - - } + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; spin_unlock(&block_group->lock); - if (block_group->zone_is_active) { - /* For the active block group list */ - btrfs_get_block_group(block_group); + /* For the active block group list */ + btrfs_get_block_group(block_group); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&block_group->active_bg_list, - &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); return true; From 750ee454908e90a8792b1e2b157c2948da86e926 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 May 2022 11:57:02 +0100 Subject: [PATCH 091/179] btrfs: fix assertion failure when logging directory key range item When inserting a key range item (BTRFS_DIR_LOG_INDEX_KEY) while logging a directory, we don't expect the insertion to fail with -EEXIST, because we are holding the directory's log_mutex and we have dropped all existing BTRFS_DIR_LOG_INDEX_KEY keys from the log tree before we started to log the directory. However it's possible that during the logging we attempt to insert the same BTRFS_DIR_LOG_INDEX_KEY key twice, but for this to happen we need to race with insertions of items from other inodes in the subvolume's tree while we are logging a directory. Here's how this can happen: 1) We are logging a directory with inode number 1000 that has its items spread across 3 leaves in the subvolume's tree: leaf A - has index keys from the range 2 to 20 for example. The last item in the leaf corresponds to a dir item for index number 20. All these dir items were created in a past transaction. leaf B - has index keys from the range 22 to 100 for example. It has no keys from other inodes, all its keys are dir index keys for our directory inode number 1000. Its first key is for the dir item with a sequence number of 22. All these dir items were also created in a past transaction. leaf C - has index keys for our directory for the range 101 to 120 for example. This leaf also has items from other inodes, and its first item corresponds to the dir item for index number 101 for our directory with inode number 1000; 2) When we finish processing the items from leaf A at log_dir_items(), we log a BTRFS_DIR_LOG_INDEX_KEY key with an offset of 21 and a last offset of 21, meaning the log is authoritative for the index range from 21 to 21 (a single sequence number). At this point leaf B was not yet modified in the current transaction; 3) When we return from log_dir_items() we have released our read lock on leaf B, and have set *last_offset_ret to 21 (index number of the first item on leaf B minus 1); 4) Some other task inserts an item for other inode (inode number 1001 for example) into leaf C. That resulted in pushing some items from leaf C into leaf B, in order to make room for the new item, so now leaf B has dir index keys for the sequence number range from 22 to 102 and leaf C has the dir items for the sequence number range 103 to 120; 5) At log_directory_changes() we call log_dir_items() again, passing it a 'min_offset' / 'min_key' value of 22 (*last_offset_ret from step 3 plus 1, so 21 + 1). Then btrfs_search_forward() leaves us at slot 0 of leaf B, since leaf B was modified in the current transaction. We have also initialized 'last_old_dentry_offset' to 20 after calling btrfs_previous_item() at log_dir_items(), as it left us at the last item of leaf A, which refers to the dir item with sequence number 20; 6) We then call process_dir_items_leaf() to process the dir items of leaf B, and when we process the first item, corresponding to slot 0, sequence number 22, we notice the dir item was created in a past transaction and its sequence number is greater than the value of *last_old_dentry_offset + 1 (20 + 1), so we decide to log again a BTRFS_DIR_LOG_INDEX_KEY key with an offset of 21 and an end range of 21 (key.offset - 1 == 22 - 1 == 21), which results in an -EEXIST error from insert_dir_log_key(), as we have already inserted that key at step 2, triggering the assertion at process_dir_items_leaf(). The trace produced in dmesg is like the following: assertion failed: ret != -EEXIST, in fs/btrfs/tree-log.c:3857 [198255.980839][ T7460] ------------[ cut here ]------------ [198255.981666][ T7460] kernel BUG at fs/btrfs/ctree.h:3617! [198255.983141][ T7460] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI [198255.984080][ T7460] CPU: 0 PID: 7460 Comm: repro-ghost-dir Not tainted 5.18.0-5314c78ac373-misc-next+ [198255.986027][ T7460] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 [198255.988600][ T7460] RIP: 0010:assertfail.constprop.0+0x1c/0x1e [198255.989465][ T7460] Code: 8b 4c 89 (...) [198255.992599][ T7460] RSP: 0018:ffffc90007387188 EFLAGS: 00010282 [198255.993414][ T7460] RAX: 000000000000003d RBX: 0000000000000065 RCX: 0000000000000000 [198255.996056][ T7460] RDX: 0000000000000001 RSI: ffffffff8b62b180 RDI: fffff52000e70e24 [198255.997668][ T7460] RBP: ffffc90007387188 R08: 000000000000003d R09: ffff8881f0e16507 [198255.999199][ T7460] R10: ffffed103e1c2ca0 R11: 0000000000000001 R12: 00000000ffffffef [198256.000683][ T7460] R13: ffff88813befc630 R14: ffff888116c16e70 R15: ffffc90007387358 [198256.007082][ T7460] FS: 00007fc7f7c24640(0000) GS:ffff8881f0c00000(0000) knlGS:0000000000000000 [198256.009939][ T7460] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [198256.014133][ T7460] CR2: 0000560bb16d0b78 CR3: 0000000140b34005 CR4: 0000000000170ef0 [198256.015239][ T7460] Call Trace: [198256.015674][ T7460] [198256.016313][ T7460] log_dir_items.cold+0x16/0x2c [198256.018858][ T7460] ? replay_one_extent+0xbf0/0xbf0 [198256.025932][ T7460] ? release_extent_buffer+0x1d2/0x270 [198256.029658][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.031114][ T7460] ? lock_acquired+0xbe/0x660 [198256.032633][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.034386][ T7460] ? lock_release+0xcf/0x8a0 [198256.036152][ T7460] log_directory_changes+0xf9/0x170 [198256.036993][ T7460] ? log_dir_items+0xba0/0xba0 [198256.037661][ T7460] ? do_raw_write_unlock+0x7d/0xe0 [198256.038680][ T7460] btrfs_log_inode+0x233b/0x26d0 [198256.041294][ T7460] ? log_directory_changes+0x170/0x170 [198256.042864][ T7460] ? btrfs_attach_transaction_barrier+0x60/0x60 [198256.045130][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.046568][ T7460] ? lock_release+0xcf/0x8a0 [198256.047504][ T7460] ? lock_downgrade+0x420/0x420 [198256.048712][ T7460] ? ilookup5_nowait+0x81/0xa0 [198256.049747][ T7460] ? lock_downgrade+0x420/0x420 [198256.050652][ T7460] ? do_raw_spin_unlock+0xa9/0x100 [198256.051618][ T7460] ? __might_resched+0x128/0x1c0 [198256.052511][ T7460] ? __might_sleep+0x66/0xc0 [198256.053442][ T7460] ? __kasan_check_read+0x11/0x20 [198256.054251][ T7460] ? iget5_locked+0xbd/0x150 [198256.054986][ T7460] ? run_delayed_iput_locked+0x110/0x110 [198256.055929][ T7460] ? btrfs_iget+0xc7/0x150 [198256.056630][ T7460] ? btrfs_orphan_cleanup+0x4a0/0x4a0 [198256.057502][ T7460] ? free_extent_buffer+0x13/0x20 [198256.058322][ T7460] btrfs_log_inode+0x2654/0x26d0 [198256.059137][ T7460] ? log_directory_changes+0x170/0x170 [198256.060020][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.060930][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.061905][ T7460] ? lock_contended+0x770/0x770 [198256.062682][ T7460] ? btrfs_log_inode_parent+0xd04/0x1750 [198256.063582][ T7460] ? lock_downgrade+0x420/0x420 [198256.064432][ T7460] ? preempt_count_sub+0x18/0xc0 [198256.065550][ T7460] ? __mutex_lock+0x580/0xdc0 [198256.066654][ T7460] ? stack_trace_save+0x94/0xc0 [198256.068008][ T7460] ? __kasan_check_write+0x14/0x20 [198256.072149][ T7460] ? __mutex_unlock_slowpath+0x12a/0x430 [198256.073145][ T7460] ? mutex_lock_io_nested+0xcd0/0xcd0 [198256.074341][ T7460] ? wait_for_completion_io_timeout+0x20/0x20 [198256.075345][ T7460] ? lock_downgrade+0x420/0x420 [198256.076142][ T7460] ? lock_contended+0x770/0x770 [198256.076939][ T7460] ? do_raw_spin_lock+0x1c0/0x1c0 [198256.078401][ T7460] ? btrfs_sync_file+0x5e6/0xa40 [198256.080598][ T7460] btrfs_log_inode_parent+0x523/0x1750 [198256.081991][ T7460] ? wait_current_trans+0xc8/0x240 [198256.083320][ T7460] ? lock_downgrade+0x420/0x420 [198256.085450][ T7460] ? btrfs_end_log_trans+0x70/0x70 [198256.086362][ T7460] ? rcu_read_lock_sched_held+0x16/0x80 [198256.087544][ T7460] ? lock_release+0xcf/0x8a0 [198256.088305][ T7460] ? lock_downgrade+0x420/0x420 [198256.090375][ T7460] ? dget_parent+0x8e/0x300 [198256.093538][ T7460] ? do_raw_spin_lock+0x1c0/0x1c0 [198256.094918][ T7460] ? lock_downgrade+0x420/0x420 [198256.097815][ T7460] ? do_raw_spin_unlock+0xa9/0x100 [198256.101822][ T7460] ? dget_parent+0xb7/0x300 [198256.103345][ T7460] btrfs_log_dentry_safe+0x48/0x60 [198256.105052][ T7460] btrfs_sync_file+0x629/0xa40 [198256.106829][ T7460] ? start_ordered_ops.constprop.0+0x120/0x120 [198256.109655][ T7460] ? __fget_files+0x161/0x230 [198256.110760][ T7460] vfs_fsync_range+0x6d/0x110 [198256.111923][ T7460] ? start_ordered_ops.constprop.0+0x120/0x120 [198256.113556][ T7460] __x64_sys_fsync+0x45/0x70 [198256.114323][ T7460] do_syscall_64+0x5c/0xc0 [198256.115084][ T7460] ? syscall_exit_to_user_mode+0x3b/0x50 [198256.116030][ T7460] ? do_syscall_64+0x69/0xc0 [198256.116768][ T7460] ? do_syscall_64+0x69/0xc0 [198256.117555][ T7460] ? do_syscall_64+0x69/0xc0 [198256.118324][ T7460] ? sysvec_call_function_single+0x57/0xc0 [198256.119308][ T7460] ? asm_sysvec_call_function_single+0xa/0x20 [198256.120363][ T7460] entry_SYSCALL_64_after_hwframe+0x44/0xae [198256.121334][ T7460] RIP: 0033:0x7fc7fe97b6ab [198256.122067][ T7460] Code: 0f 05 48 (...) [198256.125198][ T7460] RSP: 002b:00007fc7f7c23950 EFLAGS: 00000293 ORIG_RAX: 000000000000004a [198256.126568][ T7460] RAX: ffffffffffffffda RBX: 00007fc7f7c239f0 RCX: 00007fc7fe97b6ab [198256.127942][ T7460] RDX: 0000000000000002 RSI: 000056167536bcf0 RDI: 0000000000000004 [198256.129302][ T7460] RBP: 0000000000000004 R08: 0000000000000000 R09: 000000007ffffeb8 [198256.130670][ T7460] R10: 00000000000001ff R11: 0000000000000293 R12: 0000000000000001 [198256.132046][ T7460] R13: 0000561674ca8140 R14: 00007fc7f7c239d0 R15: 000056167536dab8 [198256.133403][ T7460] Fix this by treating -EEXIST as expected at insert_dir_log_key() and have it update the item with an end offset corresponding to the maximum between the previously logged end offset and the new requested end offset. The end offsets may be different due to dir index key deletions that happened as part of unlink operations while we are logging a directory (triggered when fsyncing some other inode parented by the directory) or during renames which always attempt to log a single dir index deletion. Reported-by: Zygo Blaxell Link: https://lore.kernel.org/linux-btrfs/YmyefE9mc2xl5ZMz@hungrycats.org/ Fixes: 732d591a5d6c12 ("btrfs: stop copying old dir items when logging a directory") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 11399c8eed87..e65633686378 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3721,11 +3721,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - if (ret) + /* + * -EEXIST is fine and can happen sporadically when we are logging a + * directory and have concurrent insertions in the subvolume's tree for + * items from other inodes and that result in pushing off some dir items + * from one leaf to another in order to accommodate for the new items. + * This results in logging the same dir index range key. + */ + if (ret && ret != -EEXIST) return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); + if (ret == -EEXIST) { + const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); + + /* + * btrfs_del_dir_entries_in_log() might have been called during + * an unlink between the initial insertion of this key and the + * current update, or we might be logging a single entry deletion + * during a rename, so set the new last_offset to the max value. + */ + last_offset = max(last_offset, curr_end); + } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); @@ -3849,13 +3867,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, ret = insert_dir_log_key(trans, log, dst_path, ino, *last_old_dentry_offset + 1, key.offset - 1); - /* - * -EEXIST should never happen because when we - * log a directory in full mode (LOG_INODE_ALL) - * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from - * the log tree. - */ - ASSERT(ret != -EEXIST); if (ret < 0) return ret; } @@ -7031,12 +7042,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, /* * Other concurrent task might be logging the old directory, * as it can be triggered when logging other inode that had or - * still has a dentry in the old directory. So take the old - * directory's log_mutex to prevent getting an -EEXIST when - * logging a key to record the deletion, or having that other - * task logging the old directory get an -EEXIST if it attempts - * to log the same key after we just did it. In both cases that - * would result in falling back to a transaction commit. + * still has a dentry in the old directory. We lock the old + * directory's log_mutex to ensure the deletion of the old + * name is persisted, because during directory logging we + * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of + * the old name's dir index item is in the delayed items, so + * it could be missed by an in progress directory logging. */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), From 3e1ad196385c65c1454aceab1226d9a4baca27d5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 3 May 2022 17:35:25 +0200 Subject: [PATCH 092/179] btrfs: sysfs: export the balance paused state of exclusive operation The new state allowing device addition with paused balance is not exported to user space so it can't recognize it and actually start the operation. Fixes: efc0e69c2fea ("btrfs: introduce exclusive operation BALANCE_PAUSED state") CC: stable@vger.kernel.org # 5.17 Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 17389a42a3ab..ba78ca5aabbb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -922,6 +922,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; + case BTRFS_EXCLOP_BALANCE_PAUSED: + str = "balance paused\n"; + break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; From 6997fbd7a3dafa754f81d541498ace35b43246d8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 5 May 2022 10:53:53 +0900 Subject: [PATCH 093/179] net: rds: use maybe_get_net() when acquiring refcount on TCP sockets Eric Dumazet is reporting addition on 0 problem at rds_tcp_tune(), for delayed works queued in rds_wq might be invoked after a net namespace's refcount already reached 0. Since rds_tcp_exit_net() from cleanup_net() calls flush_workqueue(rds_wq), it is guaranteed that we can instead use maybe_get_net() from delayed work functions until rds_tcp_exit_net() returns. Note that I'm not convinced that all works which might access a net namespace are already queued in rds_wq by the moment rds_tcp_exit_net() calls flush_workqueue(rds_wq). If some race is there, rds_tcp_exit_net() will fail to wait for work functions, and kmem_cache_free() could be called from net_free() before maybe_get_net() is called from rds_tcp_tune(). Reported-by: Eric Dumazet Fixes: 3a58f13a881ed351 ("net: rds: acquire refcount on TCP sockets") Signed-off-by: Tetsuo Handa Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/41d09faf-bc78-1a87-dfd1-c6d1b5984b61@I-love.SAKURA.ne.jp Signed-off-by: Jakub Kicinski --- net/rds/tcp.c | 12 +++++++++--- net/rds/tcp.h | 2 +- net/rds/tcp_connect.c | 5 ++++- net/rds/tcp_listen.c | 5 ++++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 2f638f8b7b1e..73ee2771093d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -487,11 +487,11 @@ struct rds_tcp_net { /* All module specific customizations to the RDS-TCP socket should be done in * rds_tcp_tune() and applied after socket creation. */ -void rds_tcp_tune(struct socket *sock) +bool rds_tcp_tune(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); - struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct rds_tcp_net *rtn; tcp_sock_set_nodelay(sock->sk); lock_sock(sk); @@ -499,10 +499,15 @@ void rds_tcp_tune(struct socket *sock) * a process which created this net namespace terminated. */ if (!sk->sk_net_refcnt) { + if (!maybe_get_net(net)) { + release_sock(sk); + return false; + } sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); } + rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; sk->sk_userlocks |= SOCK_SNDBUF_LOCK; @@ -512,6 +517,7 @@ void rds_tcp_tune(struct socket *sock) sk->sk_userlocks |= SOCK_RCVBUF_LOCK; } release_sock(sk); + return true; } static void rds_tcp_accept_worker(struct work_struct *work) diff --git a/net/rds/tcp.h b/net/rds/tcp.h index dc8d745d6857..f8b5930d7b34 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -49,7 +49,7 @@ struct rds_tcp_statistics { }; /* tcp.c */ -void rds_tcp_tune(struct socket *sock); +bool rds_tcp_tune(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 5461d77fff4f..f0c477c5d1db 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -124,7 +124,10 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) if (ret < 0) goto out; - rds_tcp_tune(sock); + if (!rds_tcp_tune(sock)) { + ret = -EINVAL; + goto out; + } if (isv6) { sin6.sin6_family = AF_INET6; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 09cadd556d1e..7edf2e69d3fe 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -133,7 +133,10 @@ int rds_tcp_accept_one(struct socket *sock) __module_get(new_sock->ops->owner); rds_tcp_keepalive(new_sock); - rds_tcp_tune(new_sock); + if (!rds_tcp_tune(new_sock)) { + ret = -EINVAL; + goto out; + } inet = inet_sk(new_sock->sk); From e333eed63a091a09bd0db191b7710c594c6e995b Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 4 May 2022 11:31:03 -0300 Subject: [PATCH 094/179] net: phy: micrel: Do not use kszphy_suspend/resume for KSZ8061 Since commit f1131b9c23fb ("net: phy: micrel: use kszphy_suspend()/kszphy_resume for irq aware devices") the following NULL pointer dereference is observed on a board with KSZ8061: # udhcpc -i eth0 udhcpc: started, v1.35.0 8<--- cut here --- Unable to handle kernel NULL pointer dereference at virtual address 00000008 pgd = f73cef4e [00000008] *pgd=00000000 Internal error: Oops: 5 [#1] SMP ARM Modules linked in: CPU: 0 PID: 196 Comm: ifconfig Not tainted 5.15.37-dirty #94 Hardware name: Freescale i.MX6 SoloX (Device Tree) PC is at kszphy_config_reset+0x10/0x114 LR is at kszphy_resume+0x24/0x64 ... The KSZ8061 phy_driver structure does not have the .probe/..driver_data fields, which means that priv is not allocated. This causes the NULL pointer dereference inside kszphy_config_reset(). Fix the problem by using the generic suspend/resume functions as before. Another alternative would be to provide the .probe and .driver_data information into the structure, but to be on the safe side, let's just restore Ethernet functionality by using the generic suspend/resume. Cc: stable@vger.kernel.org Fixes: f1131b9c23fb ("net: phy: micrel: use kszphy_suspend()/kszphy_resume for irq aware devices") Signed-off-by: Fabio Estevam Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220504143104.1286960-1-festevam@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index fc53b71dc872..7c243cedde9f 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2782,8 +2782,8 @@ static struct phy_driver ksphy_driver[] = { .config_init = ksz8061_config_init, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, - .suspend = kszphy_suspend, - .resume = kszphy_resume, + .suspend = genphy_suspend, + .resume = genphy_resume, }, { .phy_id = PHY_ID_KSZ9021, .phy_id_mask = 0x000ffffe, From 15f03ffe4bb951e982457f44b6cf6b06ef4cbb93 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 4 May 2022 11:31:04 -0300 Subject: [PATCH 095/179] net: phy: micrel: Pass .probe for KS8737 Since commit f1131b9c23fb ("net: phy: micrel: use kszphy_suspend()/kszphy_resume for irq aware devices") the kszphy_suspend/ resume hooks are used. These functions require the probe function to be called so that priv can be allocated. Otherwise, a NULL pointer dereference happens inside kszphy_config_reset(). Cc: stable@vger.kernel.org Fixes: f1131b9c23fb ("net: phy: micrel: use kszphy_suspend()/kszphy_resume for irq aware devices") Reported-by: Andrew Lunn Signed-off-by: Fabio Estevam Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220504143104.1286960-2-festevam@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 7c243cedde9f..9d7dafed3931 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2657,6 +2657,7 @@ static struct phy_driver ksphy_driver[] = { .name = "Micrel KS8737", /* PHY_BASIC_FEATURES */ .driver_data = &ks8737_type, + .probe = kszphy_probe, .config_init = kszphy_config_init, .config_intr = kszphy_config_intr, .handle_interrupt = kszphy_handle_interrupt, From e1a7ac6f3ba6e157adcd0ca94d92a401f1943f56 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 4 May 2022 11:07:38 +0200 Subject: [PATCH 096/179] ping: fix address binding wrt vrf When ping_group_range is updated, 'ping' uses the DGRAM ICMP socket, instead of an IP raw socket. In this case, 'ping' is unable to bind its socket to a local address owned by a vrflite. Before the patch: $ sysctl -w net.ipv4.ping_group_range='0 2147483647' $ ip link add blue type vrf table 10 $ ip link add foo type dummy $ ip link set foo master blue $ ip link set foo up $ ip addr add 192.168.1.1/24 dev foo $ ip addr add 2001::1/64 dev foo $ ip vrf exec blue ping -c1 -I 192.168.1.1 192.168.1.2 ping: bind: Cannot assign requested address $ ip vrf exec blue ping6 -c1 -I 2001::1 2001::2 ping6: bind icmp socket: Cannot assign requested address CC: stable@vger.kernel.org Fixes: 1b69c6d0ae90 ("net: Introduce L3 Master device abstraction") Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/ping.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 3ee947557b88..aa9a11b20d18 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -305,6 +305,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + u32 tb_id = RT_TABLE_LOCAL; int chk_addr_ret; if (addr_len < sizeof(*addr)) @@ -318,7 +319,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; + chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk), addr->sin_addr.s_addr, @@ -355,6 +357,14 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, return -ENODEV; } } + + if (!dev && sk->sk_bound_dev_if) { + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + } has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, scoped); rcu_read_unlock(); From e71b7f1f44d3d88c677769c85ef0171caf9fc89f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 4 May 2022 11:07:39 +0200 Subject: [PATCH 097/179] selftests: add ping test with ping_group_range tuned The 'ping' utility is able to manage two kind of sockets (raw or icmp), depending on the sysctl ping_group_range. By default, ping_group_range is set to '1 0', which forces ping to use an ip raw socket. Let's replay the ping tests by allowing 'ping' to use the ip icmp socket. After the previous patch, ipv4 tests results are the same with both kinds of socket. For ipv6, there are a lot a new failures (the previous patch fixes only two cases). Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fcnal-test.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 47c4d4b4a44a..54701c8b0cd7 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -810,10 +810,16 @@ ipv4_ping() setup set_sysctl net.ipv4.raw_l3mdev_accept=1 2>/dev/null ipv4_ping_novrf + setup + set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null + ipv4_ping_novrf log_subsection "With VRF" setup "yes" ipv4_ping_vrf + setup "yes" + set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null + ipv4_ping_vrf } ################################################################################ @@ -2348,10 +2354,16 @@ ipv6_ping() log_subsection "No VRF" setup ipv6_ping_novrf + setup + set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null + ipv6_ping_novrf log_subsection "With VRF" setup "yes" ipv6_ping_vrf + setup "yes" + set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null + ipv6_ping_vrf } ################################################################################ From 85db6352fc8a158a893151baa1716463d34a20d0 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 4 May 2022 11:09:14 +0300 Subject: [PATCH 098/179] net: Fix features skip in for_each_netdev_feature() The find_next_netdev_feature() macro gets the "remaining length", not bit index. Passing "bit - 1" for the following iteration is wrong as it skips the adjacent bit. Pass "bit" instead. Fixes: 3b89ea9c5902 ("net: Fix for_each_netdev_feature on Big endian") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20220504080914.1918-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdev_features.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2c6b9e416225..7c2d77d75a88 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -169,7 +169,7 @@ enum { #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) -/* Finds the next feature with the highest number of the range of start till 0. +/* Finds the next feature with the highest number of the range of start-1 till 0. */ static inline int find_next_netdev_feature(u64 feature, unsigned long start) { @@ -188,7 +188,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) for ((bit) = find_next_netdev_feature((mask_addr), \ NETDEV_FEATURE_COUNT); \ (bit) >= 0; \ - (bit) = find_next_netdev_feature((mask_addr), (bit) - 1)) + (bit) = find_next_netdev_feature((mask_addr), (bit))) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ From 4e707344e18525b4edf5c2bc2e3eb60692e8c92e Mon Sep 17 00:00:00 2001 From: Jonathan Toppins Date: Wed, 4 May 2022 14:59:08 -0400 Subject: [PATCH 099/179] MAINTAINERS: add missing files for bonding definition The bonding entry did not include additional include files that have been added nor did it reference the documentation. Add these references for completeness. Signed-off-by: Jonathan Toppins Link: https://lore.kernel.org/r/903ed2906b93628b38a2015664a20d2802042863.1651690748.git.jtoppins@redhat.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index edc96cdb85e8..15037bd1ce48 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3571,8 +3571,9 @@ M: Andy Gospodarek L: netdev@vger.kernel.org S: Supported W: http://sourceforge.net/projects/bonding/ +F: Documentation/networking/bonding.rst F: drivers/net/bonding/ -F: include/net/bonding.h +F: include/net/bond* F: include/uapi/linux/if_bonding.h BOSCH SENSORTEC BMA400 ACCELEROMETER IIO DRIVER From 5b53a405e4658580e1faf7c217db3f55a21ba849 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Thu, 5 May 2022 16:17:29 +0200 Subject: [PATCH 100/179] s390/dasd: fix data corruption for ESE devices For ESE devices we get an error when accessing an unformatted track. The handling of this error will return zero data for read requests and format the track on demand before writing to it. To do this the code needs to distinguish between read and write requests. This is done with data from the blocklayer request. A pointer to the blocklayer request is stored in the CQR. If there is an error on the device an ERP request is built to do error recovery. While the ERP request is mostly a copy of the original CQR the pointer to the blocklayer request is not copied to not accidentally pass it back to the blocklayer without cleanup. This leads to the error that during ESE handling after an ERP request was built it is not possible to determine the IO direction. This leads to the formatting of a track for read requests which might in turn lead to data corruption. Fixes: 5e2b17e712cf ("s390/dasd: Add dynamic formatting support for ESE volumes") Cc: stable@vger.kernel.org # 5.3+ Signed-off-by: Stefan Haberland Reviewed-by: Jan Hoeppner Link: https://lore.kernel.org/r/20220505141733.1989450-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 8 +++++++- drivers/s390/block/dasd_eckd.c | 2 +- drivers/s390/block/dasd_int.h | 12 ++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 8e87a31e329d..76d13c5ff205 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1639,6 +1639,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, unsigned long now; int nrf_suppressed = 0; int fp_suppressed = 0; + struct request *req; u8 *sense = NULL; int expires; @@ -1739,7 +1740,12 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, } if (dasd_ese_needs_format(cqr->block, irb)) { - if (rq_data_dir((struct request *)cqr->callback_data) == READ) { + req = dasd_get_callback_data(cqr); + if (!req) { + cqr->status = DASD_CQR_ERROR; + return; + } + if (rq_data_dir(req) == READ) { device->discipline->ese_read(cqr, irb); cqr->status = DASD_CQR_SUCCESS; cqr->stopclk = now; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 8410a25a65c1..e3583502aca2 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -3145,7 +3145,7 @@ dasd_eckd_ese_format(struct dasd_device *startdev, struct dasd_ccw_req *cqr, sector_t curr_trk; int rc; - req = cqr->callback_data; + req = dasd_get_callback_data(cqr); block = cqr->block; base = block->base; private = base->private; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 3b7af00a7825..07f9670ea61e 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -756,6 +756,18 @@ dasd_check_blocksize(int bsize) return 0; } +/* + * return the callback data of the original request in case there are + * ERP requests build on top of it + */ +static inline void *dasd_get_callback_data(struct dasd_ccw_req *cqr) +{ + while (cqr->refers) + cqr = cqr->refers; + + return cqr->callback_data; +} + /* externals in dasd.c */ #define DASD_PROFILE_OFF 0 #define DASD_PROFILE_ON 1 From 71f3871657370dbbaf942a1c758f64e49a36c70f Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Thu, 5 May 2022 16:17:30 +0200 Subject: [PATCH 101/179] s390/dasd: prevent double format of tracks for ESE devices For ESE devices we get an error for write operations on an unformatted track. Afterwards the track will be formatted and the IO operation restarted. When using alias devices a track might be accessed by multiple requests simultaneously and there is a race window that a track gets formatted twice resulting in data loss. Prevent this by remembering the amount of formatted tracks when starting a request and comparing this number before actually formatting a track on the fly. If the number has changed there is a chance that the current track was finally formatted in between. As a result do not format the track and restart the current IO to check. The number of formatted tracks does not match the overall number of formatted tracks on the device and it might wrap around but this is no problem. It is only needed to recognize that a track has been formatted at all in between. Fixes: 5e2b17e712cf ("s390/dasd: Add dynamic formatting support for ESE volumes") Cc: stable@vger.kernel.org # 5.3+ Signed-off-by: Stefan Haberland Reviewed-by: Jan Hoeppner Link: https://lore.kernel.org/r/20220505141733.1989450-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 7 +++++++ drivers/s390/block/dasd_eckd.c | 19 +++++++++++++++++-- drivers/s390/block/dasd_int.h | 2 ++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 76d13c5ff205..d62a4c673962 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1422,6 +1422,13 @@ int dasd_start_IO(struct dasd_ccw_req *cqr) if (!cqr->lpm) cqr->lpm = dasd_path_get_opm(device); } + /* + * remember the amount of formatted tracks to prevent double format on + * ESE devices + */ + if (cqr->block) + cqr->trkcount = atomic_read(&cqr->block->trkcount); + if (cqr->cpmode == 1) { rc = ccw_device_tm_start(device->cdev, cqr->cpaddr, (long) cqr, cqr->lpm); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index e3583502aca2..649eba51e048 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -3083,13 +3083,24 @@ static int dasd_eckd_format_device(struct dasd_device *base, } static bool test_and_set_format_track(struct dasd_format_entry *to_format, - struct dasd_block *block) + struct dasd_ccw_req *cqr) { + struct dasd_block *block = cqr->block; struct dasd_format_entry *format; unsigned long flags; bool rc = false; spin_lock_irqsave(&block->format_lock, flags); + if (cqr->trkcount != atomic_read(&block->trkcount)) { + /* + * The number of formatted tracks has changed after request + * start and we can not tell if the current track was involved. + * To avoid data corruption treat it as if the current track is + * involved + */ + rc = true; + goto out; + } list_for_each_entry(format, &block->format_list, list) { if (format->track == to_format->track) { rc = true; @@ -3109,6 +3120,7 @@ static void clear_format_track(struct dasd_format_entry *format, unsigned long flags; spin_lock_irqsave(&block->format_lock, flags); + atomic_inc(&block->trkcount); list_del_init(&format->list); spin_unlock_irqrestore(&block->format_lock, flags); } @@ -3170,8 +3182,11 @@ dasd_eckd_ese_format(struct dasd_device *startdev, struct dasd_ccw_req *cqr, } format->track = curr_trk; /* test if track is already in formatting by another thread */ - if (test_and_set_format_track(format, block)) + if (test_and_set_format_track(format, cqr)) { + /* this is no real error so do not count down retries */ + cqr->retries++; return ERR_PTR(-EEXIST); + } fdata.start_unit = curr_trk; fdata.stop_unit = curr_trk; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 07f9670ea61e..83b918b84b4a 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -187,6 +187,7 @@ struct dasd_ccw_req { void (*callback)(struct dasd_ccw_req *, void *data); void *callback_data; unsigned int proc_bytes; /* bytes for partial completion */ + unsigned int trkcount; /* count formatted tracks */ }; /* @@ -610,6 +611,7 @@ struct dasd_block { struct list_head format_list; spinlock_t format_lock; + atomic_t trkcount; }; struct dasd_attention_data { From cd68c48ea15c85f1577a442dc4c285e112ff1b37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 5 May 2022 16:17:31 +0200 Subject: [PATCH 102/179] s390/dasd: Fix read for ESE with blksize < 4k MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reading unformatted tracks on ESE devices, the corresponding memory areas are simply set to zero for each segment. This is done incorrectly for blocksizes < 4096. There are two problems. First, the increment of dst is done using the counter of the loop (off), which is increased by blksize every iteration. This leads to a much bigger increment for dst as actually intended. Second, the increment of dst is done before the memory area is set to 0, skipping a significant amount of bytes of memory. This leads to illegal overwriting of memory and ultimately to a kernel panic. This is not a problem with 4k blocksize because blk_queue_max_segment_size is set to PAGE_SIZE, always resulting in a single iteration for the inner segment loop (bv.bv_len == blksize). The incorrectly used 'off' value to increment dst is 0 and the correct memory area is used. In order to fix this for blksize < 4k, increment dst correctly using the blksize and only do it at the end of the loop. Fixes: 5e2b17e712cf ("s390/dasd: Add dynamic formatting support for ESE volumes") Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Jan Höppner Reviewed-by: Stefan Haberland Link: https://lore.kernel.org/r/20220505141733.1989450-4-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 649eba51e048..e46461b4d8a7 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -3285,12 +3285,11 @@ static int dasd_eckd_ese_read(struct dasd_ccw_req *cqr, struct irb *irb) cqr->proc_bytes = blk_count * blksize; return 0; } - if (dst && !skip_block) { - dst += off; + if (dst && !skip_block) memset(dst, 0, blksize); - } else { + else skip_block--; - } + dst += blksize; blk_count++; } } From b9c10f68e23c13f56685559a0d6fdaca9f838324 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 5 May 2022 16:17:32 +0200 Subject: [PATCH 103/179] s390/dasd: Fix read inconsistency for ESE DASD devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Read requests that return with NRF error are partially completed in dasd_eckd_ese_read(). The function keeps track of the amount of processed bytes and the driver will eventually return this information back to the block layer for further processing via __dasd_cleanup_cqr() when the request is in the final stage of processing (from the driver's perspective). For this, blk_update_request() is used which requires the number of bytes to complete the request. As per documentation the nr_bytes parameter is described as follows: "number of bytes to complete for @req". This was mistakenly interpreted as "number of bytes _left_ for @req" leading to new requests with incorrect data length. The consequence are inconsistent and completely wrong read requests as data from random memory areas are read back. Fix this by correctly specifying the amount of bytes that should be used to complete the request. Fixes: 5e6bdd37c552 ("s390/dasd: fix data corruption for thin provisioned devices") Cc: stable@vger.kernel.org # 5.3+ Signed-off-by: Jan Höppner Reviewed-by: Stefan Haberland Link: https://lore.kernel.org/r/20220505141733.1989450-5-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index d62a4c673962..ba6d78789660 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2778,8 +2778,7 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) * complete a request partially. */ if (proc_bytes) { - blk_update_request(req, BLK_STS_OK, - blk_rq_bytes(req) - proc_bytes); + blk_update_request(req, BLK_STS_OK, proc_bytes); blk_mq_requeue_request(req, true); } else if (likely(!blk_should_fake_timeout(req->q))) { blk_mq_complete_request(req); From f1c8781ac9d87650ccf45a354c0bbfa3f9230371 Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Thu, 5 May 2022 16:17:33 +0200 Subject: [PATCH 104/179] s390/dasd: Use kzalloc instead of kmalloc/memset Use kzalloc rather than duplicating its implementation, which makes code simple and easy to understand. Signed-off-by: Haowen Bai Reviewed-by: Sven Schnelle Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20220505141733.1989450-6-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index e46461b4d8a7..836838f7d686 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1480,7 +1480,7 @@ static int dasd_eckd_pe_handler(struct dasd_device *device, { struct pe_handler_work_data *data; - data = kmalloc(sizeof(*data), GFP_ATOMIC | GFP_DMA); + data = kzalloc(sizeof(*data), GFP_ATOMIC | GFP_DMA); if (!data) { if (mutex_trylock(&dasd_pe_handler_mutex)) { data = pe_handler_worker; @@ -1488,9 +1488,6 @@ static int dasd_eckd_pe_handler(struct dasd_device *device, } else { return -ENOMEM; } - } else { - memset(data, 0, sizeof(*data)); - data->isglobal = 0; } INIT_WORK(&data->worker, do_pe_handler_work); dasd_get_device(device); From e1846cff2fe614d93a2f89461b5935678fd34bd9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 May 2022 02:54:59 +0300 Subject: [PATCH 105/179] net: mscc: ocelot: mark traps with a bool instead of keeping them in a list Since the blamed commit, VCAP filters can appear on more than one list. If their action is "trap", they are chained on ocelot->traps via filter->trap_list. This is in addition to their normal placement on the VCAP block->rules list head. Therefore, when we free a VCAP filter, we must remove it from all lists it is a member of, including ocelot->traps. There are at least 2 bugs which are direct consequences of this design decision. First is the incorrect usage of list_empty(), meant to denote whether "filter" is chained into ocelot->traps via filter->trap_list. This does not do the correct thing, because list_empty() checks whether "head->next == head", but in our case, head->next == head->prev == NULL. So we dereference NULL pointers and die when we call list_del(). Second is the fact that not all places that should remove the filter from ocelot->traps do so. One example is ocelot_vcap_block_remove_filter(), which is where we have the main kfree(filter). By keeping freed filters in ocelot->traps we end up in a use-after-free in felix_update_trapping_destinations(). Attempting to fix all the buggy patterns is a whack-a-mole game which makes the driver unmaintainable. Actually this is what the previous patch version attempted to do: https://patchwork.kernel.org/project/netdevbpf/patch/20220503115728.834457-3-vladimir.oltean@nxp.com/ but it introduced another set of bugs, because there are other places in which create VCAP filters, not just ocelot_vcap_filter_create(): - ocelot_trap_add() - felix_tag_8021q_vlan_add_rx() - felix_tag_8021q_vlan_add_tx() Relying on the convention that all those code paths must call INIT_LIST_HEAD(&filter->trap_list) is not going to scale. So let's do what should have been done in the first place and keep a bool in struct ocelot_vcap_filter which denotes whether we are looking at a trapping rule or not. Iterating now happens over the main VCAP IS2 block->rules. The advantage is that we no longer risk having stale references to a freed filter, since it is only present in that list. Fixes: e42bd4ed09aa ("net: mscc: ocelot: keep traps in a list") Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 7 ++++++- drivers/net/ethernet/mscc/ocelot.c | 11 +++-------- drivers/net/ethernet/mscc/ocelot_flower.c | 4 +--- include/soc/mscc/ocelot_vcap.h | 2 +- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 9e28219b223d..faccfb3f0158 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -403,6 +403,7 @@ static int felix_update_trapping_destinations(struct dsa_switch *ds, { struct ocelot *ocelot = ds->priv; struct felix *felix = ocelot_to_felix(ocelot); + struct ocelot_vcap_block *block_vcap_is2; struct ocelot_vcap_filter *trap; enum ocelot_mask_mode mask_mode; unsigned long port_mask; @@ -422,9 +423,13 @@ static int felix_update_trapping_destinations(struct dsa_switch *ds, /* We are sure that "cpu" was found, otherwise * dsa_tree_setup_default_cpu() would have failed earlier. */ + block_vcap_is2 = &ocelot->block[VCAP_IS2]; /* Make sure all traps are set up for that destination */ - list_for_each_entry(trap, &ocelot->traps, trap_list) { + list_for_each_entry(trap, &block_vcap_is2->rules, list) { + if (!trap->is_trap) + continue; + /* Figure out the current trapping destination */ if (using_tag_8021q) { /* Redirect to the tag_8021q CPU port. If timestamps diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ca71b62a44dc..20ceac81a2c2 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1622,7 +1622,7 @@ int ocelot_trap_add(struct ocelot *ocelot, int port, trap->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY; trap->action.port_mask = 0; trap->take_ts = take_ts; - list_add_tail(&trap->trap_list, &ocelot->traps); + trap->is_trap = true; new = true; } @@ -1634,10 +1634,8 @@ int ocelot_trap_add(struct ocelot *ocelot, int port, err = ocelot_vcap_filter_replace(ocelot, trap); if (err) { trap->ingress_port_mask &= ~BIT(port); - if (!trap->ingress_port_mask) { - list_del(&trap->trap_list); + if (!trap->ingress_port_mask) kfree(trap); - } return err; } @@ -1657,11 +1655,8 @@ int ocelot_trap_del(struct ocelot *ocelot, int port, unsigned long cookie) return 0; trap->ingress_port_mask &= ~BIT(port); - if (!trap->ingress_port_mask) { - list_del(&trap->trap_list); - + if (!trap->ingress_port_mask) return ocelot_vcap_filter_del(ocelot, trap); - } return ocelot_vcap_filter_replace(ocelot, trap); } diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index 03b5e59d033e..a9b26b3002be 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -295,7 +295,7 @@ static int ocelot_flower_parse_action(struct ocelot *ocelot, int port, filter->action.cpu_copy_ena = true; filter->action.cpu_qu_num = 0; filter->type = OCELOT_VCAP_FILTER_OFFLOAD; - list_add_tail(&filter->trap_list, &ocelot->traps); + filter->is_trap = true; break; case FLOW_ACTION_POLICE: if (filter->block_id == PSFP_BLOCK_ID) { @@ -878,8 +878,6 @@ int ocelot_cls_flower_replace(struct ocelot *ocelot, int port, ret = ocelot_flower_parse(ocelot, port, ingress, f, filter); if (ret) { - if (!list_empty(&filter->trap_list)) - list_del(&filter->trap_list); kfree(filter); return ret; } diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 7b2bf9b1fe69..de26c992f821 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -681,7 +681,6 @@ struct ocelot_vcap_id { struct ocelot_vcap_filter { struct list_head list; - struct list_head trap_list; enum ocelot_vcap_filter_type type; int block_id; @@ -695,6 +694,7 @@ struct ocelot_vcap_filter { struct ocelot_vcap_stats stats; /* For VCAP IS1 and IS2 */ bool take_ts; + bool is_trap; unsigned long ingress_port_mask; /* For VCAP ES0 */ struct ocelot_vcap_port ingress_port; From 16bbebd35629c93a8c68c6d8d28557e100bcee73 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 May 2022 02:55:00 +0300 Subject: [PATCH 106/179] net: mscc: ocelot: fix last VCAP IS1/IS2 filter persisting in hardware when deleted ocelot_vcap_filter_del() works by moving the next filters over the current one, and then deleting the last filter by calling vcap_entry_set() with a del_filter which was specially created by memsetting its memory to zeroes. vcap_entry_set() then programs this to the TCAM and action RAM via the cache registers. The problem is that vcap_entry_set() is a dispatch function which looks at del_filter->block_id. But since del_filter is zeroized memory, the block_id is 0, or otherwise said, VCAP_ES0. So practically, what we do is delete the entry at the same TCAM index from VCAP ES0 instead of IS1 or IS2. The code was not always like this. vcap_entry_set() used to simply be is2_entry_set(), and then, the logic used to work. Restore the functionality by populating the block_id of the del_filter based on the VCAP block of the filter that we're deleting. This makes vcap_entry_set() know what to do. Fixes: 1397a2eb52e2 ("net: mscc: ocelot: create TCAM skeleton from tc filter chains") Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_vcap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index c8701ac955a8..2749df593ebc 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -1250,7 +1250,11 @@ int ocelot_vcap_filter_del(struct ocelot *ocelot, struct ocelot_vcap_filter del_filter; int i, index; + /* Need to inherit the block_id so that vcap_entry_set() + * does not get confused and knows where to install it. + */ memset(&del_filter, 0, sizeof(del_filter)); + del_filter.block_id = filter->block_id; /* Gets index of the filter */ index = ocelot_vcap_block_get_filter_index(block, filter); From 6741e11880003e35802d78cc58035057934f4dab Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 May 2022 02:55:01 +0300 Subject: [PATCH 107/179] net: mscc: ocelot: fix VCAP IS2 filters matching on both lookups The VCAP IS2 TCAM is looked up twice per packet, and each filter can be configured to only match during the first, second lookup, or both, or none. The blamed commit wrote the code for making VCAP IS2 filters match only on the given lookup. But right below that code, there was another line that explicitly made the lookup a "don't care", and this is overwriting the lookup we've selected. So the code had no effect. Some of the more noticeable effects of having filters match on both lookups: - in "tc -s filter show dev swp0 ingress", we see each packet matching a VCAP IS2 filter counted twice. This throws off scripts such as tools/testing/selftests/net/forwarding/tc_actions.sh and makes them fail. - a "tc-drop" action offloaded to VCAP IS2 needs a policer as well, because once the CPU port becomes a member of the destination port mask of a packet, nothing removes it, not even a PERMIT/DENY mask mode with a port mask of 0. But VCAP IS2 rules with the POLICE_ENA bit in the action vector can only appear in the first lookup. What happens when a filter matches both lookups is that the action vector is combined, and this makes the POLICE_ENA bit ineffective, since the last lookup in which it has appeared is the second one. In other words, "tc-drop" actions do not drop packets for the CPU port, dropped packets are still seen by software unless there was an FDB entry that directed those packets to some other place different from the CPU. The last bit used to work, because in the initial commit b596229448dd ("net: mscc: ocelot: Add support for tcam"), we were writing the FIRST field of the VCAP IS2 half key with a 1, not with a "don't care". The change to "don't care" was made inadvertently by me in commit c1c3993edb7c ("net: mscc: ocelot: generalize existing code for VCAP"), which I just realized, and which needs a separate fix from this one, for "stable" kernels that lack the commit blamed below. Fixes: 226e9cd82a96 ("net: mscc: ocelot: only install TCAM entries into a specific lookup and PAG") Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_vcap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index 2749df593ebc..774cec377703 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -374,7 +374,6 @@ static void is2_entry_set(struct ocelot *ocelot, int ix, OCELOT_VCAP_BIT_0); vcap_key_set(vcap, &data, VCAP_IS2_HK_IGR_PORT_MASK, 0, ~filter->ingress_port_mask); - vcap_key_bit_set(vcap, &data, VCAP_IS2_HK_FIRST, OCELOT_VCAP_BIT_ANY); vcap_key_bit_set(vcap, &data, VCAP_IS2_HK_HOST_MATCH, OCELOT_VCAP_BIT_ANY); vcap_key_bit_set(vcap, &data, VCAP_IS2_HK_L2_MC, filter->dmac_mc); From 477d2b91623e682e9a8126ea92acb8f684969cc7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 May 2022 02:55:02 +0300 Subject: [PATCH 108/179] net: mscc: ocelot: restrict tc-trap actions to VCAP IS2 lookup 0 Once the CPU port was added to the destination port mask of a packet, it can never be cleared, so even packets marked as dropped by the MASK_MODE of a VCAP IS2 filter will still reach it. This is why we need the OCELOT_POLICER_DISCARD to "kill dropped packets dead" and make software stop seeing them. We disallow policer rules from being put on any other chain than the one for the first lookup, but we don't do this for "drop" rules, although we should. This change is merely ascertaining that the rules dont't (completely) work and letting the user know. The blamed commit is the one that introduced the multi-chain architecture in ocelot. Prior to that, we should have always offloaded the filters to VCAP IS2 lookup 0, where they did work. Fixes: 1397a2eb52e2 ("net: mscc: ocelot: create TCAM skeleton from tc filter chains") Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_flower.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c index a9b26b3002be..51cf241ff7d0 100644 --- a/drivers/net/ethernet/mscc/ocelot_flower.c +++ b/drivers/net/ethernet/mscc/ocelot_flower.c @@ -280,9 +280,10 @@ static int ocelot_flower_parse_action(struct ocelot *ocelot, int port, filter->type = OCELOT_VCAP_FILTER_OFFLOAD; break; case FLOW_ACTION_TRAP: - if (filter->block_id != VCAP_IS2) { + if (filter->block_id != VCAP_IS2 || + filter->lookup != 0) { NL_SET_ERR_MSG_MOD(extack, - "Trap action can only be offloaded to VCAP IS2"); + "Trap action can only be offloaded to VCAP IS2 lookup 0"); return -EOPNOTSUPP; } if (filter->goto_target != -1) { From 93a8417088ea570b5721d2b526337a2d3aed9fa3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 May 2022 02:55:03 +0300 Subject: [PATCH 109/179] net: mscc: ocelot: avoid corrupting hardware counters when moving VCAP filters Given the following order of operations: (1) we add filter A using tc-flower (2) we send a packet that matches it (3) we read the filter's statistics to find a hit count of 1 (4) we add a second filter B with a higher preference than A, and A moves one position to the right to make room in the TCAM for it (5) we send another packet, and this matches the second filter B (6) we read the filter statistics again. When this happens, the hit count of filter A is 2 and of filter B is 1, despite a single packet having matched each filter. Furthermore, in an alternate history, reading the filter stats a second time between steps (3) and (4) makes the hit count of filter A remain at 1 after step (6), as expected. The reason why this happens has to do with the filter->stats.pkts field, which is written to hardware through the call path below: vcap_entry_set / | \ / | \ / | \ / | \ es0_entry_set is1_entry_set is2_entry_set \ | / \ | / \ | / vcap_data_set(data.counter, ...) The primary role of filter->stats.pkts is to transport the filter hit counters from the last readout all the way from vcap_entry_get() -> ocelot_vcap_filter_stats_update() -> ocelot_cls_flower_stats(). The reason why vcap_entry_set() writes it to hardware is so that the counters (saturating and having a limited bit width) are cleared after each user space readout. The writing of filter->stats.pkts to hardware during the TCAM entry movement procedure is an unintentional consequence of the code design, because the hit count isn't up to date at this point. So at step (4), when filter A is moved by ocelot_vcap_filter_add() to make room for filter B, the hardware hit count is 0 (no packet matched on it in the meantime), but filter->stats.pkts is 1, because the last readout saw the earlier packet. The movement procedure programs the old hit count back to hardware, so this creates the impression to user space that more packets have been matched than they really were. The bug can be seen when running the gact_drop_and_ok_test() from the tc_actions.sh selftest. Fix the issue by reading back the hit count to tmp->stats.pkts before migrating the VCAP filter. Sure, this is a best-effort technique, since the packets that hit the rule between vcap_entry_get() and vcap_entry_set() won't be counted, but at least it allows the counters to be reliably used for selftests where the traffic is under control. The vcap_entry_get() name is a bit unintuitive, but it only reads back the counter portion of the TCAM entry, not the entire entry. The index from which we retrieve the counter is also a bit unintuitive (i - 1 during add, i + 1 during del), but this is the way in which TCAM entry movement works. The "entry index" isn't a stored integer for a TCAM filter, instead it is dynamically computed by ocelot_vcap_block_get_filter_index() based on the entry's position in the &block->rules list. That position (as well as block->count) is automatically updated by ocelot_vcap_filter_add_to_block() on add, and by ocelot_vcap_block_remove_filter() on del. So "i" is the new filter index, and "i - 1" or "i + 1" respectively are the old addresses of that TCAM entry (we only support installing/deleting one filter at a time). Fixes: b596229448dd ("net: mscc: ocelot: Add support for tcam") Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_vcap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.c b/drivers/net/ethernet/mscc/ocelot_vcap.c index 774cec377703..eeb4cc07dd16 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.c +++ b/drivers/net/ethernet/mscc/ocelot_vcap.c @@ -1216,6 +1216,8 @@ int ocelot_vcap_filter_add(struct ocelot *ocelot, struct ocelot_vcap_filter *tmp; tmp = ocelot_vcap_block_find_filter_by_index(block, i); + /* Read back the filter's counters before moving it */ + vcap_entry_get(ocelot, i - 1, tmp); vcap_entry_set(ocelot, i, tmp); } @@ -1268,6 +1270,8 @@ int ocelot_vcap_filter_del(struct ocelot *ocelot, struct ocelot_vcap_filter *tmp; tmp = ocelot_vcap_block_find_filter_by_index(block, i); + /* Read back the filter's counters before moving it */ + vcap_entry_get(ocelot, i + 1, tmp); vcap_entry_set(ocelot, i, tmp); } From 348c71344111d7a48892e3e52264ff11956fc196 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 5 May 2022 21:04:51 +0530 Subject: [PATCH 110/179] powerpc/papr_scm: Fix buffer overflow issue with CONFIG_FORTIFY_SOURCE With CONFIG_FORTIFY_SOURCE enabled, string functions will also perform dynamic checks for string size which can panic the kernel, like incase of overflow detection. In papr_scm, papr_scm_pmu_check_events function uses stat->stat_id with string operations, to populate the nvdimm_events_map array. Since stat_id variable is not NULL terminated, the kernel panics with CONFIG_FORTIFY_SOURCE enabled at boot time. Below are the logs of kernel panic: detected buffer overflow in __fortify_strlen ------------[ cut here ]------------ kernel BUG at lib/string_helpers.c:980! Oops: Exception in kernel mode, sig: 5 [#1] NIP [c00000000077dad0] fortify_panic+0x28/0x38 LR [c00000000077dacc] fortify_panic+0x24/0x38 Call Trace: [c0000022d77836e0] [c00000000077dacc] fortify_panic+0x24/0x38 (unreliable) [c00800000deb2660] papr_scm_pmu_check_events.constprop.0+0x118/0x220 [papr_scm] [c00800000deb2cb0] papr_scm_probe+0x288/0x62c [papr_scm] [c0000000009b46a8] platform_probe+0x98/0x150 Fix this issue by using kmemdup_nul() to copy the content of stat->stat_id directly to the nvdimm_events_map array. mpe: stat->stat_id comes from the hypervisor, not userspace, so there is no security exposure. Fixes: 4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support") Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220505153451.35503-1-kjain@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f58728d5f10d..39962c905542 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -462,7 +462,6 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu { struct papr_scm_perf_stat *stat; struct papr_scm_perf_stats *stats; - char *statid; int index, rc, count; u32 available_events; @@ -493,14 +492,12 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu for (index = 0, stat = stats->scm_statistic, count = 0; index < available_events; index++, ++stat) { - statid = kzalloc(strlen(stat->stat_id) + 1, GFP_KERNEL); - if (!statid) { + p->nvdimm_events_map[count] = kmemdup_nul(stat->stat_id, 8, GFP_KERNEL); + if (!p->nvdimm_events_map[count]) { rc = -ENOMEM; goto out_nvdimm_events_map; } - strcpy(statid, stat->stat_id); - p->nvdimm_events_map[count] = statid; count++; } p->nvdimm_events_map[count] = NULL; From c25d7f32e3e209462cd82e6e93e66b72dbb2308f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 28 Apr 2022 22:05:00 -0500 Subject: [PATCH 111/179] platform/x86: thinkpad_acpi: Convert btusb DMI list to quirks DMI matching in thinkpad_acpi happens local to a function meaning quirks can only match that function. Future changes to thinkpad_acpi may need to quirk other code, so change this to use a quirk infrastructure. Signed-off-by: Mario Limonciello Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20220429030501.1909-2-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index c568fae56db2..2820205c01fd 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -309,6 +309,15 @@ struct ibm_init_struct { struct ibm_struct *data; }; +/* DMI Quirks */ +struct quirk_entry { + bool btusb_bug; +}; + +static struct quirk_entry quirk_btusb_bug = { + .btusb_bug = true, +}; + static struct { u32 bluetooth:1; u32 hotkey:1; @@ -338,6 +347,7 @@ static struct { u32 hotkey_poll_active:1; u32 has_adaptive_kbd:1; u32 kbd_lang:1; + struct quirk_entry *quirks; } tp_features; static struct { @@ -4359,9 +4369,10 @@ static void bluetooth_exit(void) bluetooth_shutdown(); } -static const struct dmi_system_id bt_fwbug_list[] __initconst = { +static const struct dmi_system_id fwbug_list[] __initconst = { { .ident = "ThinkPad E485", + .driver_data = &quirk_btusb_bug, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_BOARD_NAME, "20KU"), @@ -4369,6 +4380,7 @@ static const struct dmi_system_id bt_fwbug_list[] __initconst = { }, { .ident = "ThinkPad E585", + .driver_data = &quirk_btusb_bug, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_BOARD_NAME, "20KV"), @@ -4376,6 +4388,7 @@ static const struct dmi_system_id bt_fwbug_list[] __initconst = { }, { .ident = "ThinkPad A285 - 20MW", + .driver_data = &quirk_btusb_bug, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_BOARD_NAME, "20MW"), @@ -4383,6 +4396,7 @@ static const struct dmi_system_id bt_fwbug_list[] __initconst = { }, { .ident = "ThinkPad A285 - 20MX", + .driver_data = &quirk_btusb_bug, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_BOARD_NAME, "20MX"), @@ -4390,6 +4404,7 @@ static const struct dmi_system_id bt_fwbug_list[] __initconst = { }, { .ident = "ThinkPad A485 - 20MU", + .driver_data = &quirk_btusb_bug, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_BOARD_NAME, "20MU"), @@ -4397,6 +4412,7 @@ static const struct dmi_system_id bt_fwbug_list[] __initconst = { }, { .ident = "ThinkPad A485 - 20MV", + .driver_data = &quirk_btusb_bug, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_BOARD_NAME, "20MV"), @@ -4419,7 +4435,8 @@ static int __init have_bt_fwbug(void) * Some AMD based ThinkPads have a firmware bug that calling * "GBDC" will cause bluetooth on Intel wireless cards blocked */ - if (dmi_check_system(bt_fwbug_list) && pci_dev_present(fwbug_cards_ids)) { + if (tp_features.quirks && tp_features.quirks->btusb_bug && + pci_dev_present(fwbug_cards_ids)) { vdbg_printk(TPACPI_DBG_INIT | TPACPI_DBG_RFKILL, FW_BUG "disable bluetooth subdriver for Intel cards\n"); return 1; @@ -11496,6 +11513,7 @@ static void thinkpad_acpi_module_exit(void) static int __init thinkpad_acpi_module_init(void) { + const struct dmi_system_id *dmi_id; int ret, i; tpacpi_lifecycle = TPACPI_LIFE_INIT; @@ -11535,6 +11553,10 @@ static int __init thinkpad_acpi_module_init(void) return -ENODEV; } + dmi_id = dmi_first_match(fwbug_list); + if (dmi_id) + tp_features.quirks = dmi_id->driver_data; + /* Device initialization */ tpacpi_pdev = platform_device_register_simple(TPACPI_DRVR_NAME, -1, NULL, 0); From 455cd867b85b53fd3602345f9b8a8facc551adc9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 28 Apr 2022 22:05:01 -0500 Subject: [PATCH 112/179] platform/x86: thinkpad_acpi: Add a s2idle resume quirk for a number of laptops Lenovo laptops that contain NVME SSDs across a variety of generations have trouble resuming from suspend to idle when the IOMMU translation layer is active for the NVME storage device. This generally manifests as a large resume delay or page faults. These delays and page faults occur as a result of a Lenovo BIOS specific SMI that runs during the D3->D0 transition on NVME devices. This SMI occurs because of a flag that is set during resume by Lenovo firmware: ``` OperationRegion (PM80, SystemMemory, 0xFED80380, 0x10) Field (PM80, AnyAcc, NoLock, Preserve) { SI3R, 1 } Method (_ON, 0, NotSerialized) // _ON_: Power On { TPST (0x60D0) If ((DAS3 == 0x00)) { If (SI3R) { TPST (0x60E0) M020 (NBRI, 0x00, 0x00, 0x04, (NCMD | 0x06)) M020 (NBRI, 0x00, 0x00, 0x10, NBAR) APMC = HDSI /* \HDSI */ SLPS = 0x01 SI3R = 0x00 TPST (0x60E1) } D0NV = 0x01 } } ``` Create a quirk that will run early in the resume process to prevent this SMI from running. As any of these machines are fixed, they can be peeled back from this quirk or narrowed down to individual firmware versions. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1910 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1689 Signed-off-by: Mario Limonciello Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20220429030501.1909-3-mario.limonciello@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 126 +++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 2820205c01fd..8180d7789f56 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -312,12 +312,17 @@ struct ibm_init_struct { /* DMI Quirks */ struct quirk_entry { bool btusb_bug; + u32 s2idle_bug_mmio; }; static struct quirk_entry quirk_btusb_bug = { .btusb_bug = true, }; +static struct quirk_entry quirk_s2idle_bug = { + .s2idle_bug_mmio = 0xfed80380, +}; + static struct { u32 bluetooth:1; u32 hotkey:1; @@ -4418,9 +4423,119 @@ static const struct dmi_system_id fwbug_list[] __initconst = { DMI_MATCH(DMI_BOARD_NAME, "20MV"), }, }, + { + .ident = "L14 Gen2 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20X5"), + } + }, + { + .ident = "T14s Gen2 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20XF"), + } + }, + { + .ident = "X13 Gen2 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20XH"), + } + }, + { + .ident = "T14 Gen2 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20XK"), + } + }, + { + .ident = "T14 Gen1 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20UD"), + } + }, + { + .ident = "T14 Gen1 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20UE"), + } + }, + { + .ident = "T14s Gen1 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20UH"), + } + }, + { + .ident = "P14s Gen1 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20Y1"), + } + }, + { + .ident = "P14s Gen2 AMD", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21A0"), + } + }, {} }; +#ifdef CONFIG_SUSPEND +/* + * Lenovo laptops from a variety of generations run a SMI handler during the D3->D0 + * transition that occurs specifically when exiting suspend to idle which can cause + * large delays during resume when the IOMMU translation layer is enabled (the default + * behavior) for NVME devices: + * + * To avoid this firmware problem, skip the SMI handler on these machines before the + * D0 transition occurs. + */ +static void thinkpad_acpi_amd_s2idle_restore(void) +{ + struct resource *res; + void __iomem *addr; + u8 val; + + res = request_mem_region_muxed(tp_features.quirks->s2idle_bug_mmio, 1, + "thinkpad_acpi_pm80"); + if (!res) + return; + + addr = ioremap(tp_features.quirks->s2idle_bug_mmio, 1); + if (!addr) + goto cleanup_resource; + + val = ioread8(addr); + iowrite8(val & ~BIT(0), addr); + + iounmap(addr); +cleanup_resource: + release_resource(res); +} + +static struct acpi_s2idle_dev_ops thinkpad_acpi_s2idle_dev_ops = { + .restore = thinkpad_acpi_amd_s2idle_restore, +}; +#endif + static const struct pci_device_id fwbug_cards_ids[] __initconst = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x24F3) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x24FD) }, @@ -11472,6 +11587,10 @@ static void thinkpad_acpi_module_exit(void) tpacpi_lifecycle = TPACPI_LIFE_EXITING; +#ifdef CONFIG_SUSPEND + if (tp_features.quirks && tp_features.quirks->s2idle_bug_mmio) + acpi_unregister_lps0_dev(&thinkpad_acpi_s2idle_dev_ops); +#endif if (tpacpi_hwmon) hwmon_device_unregister(tpacpi_hwmon); if (tp_features.sensors_pdrv_registered) @@ -11645,6 +11764,13 @@ static int __init thinkpad_acpi_module_init(void) tp_features.input_device_registered = 1; } +#ifdef CONFIG_SUSPEND + if (tp_features.quirks && tp_features.quirks->s2idle_bug_mmio) { + if (!acpi_register_lps0_dev(&thinkpad_acpi_s2idle_dev_ops)) + pr_info("Using s2idle quirk to avoid %s platform firmware bug\n", + (dmi_id && dmi_id->ident) ? dmi_id->ident : ""); + } +#endif return 0; } From aa2fef6f40e6ccc22e932b36898f260f0e5a021a Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Mon, 2 May 2022 15:12:00 -0400 Subject: [PATCH 113/179] platform/x86: thinkpad_acpi: Correct dual fan probe There was an issue with the dual fan probe whereby the probe was failing as it assuming that second_fan support was not available. Corrected the logic so the probe works correctly. Cleaned up so quirks only used if 2nd fan not detected. Tested on X1 Carbon 10 (2 fans), X1 Carbon 9 (2 fans) and T490 (1 fan) Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20220502191200.63470-1-markpearson@lenovo.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 8180d7789f56..e6cb4a14cdd4 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -8880,24 +8880,27 @@ static int __init fan_init(struct ibm_init_struct *iibm) fan_status_access_mode = TPACPI_FAN_RD_TPEC; if (quirks & TPACPI_FAN_Q1) fan_quirk1_setup(); - if (quirks & TPACPI_FAN_2FAN) { - tp_features.second_fan = 1; - pr_info("secondary fan support enabled\n"); - } - if (quirks & TPACPI_FAN_2CTL) { - tp_features.second_fan = 1; - tp_features.second_fan_ctl = 1; - pr_info("secondary fan control enabled\n"); - } /* Try and probe the 2nd fan */ + tp_features.second_fan = 1; /* needed for get_speed to work */ res = fan2_get_speed(&speed); if (res >= 0) { /* It responded - so let's assume it's there */ tp_features.second_fan = 1; tp_features.second_fan_ctl = 1; pr_info("secondary fan control detected & enabled\n"); + } else { + /* Fan not auto-detected */ + tp_features.second_fan = 0; + if (quirks & TPACPI_FAN_2FAN) { + tp_features.second_fan = 1; + pr_info("secondary fan support enabled\n"); + } + if (quirks & TPACPI_FAN_2CTL) { + tp_features.second_fan = 1; + tp_features.second_fan_ctl = 1; + pr_info("secondary fan control enabled\n"); + } } - } else { pr_err("ThinkPad ACPI EC access misbehaving, fan status and control unavailable\n"); return -ENODEV; From 2cdfa0c20d58da3757054797c2974c967035926a Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Fri, 29 Apr 2022 08:23:22 -0400 Subject: [PATCH 114/179] platform/x86/intel: Fix 'rmmod pmt_telemetry' panic 'rmmod pmt_telemetry' panics with: BUG: kernel NULL pointer dereference, address: 0000000000000040 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 4 PID: 1697 Comm: rmmod Tainted: G S W -------- --- 5.18.0-rc4 #3 Hardware name: Intel Corporation Alder Lake Client Platform/AlderLake-P DDR5 RVP, BIOS ADLPFWI1.R00.3056.B00.2201310233 01/31/2022 RIP: 0010:device_del+0x1b/0x3d0 Code: e8 1a d9 e9 ff e9 58 ff ff ff 48 8b 08 eb dc 0f 1f 44 00 00 41 56 41 55 41 54 55 48 8d af 80 00 00 00 53 48 89 fb 48 83 ec 18 <4c> 8b 67 40 48 89 ef 65 48 8b 04 25 28 00 00 00 48 89 44 24 10 31 RSP: 0018:ffffb520415cfd60 EFLAGS: 00010286 RAX: 0000000000000070 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000080 R08: ffffffffffffffff R09: ffffb520415cfd78 R10: 0000000000000002 R11: ffffb520415cfd78 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f7e198e5740(0000) GS:ffff905c9f700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000040 CR3: 000000010782a005 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: ? __xa_erase+0x53/0xb0 device_unregister+0x13/0x50 intel_pmt_dev_destroy+0x34/0x60 [pmt_class] pmt_telem_remove+0x40/0x50 [pmt_telemetry] auxiliary_bus_remove+0x18/0x30 device_release_driver_internal+0xc1/0x150 driver_detach+0x44/0x90 bus_remove_driver+0x74/0xd0 auxiliary_driver_unregister+0x12/0x20 pmt_telem_exit+0xc/0xe4a [pmt_telemetry] __x64_sys_delete_module+0x13a/0x250 ? syscall_trace_enter.isra.19+0x11e/0x1a0 do_syscall_64+0x58/0x80 ? syscall_exit_to_user_mode+0x12/0x30 ? do_syscall_64+0x67/0x80 ? syscall_exit_to_user_mode+0x12/0x30 ? do_syscall_64+0x67/0x80 ? syscall_exit_to_user_mode+0x12/0x30 ? do_syscall_64+0x67/0x80 ? exc_page_fault+0x64/0x140 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f7e1803a05b Code: 73 01 c3 48 8b 0d 2d 4e 38 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fd 4d 38 00 f7 d8 64 89 01 48 The probe function, pmt_telem_probe(), adds an entry for devices even if they have not been initialized. This results in the array of initialized devices containing both initialized and uninitialized entries. This causes a panic in the remove function, pmt_telem_remove() which expects the array to only contain initialized entries. Only use an entry when a device is initialized. Cc: "David E. Box" Cc: Hans de Goede Cc: Mark Gross Cc: platform-driver-x86@vger.kernel.org Signed-off-by: David Arcari Signed-off-by: Prarit Bhargava Reviewed-by: David E. Box Link: https://lore.kernel.org/r/20220429122322.2550003-1-prarit@redhat.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/intel/pmt/telemetry.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c index 6b6f3e2a617a..f73ecfd4a309 100644 --- a/drivers/platform/x86/intel/pmt/telemetry.c +++ b/drivers/platform/x86/intel/pmt/telemetry.c @@ -103,7 +103,7 @@ static int pmt_telem_probe(struct auxiliary_device *auxdev, const struct auxilia auxiliary_set_drvdata(auxdev, priv); for (i = 0; i < intel_vsec_dev->num_resources; i++) { - struct intel_pmt_entry *entry = &priv->entry[i]; + struct intel_pmt_entry *entry = &priv->entry[priv->num_entries]; ret = intel_pmt_dev_create(entry, &pmt_telem_ns, intel_vsec_dev, i); if (ret < 0) From ed13d4ac57474d959c40fd05d8860e2b1607becb Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Fri, 29 Apr 2022 20:00:49 +0200 Subject: [PATCH 115/179] platform/surface: gpe: Add support for Surface Pro 8 The new Surface Pro 8 uses GPEs for lid events as well. Add an entry for that so that the lid can be used to wake the device. Note that this is a device with a keyboard type-cover, where this acts as the "lid". Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20220429180049.1282447-1-luzmaximilian@gmail.com Signed-off-by: Hans de Goede --- drivers/platform/surface/surface_gpe.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/surface/surface_gpe.c b/drivers/platform/surface/surface_gpe.c index c1775db29efb..ec66fde28e75 100644 --- a/drivers/platform/surface/surface_gpe.c +++ b/drivers/platform/surface/surface_gpe.c @@ -99,6 +99,14 @@ static const struct dmi_system_id dmi_lid_device_table[] = { }, .driver_data = (void *)lid_device_props_l4D, }, + { + .ident = "Surface Pro 8", + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Surface Pro 8"), + }, + .driver_data = (void *)lid_device_props_l4B, + }, { .ident = "Surface Book 1", .matches = { From 44acfc22c7d055d9c4f8f0974ee28422405b971a Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Fri, 29 Apr 2022 21:57:38 +0200 Subject: [PATCH 116/179] platform/surface: aggregator: Fix initialization order when compiling as builtin module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building the Surface Aggregator Module (SAM) core, registry, and other SAM client drivers as builtin modules (=y), proper initialization order is not guaranteed. Due to this, client driver registration (triggered by device registration in the registry) races against bus initialization in the core. If any attempt is made at registering the device driver before the bus has been initialized (i.e. if bus initialization fails this race) driver registration will fail with a message similar to: Driver surface_battery was unable to register with bus_type surface_aggregator because the bus was not initialized Switch from module_init() to subsys_initcall() to resolve this issue. Note that the serdev subsystem uses postcore_initcall() so we are still able to safely register the serdev device driver for the core. Fixes: c167b9c7e3d6 ("platform/surface: Add Surface Aggregator subsystem") Reported-by: Blaž Hrastnik Signed-off-by: Maximilian Luz Link: https://lore.kernel.org/r/20220429195738.535751-1-luzmaximilian@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/surface/aggregator/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/surface/aggregator/core.c b/drivers/platform/surface/aggregator/core.c index d384d36098c2..a62c5dfe42d6 100644 --- a/drivers/platform/surface/aggregator/core.c +++ b/drivers/platform/surface/aggregator/core.c @@ -817,7 +817,7 @@ err_cpkg: err_bus: return status; } -module_init(ssam_core_init); +subsys_initcall(ssam_core_init); static void __exit ssam_core_exit(void) { From dba785798526a3282cc4d0f0ea751883715dbbb4 Mon Sep 17 00:00:00 2001 From: Puyou Lu Date: Fri, 6 May 2022 16:06:30 +0800 Subject: [PATCH 117/179] gpio: pca953x: fix irq_stat not updated when irq is disabled (irq_mask not set) When one port's input state get inverted (eg. from low to hight) after pca953x_irq_setup but before setting irq_mask (by some other driver such as "gpio-keys"), the next inversion of this port (eg. from hight to low) will not be triggered any more (because irq_stat is not updated at the first time). Issue should be fixed after this commit. Fixes: 89ea8bbe9c3e ("gpio: pca953x.c: add interrupt handling capability") Signed-off-by: Puyou Lu Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index d2fe76f3f34f..8726921a1129 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -762,11 +762,11 @@ static bool pca953x_irq_pending(struct pca953x_chip *chip, unsigned long *pendin bitmap_xor(cur_stat, new_stat, old_stat, gc->ngpio); bitmap_and(trigger, cur_stat, chip->irq_mask, gc->ngpio); + bitmap_copy(chip->irq_stat, new_stat, gc->ngpio); + if (bitmap_empty(trigger, gc->ngpio)) return false; - bitmap_copy(chip->irq_stat, new_stat, gc->ngpio); - bitmap_and(cur_stat, chip->irq_trig_fall, old_stat, gc->ngpio); bitmap_and(old_stat, chip->irq_trig_raise, new_stat, gc->ngpio); bitmap_or(new_stat, old_stat, cur_stat, gc->ngpio); From 0c2c7c069285374fc8feacddc0498f8ab7627117 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Mon, 2 May 2022 09:58:07 -0700 Subject: [PATCH 118/179] KVM: SEV: Mark nested locking of vcpu->lock svm_vm_migrate_from() uses sev_lock_vcpus_for_migration() to lock all source and target vcpu->locks. Unfortunately there is an 8 subclass limit, so a new subclass cannot be used for each vCPU. Instead maintain ownership of the first vcpu's mutex.dep_map using a role specific subclass: source vs target. Release the other vcpu's mutex.dep_maps. Fixes: b56639318bb2b ("KVM: SEV: Add support for SEV intra host migration") Reported-by: John Sperbeck Suggested-by: David Rientjes Suggested-by: Sean Christopherson Suggested-by: Paolo Bonzini Cc: Hillf Danton Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Gonda Message-Id: <20220502165807.529624-1-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0ad70c12c7c3..7c392873626f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1594,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } +/* vCPU mutex subclasses. */ +enum sev_migration_role { + SEV_MIGRATION_SOURCE = 0, + SEV_MIGRATION_TARGET, + SEV_NR_MIGRATION_ROLES, +}; -static int sev_lock_vcpus_for_migration(struct kvm *kvm) +static int sev_lock_vcpus_for_migration(struct kvm *kvm, + enum sev_migration_role role) { struct kvm_vcpu *vcpu; unsigned long i, j; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable(&vcpu->mutex)) + if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; + + if (first) { + /* + * Reset the role to one that avoids colliding with + * the role used for the first vcpu mutex. + */ + role = SEV_NR_MIGRATION_ROLES; + first = false; + } else { + mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); + } } return 0; out_unlock: + + first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); + + mutex_unlock(&vcpu->mutex); } return -EINTR; @@ -1621,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, + SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); + mutex_unlock(&vcpu->mutex); } } @@ -1748,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm); + ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm); + ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); if (ret) goto out_dst_vcpu; From 053d2290c0307e3642e75e0185ddadf084dc36c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 May 2022 22:18:50 +0000 Subject: [PATCH 119/179] KVM: VMX: Exit to userspace if vCPU has injected exception and invalid state Exit to userspace with an emulation error if KVM encounters an injected exception with invalid guest state, in addition to the existing check of bailing if there's a pending exception (KVM doesn't support emulating exceptions except when emulating real mode via vm86). In theory, KVM should never get to such a situation as KVM is supposed to exit to userspace before injecting an exception with invalid guest state. But in practice, userspace can intervene and manually inject an exception and/or stuff registers to force invalid guest state while a previously injected exception is awaiting reinjection. Fixes: fc4fad79fc3d ("KVM: VMX: Reject KVM_RUN if emulation is required with pending exception") Reported-by: syzbot+cfafed3bb76d3e37581b@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20220502221850.131873-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d58b763df855..610355b9ccce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5472,7 +5472,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); return vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending; + (vcpu->arch.exception.pending || vcpu->arch.exception.injected); } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) From 486b9eee57ddca5c9a2d59fc41153f36002e0a00 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 23 Apr 2022 12:20:21 +0200 Subject: [PATCH 120/179] ice: Fix race during aux device (un)plugging Function ice_plug_aux_dev() assigns pf->adev field too early prior aux device initialization and on other side ice_unplug_aux_dev() starts aux device deinit and at the end assigns NULL to pf->adev. This is wrong because pf->adev should always be non-NULL only when aux device is fully initialized and ready. This wrong order causes a crash when ice_send_event_to_aux() call occurs because that function depends on non-NULL value of pf->adev and does not assume that aux device is half-initialized or half-destroyed. After order correction the race window is tiny but it is still there, as Leon mentioned and manipulation with pf->adev needs to be protected by mutex. Fix (un-)plugging functions so pf->adev field is set after aux device init and prior aux device destroy and protect pf->adev assignment by new mutex. This mutex is also held during ice_send_event_to_aux() call to ensure that aux device is valid during that call. Note that device lock used ice_send_event_to_aux() needs to be kept to avoid race with aux drv unload. Reproducer: cycle=1 while :;do echo "#### Cycle: $cycle" ip link set ens7f0 mtu 9000 ip link add bond0 type bond mode 1 miimon 100 ip link set bond0 up ifenslave bond0 ens7f0 ip link set bond0 mtu 9000 ethtool -L ens7f0 combined 1 ip link del bond0 ip link set ens7f0 mtu 1500 sleep 1 let cycle++ done In short when the device is added/removed to/from bond the aux device is unplugged/plugged. When MTU of the device is changed an event is sent to aux device asynchronously. This can race with (un)plugging operation and because pf->adev is set too early (plug) or too late (unplug) the function ice_send_event_to_aux() can touch uninitialized or destroyed fields. In the case of crash below pf->adev->dev.mutex. Crash: [ 53.372066] bond0: (slave ens7f0): making interface the new active one [ 53.378622] bond0: (slave ens7f0): Enslaving as an active interface with an u p link [ 53.386294] IPv6: ADDRCONF(NETDEV_CHANGE): bond0: link becomes ready [ 53.549104] bond0: (slave ens7f1): Enslaving as a backup interface with an up link [ 54.118906] ice 0000:ca:00.0 ens7f0: Number of in use tx queues changed inval idating tc mappings. Priority traffic classification disabled! [ 54.233374] ice 0000:ca:00.1 ens7f1: Number of in use tx queues changed inval idating tc mappings. Priority traffic classification disabled! [ 54.248204] bond0: (slave ens7f0): Releasing backup interface [ 54.253955] bond0: (slave ens7f1): making interface the new active one [ 54.274875] bond0: (slave ens7f1): Releasing backup interface [ 54.289153] bond0 (unregistering): Released all slaves [ 55.383179] MII link monitoring set to 100 ms [ 55.398696] bond0: (slave ens7f0): making interface the new active one [ 55.405241] BUG: kernel NULL pointer dereference, address: 0000000000000080 [ 55.405289] bond0: (slave ens7f0): Enslaving as an active interface with an u p link [ 55.412198] #PF: supervisor write access in kernel mode [ 55.412200] #PF: error_code(0x0002) - not-present page [ 55.412201] PGD 25d2ad067 P4D 0 [ 55.412204] Oops: 0002 [#1] PREEMPT SMP NOPTI [ 55.412207] CPU: 0 PID: 403 Comm: kworker/0:2 Kdump: loaded Tainted: G S 5.17.0-13579-g57f2d6540f03 #1 [ 55.429094] bond0: (slave ens7f1): Enslaving as a backup interface with an up link [ 55.430224] Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.4.4 10/07/ 2021 [ 55.430226] Workqueue: ice ice_service_task [ice] [ 55.468169] RIP: 0010:mutex_unlock+0x10/0x20 [ 55.472439] Code: 0f b1 13 74 96 eb e0 4c 89 ee eb d8 e8 79 54 ff ff 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 65 48 8b 04 25 40 ef 01 00 31 d2 48 0f b1 17 75 01 c3 e9 e3 fe ff ff 0f 1f 00 0f 1f 44 00 00 48 [ 55.491186] RSP: 0018:ff4454230d7d7e28 EFLAGS: 00010246 [ 55.496413] RAX: ff1a79b208b08000 RBX: ff1a79b2182e8880 RCX: 0000000000000001 [ 55.503545] RDX: 0000000000000000 RSI: ff4454230d7d7db0 RDI: 0000000000000080 [ 55.510678] RBP: ff1a79d1c7e48b68 R08: ff4454230d7d7db0 R09: 0000000000000041 [ 55.517812] R10: 00000000000000a5 R11: 00000000000006e6 R12: ff1a79d1c7e48bc0 [ 55.524945] R13: 0000000000000000 R14: ff1a79d0ffc305c0 R15: 0000000000000000 [ 55.532076] FS: 0000000000000000(0000) GS:ff1a79d0ffc00000(0000) knlGS:0000000000000000 [ 55.540163] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 55.545908] CR2: 0000000000000080 CR3: 00000003487ae003 CR4: 0000000000771ef0 [ 55.553041] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 55.560173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 55.567305] PKRU: 55555554 [ 55.570018] Call Trace: [ 55.572474] [ 55.574579] ice_service_task+0xaab/0xef0 [ice] [ 55.579130] process_one_work+0x1c5/0x390 [ 55.583141] ? process_one_work+0x390/0x390 [ 55.587326] worker_thread+0x30/0x360 [ 55.590994] ? process_one_work+0x390/0x390 [ 55.595180] kthread+0xe6/0x110 [ 55.598325] ? kthread_complete_and_exit+0x20/0x20 [ 55.603116] ret_from_fork+0x1f/0x30 [ 55.606698] Fixes: f9f5301e7e2d ("ice: Register auxiliary device to provide RDMA") Reviewed-by: Leon Romanovsky Signed-off-by: Ivan Vecera Reviewed-by: Dave Ertman Tested-by: Gurucharan (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_idc.c | 25 +++++++++++++++-------- drivers/net/ethernet/intel/ice/ice_main.c | 2 ++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 8ed3c9ab7ff7..a895e3a8e988 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -540,6 +540,7 @@ struct ice_pf { struct mutex avail_q_mutex; /* protects access to avail_[rx|tx]qs */ struct mutex sw_mutex; /* lock for protecting VSI alloc flow */ struct mutex tc_mutex; /* lock to protect TC changes */ + struct mutex adev_mutex; /* lock to protect aux device access */ u32 msg_enable; struct ice_ptp ptp; struct tty_driver *ice_gnss_tty_driver; diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index 25a436d342c2..3e3b2ed4cd5d 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -37,14 +37,17 @@ void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_event *event) if (WARN_ON_ONCE(!in_task())) return; + mutex_lock(&pf->adev_mutex); if (!pf->adev) - return; + goto finish; device_lock(&pf->adev->dev); iadrv = ice_get_auxiliary_drv(pf); if (iadrv && iadrv->event_handler) iadrv->event_handler(pf, event); device_unlock(&pf->adev->dev); +finish: + mutex_unlock(&pf->adev_mutex); } /** @@ -290,7 +293,6 @@ int ice_plug_aux_dev(struct ice_pf *pf) return -ENOMEM; adev = &iadev->adev; - pf->adev = adev; iadev->pf = pf; adev->id = pf->aux_idx; @@ -300,18 +302,20 @@ int ice_plug_aux_dev(struct ice_pf *pf) ret = auxiliary_device_init(adev); if (ret) { - pf->adev = NULL; kfree(iadev); return ret; } ret = auxiliary_device_add(adev); if (ret) { - pf->adev = NULL; auxiliary_device_uninit(adev); return ret; } + mutex_lock(&pf->adev_mutex); + pf->adev = adev; + mutex_unlock(&pf->adev_mutex); + return 0; } @@ -320,12 +324,17 @@ int ice_plug_aux_dev(struct ice_pf *pf) */ void ice_unplug_aux_dev(struct ice_pf *pf) { - if (!pf->adev) - return; + struct auxiliary_device *adev; - auxiliary_device_delete(pf->adev); - auxiliary_device_uninit(pf->adev); + mutex_lock(&pf->adev_mutex); + adev = pf->adev; pf->adev = NULL; + mutex_unlock(&pf->adev_mutex); + + if (adev) { + auxiliary_device_delete(adev); + auxiliary_device_uninit(adev); + } } /** diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 9a0a358a15c2..949669fed7d6 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3769,6 +3769,7 @@ u16 ice_get_avail_rxq_count(struct ice_pf *pf) static void ice_deinit_pf(struct ice_pf *pf) { ice_service_task_stop(pf); + mutex_destroy(&pf->adev_mutex); mutex_destroy(&pf->sw_mutex); mutex_destroy(&pf->tc_mutex); mutex_destroy(&pf->avail_q_mutex); @@ -3847,6 +3848,7 @@ static int ice_init_pf(struct ice_pf *pf) mutex_init(&pf->sw_mutex); mutex_init(&pf->tc_mutex); + mutex_init(&pf->adev_mutex); INIT_HLIST_HEAD(&pf->aq_wait_list); spin_lock_init(&pf->aq_wait_lock); From 6096dae926a22e2892ef9169f582589c16d39639 Mon Sep 17 00:00:00 2001 From: Anatolii Gerasymenko Date: Thu, 28 Apr 2022 12:01:00 +0000 Subject: [PATCH 121/179] ice: clear stale Tx queue settings before configuring The iAVF driver uses 3 virtchnl op codes to communicate with the PF regarding the VF Tx queues: * VIRTCHNL_OP_CONFIG_VSI_QUEUES configures the hardware and firmware logic for the Tx queues * VIRTCHNL_OP_ENABLE_QUEUES configures the queue interrupts * VIRTCHNL_OP_DISABLE_QUEUES disables the queue interrupts and Tx rings. There is a bug in the iAVF driver due to the race condition between VF reset request and shutdown being executed in parallel. This leads to a break in logic and VIRTCHNL_OP_DISABLE_QUEUES is not being sent. If this occurs, the PF driver never cleans up the Tx queues. This results in leaving behind stale Tx queue settings in the hardware and firmware. The most obvious outcome is that upon the next VIRTCHNL_OP_CONFIG_VSI_QUEUES, the PF will fail to program the Tx scheduler node due to a lack of space. We need to protect ICE driver against such situation. To fix this, make sure we clear existing stale settings out when handling VIRTCHNL_OP_CONFIG_VSI_QUEUES. This ensures we remove the previous settings. Calling ice_vf_vsi_dis_single_txq should be safe as it will do nothing if the queue is not configured. The function already handles the case when the Tx queue is not currently configured and exits with a 0 return in that case. Fixes: 7ad15440acf8 ("ice: Refactor VIRTCHNL_OP_CONFIG_VSI_QUEUES handling") Signed-off-by: Jacob Keller Signed-off-by: Anatolii Gerasymenko Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl.c | 68 ++++++++++++++----- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index b72606c9e6d0..2889e050a4c9 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -1307,13 +1307,52 @@ error_param: NULL, 0); } +/** + * ice_vf_vsi_dis_single_txq - disable a single Tx queue + * @vf: VF to disable queue for + * @vsi: VSI for the VF + * @q_id: VF relative (0-based) queue ID + * + * Attempt to disable the Tx queue passed in. If the Tx queue was successfully + * disabled then clear q_id bit in the enabled queues bitmap and return + * success. Otherwise return error. + */ +static int +ice_vf_vsi_dis_single_txq(struct ice_vf *vf, struct ice_vsi *vsi, u16 q_id) +{ + struct ice_txq_meta txq_meta = { 0 }; + struct ice_tx_ring *ring; + int err; + + if (!test_bit(q_id, vf->txq_ena)) + dev_dbg(ice_pf_to_dev(vsi->back), "Queue %u on VSI %u is not enabled, but stopping it anyway\n", + q_id, vsi->vsi_num); + + ring = vsi->tx_rings[q_id]; + if (!ring) + return -EINVAL; + + ice_fill_txq_meta(vsi, ring, &txq_meta); + + err = ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id, ring, &txq_meta); + if (err) { + dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Tx ring %d on VSI %d\n", + q_id, vsi->vsi_num); + return err; + } + + /* Clear enabled queues flag */ + clear_bit(q_id, vf->txq_ena); + + return 0; +} + /** * ice_vc_dis_qs_msg * @vf: pointer to the VF info * @msg: pointer to the msg buffer * - * called from the VF to disable all or specific - * queue(s) + * called from the VF to disable all or specific queue(s) */ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) { @@ -1350,30 +1389,15 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) q_map = vqs->tx_queues; for_each_set_bit(vf_q_id, &q_map, ICE_MAX_RSS_QS_PER_VF) { - struct ice_tx_ring *ring = vsi->tx_rings[vf_q_id]; - struct ice_txq_meta txq_meta = { 0 }; - if (!ice_vc_isvalid_q_id(vf, vqs->vsi_id, vf_q_id)) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; } - if (!test_bit(vf_q_id, vf->txq_ena)) - dev_dbg(ice_pf_to_dev(vsi->back), "Queue %u on VSI %u is not enabled, but stopping it anyway\n", - vf_q_id, vsi->vsi_num); - - ice_fill_txq_meta(vsi, ring, &txq_meta); - - if (ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id, - ring, &txq_meta)) { - dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Tx ring %d on VSI %d\n", - vf_q_id, vsi->vsi_num); + if (ice_vf_vsi_dis_single_txq(vf, vsi, vf_q_id)) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; } - - /* Clear enabled queues flag */ - clear_bit(vf_q_id, vf->txq_ena); } } @@ -1622,6 +1646,14 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) if (qpi->txq.ring_len > 0) { vsi->tx_rings[i]->dma = qpi->txq.dma_ring_addr; vsi->tx_rings[i]->count = qpi->txq.ring_len; + + /* Disable any existing queue first */ + if (ice_vf_vsi_dis_single_txq(vf, vsi, q_idx)) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto error_param; + } + + /* Configure a queue with the requested settings */ if (ice_vsi_cfg_single_txq(vsi, vsi->tx_rings, q_idx)) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; From a11b6c1a383ff092f432e040c20e032503785d47 Mon Sep 17 00:00:00 2001 From: Michal Michalik Date: Wed, 20 Apr 2022 14:23:02 +0200 Subject: [PATCH 122/179] ice: fix PTP stale Tx timestamps cleanup Read stale PTP Tx timestamps from PHY on cleanup. After running out of Tx timestamps request handlers, hardware (HW) stops reporting finished requests. Function ice_ptp_tx_tstamp_cleanup() used to only clean up stale handlers in driver and was leaving the hardware registers not read. Not reading stale PTP Tx timestamps prevents next interrupts from arriving and makes timestamping unusable. Fixes: ea9b847cda64 ("ice: enable transmit timestamps for E810 devices") Signed-off-by: Michal Michalik Reviewed-by: Jacob Keller Reviewed-by: Paul Menzel Tested-by: Gurucharan (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index a1cd33273ca4..da025c204577 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -2287,6 +2287,7 @@ ice_ptp_init_tx_e810(struct ice_pf *pf, struct ice_ptp_tx *tx) /** * ice_ptp_tx_tstamp_cleanup - Cleanup old timestamp requests that got dropped + * @hw: pointer to the hw struct * @tx: PTP Tx tracker to clean up * * Loop through the Tx timestamp requests and see if any of them have been @@ -2295,7 +2296,7 @@ ice_ptp_init_tx_e810(struct ice_pf *pf, struct ice_ptp_tx *tx) * timestamp will never be captured. This might happen if the packet gets * discarded before it reaches the PHY timestamping block. */ -static void ice_ptp_tx_tstamp_cleanup(struct ice_ptp_tx *tx) +static void ice_ptp_tx_tstamp_cleanup(struct ice_hw *hw, struct ice_ptp_tx *tx) { u8 idx; @@ -2304,11 +2305,16 @@ static void ice_ptp_tx_tstamp_cleanup(struct ice_ptp_tx *tx) for_each_set_bit(idx, tx->in_use, tx->len) { struct sk_buff *skb; + u64 raw_tstamp; /* Check if this SKB has been waiting for too long */ if (time_is_after_jiffies(tx->tstamps[idx].start + 2 * HZ)) continue; + /* Read tstamp to be able to use this register again */ + ice_read_phy_tstamp(hw, tx->quad, idx + tx->quad_offset, + &raw_tstamp); + spin_lock(&tx->lock); skb = tx->tstamps[idx].skb; tx->tstamps[idx].skb = NULL; @@ -2330,7 +2336,7 @@ static void ice_ptp_periodic_work(struct kthread_work *work) ice_ptp_update_cached_phctime(pf); - ice_ptp_tx_tstamp_cleanup(&pf->ptp.port.tx); + ice_ptp_tx_tstamp_cleanup(&pf->hw, &pf->ptp.port.tx); /* Run twice a second */ kthread_queue_delayed_work(ptp->kworker, &ptp->work, From 9e6c6d17d1d6a3f1515ce399f9a011629ec79aa0 Mon Sep 17 00:00:00 2001 From: Lokesh Dhoundiyal Date: Thu, 5 May 2022 14:00:17 +1200 Subject: [PATCH 123/179] ipv4: drop dst in multicast routing path kmemleak reports the following when routing multicast traffic over an ipsec tunnel. Kmemleak output: unreferenced object 0x8000000044bebb00 (size 256): comm "softirq", pid 0, jiffies 4294985356 (age 126.810s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 80 00 00 00 05 13 74 80 ..............t. 80 00 00 00 04 9b bf f9 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f83947e0>] __kmalloc+0x1e8/0x300 [<00000000b7ed8dca>] metadata_dst_alloc+0x24/0x58 [<0000000081d32c20>] __ipgre_rcv+0x100/0x2b8 [<00000000824f6cf1>] gre_rcv+0x178/0x540 [<00000000ccd4e162>] gre_rcv+0x7c/0xd8 [<00000000c024b148>] ip_protocol_deliver_rcu+0x124/0x350 [<000000006a483377>] ip_local_deliver_finish+0x54/0x68 [<00000000d9271b3a>] ip_local_deliver+0x128/0x168 [<00000000bd4968ae>] xfrm_trans_reinject+0xb8/0xf8 [<0000000071672a19>] tasklet_action_common.isra.16+0xc4/0x1b0 [<0000000062e9c336>] __do_softirq+0x1fc/0x3e0 [<00000000013d7914>] irq_exit+0xc4/0xe0 [<00000000a4d73e90>] plat_irq_dispatch+0x7c/0x108 [<000000000751eb8e>] handle_int+0x16c/0x178 [<000000001668023b>] _raw_spin_unlock_irqrestore+0x1c/0x28 The metadata dst is leaked when ip_route_input_mc() updates the dst for the skb. Commit f38a9eb1f77b ("dst: Metadata destinations") correctly handled dropping the dst in ip_route_input_slow() but missed the multicast case which is handled by ip_route_input_mc(). Drop the dst in ip_route_input_mc() avoiding the leak. Fixes: f38a9eb1f77b ("dst: Metadata destinations") Signed-off-by: Lokesh Dhoundiyal Signed-off-by: Chris Packham Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220505020017.3111846-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98c6f3429593..57abd27e842c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1753,6 +1753,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif RT_CACHE_STAT_INC(in_slow_mc); + skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return 0; } From d5076fe4049cadef1f040eda4aaa001bb5424225 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 May 2022 09:19:46 -0700 Subject: [PATCH 124/179] netlink: do not reset transport header in netlink_recvmsg() netlink_recvmsg() does not need to change transport header. If transport header was needed, it should have been reset by the producer (netlink_dump()), not the consumer(s). The following trace probably happened when multiple threads were using MSG_PEEK. BUG: KCSAN: data-race in netlink_recvmsg / netlink_recvmsg write to 0xffff88811e9f15b2 of 2 bytes by task 32012 on cpu 1: skb_reset_transport_header include/linux/skbuff.h:2760 [inline] netlink_recvmsg+0x1de/0x790 net/netlink/af_netlink.c:1978 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] __sys_recvfrom+0x204/0x2c0 net/socket.c:2097 __do_sys_recvfrom net/socket.c:2115 [inline] __se_sys_recvfrom net/socket.c:2111 [inline] __x64_sys_recvfrom+0x74/0x90 net/socket.c:2111 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff88811e9f15b2 of 2 bytes by task 32005 on cpu 0: skb_reset_transport_header include/linux/skbuff.h:2760 [inline] netlink_recvmsg+0x1de/0x790 net/netlink/af_netlink.c:1978 ____sys_recvmsg+0x162/0x2f0 ___sys_recvmsg net/socket.c:2674 [inline] __sys_recvmsg+0x209/0x3f0 net/socket.c:2704 __do_sys_recvmsg net/socket.c:2714 [inline] __se_sys_recvmsg net/socket.c:2711 [inline] __x64_sys_recvmsg+0x42/0x50 net/socket.c:2711 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffff -> 0x0000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 32005 Comm: syz-executor.4 Not tainted 5.18.0-rc1-syzkaller-00328-ge1f700ebd6be-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20220505161946.2867638-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 05a3795eac8e..73e9c0a9c187 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1975,7 +1975,6 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, copied = len; } - skb_reset_transport_header(data_skb); err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { From 1c7ab9cd98b78bef1657a5db7204d8d437e24c94 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 5 May 2022 16:31:01 -0700 Subject: [PATCH 125/179] net: chelsio: cxgb4: Avoid potential negative array offset Using min_t(int, ...) as a potential array index implies to the compiler that negative offsets should be allowed. This is not the case, though. Replace "int" with "unsigned int". Fixes the following warning exposed under future CONFIG_FORTIFY_SOURCE improvements: In file included from include/linux/string.h:253, from include/linux/bitmap.h:11, from include/linux/cpumask.h:12, from include/linux/smp.h:13, from include/linux/lockdep.h:14, from include/linux/rcupdate.h:29, from include/linux/rculist.h:11, from include/linux/pid.h:5, from include/linux/sched.h:14, from include/linux/delay.h:23, from drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:35: drivers/net/ethernet/chelsio/cxgb4/t4_hw.c: In function 't4_get_raw_vpd_params': include/linux/fortify-string.h:46:33: warning: '__builtin_memcpy' pointer overflow between offset 29 and size [2147483648, 4294967295] [-Warray-bounds] 46 | #define __underlying_memcpy __builtin_memcpy | ^ include/linux/fortify-string.h:388:9: note: in expansion of macro '__underlying_memcpy' 388 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ include/linux/fortify-string.h:433:26: note: in expansion of macro '__fortify_memcpy_chk' 433 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:2796:9: note: in expansion of macro 'memcpy' 2796 | memcpy(p->id, vpd + id, min_t(int, id_len, ID_LEN)); | ^~~~~~ include/linux/fortify-string.h:46:33: warning: '__builtin_memcpy' pointer overflow between offset 0 and size [2147483648, 4294967295] [-Warray-bounds] 46 | #define __underlying_memcpy __builtin_memcpy | ^ include/linux/fortify-string.h:388:9: note: in expansion of macro '__underlying_memcpy' 388 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ include/linux/fortify-string.h:433:26: note: in expansion of macro '__fortify_memcpy_chk' 433 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/chelsio/cxgb4/t4_hw.c:2798:9: note: in expansion of macro 'memcpy' 2798 | memcpy(p->sn, vpd + sn, min_t(int, sn_len, SERNUM_LEN)); | ^~~~~~ Additionally remove needless cast from u8[] to char * in last strim() call. Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/202205031926.FVP7epJM-lkp@intel.com Fixes: fc9279298e3a ("cxgb4: Search VPD with pci_vpd_find_ro_info_keyword()") Fixes: 24c521f81c30 ("cxgb4: Use pci_vpd_find_id_string() to find VPD ID string") Cc: Raju Rangoju Cc: Eric Dumazet Cc: Paolo Abeni Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220505233101.1224230-1-keescook@chromium.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index e7b4e3ed056c..8d719f82854a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -2793,14 +2793,14 @@ int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p) goto out; na = ret; - memcpy(p->id, vpd + id, min_t(int, id_len, ID_LEN)); + memcpy(p->id, vpd + id, min_t(unsigned int, id_len, ID_LEN)); strim(p->id); - memcpy(p->sn, vpd + sn, min_t(int, sn_len, SERNUM_LEN)); + memcpy(p->sn, vpd + sn, min_t(unsigned int, sn_len, SERNUM_LEN)); strim(p->sn); - memcpy(p->pn, vpd + pn, min_t(int, pn_len, PN_LEN)); + memcpy(p->pn, vpd + pn, min_t(unsigned int, pn_len, PN_LEN)); strim(p->pn); - memcpy(p->na, vpd + na, min_t(int, na_len, MACADDR_LEN)); - strim((char *)p->na); + memcpy(p->na, vpd + na, min_t(unsigned int, na_len, MACADDR_LEN)); + strim(p->na); out: vfree(vpd); From f71f01394f742fc4558b3f9f4c7ef4c4cf3b07c8 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 8 May 2022 11:37:07 +0200 Subject: [PATCH 126/179] floppy: use a statically allocated error counter Interrupt handler bad_flp_intr() may cause a UAF on the recently freed request just to increment the error count. There's no point keeping that one in the request anyway, and since the interrupt handler uses a static pointer to the error which cannot be kept in sync with the pending request, better make it use a static error counter that's reset for each new request. This reset now happens when entering redo_fd_request() for a new request via set_next_request(). One initial concern about a single error counter was that errors on one floppy drive could be reported on another one, but this problem is not real given that the driver uses a single drive at a time, as that PC-compatible controllers also have this limitation by using shared signals. As such the error count is always for the "current" drive. Reported-by: Minh Yuan Suggested-by: Linus Torvalds Tested-by: Denis Efremov Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- drivers/block/floppy.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index d5b9ff9bcbb2..015841f50f4e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -509,8 +509,8 @@ static unsigned long fdc_busy; static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); static DECLARE_WAIT_QUEUE_HEAD(command_done); -/* Errors during formatting are counted here. */ -static int format_errors; +/* errors encountered on the current (or last) request */ +static int floppy_errors; /* Format request descriptor. */ static struct format_descr format_req; @@ -530,7 +530,6 @@ static struct format_descr format_req; static char *floppy_track_buffer; static int max_buffer_sectors; -static int *errors; typedef void (*done_f)(int); static const struct cont_t { void (*interrupt)(void); @@ -1455,7 +1454,7 @@ static int interpret_errors(void) if (drive_params[current_drive].flags & FTD_MSG) DPRINT("Over/Underrun - retrying\n"); bad = 0; - } else if (*errors >= drive_params[current_drive].max_errors.reporting) { + } else if (floppy_errors >= drive_params[current_drive].max_errors.reporting) { print_errors(); } if (reply_buffer[ST2] & ST2_WC || reply_buffer[ST2] & ST2_BC) @@ -2095,7 +2094,7 @@ static void bad_flp_intr(void) if (!next_valid_format(current_drive)) return; } - err_count = ++(*errors); + err_count = ++floppy_errors; INFBOUND(write_errors[current_drive].badness, err_count); if (err_count > drive_params[current_drive].max_errors.abort) cont->done(0); @@ -2241,9 +2240,8 @@ static int do_format(int drive, struct format_descr *tmp_format_req) return -EINVAL; } format_req = *tmp_format_req; - format_errors = 0; cont = &format_cont; - errors = &format_errors; + floppy_errors = 0; ret = wait_til_done(redo_format, true); if (ret == -EINTR) return -EINTR; @@ -2759,10 +2757,11 @@ static int set_next_request(void) current_req = list_first_entry_or_null(&floppy_reqs, struct request, queuelist); if (current_req) { - current_req->error_count = 0; + floppy_errors = 0; list_del_init(¤t_req->queuelist); + return 1; } - return current_req != NULL; + return 0; } /* Starts or continues processing request. Will automatically unlock the @@ -2821,7 +2820,6 @@ do_request: _floppy = floppy_type + drive_params[current_drive].autodetect[drive_state[current_drive].probed_format]; } else probing = 0; - errors = &(current_req->error_count); tmp = make_raw_rw_request(); if (tmp < 2) { request_done(tmp); From f3b10a3c22c6a5f1d623b70eca2b4d1efafccd71 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 8 May 2022 11:37:08 +0200 Subject: [PATCH 127/179] ataflop: use a statically allocated error counters This is the last driver making use of fd_request->error_count, which is easy to get wrong as was shown in floppy.c. We don't need to keep it there, it can be moved to the atari_floppy_struct instead, so let's do this. Suggested-by: Linus Torvalds Cc: Minh Yuan Cc: Geert Uytterhoeven Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- drivers/block/ataflop.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 5d819a466e2f..e232cc4fd444 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -303,6 +303,7 @@ static struct atari_floppy_struct { int ref; int type; struct blk_mq_tag_set tag_set; + int error_count; } unit[FD_MAX_UNITS]; #define UD unit[drive] @@ -705,14 +706,14 @@ static void fd_error( void ) if (!fd_request) return; - fd_request->error_count++; - if (fd_request->error_count >= MAX_ERRORS) { + unit[SelectedDrive].error_count++; + if (unit[SelectedDrive].error_count >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); fd_end_request_cur(BLK_STS_IOERR); finish_fdc(); return; } - else if (fd_request->error_count == RECALIBRATE_ERRORS) { + else if (unit[SelectedDrive].error_count == RECALIBRATE_ERRORS) { printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); if (SelectedDrive != -1) SUD.track = -1; @@ -1491,7 +1492,7 @@ static void setup_req_params( int drive ) ReqData = ReqBuffer + 512 * ReqCnt; if (UseTrackbuffer) - read_track = (ReqCmd == READ && fd_request->error_count == 0); + read_track = (ReqCmd == READ && unit[drive].error_count == 0); else read_track = 0; @@ -1520,6 +1521,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_RESOURCE; } fd_request = bd->rq; + unit[drive].error_count = 0; blk_mq_start_request(fd_request); atari_disable_irq( IRQ_MFP_FDC ); From 2e3afb42dd480d755cd7d97ca04586a5616c1a5e Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 8 May 2022 11:37:09 +0200 Subject: [PATCH 128/179] blk-mq: remove the error_count from struct request The last two users were floppy.c and ataflop.c respectively, it was verified that no other drivers makes use of this, so let's remove it. Suggested-by: Linus Torvalds Cc: Minh Yuan Cc: Denis Efremov , Cc: Geert Uytterhoeven Cc: Christoph Hellwig Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7aa5c54901a9..9f07061418db 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -163,7 +163,6 @@ struct request { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; void *completion_data; - int error_count; /* for legacy drivers, don't use */ }; From 9dc4241bb14afecd16518a0760bceb3d7359b12a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 7 May 2022 15:31:16 +0200 Subject: [PATCH 129/179] Revert "parisc: Mark cr16 CPU clocksource unstable on all SMP machines" This reverts commit afdb4a5b1d340e4afffc65daa21cc71890d7d589. It triggers RCU stalls at boot with a 32-bit kernel. Signed-off-by: Helge Deller Noticed-by: John David Anglin Cc: stable@vger.kernel.org # v5.16+ --- arch/parisc/kernel/time.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index bb27dfeeddfc..95ee9e1a364b 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -251,16 +251,30 @@ void __init time_init(void) static int __init init_cr16_clocksource(void) { /* - * The cr16 interval timers are not syncronized across CPUs, even if - * they share the same socket. + * The cr16 interval timers are not syncronized across CPUs on + * different sockets, so mark them unstable and lower rating on + * multi-socket SMP systems. */ if (num_online_cpus() > 1 && !running_on_qemu) { - /* mark sched_clock unstable */ - clear_sched_clock_stable(); + int cpu; + unsigned long cpu0_loc; + cpu0_loc = per_cpu(cpu_data, 0).cpu_loc; - clocksource_cr16.name = "cr16_unstable"; - clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; - clocksource_cr16.rating = 0; + for_each_online_cpu(cpu) { + if (cpu == 0) + continue; + if ((cpu0_loc != 0) && + (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) + continue; + + /* mark sched_clock unstable */ + clear_sched_clock_stable(); + + clocksource_cr16.name = "cr16_unstable"; + clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; + clocksource_cr16.rating = 0; + break; + } } /* register at clocksource framework */ From 7962c0896429af2a0e00ec6bc15d992536453b2d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 7 May 2022 15:32:38 +0200 Subject: [PATCH 130/179] Revert "parisc: Mark sched_clock unstable only if clocks are not syncronized" This reverts commit d97180ad68bdb7ee10f327205a649bc2f558741d. It triggers RCU stalls at boot with a 32-bit kernel. Signed-off-by: Helge Deller Noticed-by: John David Anglin Cc: stable@vger.kernel.org # v5.15+ --- arch/parisc/kernel/setup.c | 2 ++ arch/parisc/kernel/time.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index b91cb45ffd4e..f005ddedb50e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -161,6 +161,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PA11 dma_ops_init(); #endif + + clear_sched_clock_stable(); } /* diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 95ee9e1a364b..19c31a72fe76 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -267,9 +267,6 @@ static int __init init_cr16_clocksource(void) (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) continue; - /* mark sched_clock unstable */ - clear_sched_clock_stable(); - clocksource_cr16.name = "cr16_unstable"; clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; clocksource_cr16.rating = 0; @@ -277,6 +274,10 @@ static int __init init_cr16_clocksource(void) } } + /* XXX: We may want to mark sched_clock stable here if cr16 clocks are + * in sync: + * (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */ + /* register at clocksource framework */ clocksource_register_hz(&clocksource_cr16, 100 * PAGE0->mem_10msec); From 6c800d7f55fcd78e17deae5ae4374d8e73482c13 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 8 May 2022 10:18:40 +0200 Subject: [PATCH 131/179] Revert "parisc: Fix patch code locking and flushing" This reverts commit a9fe7fa7d874a536e0540469f314772c054a0323. Leads to segfaults on 32bit kernel. Signed-off-by: Helge Deller --- arch/parisc/kernel/patch.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index e59574f65e64..80a0ab372802 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -40,7 +40,10 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags, *need_unmap = 1; set_fixmap(fixmap, page_to_phys(page)); - raw_spin_lock_irqsave(&patch_lock, *flags); + if (flags) + raw_spin_lock_irqsave(&patch_lock, *flags); + else + __acquire(&patch_lock); return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK)); } @@ -49,7 +52,10 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags) { clear_fixmap(fixmap); - raw_spin_unlock_irqrestore(&patch_lock, *flags); + if (flags) + raw_spin_unlock_irqrestore(&patch_lock, *flags); + else + __release(&patch_lock); } void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) @@ -61,9 +67,8 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) int mapped; /* Make sure we don't have any aliases in cache */ - flush_kernel_dcache_range_asm(start, end); - flush_kernel_icache_range_asm(start, end); - flush_tlb_kernel_range(start, end); + flush_kernel_vmap_range(addr, len); + flush_icache_range(start, end); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, &mapped); @@ -76,10 +81,8 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) * We're crossing a page boundary, so * need to remap */ - flush_kernel_dcache_range_asm((unsigned long)fixmap, - (unsigned long)p); - flush_tlb_kernel_range((unsigned long)fixmap, - (unsigned long)p); + flush_kernel_vmap_range((void *)fixmap, + (p-fixmap) * sizeof(*p)); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, @@ -87,10 +90,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) } } - flush_kernel_dcache_range_asm((unsigned long)fixmap, (unsigned long)p); - flush_tlb_kernel_range((unsigned long)fixmap, (unsigned long)p); + flush_kernel_vmap_range((void *)fixmap, (p-fixmap) * sizeof(*p)); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); + flush_icache_range(start, end); } void __kprobes __patch_text(void *addr, u32 insn) From 0921244f6f4f0d05698b953fe632a99b38907226 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Apr 2022 09:19:11 +0200 Subject: [PATCH 132/179] parisc: Only list existing CPUs in cpu_possible_mask The inventory knows which CPUs are in the system, so this bitmask should be in cpu_possible_mask instead of the bitmask based on CONFIG_NR_CPUS. Reset the cpu_possible_mask before scanning the system for CPUs, and mark each existing CPU as possible during initialization of that CPU. This avoids those warnings later on too: register_cpu_capacity_sysctl: too early to get CPU4 device! Signed-off-by: Helge Deller Noticed-by: John David Anglin --- arch/parisc/kernel/processor.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index d98692115221..9e92b76b0ce0 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -171,6 +171,7 @@ static int __init processor_probe(struct parisc_device *dev) p->cpu_num = cpu_info.cpu_num; p->cpu_loc = cpu_info.cpu_loc; + set_cpu_possible(cpuid, true); store_cpu_topology(cpuid); #ifdef CONFIG_SMP @@ -461,6 +462,13 @@ static struct parisc_driver cpu_driver __refdata = { */ void __init processor_init(void) { + unsigned int cpu; + reset_cpu_topology(); + + /* reset possible mask. We will mark those which are possible. */ + for_each_possible_cpu(cpu) + set_cpu_possible(cpu, false); + register_parisc_driver(&cpu_driver); } From 7e93a3dd63db2341d094ab1d9ba29b5d8d5093d1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Apr 2022 18:55:27 +0200 Subject: [PATCH 133/179] parisc: Update 32- and 64-bit defconfigs Enable CONFIG_CGROUPS=y on 32-bit defconfig for systemd-support, and enable CONFIG_NAMESPACES and CONFIG_USER_NS. Signed-off-by: Helge Deller --- arch/parisc/configs/generic-32bit_defconfig | 4 +++- arch/parisc/configs/generic-64bit_defconfig | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index a5fee10d76ee..8ce0ae370680 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -6,6 +6,9 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=16 +CONFIG_CGROUPS=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y @@ -47,7 +50,6 @@ CONFIG_PARPORT=y CONFIG_PARPORT_PC=m CONFIG_PARPORT_1284=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_CRYPTOLOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=6144 CONFIG_BLK_DEV_SD=y diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 1b8fd80cbe7f..57501b0aed92 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -16,6 +16,7 @@ CONFIG_CGROUPS=y CONFIG_MEMCG=y CONFIG_CGROUP_PIDS=y CONFIG_CPUSETS=y +CONFIG_USER_NS=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -267,9 +268,9 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRC_CCITT=m CONFIG_LIBCRC32C=y CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_KERNEL=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_SCHED_DEBUG is not set From 1955c4f879a130c7822f483cf593338ad747aed4 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 Apr 2022 22:24:20 +0200 Subject: [PATCH 134/179] parisc: Re-enable GENERIC_CPU_DEVICES for !SMP In commit 62773112acc5 ("parisc: Switch from GENERIC_CPU_DEVICES to GENERIC_ARCH_TOPOLOGY") GENERIC_CPU_DEVICES was unconditionally turned off, but this triggers a warning in topology_add_dev(). Turning it back on for the !SMP case avoids this warning. Reported-by: Guenter Roeck Tested-by: Guenter Roeck Fixes: 62773112acc5 ("parisc: Switch from GENERIC_CPU_DEVICES to GENERIC_ARCH_TOPOLOGY") Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 52e550b45692..bd22578859d0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -38,6 +38,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW select SYSCTL_EXCEPTION_TRACE From 5b89966bc96a06f6ad65f64ae4b0461918fcc9d3 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 3 Apr 2022 21:57:51 +0200 Subject: [PATCH 135/179] parisc: Merge model and model name into one line in /proc/cpuinfo The Linux tool "lscpu" shows the double amount of CPUs if we have "model" and "model name" in two different lines in /proc/cpuinfo. This change combines the model and the model name into one line. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org --- arch/parisc/kernel/processor.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 9e92b76b0ce0..26eb568f8b96 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -420,8 +420,7 @@ show_cpuinfo (struct seq_file *m, void *v) } seq_printf(m, " (0x%02lx)\n", boot_cpu_data.pdc.capabilities); - seq_printf(m, "model\t\t: %s\n" - "model name\t: %s\n", + seq_printf(m, "model\t\t: %s - %s\n", boot_cpu_data.pdc.sys_model_name, cpuinfo->dev ? cpuinfo->dev->name : "Unknown"); From 234ff4c585d704896450a3634a7c29fa4e1907e1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 5 Apr 2022 21:28:37 +0200 Subject: [PATCH 136/179] parisc: Change MAX_ADDRESS to become unsigned long long Dave noticed that for the 32-bit kernel MAX_ADDRESS should be a ULL, otherwise this define would become 0: MAX_ADDRESS (1UL << MAX_ADDRBITS) It has no real effect on the kernel. Signed-off-by: Helge Deller Noticed-by: John David Anglin --- arch/parisc/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 939db6fe620b..69765a6dbe89 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -160,7 +160,7 @@ extern void __update_cache(pte_t pte); #define SPACEID_SHIFT (MAX_ADDRBITS - 32) #else #define MAX_ADDRBITS (BITS_PER_LONG) -#define MAX_ADDRESS (1UL << MAX_ADDRBITS) +#define MAX_ADDRESS (1ULL << MAX_ADDRBITS) #define SPACEID_SHIFT 0 #endif From a65bcad5421507c2f6c52e1e2ca6a6ce02fd1ad6 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 30 Apr 2022 21:07:18 +0200 Subject: [PATCH 137/179] parisc: Fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Helge Deller --- arch/parisc/kernel/kprobes.c | 2 +- arch/parisc/kernel/traps.c | 2 +- arch/parisc/math-emu/dfadd.c | 2 +- arch/parisc/math-emu/dfsub.c | 2 +- arch/parisc/math-emu/sfadd.c | 2 +- arch/parisc/math-emu/sfsub.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 3343d2fb7889..6e0b86652f30 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -152,7 +152,7 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs) /* for absolute branch instructions we can copy iaoq_b. for relative * branch instructions we need to calculate the new address based on the * difference between iaoq_f and iaoq_b. We cannot use iaoq_b without - * modificationt because it's based on our ainsn.insn address. + * modifications because it's based on our ainsn.insn address. */ if (p->post_handler) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index a6e61cf2cad0..b78f1b9d45c1 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -469,7 +469,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o * panic notifiers, and we should call panic * directly from the location that we wish. * e.g. We should not call panic from - * parisc_terminate, but rather the oter way around. + * parisc_terminate, but rather the other way around. * This hack works, prints the panic message twice, * and it enables reboot timers! */ diff --git a/arch/parisc/math-emu/dfadd.c b/arch/parisc/math-emu/dfadd.c index ec487e07f004..00e561d4aa55 100644 --- a/arch/parisc/math-emu/dfadd.c +++ b/arch/parisc/math-emu/dfadd.c @@ -253,7 +253,7 @@ dbl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/dfsub.c b/arch/parisc/math-emu/dfsub.c index c4f30acf2d48..4f03782284bd 100644 --- a/arch/parisc/math-emu/dfsub.c +++ b/arch/parisc/math-emu/dfsub.c @@ -256,7 +256,7 @@ dbl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfadd.c b/arch/parisc/math-emu/sfadd.c index 838758279d5b..9b98c874dfac 100644 --- a/arch/parisc/math-emu/sfadd.c +++ b/arch/parisc/math-emu/sfadd.c @@ -249,7 +249,7 @@ sgl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfsub.c b/arch/parisc/math-emu/sfsub.c index 583d3ace4634..29d9eed09d12 100644 --- a/arch/parisc/math-emu/sfsub.c +++ b/arch/parisc/math-emu/sfsub.c @@ -252,7 +252,7 @@ sgl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { From 340233dcc0160aafcce46ca893d1679f16acf409 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 8 May 2022 18:25:00 +0200 Subject: [PATCH 138/179] parisc: Mark cr16 clock unstable on all SMP machines The cr16 interval timers are not synchronized across CPUs, even with just one dual-core CPU. This becomes visible if the machines have a longer uptime. Signed-off-by: Helge Deller --- arch/parisc/kernel/time.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 19c31a72fe76..9714fbd7c42d 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -251,33 +251,14 @@ void __init time_init(void) static int __init init_cr16_clocksource(void) { /* - * The cr16 interval timers are not syncronized across CPUs on - * different sockets, so mark them unstable and lower rating on - * multi-socket SMP systems. + * The cr16 interval timers are not synchronized across CPUs. */ if (num_online_cpus() > 1 && !running_on_qemu) { - int cpu; - unsigned long cpu0_loc; - cpu0_loc = per_cpu(cpu_data, 0).cpu_loc; - - for_each_online_cpu(cpu) { - if (cpu == 0) - continue; - if ((cpu0_loc != 0) && - (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) - continue; - - clocksource_cr16.name = "cr16_unstable"; - clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; - clocksource_cr16.rating = 0; - break; - } + clocksource_cr16.name = "cr16_unstable"; + clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; + clocksource_cr16.rating = 0; } - /* XXX: We may want to mark sched_clock stable here if cr16 clocks are - * in sync: - * (clocksource_cr16.flags == CLOCK_SOURCE_IS_CONTINUOUS) */ - /* register at clocksource framework */ clocksource_register_hz(&clocksource_cr16, 100 * PAGE0->mem_10msec); From ba0c04104082ca211e108dd8eec6db2ad7676528 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 8 May 2022 19:55:13 +0200 Subject: [PATCH 139/179] Revert "parisc: Increase parisc_cache_flush_threshold setting" This reverts commit a58e9d0984e8dad53f17ec73ae3c1cc7f8d88151. Triggers segfaults with 32-bit kernels on PA8500 machines. Signed-off-by: Helge Deller --- arch/parisc/kernel/cache.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 23348199f3f8..e7911225a4f8 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -403,7 +403,7 @@ void __init parisc_setup_cache_timing(void) { unsigned long rangetime, alltime; unsigned long size; - unsigned long threshold, threshold2; + unsigned long threshold; alltime = mfctl(16); flush_data_cache(); @@ -418,20 +418,8 @@ void __init parisc_setup_cache_timing(void) alltime, size, rangetime); threshold = L1_CACHE_ALIGN(size * alltime / rangetime); - - /* - * The threshold computed above isn't very reliable since the - * flush times depend greatly on the percentage of dirty lines - * in the flush range. Further, the whole cache time doesn't - * include the time to refill lines that aren't in the mm/vma - * being flushed. By timing glibc build and checks on mako cpus, - * the following formula seems to work reasonably well. The - * value from the timing calculation is too small, and increases - * build and check times by almost a factor two. - */ - threshold2 = cache_info.dc_size * num_online_cpus(); - if (threshold2 > threshold) - threshold = threshold2; + if (threshold > cache_info.dc_size) + threshold = cache_info.dc_size; if (threshold) parisc_cache_flush_threshold = threshold; printk(KERN_INFO "Cache flush threshold set to %lu KiB\n", From c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 8 May 2022 13:54:17 -0700 Subject: [PATCH 140/179] Linux 5.18-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9a820c525b86..2284d1ca2503 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Superb Owl # *DOCUMENTATION* From 49e6123c65dac6393b04f39ceabf79c44f66b8be Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 4 May 2022 12:32:27 +0000 Subject: [PATCH 141/179] net: sfc: fix memory leak due to ptp channel It fixes memory leak in ring buffer change logic. When ring buffer size is changed(ethtool -G eth0 rx 4096), sfc driver works like below. 1. stop all channels and remove ring buffers. 2. allocates new buffer array. 3. allocates rx buffers. 4. start channels. While the above steps are working, it skips some steps if the channel doesn't have a ->copy callback function. Due to ptp channel doesn't have ->copy callback, these above steps are skipped for ptp channel. It eventually makes some problems. a. ptp channel's ring buffer size is not changed, it works only 1024(default). b. memory leak. The reason for memory leak is to use the wrong ring buffer values. There are some values, which is related to ring buffer size. a. efx->rxq_entries - This is global value of rx queue size. b. rx_queue->ptr_mask - used for access ring buffer as circular ring. - roundup_pow_of_two(efx->rxq_entries) - 1 c. rx_queue->max_fill - efx->rxq_entries - EFX_RXD_HEAD_ROOM These all values should be based on ring buffer size consistently. But ptp channel's values are not. a. efx->rxq_entries - This is global(for sfc) value, always new ring buffer size. b. rx_queue->ptr_mask - This is always 1023(default). c. rx_queue->max_fill - This is new ring buffer size - EFX_RXD_HEAD_ROOM. Let's assume we set 4096 for rx ring buffer, normal channel ptp channel efx->rxq_entries 4096 4096 rx_queue->ptr_mask 4095 1023 rx_queue->max_fill 4086 4086 sfc driver allocates rx ring buffers based on these values. When it allocates ptp channel's ring buffer, 4086 ring buffers are allocated then, these buffers are attached to the allocated array. But ptp channel's ring buffer array size is still 1024(default) and ptr_mask is still 1023 too. So, 3062 ring buffers will be overwritten to the array. This is the reason for memory leak. Test commands: ethtool -G rx 4096 while : do ip link set up ip link set down done In order to avoid this problem, it adds ->copy callback to ptp channel type. So that rx_queue->ptr_mask value will be updated correctly. Fixes: 7c236c43b838 ("sfc: Add support for IEEE-1588 PTP") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx_channels.c | 7 ++++++- drivers/net/ethernet/sfc/ptp.c | 14 +++++++++++++- drivers/net/ethernet/sfc/ptp.h | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c index 377df8b7f015..40df910aa140 100644 --- a/drivers/net/ethernet/sfc/efx_channels.c +++ b/drivers/net/ethernet/sfc/efx_channels.c @@ -867,7 +867,9 @@ static void efx_set_xdp_channels(struct efx_nic *efx) int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries) { - struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel; + struct efx_channel *other_channel[EFX_MAX_CHANNELS], *channel, + *ptp_channel = efx_ptp_channel(efx); + struct efx_ptp_data *ptp_data = efx->ptp_data; unsigned int i, next_buffer_table = 0; u32 old_rxq_entries, old_txq_entries; int rc, rc2; @@ -938,6 +940,7 @@ int efx_realloc_channels(struct efx_nic *efx, u32 rxq_entries, u32 txq_entries) efx_set_xdp_channels(efx); out: + efx->ptp_data = NULL; /* Destroy unused channel structures */ for (i = 0; i < efx->n_channels; i++) { channel = other_channel[i]; @@ -948,6 +951,7 @@ out: } } + efx->ptp_data = ptp_data; rc2 = efx_soft_enable_interrupts(efx); if (rc2) { rc = rc ? rc : rc2; @@ -966,6 +970,7 @@ rollback: efx->txq_entries = old_txq_entries; for (i = 0; i < efx->n_channels; i++) swap(efx->channel[i], other_channel[i]); + efx_ptp_update_channel(efx, ptp_channel); goto out; } diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index f0ef515e2ade..4625f85acab2 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -45,6 +45,7 @@ #include "farch_regs.h" #include "tx.h" #include "nic.h" /* indirectly includes ptp.h */ +#include "efx_channels.h" /* Maximum number of events expected to make up a PTP event */ #define MAX_EVENT_FRAGS 3 @@ -541,6 +542,12 @@ struct efx_channel *efx_ptp_channel(struct efx_nic *efx) return efx->ptp_data ? efx->ptp_data->channel : NULL; } +void efx_ptp_update_channel(struct efx_nic *efx, struct efx_channel *channel) +{ + if (efx->ptp_data) + efx->ptp_data->channel = channel; +} + static u32 last_sync_timestamp_major(struct efx_nic *efx) { struct efx_channel *channel = efx_ptp_channel(efx); @@ -1443,6 +1450,11 @@ int efx_ptp_probe(struct efx_nic *efx, struct efx_channel *channel) int rc = 0; unsigned int pos; + if (efx->ptp_data) { + efx->ptp_data->channel = channel; + return 0; + } + ptp = kzalloc(sizeof(struct efx_ptp_data), GFP_KERNEL); efx->ptp_data = ptp; if (!efx->ptp_data) @@ -2176,7 +2188,7 @@ static const struct efx_channel_type efx_ptp_channel_type = { .pre_probe = efx_ptp_probe_channel, .post_remove = efx_ptp_remove_channel, .get_name = efx_ptp_get_channel_name, - /* no copy operation; there is no need to reallocate this channel */ + .copy = efx_copy_channel, .receive_skb = efx_ptp_rx, .want_txqs = efx_ptp_want_txqs, .keep_eventq = false, diff --git a/drivers/net/ethernet/sfc/ptp.h b/drivers/net/ethernet/sfc/ptp.h index 9855e8c9e544..7b1ef7002b3f 100644 --- a/drivers/net/ethernet/sfc/ptp.h +++ b/drivers/net/ethernet/sfc/ptp.h @@ -16,6 +16,7 @@ struct ethtool_ts_info; int efx_ptp_probe(struct efx_nic *efx, struct efx_channel *channel); void efx_ptp_defer_probe_with_channel(struct efx_nic *efx); struct efx_channel *efx_ptp_channel(struct efx_nic *efx); +void efx_ptp_update_channel(struct efx_nic *efx, struct efx_channel *channel); void efx_ptp_remove(struct efx_nic *efx); int efx_ptp_set_ts_config(struct efx_nic *efx, struct ifreq *ifr); int efx_ptp_get_ts_config(struct efx_nic *efx, struct ifreq *ifr); From cf3ab8d4a797960b4be20565abb3bcd227b18a68 Mon Sep 17 00:00:00 2001 From: Lina Wang Date: Thu, 5 May 2022 13:48:49 +0800 Subject: [PATCH 142/179] net: fix wrong network header length When clatd starts with ebpf offloaing, and NETIF_F_GRO_FRAGLIST is enable, several skbs are gathered in skb_shinfo(skb)->frag_list. The first skb's ipv6 header will be changed to ipv4 after bpf_skb_proto_6_to_4, network_header\transport_header\mac_header have been updated as ipv4 acts, but other skbs in frag_list didnot update anything, just ipv6 packets. udp_queue_rcv_skb will call skb_segment_list to traverse other skbs in frag_list and make sure right udp payload is delivered to user space. Unfortunately, other skbs in frag_list who are still ipv6 packets are updated like the first skb and will have wrong transport header length. e.g.before bpf_skb_proto_6_to_4,the first skb and other skbs in frag_list has the same network_header(24)& transport_header(64), after bpf_skb_proto_6_to_4, ipv6 protocol has been changed to ipv4, the first skb's network_header is 44,transport_header is 64, other skbs in frag_list didnot change.After skb_segment_list, the other skbs in frag_list has different network_header(24) and transport_header(44), so there will be 20 bytes different from original,that is difference between ipv6 header and ipv4 header. Just change transport_header to be the same with original. Actually, there are two solutions to fix it, one is traversing all skbs and changing every skb header in bpf_skb_proto_6_to_4, the other is modifying frag_list skb's header in skb_segment_list. Considering efficiency, adopt the second one--- when the first skb and other skbs in frag_list has different network_header length, restore them to make sure right udp payload is delivered to user space. Signed-off-by: Lina Wang Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 30b523fa4ad2..c90c74de90d5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3897,7 +3897,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_len = 0; struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; - int err; + int len_diff, err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3937,9 +3937,11 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); + len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + nskb->transport_header += len_diff; skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, offset + tnl_hlen); From edae34a3ed9293b5077dddf9e51a3d86c95dc76a Mon Sep 17 00:00:00 2001 From: Lina Wang Date: Thu, 5 May 2022 13:48:50 +0800 Subject: [PATCH 143/179] selftests net: add UDP GRO fraglist + bpf self-tests When NET_F_F_GRO_FRAGLIST is enabled and bpf_skb_change_proto is used, check if udp packets and tcp packets are successfully delivered to user space. If wrong udp packets are delivered, udpgso_bench_rx will exit with "Initial byte out of range" Signed-off-by: Maciej enczykowski Signed-off-by: Lina Wang Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 3 + tools/testing/selftests/net/bpf/Makefile | 14 + tools/testing/selftests/net/bpf/nat6to4.c | 285 ++++++++++++++++++ tools/testing/selftests/net/udpgro_frglist.sh | 101 +++++++ 4 files changed, 403 insertions(+) create mode 100644 tools/testing/selftests/net/bpf/Makefile create mode 100644 tools/testing/selftests/net/bpf/nat6to4.c create mode 100755 tools/testing/selftests/net/udpgro_frglist.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 0f2ebc38d893..e1f998defd10 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -25,6 +25,7 @@ TEST_PROGS += bareudp.sh TEST_PROGS += amt.sh TEST_PROGS += unicast_extensions.sh TEST_PROGS += udpgro_fwd.sh +TEST_PROGS += udpgro_frglist.sh TEST_PROGS += veth.sh TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh @@ -61,6 +62,8 @@ TEST_FILES := settings KSFT_KHDR_INSTALL := 1 include ../lib.mk +include bpf/Makefile + $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread $(OUTPUT)/tcp_inq: LDLIBS += -lpthread diff --git a/tools/testing/selftests/net/bpf/Makefile b/tools/testing/selftests/net/bpf/Makefile new file mode 100644 index 000000000000..f91bf14bbee7 --- /dev/null +++ b/tools/testing/selftests/net/bpf/Makefile @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 + +CLANG ?= clang +CCINCLUDE += -I../../bpf +CCINCLUDE += -I../../../../../usr/include/ + +TEST_CUSTOM_PROGS = $(OUTPUT)/bpf/nat6to4.o +all: $(TEST_CUSTOM_PROGS) + +$(OUTPUT)/%.o: %.c + $(CLANG) -O2 -target bpf -c $< $(CCINCLUDE) -o $@ + +clean: + rm -f $(TEST_CUSTOM_PROGS) diff --git a/tools/testing/selftests/net/bpf/nat6to4.c b/tools/testing/selftests/net/bpf/nat6to4.c new file mode 100644 index 000000000000..ac54c36b25fc --- /dev/null +++ b/tools/testing/selftests/net/bpf/nat6to4.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This code is taken from the Android Open Source Project and the author + * (Maciej Żenczykowski) has gave permission to relicense it under the + * GPLv2. Therefore this program is free software; + * You can redistribute it and/or modify it under the terms of the GNU + * General Public License version 2 as published by the Free Software + * Foundation + + * The original headers, including the original license headers, are + * included below for completeness. + * + * Copyright (C) 2019 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + +#include +#include + +#define IP_DF 0x4000 // Flag: "Don't Fragment" + +SEC("schedcls/ingress6/nat_6") +int sched_cls_ingress6_nat_6_prog(struct __sk_buff *skb) +{ + const int l2_header_size = sizeof(struct ethhdr); + void *data = (void *)(long)skb->data; + const void *data_end = (void *)(long)skb->data_end; + const struct ethhdr * const eth = data; // used iff is_ethernet + const struct ipv6hdr * const ip6 = (void *)(eth + 1); + + // Require ethernet dst mac address to be our unicast address. + if (skb->pkt_type != PACKET_HOST) + return TC_ACT_OK; + + // Must be meta-ethernet IPv6 frame + if (skb->protocol != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + // Must have (ethernet and) ipv6 header + if (data + l2_header_size + sizeof(*ip6) > data_end) + return TC_ACT_OK; + + // Ethertype - if present - must be IPv6 + if (eth->h_proto != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + // IP version must be 6 + if (ip6->version != 6) + return TC_ACT_OK; + // Maximum IPv6 payload length that can be translated to IPv4 + if (bpf_ntohs(ip6->payload_len) > 0xFFFF - sizeof(struct iphdr)) + return TC_ACT_OK; + switch (ip6->nexthdr) { + case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 + case IPPROTO_UDP: // address means there is no need to update their checksums. + case IPPROTO_GRE: // We do not need to bother looking at GRE/ESP headers, + case IPPROTO_ESP: // since there is never a checksum to update. + break; + default: // do not know how to handle anything else + return TC_ACT_OK; + } + + struct ethhdr eth2; // used iff is_ethernet + + eth2 = *eth; // Copy over the ethernet header (src/dst mac) + eth2.h_proto = bpf_htons(ETH_P_IP); // But replace the ethertype + + struct iphdr ip = { + .version = 4, // u4 + .ihl = sizeof(struct iphdr) / sizeof(__u32), // u4 + .tos = (ip6->priority << 4) + (ip6->flow_lbl[0] >> 4), // u8 + .tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + sizeof(struct iphdr)), // u16 + .id = 0, // u16 + .frag_off = bpf_htons(IP_DF), // u16 + .ttl = ip6->hop_limit, // u8 + .protocol = ip6->nexthdr, // u8 + .check = 0, // u16 + .saddr = 0x0201a8c0, // u32 + .daddr = 0x0101a8c0, // u32 + }; + + // Calculate the IPv4 one's complement checksum of the IPv4 header. + __wsum sum4 = 0; + + for (int i = 0; i < sizeof(ip) / sizeof(__u16); ++i) + sum4 += ((__u16 *)&ip)[i]; + + // Note that sum4 is guaranteed to be non-zero by virtue of ip.version == 4 + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 + ip.check = (__u16)~sum4; // sum4 cannot be zero, so this is never 0xFFFF + + // Calculate the *negative* IPv6 16-bit one's complement checksum of the IPv6 header. + __wsum sum6 = 0; + // We'll end up with a non-zero sum due to ip6->version == 6 (which has '0' bits) + for (int i = 0; i < sizeof(*ip6) / sizeof(__u16); ++i) + sum6 += ~((__u16 *)ip6)[i]; // note the bitwise negation + + // Note that there is no L4 checksum update: we are relying on the checksum neutrality + // of the ipv6 address chosen by netd's ClatdController. + + // Packet mutations begin - point of no return, but if this first modification fails + // the packet is probably still pristine, so let clatd handle it. + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0)) + return TC_ACT_OK; + bpf_csum_update(skb, sum6); + + data = (void *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + if (data + l2_header_size + sizeof(struct iphdr) > data_end) + return TC_ACT_SHOT; + + struct ethhdr *new_eth = data; + + // Copy over the updated ethernet header + *new_eth = eth2; + + // Copy over the new ipv4 header. + *(struct iphdr *)(new_eth + 1) = ip; + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); +} + +SEC("schedcls/egress4/snat4") +int sched_cls_egress4_snat4_prog(struct __sk_buff *skb) +{ + const int l2_header_size = sizeof(struct ethhdr); + void *data = (void *)(long)skb->data; + const void *data_end = (void *)(long)skb->data_end; + const struct ethhdr *const eth = data; // used iff is_ethernet + const struct iphdr *const ip4 = (void *)(eth + 1); + + // Must be meta-ethernet IPv4 frame + if (skb->protocol != bpf_htons(ETH_P_IP)) + return TC_ACT_OK; + + // Must have ipv4 header + if (data + l2_header_size + sizeof(struct ipv6hdr) > data_end) + return TC_ACT_OK; + + // Ethertype - if present - must be IPv4 + if (eth->h_proto != bpf_htons(ETH_P_IP)) + return TC_ACT_OK; + + // IP version must be 4 + if (ip4->version != 4) + return TC_ACT_OK; + + // We cannot handle IP options, just standard 20 byte == 5 dword minimal IPv4 header + if (ip4->ihl != 5) + return TC_ACT_OK; + + // Maximum IPv6 payload length that can be translated to IPv4 + if (bpf_htons(ip4->tot_len) > 0xFFFF - sizeof(struct ipv6hdr)) + return TC_ACT_OK; + + // Calculate the IPv4 one's complement checksum of the IPv4 header. + __wsum sum4 = 0; + + for (int i = 0; i < sizeof(*ip4) / sizeof(__u16); ++i) + sum4 += ((__u16 *)ip4)[i]; + + // Note that sum4 is guaranteed to be non-zero by virtue of ip4->version == 4 + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse u32 into range 1 .. 0x1FFFE + sum4 = (sum4 & 0xFFFF) + (sum4 >> 16); // collapse any potential carry into u16 + // for a correct checksum we should get *a* zero, but sum4 must be positive, ie 0xFFFF + if (sum4 != 0xFFFF) + return TC_ACT_OK; + + // Minimum IPv4 total length is the size of the header + if (bpf_ntohs(ip4->tot_len) < sizeof(*ip4)) + return TC_ACT_OK; + + // We are incapable of dealing with IPv4 fragments + if (ip4->frag_off & ~bpf_htons(IP_DF)) + return TC_ACT_OK; + + switch (ip4->protocol) { + case IPPROTO_TCP: // For TCP & UDP the checksum neutrality of the chosen IPv6 + case IPPROTO_GRE: // address means there is no need to update their checksums. + case IPPROTO_ESP: // We do not need to bother looking at GRE/ESP headers, + break; // since there is never a checksum to update. + + case IPPROTO_UDP: // See above comment, but must also have UDP header... + if (data + sizeof(*ip4) + sizeof(struct udphdr) > data_end) + return TC_ACT_OK; + const struct udphdr *uh = (const struct udphdr *)(ip4 + 1); + // If IPv4/UDP checksum is 0 then fallback to clatd so it can calculate the + // checksum. Otherwise the network or more likely the NAT64 gateway might + // drop the packet because in most cases IPv6/UDP packets with a zero checksum + // are invalid. See RFC 6935. TODO: calculate checksum via bpf_csum_diff() + if (!uh->check) + return TC_ACT_OK; + break; + + default: // do not know how to handle anything else + return TC_ACT_OK; + } + struct ethhdr eth2; // used iff is_ethernet + + eth2 = *eth; // Copy over the ethernet header (src/dst mac) + eth2.h_proto = bpf_htons(ETH_P_IPV6); // But replace the ethertype + + struct ipv6hdr ip6 = { + .version = 6, // __u8:4 + .priority = ip4->tos >> 4, // __u8:4 + .flow_lbl = {(ip4->tos & 0xF) << 4, 0, 0}, // __u8[3] + .payload_len = bpf_htons(bpf_ntohs(ip4->tot_len) - 20), // __be16 + .nexthdr = ip4->protocol, // __u8 + .hop_limit = ip4->ttl, // __u8 + }; + ip6.saddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); + ip6.saddr.in6_u.u6_addr32[1] = 0; + ip6.saddr.in6_u.u6_addr32[2] = 0; + ip6.saddr.in6_u.u6_addr32[3] = bpf_htonl(1); + ip6.daddr.in6_u.u6_addr32[0] = bpf_htonl(0x20010db8); + ip6.daddr.in6_u.u6_addr32[1] = 0; + ip6.daddr.in6_u.u6_addr32[2] = 0; + ip6.daddr.in6_u.u6_addr32[3] = bpf_htonl(2); + + // Calculate the IPv6 16-bit one's complement checksum of the IPv6 header. + __wsum sum6 = 0; + // We'll end up with a non-zero sum due to ip6.version == 6 + for (int i = 0; i < sizeof(ip6) / sizeof(__u16); ++i) + sum6 += ((__u16 *)&ip6)[i]; + + // Packet mutations begin - point of no return, but if this first modification fails + // the packet is probably still pristine, so let clatd handle it. + if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0)) + return TC_ACT_OK; + + // This takes care of updating the skb->csum field for a CHECKSUM_COMPLETE packet. + // In such a case, skb->csum is a 16-bit one's complement sum of the entire payload, + // thus we need to subtract out the ipv4 header's sum, and add in the ipv6 header's sum. + // However, we've already verified the ipv4 checksum is correct and thus 0. + // Thus we only need to add the ipv6 header's sum. + // + // bpf_csum_update() always succeeds if the skb is CHECKSUM_COMPLETE and returns an error + // (-ENOTSUPP) if it isn't. So we just ignore the return code (see above for more details). + bpf_csum_update(skb, sum6); + + // bpf_skb_change_proto() invalidates all pointers - reload them. + data = (void *)(long)skb->data; + data_end = (void *)(long)skb->data_end; + + // I cannot think of any valid way for this error condition to trigger, however I do + // believe the explicit check is required to keep the in kernel ebpf verifier happy. + if (data + l2_header_size + sizeof(ip6) > data_end) + return TC_ACT_SHOT; + + struct ethhdr *new_eth = data; + + // Copy over the updated ethernet header + *new_eth = eth2; + // Copy over the new ipv4 header. + *(struct ipv6hdr *)(new_eth + 1) = ip6; + return TC_ACT_OK; +} + +char _license[] SEC("license") = ("GPL"); diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh new file mode 100755 index 000000000000..807b74c8fd80 --- /dev/null +++ b/tools/testing/selftests/net/udpgro_frglist.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Run a series of udpgro benchmarks + +readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" + +cleanup() { + local -r jobs="$(jobs -p)" + local -r ns="$(ip netns list|grep $PEER_NS)" + + [ -n "${jobs}" ] && kill -INT ${jobs} 2>/dev/null + [ -n "$ns" ] && ip netns del $ns 2>/dev/null +} +trap cleanup EXIT + +run_one() { + # use 'rx' as separator between sender args and receiver args + local -r all="$@" + local -r tx_args=${all%rx*} + local rx_args=${all#*rx} + + + + ip netns add "${PEER_NS}" + ip -netns "${PEER_NS}" link set lo up + ip link add type veth + ip link set dev veth0 up + ip addr add dev veth0 192.168.1.2/24 + ip addr add dev veth0 2001:db8::2/64 nodad + + ip link set dev veth1 netns "${PEER_NS}" + ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24 + ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad + ip -netns "${PEER_NS}" link set dev veth1 up + ip netns exec "${PEER_NS}" ethtool -K veth1 rx-gro-list on + + + ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp_dummy + tc -n "${PEER_NS}" qdisc add dev veth1 clsact + tc -n "${PEER_NS}" filter add dev veth1 ingress prio 4 protocol ipv6 bpf object-file ../bpf/nat6to4.o section schedcls/ingress6/nat_6 direct-action + tc -n "${PEER_NS}" filter add dev veth1 egress prio 4 protocol ip bpf object-file ../bpf/nat6to4.o section schedcls/egress4/snat4 direct-action + echo ${rx_args} + ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r & + + # Hack: let bg programs complete the startup + sleep 0.1 + ./udpgso_bench_tx ${tx_args} +} + +run_in_netns() { + local -r args=$@ + echo ${args} + ./in_netns.sh $0 __subprocess ${args} +} + +run_udp() { + local -r args=$@ + + echo "udp gso - over veth touching data" + run_in_netns ${args} -u -S 0 rx -4 -v + + echo "udp gso and gro - over veth touching data" + run_in_netns ${args} -S 0 rx -4 -G +} + +run_tcp() { + local -r args=$@ + + echo "tcp - over veth touching data" + run_in_netns ${args} -t rx -4 -t +} + +run_all() { + local -r core_args="-l 4" + local -r ipv4_args="${core_args} -4 -D 192.168.1.1" + local -r ipv6_args="${core_args} -6 -D 2001:db8::1" + + echo "ipv6" + run_tcp "${ipv6_args}" + run_udp "${ipv6_args}" +} + +if [ ! -f ../bpf/xdp_dummy.o ]; then + echo "Missing xdp_dummy helper. Build bpf selftest first" + exit -1 +fi + +if [ ! -f bpf/nat6to4.o ]; then + echo "Missing nat6to4 helper. Build bpfnat6to4.o selftest first" + exit -1 +fi + +if [[ $# -eq 0 ]]; then + run_all +elif [[ $1 == "__subprocess" ]]; then + shift + run_one $@ +else + run_in_netns $@ +fi From ceaf69f8eadcafb323392be88e7a5248c415d423 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sat, 7 May 2022 11:00:28 +0300 Subject: [PATCH 144/179] fanotify: do not allow setting dirent events in mask of non-dir Dirent events (create/delete/move) are only reported on watched directory inodes, but in fanotify as well as in legacy inotify, it was always allowed to set them on non-dir inode, which does not result in any meaningful outcome. Until kernel v5.17, dirent events in fanotify also differed from events "on child" (e.g. FAN_OPEN) in the information provided in the event. For example, FAN_OPEN could be set in the mask of a non-dir or the mask of its parent and event would report the fid of the child regardless of the marked object. By contrast, FAN_DELETE is not reported if the child is marked and the child fid was not reported in the events. Since kernel v5.17, with fanotify group flag FAN_REPORT_TARGET_FID, the fid of the child is reported with dirent events, like events "on child", which may create confusion for users expecting the same behavior as events "on child" when setting events in the mask on a child. The desired semantics of setting dirent events in the mask of a child are not clear, so for now, deny this action for a group initialized with flag FAN_REPORT_TARGET_FID and for the new event FAN_RENAME. We may relax this restriction in the future if we decide on the semantics and implement them. Fixes: d61fd650e9d2 ("fanotify: introduce group flag FAN_REPORT_TARGET_FID") Fixes: 8cc3b1ccd930 ("fanotify: wire up FAN_RENAME event") Link: https://lore.kernel.org/linux-fsdevel/20220505133057.zm5t6vumc4xdcnsg@quack3.lan/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220507080028.219826-1-amir73il@gmail.com --- fs/notify/fanotify/fanotify_user.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9b32b76a9c30..a792e21c5309 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1657,6 +1657,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; + /* + * FAN_RENAME is not allowed on non-dir (for now). + * We shouldn't have allowed setting any dirent events in mask of + * non-dir, but because we always allowed it, error only if group + * was initialized with the new flag FAN_REPORT_TARGET_FID. + */ + ret = -ENOTDIR; + if (inode && !S_ISDIR(inode->i_mode) && + ((mask & FAN_RENAME) || + ((mask & FANOTIFY_DIRENT_EVENTS) && + FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + goto path_put_and_out; + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; From c7e34c1e263f71b2dd78a12b6b7259a89b3a1f44 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Fri, 6 May 2022 11:42:12 +0300 Subject: [PATCH 145/179] mailmap: update Kalle Valo's email MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I switched to use my kernel.org address, the old kvalo@codeaurora.org address doesn't work anymore. Signed-off-by: Kalle Valo Tested-by: Toke Høiland-Jørgensen Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220506084212.8952-1-kvalo@kernel.org --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index b9d358217586..2c05d7257509 100644 --- a/.mailmap +++ b/.mailmap @@ -204,6 +204,7 @@ Juha Yrjola Juha Yrjola Juha Yrjola Julien Thierry +Kalle Valo Kalyan Thota Kay Sievers Kees Cook From a59d55568d02bbbdf9c0cc15be9580180f855b4f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 May 2022 23:04:21 +0200 Subject: [PATCH 146/179] mac80211_hwsim: fix RCU protected chanctx access We need to RCU protect the chanctx_conf access, so do that. Fixes: 585625c955b1 ("mac80211_hwsim: check TX and STA bandwidth") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220505230421.fb8055c081a2.Ic6da3307c77a909bd61a0ea25dc2a4b08fe1b03f@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 28bfa7b7b73c..3ac3693dbecb 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2202,11 +2202,14 @@ mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, if (!data->use_chanctx) { confbw = data->bw; } else { - struct ieee80211_chanctx_conf *chanctx_conf = - rcu_dereference(vif->chanctx_conf); + struct ieee80211_chanctx_conf *chanctx_conf; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(vif->chanctx_conf); if (!WARN_ON(!chanctx_conf)) confbw = chanctx_conf->def.width; + rcu_read_unlock(); } WARN(bw > hwsim_get_chanwidth(confbw), From 9e2db50f1ef2238fc2f71c5de1c0418b7a5b0ea2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 May 2022 23:04:22 +0200 Subject: [PATCH 147/179] mac80211_hwsim: call ieee80211_tx_prepare_skb under RCU protection This is needed since it might use (and pass out) pointers to e.g. keys protected by RCU. Can't really happen here as the frames aren't encrypted, but we need to still adhere to the rules. Fixes: cacfddf82baf ("mac80211_hwsim: initialize ieee80211_tx_info at hw_scan_work") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220505230421.5f139f9de173.I77ae111a28f7c0e9fd1ebcee7f39dbec5c606770@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 3ac3693dbecb..e9ec63e0e395 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2478,11 +2478,13 @@ static void hw_scan_work(struct work_struct *work) if (req->ie_len) skb_put_data(probe, req->ie, req->ie_len); + rcu_read_lock(); if (!ieee80211_tx_prepare_skb(hwsim->hw, hwsim->hw_scan_vif, probe, hwsim->tmp_chan->band, NULL)) { + rcu_read_unlock(); kfree_skb(probe); continue; } @@ -2490,6 +2492,7 @@ static void hw_scan_work(struct work_struct *work) local_bh_disable(); mac80211_hwsim_tx_frame(hwsim->hw, probe, hwsim->tmp_chan); + rcu_read_unlock(); local_bh_enable(); } } From f971e1887fdb3ab500c9bebf4b98f62d49a20655 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 May 2022 10:21:38 +0200 Subject: [PATCH 148/179] nl80211: fix locking in nl80211_set_tx_bitrate_mask() This accesses the wdev's chandef etc., so cannot safely be used without holding the lock. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220506102136.06b7205419e6.I2a87c05fbd8bc5e565e84d190d4cfd2e92695a90@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index aa6094c3c9b0..1a3551b6d18b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -11666,18 +11666,23 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, struct cfg80211_bitrate_mask mask; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; int err; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; + wdev_lock(wdev); err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &mask, dev, true); if (err) - return err; + goto out; - return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + err = rdev_set_bitrate_mask(rdev, dev, NULL, &mask); +out: + wdev_unlock(wdev); + return err; } static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) From a36e07dfe6ee71e209383ea9288cd8d1617e14f9 Mon Sep 17 00:00:00 2001 From: Gleb Fotengauer-Malinovskiy Date: Fri, 6 May 2022 17:24:54 +0000 Subject: [PATCH 149/179] rfkill: uapi: fix RFKILL_IOCTL_MAX_SIZE ioctl request definition The definition of RFKILL_IOCTL_MAX_SIZE introduced by commit 54f586a91532 ("rfkill: make new event layout opt-in") is unusable since it is based on RFKILL_IOC_EXT_SIZE which has not been defined. Fix that by replacing the undefined constant with the constant which is intended to be used in this definition. Fixes: 54f586a91532 ("rfkill: make new event layout opt-in") Cc: stable@vger.kernel.org # 5.11+ Signed-off-by: Gleb Fotengauer-Malinovskiy Signed-off-by: Dmitry V. Levin Link: https://lore.kernel.org/r/20220506172454.120319-1-glebfm@altlinux.org [add commit message provided later by Dmitry] Signed-off-by: Johannes Berg --- include/uapi/linux/rfkill.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h index 283c5a7b3f2c..db6c8588c1d0 100644 --- a/include/uapi/linux/rfkill.h +++ b/include/uapi/linux/rfkill.h @@ -184,7 +184,7 @@ struct rfkill_event_ext { #define RFKILL_IOC_NOINPUT 1 #define RFKILL_IOCTL_NOINPUT _IO(RFKILL_IOC_MAGIC, RFKILL_IOC_NOINPUT) #define RFKILL_IOC_MAX_SIZE 2 -#define RFKILL_IOCTL_MAX_SIZE _IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_EXT_SIZE, __u32) +#define RFKILL_IOCTL_MAX_SIZE _IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_MAX_SIZE, __u32) /* and that's all userspace gets */ From e4b1045bf9cfec6f70ac6d3783be06c3a88dcb25 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 6 May 2022 11:40:40 +0800 Subject: [PATCH 150/179] ionic: fix missing pci_release_regions() on error in ionic_probe() If ionic_map_bars() fails, pci_release_regions() need be called. Fixes: fbfb8031533c ("ionic: Add hardware init and device commands") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220506034040.2614129-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c index 6ffc62c41165..0a7a757494bc 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c @@ -256,7 +256,7 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = ionic_map_bars(ionic); if (err) - goto err_out_pci_disable_device; + goto err_out_pci_release_regions; /* Configure the device */ err = ionic_setup(ionic); @@ -360,6 +360,7 @@ err_out_teardown: err_out_unmap_bars: ionic_unmap_bars(ionic); +err_out_pci_release_regions: pci_release_regions(pdev); err_out_pci_disable_device: pci_disable_device(pdev); From 51ca86b4c9c7c75f5630fa0dbe5f8f0bd98e3c3e Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 6 May 2022 17:42:50 +0800 Subject: [PATCH 151/179] ethernet: tulip: fix missing pci_disable_device() on error in tulip_init_one() Fix the missing pci_disable_device() before return from tulip_init_one() in the error handling case. Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220506094250.3630615-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/dec/tulip/tulip_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c index 79df5a72877b..0040dcaab945 100644 --- a/drivers/net/ethernet/dec/tulip/tulip_core.c +++ b/drivers/net/ethernet/dec/tulip/tulip_core.c @@ -1399,8 +1399,10 @@ static int tulip_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* alloc_etherdev ensures aligned and zeroed private structures */ dev = alloc_etherdev (sizeof (*tp)); - if (!dev) + if (!dev) { + pci_disable_device(pdev); return -ENOMEM; + } SET_NETDEV_DEV(dev, &pdev->dev); if (pci_resource_len (pdev, 0) < tulip_tbl[chip_idx].io_size) { @@ -1785,6 +1787,7 @@ err_out_free_res: err_out_free_netdev: free_netdev (dev); + pci_disable_device(pdev); return -ENODEV; } From 4bd46bb037f8e1883dbe1fc9e79896b7f885db3f Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Fri, 6 May 2022 15:37:39 -0700 Subject: [PATCH 152/179] ptp: ocp: Use DIV64_U64_ROUND_UP for rounding. The initial code used roundup() to round the starting time to a multiple of a period. This generated an error on 32-bit systems, so was replaced with DIV_ROUND_UP_ULL(). However, this truncates to 32-bits on a 64-bit system. Replace with DIV64_U64_ROUND_UP() instead. Fixes: b325af3cfab9 ("ptp: ocp: Add signal generators and update sysfs nodes") Signed-off-by: Jonathan Lemon Link: https://lore.kernel.org/r/20220506223739.1930-2-jonathan.lemon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 0feaa4b45317..dd45471f6780 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -1557,7 +1557,7 @@ ptp_ocp_signal_set(struct ptp_ocp *bp, int gen, struct ptp_ocp_signal *s) start_ns = ktime_set(ts.tv_sec, ts.tv_nsec) + NSEC_PER_MSEC; if (!s->start) { /* roundup() does not work on 32-bit systems */ - s->start = DIV_ROUND_UP_ULL(start_ns, s->period); + s->start = DIV64_U64_ROUND_UP(start_ns, s->period); s->start = ktime_add(s->start, s->phase); } From ee1444b5e1df4155b591d0d9b1e72853a99ea861 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 6 May 2022 18:10:38 -0700 Subject: [PATCH 153/179] dim: initialize all struct fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The W=2 build pointed out that the code wasn't initializing all the variables in the dim_cq_moder declarations with the struct initializers. The net change here is zero since these structs were already static const globals and were initialized with zeros by the compiler, but removing compiler warnings has value in and of itself. lib/dim/net_dim.c: At top level: lib/dim/net_dim.c:54:9: warning: missing initializer for field ‘comps’ of ‘const struct dim_cq_moder’ [-Wmissing-field-initializers] 54 | NET_DIM_RX_EQE_PROFILES, | ^~~~~~~~~~~~~~~~~~~~~~~ In file included from lib/dim/net_dim.c:6: ./include/linux/dim.h:45:13: note: ‘comps’ declared here 45 | u16 comps; | ^~~~~ and repeats for the tx struct, and once you fix the comps entry then the cq_period_mode field needs the same treatment. Use the commonly accepted style to indicate to the compiler that we know what we're doing, and add a comma at the end of each struct initializer to clean up the issue, and use explicit initializers for the fields we are initializing which makes the compiler happy. While here and fixing these lines, clean up the code slightly with a fix for the super long lines by removing the word "_MODERATION" from a couple defines only used in this file. Fixes: f8be17b81d44 ("lib/dim: Fix -Wunused-const-variable warnings") Signed-off-by: Jesse Brandeburg Link: https://lore.kernel.org/r/20220507011038.14568-1-jesse.brandeburg@intel.com Signed-off-by: Jakub Kicinski --- lib/dim/net_dim.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index 06811d866775..53f6b9c6e936 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -12,41 +12,41 @@ * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES */ #define NET_DIM_PARAMS_NUM_PROFILES 5 -#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 -#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 +#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128 #define NET_DIM_DEF_PROFILE_CQE 1 #define NET_DIM_DEF_PROFILE_EQE 1 #define NET_DIM_RX_EQE_PROFILES { \ - {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {.usec = 1, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 8, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 64, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 128, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 256, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,} \ } #define NET_DIM_RX_CQE_PROFILES { \ - {2, 256}, \ - {8, 128}, \ - {16, 64}, \ - {32, 64}, \ - {64, 64} \ + {.usec = 2, .pkts = 256,}, \ + {.usec = 8, .pkts = 128,}, \ + {.usec = 16, .pkts = 64,}, \ + {.usec = 32, .pkts = 64,}, \ + {.usec = 64, .pkts = 64,} \ } #define NET_DIM_TX_EQE_PROFILES { \ - {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ - {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ + {.usec = 1, .pkts = NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 8, .pkts = NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 32, .pkts = NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 64, .pkts = NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE,}, \ + {.usec = 128, .pkts = NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE,} \ } #define NET_DIM_TX_CQE_PROFILES { \ - {5, 128}, \ - {8, 64}, \ - {16, 32}, \ - {32, 32}, \ - {64, 32} \ + {.usec = 5, .pkts = 128,}, \ + {.usec = 8, .pkts = 64,}, \ + {.usec = 16, .pkts = 32,}, \ + {.usec = 32, .pkts = 32,}, \ + {.usec = 64, .pkts = 32,} \ } static const struct dim_cq_moder From 91a7cda1f4b8bdf770000a3b60640576dafe0cec Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Fri, 6 May 2022 08:08:15 +0200 Subject: [PATCH 154/179] net: phy: Fix race condition on link status change This fixes the following error caused by a race condition between phydev->adjust_link() and a MDIO transaction in the phy interrupt handler. The issue was reproduced with the ethernet FEC driver and a micrel KSZ9031 phy. [ 146.195696] fec 2188000.ethernet eth0: MDIO read timeout [ 146.201779] ------------[ cut here ]------------ [ 146.206671] WARNING: CPU: 0 PID: 571 at drivers/net/phy/phy.c:942 phy_error+0x24/0x6c [ 146.214744] Modules linked in: bnep imx_vdoa imx_sdma evbug [ 146.220640] CPU: 0 PID: 571 Comm: irq/128-2188000 Not tainted 5.18.0-rc3-00080-gd569e86915b7 #9 [ 146.229563] Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) [ 146.236257] unwind_backtrace from show_stack+0x10/0x14 [ 146.241640] show_stack from dump_stack_lvl+0x58/0x70 [ 146.246841] dump_stack_lvl from __warn+0xb4/0x24c [ 146.251772] __warn from warn_slowpath_fmt+0x5c/0xd4 [ 146.256873] warn_slowpath_fmt from phy_error+0x24/0x6c [ 146.262249] phy_error from kszphy_handle_interrupt+0x40/0x48 [ 146.268159] kszphy_handle_interrupt from irq_thread_fn+0x1c/0x78 [ 146.274417] irq_thread_fn from irq_thread+0xf0/0x1dc [ 146.279605] irq_thread from kthread+0xe4/0x104 [ 146.284267] kthread from ret_from_fork+0x14/0x28 [ 146.289164] Exception stack(0xe6fa1fb0 to 0xe6fa1ff8) [ 146.294448] 1fa0: 00000000 00000000 00000000 00000000 [ 146.302842] 1fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 146.311281] 1fe0: 00000000 00000000 00000000 00000000 00000013 00000000 [ 146.318262] irq event stamp: 12325 [ 146.321780] hardirqs last enabled at (12333): [] __up_console_sem+0x50/0x60 [ 146.330013] hardirqs last disabled at (12342): [] __up_console_sem+0x3c/0x60 [ 146.338259] softirqs last enabled at (12324): [] __do_softirq+0x2c0/0x624 [ 146.346311] softirqs last disabled at (12319): [] __irq_exit_rcu+0x138/0x178 [ 146.354447] ---[ end trace 0000000000000000 ]--- With the FEC driver phydev->adjust_link() calls fec_enet_adjust_link() calls fec_stop()/fec_restart() and both these function reset and temporary disable the FEC disrupting any MII transaction that could be happening at the same time. fec_enet_adjust_link() and phy_read() can be running at the same time when we have one additional interrupt before the phy_state_machine() is able to terminate. Thread 1 (phylib WQ) | Thread 2 (phy interrupt) | | phy_interrupt() <-- PHY IRQ | handle_interrupt() | phy_read() | phy_trigger_machine() | --> schedule phylib WQ | | phy_state_machine() | phy_check_link_status() | phy_link_change() | phydev->adjust_link() | fec_enet_adjust_link() | --> FEC reset | phy_interrupt() <-- PHY IRQ | phy_read() | Fix this by acquiring the phydev lock in phy_interrupt(). Link: https://lore.kernel.org/all/20220422152612.GA510015@francesco-nb.int.toradex.com/ Fixes: c974bdbc3e77 ("net: phy: Use threaded IRQ, to allow IRQ from sleeping devices") cc: Signed-off-by: Francesco Dolcini Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220506060815.327382-1-francesco.dolcini@toradex.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index beb2b66da132..f122026c4682 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -970,8 +970,13 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat) { struct phy_device *phydev = phy_dat; struct phy_driver *drv = phydev->drv; + irqreturn_t ret; - return drv->handle_interrupt(phydev); + mutex_lock(&phydev->lock); + ret = drv->handle_interrupt(phydev); + mutex_unlock(&phydev->lock); + + return ret; } /** From 630fd4822af2374cd75c682b7665dcb367613765 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 7 May 2022 16:45:50 +0300 Subject: [PATCH 155/179] net: dsa: flush switchdev workqueue on bridge join error path There is a race between switchdev_bridge_port_offload() and the dsa_port_switchdev_sync_attrs() call right below it. When switchdev_bridge_port_offload() finishes, FDB entries have been replayed by the bridge, but are scheduled for deferred execution later. However dsa_port_switchdev_sync_attrs -> dsa_port_can_apply_vlan_filtering() may impose restrictions on the vlan_filtering attribute and refuse offloading. When this happens, the delayed FDB entries will dereference dp->bridge, which is a NULL pointer because we have stopped the process of offloading this bridge. Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Workqueue: dsa_ordered dsa_slave_switchdev_event_work pc : dsa_port_bridge_host_fdb_del+0x64/0x100 lr : dsa_slave_switchdev_event_work+0x130/0x1bc Call trace: dsa_port_bridge_host_fdb_del+0x64/0x100 dsa_slave_switchdev_event_work+0x130/0x1bc process_one_work+0x294/0x670 worker_thread+0x80/0x460 ---[ end trace 0000000000000000 ]--- Error: dsa_core: Must first remove VLAN uppers having VIDs also present in bridge. Fix the bug by doing what we do on the normal bridge leave path as well, which is to wait until the deferred FDB entries complete executing, then exit. The placement of dsa_flush_workqueue() after switchdev_bridge_port_unoffload() guarantees that both the FDB additions and deletions on rollback are waited for. Fixes: d7d0d423dbaa ("net: dsa: flush switchdev workqueue when leaving the bridge") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220507134550.1849834-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/port.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dsa/port.c b/net/dsa/port.c index cdc56ba11f52..bdccb613285d 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -451,6 +451,7 @@ out_rollback_unoffload: switchdev_bridge_port_unoffload(brport_dev, dp, &dsa_slave_switchdev_notifier, &dsa_slave_switchdev_blocking_notifier); + dsa_flush_workqueue(); out_rollback_unbridge: dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); out_rollback: From 1809c30b6e5a83a1de1435fe01aaa4de4d626a7c Mon Sep 17 00:00:00 2001 From: Manuel Ullmann Date: Wed, 4 May 2022 21:30:44 +0200 Subject: [PATCH 156/179] net: atlantic: always deep reset on pm op, fixing up my null deref regression The impact of this regression is the same for resume that I saw on thaw: the kernel hangs and nothing except SysRq rebooting can be done. Fixes regression in commit cbe6c3a8f8f4 ("net: atlantic: invert deep par in pm functions, preventing null derefs"), where I disabled deep pm resets in suspend and resume, trying to make sense of the atl_resume_common() deep parameter in the first place. It turns out, that atlantic always has to deep reset on pm operations. Even though I expected that and tested resume, I screwed up by kexec-rebooting into an unpatched kernel, thus missing the breakage. This fixup obsoletes the deep parameter of atl_resume_common, but I leave the cleanup for the maintainers to post to mainline. Suspend and hibernation were successfully tested by the reporters. Fixes: cbe6c3a8f8f4 ("net: atlantic: invert deep par in pm functions, preventing null derefs") Link: https://lore.kernel.org/regressions/9-Ehc_xXSwdXcvZqKD5aSqsqeNj5Izco4MYEwnx5cySXVEc9-x_WC4C3kAoCqNTi-H38frroUK17iobNVnkLtW36V6VWGSQEOHXhmVMm5iQ=@protonmail.com/ Reported-by: Jordan Leppert Reported-by: Holger Hoffstaette Tested-by: Jordan Leppert Tested-by: Holger Hoffstaette CC: # 5.10+ Signed-off-by: Manuel Ullmann Link: https://lore.kernel.org/r/87bkw8dfmp.fsf@posteo.de Signed-off-by: Paolo Abeni --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 3a529ee8c834..831833911a52 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -449,7 +449,7 @@ static int aq_pm_freeze(struct device *dev) static int aq_pm_suspend_poweroff(struct device *dev) { - return aq_suspend_common(dev, false); + return aq_suspend_common(dev, true); } static int aq_pm_thaw(struct device *dev) @@ -459,7 +459,7 @@ static int aq_pm_thaw(struct device *dev) static int aq_pm_resume_restore(struct device *dev) { - return atl_resume_common(dev, false); + return atl_resume_common(dev, true); } static const struct dev_pm_ops aq_pm_ops = { From 846a3351ddfe4a86eede4bb26a205c3f38ef84d3 Mon Sep 17 00:00:00 2001 From: Jing Xia Date: Tue, 10 May 2022 10:35:14 +0800 Subject: [PATCH 157/179] writeback: Avoid skipping inode writeback We have run into an issue that a task gets stuck in balance_dirty_pages_ratelimited() when perform I/O stress testing. The reason we observed is that an I_DIRTY_PAGES inode with lots of dirty pages is in b_dirty_time list and standard background writeback cannot writeback the inode. After studing the relevant code, the following scenario may lead to the issue: task1 task2 ----- ----- fuse_flush write_inode_now //in b_dirty_time writeback_single_inode __writeback_single_inode fuse_write_end filemap_dirty_folio __xa_set_mark:PAGECACHE_TAG_DIRTY lock inode->i_lock if mapping tagged PAGECACHE_TAG_DIRTY inode->i_state |= I_DIRTY_PAGES unlock inode->i_lock __mark_inode_dirty:I_DIRTY_PAGES lock inode->i_lock -was dirty,inode stays in -b_dirty_time unlock inode->i_lock if(!(inode->i_state & I_DIRTY_All)) -not true,so nothing done This patch moves the dirty inode to b_dirty list when the inode currently is not queued in b_io or b_more_io list at the end of writeback_single_inode. Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig CC: stable@vger.kernel.org Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") Signed-off-by: Jing Xia Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220510023514.27399-1-jing.xia@unisoc.com --- fs/fs-writeback.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 591fe9cf1659..1fae0196292a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode, */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED) && + (inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: From dc5306a8c0eace6c113aded2e36ae5e15fdca4d7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 8 May 2022 03:22:17 -0700 Subject: [PATCH 158/179] decnet: Use container_of() for struct dn_neigh casts Clang's structure layout randomization feature gets upset when it sees struct neighbor (which is randomized) cast to struct dn_neigh: net/decnet/dn_route.c:1123:15: error: casting from randomized structure pointer type 'struct neighbour *' to 'struct dn_neigh *' gateway = ((struct dn_neigh *)neigh)->addr; ^ Update all the open-coded casts to use container_of() to do the conversion instead of depending on strict member ordering. Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/202205041247.WKBEHGS5-lkp@intel.com Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Yajun Deng Cc: Zheng Yongjun Cc: Bill Wendling Cc: linux-decnet-user@lists.sourceforge.net Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220508102217.2647184-1-keescook@chromium.org Signed-off-by: Paolo Abeni --- net/decnet/dn_dev.c | 4 ++-- net/decnet/dn_neigh.c | 3 ++- net/decnet/dn_route.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 0ee7d4c0c955..a09ba642b5e7 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -854,7 +854,7 @@ static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) memcpy(msg->neighbor, dn_hiord, ETH_ALEN); if (dn_db->router) { - struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + struct dn_neigh *dn = container_of(dn_db->router, struct dn_neigh, n); dn_dn2eth(msg->neighbor, dn->addr); } @@ -902,7 +902,7 @@ static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa) { int n; struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + struct dn_neigh *dn = container_of(dn_db->router, struct dn_neigh, n); struct sk_buff *skb; size_t size; unsigned char *ptr; diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 94b306f6d551..fbd98ac853ea 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -426,7 +426,8 @@ int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb) if (!dn_db->router) { dn_db->router = neigh_clone(neigh); } else { - if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + if (msg->priority > container_of(dn_db->router, + struct dn_neigh, n)->priority) neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); } } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 7e85f2a1ae25..d1d78a463a06 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1120,7 +1120,7 @@ source_ok: /* Ok then, we assume its directly connected and move on */ select_source: if (neigh) - gateway = ((struct dn_neigh *)neigh)->addr; + gateway = container_of(neigh, struct dn_neigh, n)->addr; if (gateway == 0) gateway = fld.daddr; if (fld.saddr == 0) { @@ -1429,7 +1429,7 @@ static int dn_route_input_slow(struct sk_buff *skb) /* Use the default router if there is one */ neigh = neigh_clone(dn_db->router); if (neigh) { - gateway = ((struct dn_neigh *)neigh)->addr; + gateway = container_of(neigh, struct dn_neigh, n)->addr; goto make_route; } From 7ff960a6fe399fdcbca6159063684671ae57eee9 Mon Sep 17 00:00:00 2001 From: Shunsuke Mie Date: Tue, 10 May 2022 19:27:23 +0900 Subject: [PATCH 159/179] virtio: fix virtio transitional ids This commit fixes the transitional PCI device ID. Fixes: d61914ea6ada ("virtio: update virtio id table, add transitional ids") Signed-off-by: Shunsuke Mie Link: https://lore.kernel.org/r/20220510102723.87666-1-mie@igel.co.jp Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_ids.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 80d76b75bccd..7aa2eb766205 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -73,12 +73,12 @@ * Virtio Transitional IDs */ -#define VIRTIO_TRANS_ID_NET 1000 /* transitional virtio net */ -#define VIRTIO_TRANS_ID_BLOCK 1001 /* transitional virtio block */ -#define VIRTIO_TRANS_ID_BALLOON 1002 /* transitional virtio balloon */ -#define VIRTIO_TRANS_ID_CONSOLE 1003 /* transitional virtio console */ -#define VIRTIO_TRANS_ID_SCSI 1004 /* transitional virtio SCSI */ -#define VIRTIO_TRANS_ID_RNG 1005 /* transitional virtio rng */ -#define VIRTIO_TRANS_ID_9P 1009 /* transitional virtio 9p console */ +#define VIRTIO_TRANS_ID_NET 0x1000 /* transitional virtio net */ +#define VIRTIO_TRANS_ID_BLOCK 0x1001 /* transitional virtio block */ +#define VIRTIO_TRANS_ID_BALLOON 0x1002 /* transitional virtio balloon */ +#define VIRTIO_TRANS_ID_CONSOLE 0x1003 /* transitional virtio console */ +#define VIRTIO_TRANS_ID_SCSI 0x1004 /* transitional virtio SCSI */ +#define VIRTIO_TRANS_ID_RNG 0x1005 /* transitional virtio rng */ +#define VIRTIO_TRANS_ID_9P 0x1009 /* transitional virtio 9p console */ #endif /* _LINUX_VIRTIO_IDS_H */ From c1ad35dd0548ce947d97aaf92f7f2f9a202951cf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 May 2022 12:36:04 +0200 Subject: [PATCH 160/179] udf: Avoid using stale lengthOfImpUse udf_write_fi() uses lengthOfImpUse of the entry it is writing to. However this field has not yet been initialized so it either contains completely bogus value or value from last directory entry at that place. In either case this is wrong and can lead to filesystem corruption or kernel crashes. Reported-by: butt3rflyh4ck CC: stable@vger.kernel.org Fixes: 979a6e28dd96 ("udf: Get rid of 0-length arrays in struct fileIdentDesc") Signed-off-by: Jan Kara --- fs/udf/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0ed4861b038f..b3d5f97f16cd 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy(udf_get_fi_ident(sfi), fileident, lfi); + memcpy(sfi->impUse + liu, fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy(udf_get_fi_ident(sfi), fileident, -offset); + memcpy(sfi->impUse + liu, fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); + memset(sfi->impUse + liu + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); + memset(sfi->impUse + liu + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } From 12a4d677b1c34717443470c1492fe520638ef39a Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 9 May 2022 22:45:19 +0800 Subject: [PATCH 161/179] net: phy: micrel: Fix incorrect variable type in micrel In lanphy_read_page_reg, calling __phy_read() might return a negative error code. Use 'int' to check the error code. Fixes: 7c2dcfa295b1 ("net: phy: micrel: Add support for LAN8804 PHY") Signed-off-by: Wan Jiabing Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220509144519.2343399-1-wanjiabing@vivo.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 9d7dafed3931..cd9aa353b653 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1743,7 +1743,7 @@ static int ksz886x_cable_test_get_status(struct phy_device *phydev, static int lanphy_read_page_reg(struct phy_device *phydev, int page, u32 addr) { - u32 data; + int data; phy_lock_mdio_bus(phydev); __phy_write(phydev, LAN_EXT_PAGE_ACCESS_CONTROL, page); @@ -2444,8 +2444,7 @@ static int lan8804_config_init(struct phy_device *phydev) static irqreturn_t lan8814_handle_interrupt(struct phy_device *phydev) { - u16 tsu_irq_status; - int irq_status; + int irq_status, tsu_irq_status; irq_status = phy_read(phydev, LAN8814_INTS); if (irq_status > 0 && (irq_status & LAN8814_INT_LINK)) From 0807ce0b010418a191e0e4009803b2d74c3245d5 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 10 May 2022 11:13:16 +0800 Subject: [PATCH 162/179] net: stmmac: fix missing pci_disable_device() on error in stmmac_pci_probe() Switch to using pcim_enable_device() to avoid missing pci_disable_device(). Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220510031316.1780409-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index fcf17d8a0494..644bb54f5f02 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -181,7 +181,7 @@ static int stmmac_pci_probe(struct pci_dev *pdev, return -ENOMEM; /* Enable pci device */ - ret = pci_enable_device(pdev); + ret = pcim_enable_device(pdev); if (ret) { dev_err(&pdev->dev, "%s: ERROR: failed to enable device\n", __func__); @@ -241,8 +241,6 @@ static void stmmac_pci_remove(struct pci_dev *pdev) pcim_iounmap_regions(pdev, BIT(i)); break; } - - pci_disable_device(pdev); } static int __maybe_unused stmmac_pci_suspend(struct device *dev) From 62e0ae0f4020250f961cf8d0103a4621be74e077 Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Mon, 9 May 2022 19:28:23 -0700 Subject: [PATCH 163/179] net: atlantic: fix "frag[0] not initialized" In aq_ring_rx_clean(), if buff->is_eop is not set AND buff->len < AQ_CFG_RX_HDR_SIZE, then hdr_len remains equal to buff->len and skb_add_rx_frag(xxx, *0*, ...) is not called. The loop following this code starts calling skb_add_rx_frag() starting with i=1 and thus frag[0] is never initialized. Since i is initialized to zero at the top of the primary loop, we can just reference and post-increment i instead of hardcoding the 0 when calling skb_add_rx_frag() the first time. Reported-by: Aashay Shringarpure Reported-by: Yi Chou Reported-by: Shervin Oloumi Signed-off-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 77e76c9efd32..440423b0e8ea 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -446,7 +446,7 @@ int aq_ring_rx_clean(struct aq_ring_s *self, ALIGN(hdr_len, sizeof(long))); if (buff->len - hdr_len > 0) { - skb_add_rx_frag(skb, 0, buff->rxdata.page, + skb_add_rx_frag(skb, i++, buff->rxdata.page, buff->rxdata.pg_off + hdr_len, buff->len - hdr_len, AQ_CFG_RX_FRAME_MAX); @@ -455,7 +455,6 @@ int aq_ring_rx_clean(struct aq_ring_s *self, if (!buff->is_eop) { buff_ = buff; - i = 1U; do { next_ = buff_->next; buff_ = &self->buff_ring[next_]; From 79784d77ebbd3ec516b7a5ce555d979fb7946202 Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Mon, 9 May 2022 19:28:24 -0700 Subject: [PATCH 164/179] net: atlantic: reduce scope of is_rsc_complete Don't defer handling the err case outside the loop. That's pointless. And since is_rsc_complete is only used inside this loop, declare it inside the loop to reduce it's scope. Signed-off-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 440423b0e8ea..bc1952131799 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -346,7 +346,6 @@ int aq_ring_rx_clean(struct aq_ring_s *self, int budget) { struct net_device *ndev = aq_nic_get_ndev(self->aq_nic); - bool is_rsc_completed = true; int err = 0; for (; (self->sw_head != self->hw_head) && budget; @@ -366,6 +365,8 @@ int aq_ring_rx_clean(struct aq_ring_s *self, if (!buff->is_eop) { buff_ = buff; do { + bool is_rsc_completed = true; + if (buff_->next >= self->size) { err = -EIO; goto err_exit; @@ -377,18 +378,16 @@ int aq_ring_rx_clean(struct aq_ring_s *self, next_, self->hw_head); - if (unlikely(!is_rsc_completed)) - break; + if (unlikely(!is_rsc_completed)) { + err = 0; + goto err_exit; + } buff->is_error |= buff_->is_error; buff->is_cso_err |= buff_->is_cso_err; } while (!buff_->is_eop); - if (!is_rsc_completed) { - err = 0; - goto err_exit; - } if (buff->is_error || (buff->is_lro && buff->is_cso_err)) { buff_ = buff; From 6aecbba12b5c90b26dc062af3b9de8c4b3a2f19f Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Mon, 9 May 2022 19:28:25 -0700 Subject: [PATCH 165/179] net: atlantic: add check for MAX_SKB_FRAGS Enforce that the CPU can not get stuck in an infinite loop. Reported-by: Aashay Shringarpure Reported-by: Yi Chou Reported-by: Shervin Oloumi Signed-off-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index bc1952131799..8201ce7adb77 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -363,6 +363,7 @@ int aq_ring_rx_clean(struct aq_ring_s *self, continue; if (!buff->is_eop) { + unsigned int frag_cnt = 0U; buff_ = buff; do { bool is_rsc_completed = true; @@ -371,6 +372,8 @@ int aq_ring_rx_clean(struct aq_ring_s *self, err = -EIO; goto err_exit; } + + frag_cnt++; next_ = buff_->next, buff_ = &self->buff_ring[next_]; is_rsc_completed = @@ -378,7 +381,8 @@ int aq_ring_rx_clean(struct aq_ring_s *self, next_, self->hw_head); - if (unlikely(!is_rsc_completed)) { + if (unlikely(!is_rsc_completed) || + frag_cnt > MAX_SKB_FRAGS) { err = 0; goto err_exit; } From 2120b7f4d128433ad8c5f503a9584deba0684901 Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Mon, 9 May 2022 19:28:26 -0700 Subject: [PATCH 166/179] net: atlantic: verify hw_head_ lies within TX buffer ring Bounds check hw_head index provided by NIC to verify it lies within the TX buffer ring. Reported-by: Aashay Shringarpure Reported-by: Yi Chou Reported-by: Shervin Oloumi Signed-off-by: Grant Grundler Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index d875ce3ec759..15ede7285fb5 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -889,6 +889,13 @@ int hw_atl_b0_hw_ring_tx_head_update(struct aq_hw_s *self, err = -ENXIO; goto err_exit; } + + /* Validate that the new hw_head_ is reasonable. */ + if (hw_head_ >= ring->size) { + err = -ENXIO; + goto err_exit; + } + ring->hw_head = hw_head_; err = aq_hw_err_from_flags(self); From 2c50c6867c85afee6f2b3bcbc50fc9d0083d1343 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Tue, 10 May 2022 09:05:06 +0200 Subject: [PATCH 167/179] s390/ctcm: fix variable dereferenced before check Found by cppcheck and smatch. smatch complains about drivers/s390/net/ctcm_sysfs.c:43 ctcm_buffer_write() warn: variable dereferenced before check 'priv' (see line 42) Fixes: 3c09e2647b5e ("ctcm: rename READ/WRITE defines to avoid redefinitions") Reported-by: Colin Ian King Signed-off-by: Alexandra Winter Signed-off-by: David S. Miller --- drivers/s390/net/ctcm_sysfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/ctcm_sysfs.c b/drivers/s390/net/ctcm_sysfs.c index ded1930a00b2..e3813a7aa5e6 100644 --- a/drivers/s390/net/ctcm_sysfs.c +++ b/drivers/s390/net/ctcm_sysfs.c @@ -39,11 +39,12 @@ static ssize_t ctcm_buffer_write(struct device *dev, struct ctcm_priv *priv = dev_get_drvdata(dev); int rc; - ndev = priv->channel[CTCM_READ]->netdev; - if (!(priv && priv->channel[CTCM_READ] && ndev)) { + if (!(priv && priv->channel[CTCM_READ] && + priv->channel[CTCM_READ]->netdev)) { CTCM_DBF_TEXT(SETUP, CTC_DBF_ERROR, "bfnondev"); return -ENODEV; } + ndev = priv->channel[CTCM_READ]->netdev; rc = kstrtouint(buf, 0, &bs1); if (rc) From 0c0b20587b9f25a2ad14db7f80ebe49bdf29920a Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Tue, 10 May 2022 09:05:07 +0200 Subject: [PATCH 168/179] s390/ctcm: fix potential memory leak smatch complains about drivers/s390/net/ctcm_mpc.c:1210 ctcmpc_unpack_skb() warn: possible memory leak of 'mpcginfo' mpc_action_discontact() did not free mpcginfo. Consolidate the freeing in ctcmpc_unpack_skb(). Fixes: 293d984f0e36 ("ctcm: infrastructure for replaced ctc driver") Signed-off-by: Alexandra Winter Signed-off-by: David S. Miller --- drivers/s390/net/ctcm_mpc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c index 88abfb5e8045..8ac213a55141 100644 --- a/drivers/s390/net/ctcm_mpc.c +++ b/drivers/s390/net/ctcm_mpc.c @@ -626,8 +626,6 @@ static void mpc_rcvd_sweep_resp(struct mpcg_info *mpcginfo) ctcm_clear_busy_do(dev); } - kfree(mpcginfo); - return; } @@ -1192,10 +1190,10 @@ static void ctcmpc_unpack_skb(struct channel *ch, struct sk_buff *pskb) CTCM_FUNTAIL, dev->name); priv->stats.rx_dropped++; /* mpcginfo only used for non-data transfers */ - kfree(mpcginfo); if (do_debug_data) ctcmpc_dump_skb(pskb, -8); } + kfree(mpcginfo); } done: @@ -1977,7 +1975,6 @@ static void mpc_action_rcvd_xid0(fsm_instance *fsm, int event, void *arg) } break; } - kfree(mpcginfo); CTCM_PR_DEBUG("ctcmpc:%s() %s xid2:%i xid7:%i xidt_p2:%i \n", __func__, ch->id, grp->outstanding_xid2, @@ -2038,7 +2035,6 @@ static void mpc_action_rcvd_xid7(fsm_instance *fsm, int event, void *arg) mpc_validate_xid(mpcginfo); break; } - kfree(mpcginfo); return; } From 671bb35c8e746439f0ed70815968f9a4f20a8deb Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Tue, 10 May 2022 09:05:08 +0200 Subject: [PATCH 169/179] s390/lcs: fix variable dereferenced before check smatch complains about drivers/s390/net/lcs.c:1741 lcs_get_control() warn: variable dereferenced before check 'card->dev' (see line 1739) Fixes: 27eb5ac8f015 ("[PATCH] s390: lcs driver bug fixes and improvements [1/2]") Signed-off-by: Alexandra Winter Signed-off-by: David S. Miller --- drivers/s390/net/lcs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c index bab9b34926c6..84c8981317b4 100644 --- a/drivers/s390/net/lcs.c +++ b/drivers/s390/net/lcs.c @@ -1736,10 +1736,11 @@ lcs_get_control(struct lcs_card *card, struct lcs_cmd *cmd) lcs_schedule_recovery(card); break; case LCS_CMD_STOPLAN: - pr_warn("Stoplan for %s initiated by LGW\n", - card->dev->name); - if (card->dev) + if (card->dev) { + pr_warn("Stoplan for %s initiated by LGW\n", + card->dev->name); netif_carrier_off(card->dev); + } break; default: LCS_DBF_TEXT(5, trace, "noLGWcmd"); From 8b796475fd7882663a870456466a4fb315cc1bd6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 10 May 2022 16:57:34 +0200 Subject: [PATCH 170/179] net/sched: act_pedit: really ensure the skb is writable Currently pedit tries to ensure that the accessed skb offset is writable via skb_unclone(). The action potentially allows touching any skb bytes, so it may end-up modifying shared data. The above causes some sporadic MPTCP self-test failures, due to this code: tc -n $ns2 filter add dev ns2eth$i egress \ protocol ip prio 1000 \ handle 42 fw \ action pedit munge offset 148 u8 invert \ pipe csum tcp \ index 100 The above modifies a data byte outside the skb head and the skb is a cloned one, carrying a TCP output packet. This change addresses the issue by keeping track of a rough over-estimate highest skb offset accessed by the action and ensuring such offset is really writable. Note that this may cause performance regressions in some scenarios, but hopefully pedit is not in the critical path. Fixes: db2c24175d14 ("act_pedit: access skb->data safely") Acked-by: Mat Martineau Tested-by: Geliang Tang Signed-off-by: Paolo Abeni Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/1fcf78e6679d0a287dd61bb0f04730ce33b3255d.1652194627.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_pedit.h | 1 + net/sched/act_pedit.c | 26 ++++++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 748cf87a4d7e..3e02709a1df6 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -14,6 +14,7 @@ struct tcf_pedit { struct tc_action common; unsigned char tcfp_nkeys; unsigned char tcfp_flags; + u32 tcfp_off_max_hint; struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 31fcd279c177..0eaaf1f45de1 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -149,7 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; - int ksize; + int i, ksize; u32 index; if (!nla) { @@ -228,6 +228,18 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, p->tcfp_nkeys = parm->nkeys; } memcpy(p->tcfp_keys, parm->keys, ksize); + p->tcfp_off_max_hint = 0; + for (i = 0; i < p->tcfp_nkeys; ++i) { + u32 cur = p->tcfp_keys[i].off; + + /* The AT option can read a single byte, we can bound the actual + * value with uchar max. + */ + cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift; + + /* Each key touches 4 bytes starting from the computed offset */ + p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4); + } p->tcfp_flags = parm->flags; goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -308,13 +320,18 @@ static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = to_pedit(a); + u32 max_offset; int i; - if (skb_unclone(skb, GFP_ATOMIC)) - return p->tcf_action; - spin_lock(&p->tcf_lock); + max_offset = (skb_transport_header_was_set(skb) ? + skb_transport_offset(skb) : + skb_network_offset(skb)) + + p->tcfp_off_max_hint; + if (skb_ensure_writable(skb, min(skb->len, max_offset))) + goto unlock; + tcf_lastuse_update(&p->tcf_tm); if (p->tcfp_nkeys > 0) { @@ -403,6 +420,7 @@ bad: p->tcf_qstats.overlimits++; done: bstats_update(&p->tcf_bstats, skb); +unlock: spin_unlock(&p->tcf_lock); return p->tcf_action; } From 3f95a7472d14abef284d8968734fe2ae7ff4845f Mon Sep 17 00:00:00 2001 From: Xiaomeng Tong Date: Tue, 10 May 2022 13:48:46 -0700 Subject: [PATCH 171/179] i40e: i40e_main: fix a missing check on list iterator The bug is here: ret = i40e_add_macvlan_filter(hw, ch->seid, vdev->dev_addr, &aq_err); The list iterator 'ch' will point to a bogus position containing HEAD if the list is empty or no element is found. This case must be checked before any use of the iterator, otherwise it will lead to a invalid memory access. To fix this bug, use a new variable 'iter' as the list iterator, while use the origin variable 'ch' as a dedicated pointer to point to the found element. Cc: stable@vger.kernel.org Fixes: 1d8d80b4e4ff6 ("i40e: Add macvlan support on i40e") Signed-off-by: Xiaomeng Tong Tested-by: Gurucharan (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20220510204846.2166999-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 27 +++++++++++---------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6778df2177a1..98871f014994 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7549,42 +7549,43 @@ static void i40e_free_macvlan_channels(struct i40e_vsi *vsi) static int i40e_fwd_ring_up(struct i40e_vsi *vsi, struct net_device *vdev, struct i40e_fwd_adapter *fwd) { + struct i40e_channel *ch = NULL, *ch_tmp, *iter; int ret = 0, num_tc = 1, i, aq_err; - struct i40e_channel *ch, *ch_tmp; struct i40e_pf *pf = vsi->back; struct i40e_hw *hw = &pf->hw; - if (list_empty(&vsi->macvlan_list)) - return -EINVAL; - /* Go through the list and find an available channel */ - list_for_each_entry_safe(ch, ch_tmp, &vsi->macvlan_list, list) { - if (!i40e_is_channel_macvlan(ch)) { - ch->fwd = fwd; + list_for_each_entry_safe(iter, ch_tmp, &vsi->macvlan_list, list) { + if (!i40e_is_channel_macvlan(iter)) { + iter->fwd = fwd; /* record configuration for macvlan interface in vdev */ for (i = 0; i < num_tc; i++) netdev_bind_sb_channel_queue(vsi->netdev, vdev, i, - ch->num_queue_pairs, - ch->base_queue); - for (i = 0; i < ch->num_queue_pairs; i++) { + iter->num_queue_pairs, + iter->base_queue); + for (i = 0; i < iter->num_queue_pairs; i++) { struct i40e_ring *tx_ring, *rx_ring; u16 pf_q; - pf_q = ch->base_queue + i; + pf_q = iter->base_queue + i; /* Get to TX ring ptr */ tx_ring = vsi->tx_rings[pf_q]; - tx_ring->ch = ch; + tx_ring->ch = iter; /* Get the RX ring ptr */ rx_ring = vsi->rx_rings[pf_q]; - rx_ring->ch = ch; + rx_ring->ch = iter; } + ch = iter; break; } } + if (!ch) + return -EINVAL; + /* Guarantee all rings are updated before we update the * MAC address filter. */ From 103a2f3255a95991252f8f13375c3a96a75011cd Mon Sep 17 00:00:00 2001 From: Itay Iellin Date: Sat, 7 May 2022 08:32:48 -0400 Subject: [PATCH 172/179] Bluetooth: Fix the creation of hdev->name Set a size limit of 8 bytes of the written buffer to "hdev->name" including the terminating null byte, as the size of "hdev->name" is 8 bytes. If an id value which is greater than 9999 is allocated, then the "snprintf(hdev->name, sizeof(hdev->name), "hci%d", id)" function call would lead to a truncation of the id value in decimal notation. Set an explicit maximum id parameter in the id allocation function call. The id allocation function defines the maximum allocated id value as the maximum id parameter value minus one. Therefore, HCI_MAX_ID is defined as 10000. Signed-off-by: Itay Iellin Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 3 +++ net/bluetooth/hci_core.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8abd08245326..62d7b81b1cb7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -36,6 +36,9 @@ /* HCI priority */ #define HCI_PRIO_MAX 7 +/* HCI maximum id value */ +#define HCI_MAX_ID 10000 + /* HCI Core structures */ struct inquiry_data { bdaddr_t bdaddr; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b4782a6c1025..45c2dd2e1590 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2555,10 +2555,10 @@ int hci_register_dev(struct hci_dev *hdev) */ switch (hdev->dev_type) { case HCI_PRIMARY: - id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL); + id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL); break; case HCI_AMP: - id = ida_simple_get(&hci_index_ida, 1, 0, GFP_KERNEL); + id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL); break; default: return -EINVAL; @@ -2567,7 +2567,7 @@ int hci_register_dev(struct hci_dev *hdev) if (id < 0) return id; - sprintf(hdev->name, "hci%d", id); + snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); From 00832b1d1a393dfb1b9491d085e5b27e8c25d103 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 11 May 2022 11:08:29 +0800 Subject: [PATCH 173/179] net: ethernet: mediatek: ppe: fix wrong size passed to memset() 'foe_table' is a pointer, the real size of struct mtk_foe_entry should be pass to memset(). Fixes: ba37b7caf1ed ("net: ethernet: mtk_eth_soc: add support for initializing the PPE") Signed-off-by: Yang Yingliang Acked-by: Felix Fietkau Link: https://lore.kernel.org/r/20220511030829.3308094-1-yangyingliang@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_ppe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index 3ad10c793308..66298e2235c9 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -395,7 +395,7 @@ static void mtk_ppe_init_foe_table(struct mtk_ppe *ppe) static const u8 skip[] = { 12, 25, 38, 51, 76, 89, 102 }; int i, k; - memset(ppe->foe_table, 0, MTK_PPE_ENTRIES * sizeof(ppe->foe_table)); + memset(ppe->foe_table, 0, MTK_PPE_ENTRIES * sizeof(*ppe->foe_table)); if (!IS_ENABLED(CONFIG_SOC_MT7621)) return; From 6b77c06655b8a749c1a3d9ebc51e9717003f7e5a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 10 May 2022 20:17:51 -0700 Subject: [PATCH 174/179] net: bcmgenet: Check for Wake-on-LAN interrupt probe deferral The interrupt controller supplying the Wake-on-LAN interrupt line maybe modular on some platforms (irq-bcm7038-l1.c) and might be probed at a later time than the GENET driver. We need to specifically check for -EPROBE_DEFER and propagate that error to ensure that we eventually fetch the interrupt descriptor. Fixes: 9deb48b53e7f ("bcmgenet: add WOL IRQ check") Fixes: 5b1f0e62941b ("net: bcmgenet: Avoid touching non-existent interrupt") Signed-off-by: Florian Fainelli Reviewed-by: Stefan Wahren Link: https://lore.kernel.org/r/20220511031752.2245566-1-f.fainelli@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index bf1ec8fdc2ad..e87e46c47387 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3999,6 +3999,10 @@ static int bcmgenet_probe(struct platform_device *pdev) goto err; } priv->wol_irq = platform_get_irq_optional(pdev, 2); + if (priv->wol_irq == -EPROBE_DEFER) { + err = priv->wol_irq; + goto err; + } priv->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->base)) { From 810c2f0a3f86158c1e02e74947b66d811473434a Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 11 May 2022 14:57:47 +0300 Subject: [PATCH 175/179] mlxsw: Avoid warning during ip6gre device removal IPv6 addresses which are used for tunnels are stored in a hash table with reference counting. When a new GRE tunnel is configured, the driver is notified and configures it in hardware. Currently, any change in the tunnel is not applied in the driver. It means that if the remote address is changed, the driver is not aware of this change and the first address will be used. This behavior results in a warning [1] in scenarios such as the following: # ip link add name gre1 type ip6gre local 2000::3 remote 2000::fffe tos inherit ttl inherit # ip link set name gre1 type ip6gre local 2000::3 remote 2000::ffff ttl inherit # ip link delete gre1 The change of the address is not applied in the driver. Currently, the driver uses the remote address which is stored in the 'parms' of the overlay device. When the tunnel is removed, the new IPv6 address is used, the driver tries to release it, but as it is not aware of the change, this address is not configured and it warns about releasing non existing IPv6 address. Fix it by using the IPv6 address which is cached in the IPIP entry, this address is the last one that the driver used, so even in cases such the above, the first address will be released, without any warning. [1]: WARNING: CPU: 1 PID: 2197 at drivers/net/ethernet/mellanox/mlxsw/spectrum.c:2920 mlxsw_sp_ipv6_addr_put+0x146/0x220 [mlxsw_spectrum] ... CPU: 1 PID: 2197 Comm: ip Not tainted 5.17.0-rc8-custom-95062-gc1e5ded51a9a #84 Hardware name: Mellanox Technologies Ltd. MSN4700/VMOD0010, BIOS 5.11 07/12/2021 RIP: 0010:mlxsw_sp_ipv6_addr_put+0x146/0x220 [mlxsw_spectrum] ... Call Trace: mlxsw_sp2_ipip_rem_addr_unset_gre6+0xf1/0x120 [mlxsw_spectrum] mlxsw_sp_netdevice_ipip_ol_event+0xdb/0x640 [mlxsw_spectrum] mlxsw_sp_netdevice_event+0xc4/0x850 [mlxsw_spectrum] raw_notifier_call_chain+0x3c/0x50 call_netdevice_notifiers_info+0x2f/0x80 unregister_netdevice_many+0x311/0x6d0 rtnl_dellink+0x136/0x360 rtnetlink_rcv_msg+0x12f/0x380 netlink_rcv_skb+0x49/0xf0 netlink_unicast+0x233/0x340 netlink_sendmsg+0x202/0x440 ____sys_sendmsg+0x1f3/0x220 ___sys_sendmsg+0x70/0xb0 __sys_sendmsg+0x54/0xa0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: e846efe2737b ("mlxsw: spectrum: Add hash table for IPv6 address mapping") Reported-by: Maksym Yaremchuk Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20220511115747.238602-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c index 01cf5a6a26bd..a2ee695a3f17 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c @@ -568,10 +568,8 @@ static int mlxsw_sp2_ipip_rem_addr_set_gre6(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_ipip_entry *ipip_entry) { - struct __ip6_tnl_parm parms6; - - parms6 = mlxsw_sp_ipip_netdev_parms6(ipip_entry->ol_dev); - return mlxsw_sp_ipv6_addr_kvdl_index_get(mlxsw_sp, &parms6.raddr, + return mlxsw_sp_ipv6_addr_kvdl_index_get(mlxsw_sp, + &ipip_entry->parms.daddr.addr6, &ipip_entry->dip_kvdl_index); } @@ -579,10 +577,7 @@ static void mlxsw_sp2_ipip_rem_addr_unset_gre6(struct mlxsw_sp *mlxsw_sp, const struct mlxsw_sp_ipip_entry *ipip_entry) { - struct __ip6_tnl_parm parms6; - - parms6 = mlxsw_sp_ipip_netdev_parms6(ipip_entry->ol_dev); - mlxsw_sp_ipv6_addr_put(mlxsw_sp, &parms6.raddr); + mlxsw_sp_ipv6_addr_put(mlxsw_sp, &ipip_entry->parms.daddr.addr6); } static const struct mlxsw_sp_ipip_ops mlxsw_sp2_ipip_gre6_ops = { From b7be130c5d52e5224ac7d89568737b37b4c4b785 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 May 2022 19:17:31 -0700 Subject: [PATCH 176/179] net: dsa: bcm_sf2: Fix Wake-on-LAN with mac_link_down() After commit 2d1f90f9ba83 ("net: dsa/bcm_sf2: fix incorrect usage of state->link") the interface suspend path would call our mac_link_down() call back which would forcibly set the link down, thus preventing Wake-on-LAN packets from reaching our management port. Fix this by looking at whether the port is enabled for Wake-on-LAN and not clearing the link status in that case to let packets go through. Fixes: 2d1f90f9ba83 ("net: dsa/bcm_sf2: fix incorrect usage of state->link") Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20220512021731.2494261-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/bcm_sf2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index cf82b1fa9725..87e81c636339 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -809,6 +809,9 @@ static void bcm_sf2_sw_mac_link_down(struct dsa_switch *ds, int port, struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); u32 reg, offset; + if (priv->wol_ports_mask & BIT(port)) + return; + if (port != core_readl(priv, CORE_IMP0_PRT_ID)) { if (priv->type == BCM4908_DEVICE_ID || priv->type == BCM7445_DEVICE_ID) From f3c46e41b32b6266cf60b0985c61748f53bf1c61 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Thu, 12 May 2022 11:08:20 +0800 Subject: [PATCH 177/179] net/smc: non blocking recvmsg() return -EAGAIN when no data and signal_pending Non blocking sendmsg will return -EAGAIN when any signal pending and no send space left, while non blocking recvmsg return -EINTR when signal pending and no data received. This may makes confused. As TCP returns -EAGAIN in the conditions described above. Align the behavior of smc with TCP. Fixes: 846e344eb722 ("net/smc: add receive timeout check") Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Karsten Graul Link: https://lore.kernel.org/r/20220512030820.73848-1-guangguan.wang@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/smc_rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 51e8eb2933ff..338b9ef806e8 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -355,12 +355,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } break; } + if (!timeo) + return -EAGAIN; if (signal_pending(current)) { read_done = sock_intr_errno(timeo); break; } - if (!timeo) - return -EAGAIN; } if (!smc_rx_data_available(conn)) { From 1fa89ffbc04545b7582518e57f4b63e2a062870f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 12 May 2022 05:47:09 +0000 Subject: [PATCH 178/179] net: sfc: ef10: fix memory leak in efx_ef10_mtd_probe() In the NIC ->probe() callback, ->mtd_probe() callback is called. If NIC has 2 ports, ->probe() is called twice and ->mtd_probe() too. In the ->mtd_probe(), which is efx_ef10_mtd_probe() it allocates and initializes mtd partiion. But mtd partition for sfc is shared data. So that allocated mtd partition data from last called efx_ef10_mtd_probe() will not be used. Therefore it must be freed. But it doesn't free a not used mtd partition data in efx_ef10_mtd_probe(). kmemleak reports: unreferenced object 0xffff88811ddb0000 (size 63168): comm "systemd-udevd", pid 265, jiffies 4294681048 (age 348.586s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmalloc_order_trace+0x19/0x120 [] __kmalloc+0x20e/0x250 [] efx_ef10_mtd_probe+0x11f/0x270 [sfc] [] efx_pci_probe.cold.17+0x3df/0x53d [sfc] [] local_pci_probe+0xdc/0x170 [] pci_device_probe+0x235/0x680 [] really_probe+0x1c2/0x8f0 [] __driver_probe_device+0x2ab/0x460 [] driver_probe_device+0x4a/0x120 [] __driver_attach+0x16e/0x320 [] bus_for_each_dev+0x110/0x190 [] bus_add_driver+0x39e/0x560 [] driver_register+0x18e/0x310 [] 0xffffffffc02e2055 [] do_one_initcall+0xc3/0x450 [] do_init_module+0x1b4/0x700 Acked-by: Martin Habets Fixes: 8127d661e77f ("sfc: Add support for Solarflare SFC9100 family") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20220512054709.12513-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/ef10.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 50d535981a35..f8edb3f1b73a 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -3579,6 +3579,11 @@ static int efx_ef10_mtd_probe(struct efx_nic *efx) n_parts++; } + if (!n_parts) { + kfree(parts); + return 0; + } + rc = efx_mtd_add(efx, &parts[0].common, n_parts, sizeof(*parts)); fail: if (rc) From 3740651bf7e200109dd42d5b2fb22226b26f960a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 12 May 2022 12:18:30 +0300 Subject: [PATCH 179/179] tls: Fix context leak on tls_device_down The commit cited below claims to fix a use-after-free condition after tls_device_down. Apparently, the description wasn't fully accurate. The context stayed alive, but ctx->netdev became NULL, and the offload was torn down without a proper fallback, so a bug was present, but a different kind of bug. Due to misunderstanding of the issue, the original patch dropped the refcount_dec_and_test line for the context to avoid the alleged premature deallocation. That line has to be restored, because it matches the refcount_inc_not_zero from the same function, otherwise the contexts that survived tls_device_down are leaked. This patch fixes the described issue by restoring refcount_dec_and_test. After this change, there is no leak anymore, and the fallback to software kTLS still works. Fixes: c55dcdd435aa ("net/tls: Fix use-after-free after the TLS device goes down and up") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220512091830.678684-1-maximmi@nvidia.com Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index af875ad4a822..3919fe2c58c5 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1347,7 +1347,10 @@ static int tls_device_down(struct net_device *netdev) /* Device contexts for RX and TX will be freed in on sk_destruct * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. + * Now release the ref taken above. */ + if (refcount_dec_and_test(&ctx->refcount)) + tls_device_free_ctx(ctx); } up_write(&device_offload_lock);