From f54f0d0e2b1f74de85ff02013fa4886e4154aca5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 17 Oct 2024 10:45:24 -0700 Subject: [PATCH 1/5] nvme: enhance cns version checking The number of CNS bits in the command is specific to the nvme spec version compliance. The existing check is not sufficient for possible CNS values the driver uses that may create confusion between host and device, so enhance the check to consider the version and desired CNS value. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 876c8e6311db..09466e208729 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1390,17 +1390,30 @@ static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, nvme_start_keep_alive(ctrl); } -/* - * In NVMe 1.0 the CNS field was just a binary controller or namespace - * flag, thus sending any new CNS opcodes has a big chance of not working. - * Qemu unfortunately had that bug after reporting a 1.1 version compliance - * (but not for any later version). - */ -static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) +static bool nvme_id_cns_ok(struct nvme_ctrl *ctrl, u8 cns) { - if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) - return ctrl->vs < NVME_VS(1, 2, 0); - return ctrl->vs < NVME_VS(1, 1, 0); + /* + * The CNS field occupies a full byte starting with NVMe 1.2 + */ + if (ctrl->vs >= NVME_VS(1, 2, 0)) + return true; + + /* + * NVMe 1.1 expanded the CNS value to two bits, which means values + * larger than that could get truncated and treated as an incorrect + * value. + * + * Qemu implemented 1.0 behavior for controllers claiming 1.1 + * compliance, so they need to be quirked here. + */ + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) + return cns <= 3; + + /* + * NVMe 1.0 used a single bit for the CNS value. + */ + return cns <= 1; } static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) @@ -3104,7 +3117,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) ctrl->max_zeroes_sectors = 0; if (ctrl->subsys->subtype != NVME_NQN_NVME || - nvme_ctrl_limited_cns(ctrl) || + !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) || test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags)) return 0; @@ -4200,7 +4213,7 @@ static void nvme_scan_work(struct work_struct *work) } mutex_lock(&ctrl->scan_lock); - if (nvme_ctrl_limited_cns(ctrl)) { + if (!nvme_id_cns_ok(ctrl, NVME_ID_CNS_NS_ACTIVE_LIST)) { nvme_scan_ns_sequential(ctrl); } else { /* From be0e822bb3f5259c7f9424ba97e8175211288813 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Oct 2024 10:07:48 +0100 Subject: [PATCH 2/5] block: fix queue limits checks in blk_rq_map_user_bvec for real blk_rq_map_user_bvec currently only has ad-hoc checks for queue limits, and the last fix to it enabled valid NVMe I/O to pass, but also allowed invalid one for drivers that set a max_segment_size or seg_boundary limit. Fix it once for all by using the bio_split_rw_at helper from the I/O path that indicates if and where a bio would be have to be split to adhere to the queue limits, and it returns a positive value, turn that into -EREMOTEIO to retry using the copy path. Fixes: 2ff949441802 ("block: fix sanity checks in blk_rq_map_user_bvec") Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20241028090840.446180-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 56 +++++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 6ef2ec1f7d78..b5fd1d857461 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -561,55 +561,33 @@ EXPORT_SYMBOL(blk_rq_append_bio); /* Prepare bio for passthrough IO given ITER_BVEC iter */ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { - struct request_queue *q = rq->q; - size_t nr_iter = iov_iter_count(iter); - size_t nr_segs = iter->nr_segs; - struct bio_vec *bvecs, *bvprvp = NULL; - const struct queue_limits *lim = &q->limits; - unsigned int nsegs = 0, bytes = 0; + const struct queue_limits *lim = &rq->q->limits; + unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; + unsigned int nsegs; struct bio *bio; - size_t i; + int ret; - if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) - return -EINVAL; - if (nr_segs > queue_max_segments(q)) + if (!iov_iter_count(iter) || iov_iter_count(iter) > max_bytes) return -EINVAL; - /* no iovecs to alloc, as we already have a BVEC iterator */ + /* reuse the bvecs from the iterator instead of allocating new ones */ bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); - if (bio == NULL) + if (!bio) return -ENOMEM; - bio_iov_bvec_set(bio, (struct iov_iter *)iter); - blk_rq_bio_prep(rq, bio, nr_segs); - /* loop to perform a bunch of sanity checks */ - bvecs = (struct bio_vec *)iter->bvec; - for (i = 0; i < nr_segs; i++) { - struct bio_vec *bv = &bvecs[i]; - - /* - * If the queue doesn't support SG gaps and adding this - * offset would create a gap, fallback to copy. - */ - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { - blk_mq_map_bio_put(bio); - return -EREMOTEIO; - } - /* check full condition */ - if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) - goto put_bio; - if (bytes + bv->bv_len > nr_iter) - break; - - nsegs++; - bytes += bv->bv_len; - bvprvp = bv; + /* check that the data layout matches the hardware restrictions */ + ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes); + if (ret) { + /* if we would have to split the bio, copy instead */ + if (ret > 0) + ret = -EREMOTEIO; + blk_mq_map_bio_put(bio); + return ret; } + + blk_rq_bio_prep(rq, bio, nsegs); return 0; -put_bio: - blk_mq_map_bio_put(bio); - return -EINVAL; } /** From 42ab37eaad17aee458489c553a367621ee04e0bc Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 23 Oct 2024 08:40:26 -0700 Subject: [PATCH 3/5] nvme: module parameter to disable pi with offsets A recent commit enables integrity checks for formats the previous kernel versions registered with the "nop" integrity profile. This means namespaces using that format become unreadable when upgrading the kernel past that commit. Introduce a module parameter to restore the "nop" integrity profile so that storage can be readable once again. This could be a boot device, so the setting needs to happen at module load time. Fixes: 921e81db524d17 ("nvme: allow integrity when PI is not in first bytes") Reported-by: David Wei Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 09466e208729..ce20b916301a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -91,6 +91,17 @@ module_param(apst_secondary_latency_tol_us, ulong, 0644); MODULE_PARM_DESC(apst_secondary_latency_tol_us, "secondary APST latency tolerance in us"); +/* + * Older kernels didn't enable protection information if it was at an offset. + * Newer kernels do, so it breaks reads on the upgrade if such formats were + * used in prior kernels since the metadata written did not contain a valid + * checksum. + */ +static bool disable_pi_offsets = false; +module_param(disable_pi_offsets, bool, 0444); +MODULE_PARM_DESC(disable_pi_offsets, + "disable protection information if it has an offset"); + /* * nvme_wq - hosts nvme related works that are not reset or delete * nvme_reset_wq - hosts nvme reset works @@ -1926,8 +1937,12 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, if (head->pi_size && head->ms >= head->pi_size) head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - if (!(id->dps & NVME_NS_DPS_PI_FIRST)) - info->pi_offset = head->ms - head->pi_size; + if (!(id->dps & NVME_NS_DPS_PI_FIRST)) { + if (disable_pi_offsets) + head->pi_type = 0; + else + info->pi_offset = head->ms - head->pi_size; + } if (ctrl->ops->flags & NVME_F_FABRICS) { /* From d2f551b1f72b4c508ab9298419f6feadc3b5d791 Mon Sep 17 00:00:00 2001 From: Vitaliy Shevtsov Date: Mon, 16 Sep 2024 22:41:37 +0500 Subject: [PATCH 4/5] nvmet-auth: assign dh_key to NULL after kfree_sensitive ctrl->dh_key might be used across multiple calls to nvmet_setup_dhgroup() for the same controller. So it's better to nullify it after release on error path in order to avoid double free later in nvmet_destroy_auth(). Found by Linux Verification Center (linuxtesting.org) with Svace. Fixes: 7a277c37d352 ("nvmet-auth: Diffie-Hellman key exchange support") Cc: stable@vger.kernel.org Signed-off-by: Vitaliy Shevtsov Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 7897d02c681d..b0fd211ec57e 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -115,6 +115,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) pr_debug("%s: ctrl %d failed to generate private key, err %d\n", __func__, ctrl->cntlid, ret); kfree_sensitive(ctrl->dh_key); + ctrl->dh_key = NULL; return ret; } ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm); From 5eed4fb274cd6579f2fb4190b11c4c86c553cd06 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 28 Oct 2024 13:45:46 -0700 Subject: [PATCH 5/5] nvme: re-fix error-handling for io_uring nvme-passthrough This was previously fixed with commit 1147dd0503564fa0e0348 ("nvme: fix error-handling for io_uring nvme-passthrough"), but the change was mistakenly undone in a later commit. Fixes: d6aacee9255e7f ("nvme: use bio_integrity_map_user") Cc: stable@vger.kernel.org Reported-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index b9b79ccfabf8..a96976b22fa7 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -421,10 +421,13 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) { pdu->status = -EINTR; - else + } else { pdu->status = nvme_req(req)->status; + if (!pdu->status) + pdu->status = blk_status_to_errno(err); + } pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /*