From 8c936f9ea11ec4e35e288810a7503b5c841a355f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 26 Apr 2022 19:01:01 -1000
Subject: [PATCH 1/3] iocost: don't reset the inuse weight of under-weighted
 debtors

When an iocg is in debt, its inuse weight is owned by debt handling and
should stay at 1. This invariant was broken when determining the amount of
surpluses at the beginning of donation calculation - when an iocg's
hierarchical weight is too low, the iocg is excluded from donation
calculation and its inuse is reset to its active regardless of its
indebtedness, triggering warnings like the following:

 WARNING: CPU: 5 PID: 0 at block/blk-iocost.c:1416 iocg_kick_waitq+0x392/0x3a0
 ...
 RIP: 0010:iocg_kick_waitq+0x392/0x3a0
 Code: 00 00 be ff ff ff ff 48 89 4d a8 e8 98 b2 70 00 48 8b 4d a8 85 c0 0f 85 4a fe ff ff 0f 0b e9 43 fe ff ff 0f 0b e9 4d fe ff ff <0f> 0b e9 50 fe ff ff e8 a2 ae 70 00 66 90 0f 1f 44 00 00 55 48 89
 RSP: 0018:ffffc90000200d08 EFLAGS: 00010016
 ...
  <IRQ>
  ioc_timer_fn+0x2e0/0x1470
  call_timer_fn+0xa1/0x2c0
 ...

As this happens only when an iocg's hierarchical weight is negligible, its
impact likely is limited to triggering the warnings. Fix it by skipping
resetting inuse of under-weighted debtors.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Rik van Riel <riel@surriel.com>
Fixes: c421a3eb2e27 ("blk-iocost: revamp debt handling")
Cc: stable@vger.kernel.org # v5.10+
Link: https://lore.kernel.org/r/YmjODd4aif9BzFuO@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iocost.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 70a0a3d680a3..9bd670999d0a 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2322,7 +2322,17 @@ static void ioc_timer_fn(struct timer_list *timer)
 				iocg->hweight_donating = hwa;
 				iocg->hweight_after_donation = new_hwi;
 				list_add(&iocg->surplus_list, &surpluses);
-			} else {
+			} else if (!iocg->abs_vdebt) {
+				/*
+				 * @iocg doesn't have enough to donate. Reset
+				 * its inuse to active.
+				 *
+				 * Don't reset debtors as their inuse's are
+				 * owned by debt handling. This shouldn't affect
+				 * donation calculuation in any meaningful way
+				 * as @iocg doesn't have a meaningful amount of
+				 * share anyway.
+				 */
 				TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
 						iocg->inuse, iocg->active,
 						iocg->hweight_inuse, new_hwi);

From 4cddeacad6d4b23493a108d0705e7d2ab89ba5a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 27 Apr 2022 09:49:12 -1000
Subject: [PATCH 2/3] Revert "block: inherit request start time from bio for
 BLK_CGROUP"

This reverts commit 0006707723233cb2a9a23ca19fc3d0864835704c. It has a
couple problems:

* bio_issue_time() is stored in bio->bi_issue truncated to 51 bits. This
  overflows in slightly over 26 days. Setting rq->io_start_time_ns with it
  means that io duration calculation would yield >26days after 26 days of
  uptime. This, for example, confuses kyber making it cause high IO
  latencies.

* rq->io_start_time_ns should record the time that the IO is issued to the
  device so that on-device latency can be measured. However,
  bio_issue_time() is set before the bio goes through the rq-qos controllers
  (wbt, iolatency, iocost), so when the bio gets throttled in any of the
  mechanisms, the measured latencies make no sense - on-device latencies end
  up higher than request-alloc-to-completion latencies.

We'll need a smarter way to avoid calling ktime_get_ns() repeatedly
back-to-back. For now, let's revert the commit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org # v5.16+
Link: https://lore.kernel.org/r/YmmeOLfo5lzc+8yI@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c4370d276170..84d749511f55 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1131,14 +1131,7 @@ void blk_mq_start_request(struct request *rq)
 	trace_block_rq_issue(rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		u64 start_time;
-#ifdef CONFIG_BLK_CGROUP
-		if (rq->bio)
-			start_time = bio_issue_time(&rq->bio->bi_issue);
-		else
-#endif
-			start_time = ktime_get_ns();
-		rq->io_start_time_ns = start_time;
+		rq->io_start_time_ns = ktime_get_ns();
 		rq->stats_sectors = blk_rq_sectors(rq);
 		rq->rq_flags |= RQF_STATS;
 		rq_qos_issue(q, rq);

From 09df6a75fffa68169c5ef9bef990cd7ba94f3eef Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 7 Apr 2022 16:07:38 +0200
Subject: [PATCH 3/3] bfq: Fix warning in bfqq_request_over_limit()

People are occasionally reporting a warning bfqq_request_over_limit()
triggering reporting that BFQ's idea of cgroup hierarchy (and its depth)
does not match what generic blkcg code thinks. This can actually happen
when bfqq gets moved between BFQ groups while bfqq_request_over_limit()
is running. Make sure the code is safe against BFQ queue being moved to
a different BFQ group.

Fixes: 76f1df88bbc2 ("bfq: Limit number of requests consumed by each cgroup")
CC: stable@vger.kernel.org
Link: https://lore.kernel.org/all/CAJCQCtTw_2C7ZSz7as5Gvq=OmnDiio=HRkQekqWpKot84sQhFA@mail.gmail.com/
Reported-by: Chris Murphy <lists@colorremedies.com>
Reported-by: "yukuai (C)" <yukuai3@huawei.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220407140738.9723-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 2e0dd68a3cbe..1f62dbdc521f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -569,7 +569,7 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
 	struct bfq_entity *entity = &bfqq->entity;
 	struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
 	struct bfq_entity **entities = inline_entities;
-	int depth, level;
+	int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH;
 	int class_idx = bfqq->ioprio_class - 1;
 	struct bfq_sched_data *sched_data;
 	unsigned long wsum;
@@ -578,15 +578,21 @@ static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
 	if (!entity->on_st_or_in_serv)
 		return false;
 
+retry:
+	spin_lock_irq(&bfqd->lock);
 	/* +1 for bfqq entity, root cgroup not included */
 	depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
-	if (depth > BFQ_LIMIT_INLINE_DEPTH) {
+	if (depth > alloc_depth) {
+		spin_unlock_irq(&bfqd->lock);
+		if (entities != inline_entities)
+			kfree(entities);
 		entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
 		if (!entities)
 			return false;
+		alloc_depth = depth;
+		goto retry;
 	}
 
-	spin_lock_irq(&bfqd->lock);
 	sched_data = entity->sched_data;
 	/* Gather our ancestors as we need to traverse them in reverse order */
 	level = 0;