mirror of
https://github.com/torvalds/linux.git
synced 2024-11-22 12:11:40 +00:00
320fb0f91e
Test scripts: cd /sys/fs/cgroup/blkio/ echo "8:0 1024" > blkio.throttle.write_bps_device echo $$ > cgroup.procs dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & Test result: 10240 bytes (10 kB, 10 KiB) copied, 10.0134 s, 1.0 kB/s 10240 bytes (10 kB, 10 KiB) copied, 10.0135 s, 1.0 kB/s The problem is that the second bio is finished after 10s instead of 20s. Root cause: 1) second bio will be flagged: __blk_throtl_bio while (true) { ... if (sq->nr_queued[rw]) -> some bio is throttled already break }; bio_set_flag(bio, BIO_THROTTLED); -> flag the bio 2) flagged bio will be dispatched without waiting: throtl_dispatch_tg tg_may_dispatch tg_with_in_bps_limit if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) *wait = 0; -> wait time is zero return true; commit9f5ede3c01
("block: throttle split bio in case of iops limit") support to count split bios for iops limit, thus it adds flagged bio checking in tg_with_in_bps_limit() so that split bios will only count once for bps limit, however, it introduce a new problem that io throttle won't work if multiple bios are throttled. In order to fix the problem, handle iops/bps limit in different ways: 1) for iops limit, there is no flag to record if the bio is throttled, and iops is always applied. 2) for bps limit, original bio will be flagged with BIO_BPS_THROTTLED, and io throttle will ignore bio with the flag. Noted this patch also remove the code to set flag in __bio_clone(), it's introduced in commit111be88398
("block-throttle: avoid double charge"), and author thinks split bio can be resubmited and throttled again, which is wrong because split bio will continue to dispatch from caller. Fixes:9f5ede3c01
("block: throttle split bio in case of iops limit") Cc: <stable@vger.kernel.org> Signed-off-by: Yu Kuai <yukuai3@huawei.com> Acked-by: Tejun Heo <tj@kernel.org> Link: https://lore.kernel.org/r/20220829022240.3348319-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
190 lines
6.0 KiB
C
190 lines
6.0 KiB
C
#ifndef BLK_THROTTLE_H
|
|
#define BLK_THROTTLE_H
|
|
|
|
#include "blk-cgroup-rwstat.h"
|
|
|
|
/*
|
|
* To implement hierarchical throttling, throtl_grps form a tree and bios
|
|
* are dispatched upwards level by level until they reach the top and get
|
|
* issued. When dispatching bios from the children and local group at each
|
|
* level, if the bios are dispatched into a single bio_list, there's a risk
|
|
* of a local or child group which can queue many bios at once filling up
|
|
* the list starving others.
|
|
*
|
|
* To avoid such starvation, dispatched bios are queued separately
|
|
* according to where they came from. When they are again dispatched to
|
|
* the parent, they're popped in round-robin order so that no single source
|
|
* hogs the dispatch window.
|
|
*
|
|
* throtl_qnode is used to keep the queued bios separated by their sources.
|
|
* Bios are queued to throtl_qnode which in turn is queued to
|
|
* throtl_service_queue and then dispatched in round-robin order.
|
|
*
|
|
* It's also used to track the reference counts on blkg's. A qnode always
|
|
* belongs to a throtl_grp and gets queued on itself or the parent, so
|
|
* incrementing the reference of the associated throtl_grp when a qnode is
|
|
* queued and decrementing when dequeued is enough to keep the whole blkg
|
|
* tree pinned while bios are in flight.
|
|
*/
|
|
struct throtl_qnode {
|
|
struct list_head node; /* service_queue->queued[] */
|
|
struct bio_list bios; /* queued bios */
|
|
struct throtl_grp *tg; /* tg this qnode belongs to */
|
|
};
|
|
|
|
struct throtl_service_queue {
|
|
struct throtl_service_queue *parent_sq; /* the parent service_queue */
|
|
|
|
/*
|
|
* Bios queued directly to this service_queue or dispatched from
|
|
* children throtl_grp's.
|
|
*/
|
|
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
|
|
unsigned int nr_queued[2]; /* number of queued bios */
|
|
|
|
/*
|
|
* RB tree of active children throtl_grp's, which are sorted by
|
|
* their ->disptime.
|
|
*/
|
|
struct rb_root_cached pending_tree; /* RB tree of active tgs */
|
|
unsigned int nr_pending; /* # queued in the tree */
|
|
unsigned long first_pending_disptime; /* disptime of the first tg */
|
|
struct timer_list pending_timer; /* fires on first_pending_disptime */
|
|
};
|
|
|
|
enum tg_state_flags {
|
|
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
|
|
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
|
|
THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */
|
|
THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
|
|
};
|
|
|
|
enum {
|
|
LIMIT_LOW,
|
|
LIMIT_MAX,
|
|
LIMIT_CNT,
|
|
};
|
|
|
|
struct throtl_grp {
|
|
/* must be the first member */
|
|
struct blkg_policy_data pd;
|
|
|
|
/* active throtl group service_queue member */
|
|
struct rb_node rb_node;
|
|
|
|
/* throtl_data this group belongs to */
|
|
struct throtl_data *td;
|
|
|
|
/* this group's service queue */
|
|
struct throtl_service_queue service_queue;
|
|
|
|
/*
|
|
* qnode_on_self is used when bios are directly queued to this
|
|
* throtl_grp so that local bios compete fairly with bios
|
|
* dispatched from children. qnode_on_parent is used when bios are
|
|
* dispatched from this throtl_grp into its parent and will compete
|
|
* with the sibling qnode_on_parents and the parent's
|
|
* qnode_on_self.
|
|
*/
|
|
struct throtl_qnode qnode_on_self[2];
|
|
struct throtl_qnode qnode_on_parent[2];
|
|
|
|
/*
|
|
* Dispatch time in jiffies. This is the estimated time when group
|
|
* will unthrottle and is ready to dispatch more bio. It is used as
|
|
* key to sort active groups in service tree.
|
|
*/
|
|
unsigned long disptime;
|
|
|
|
unsigned int flags;
|
|
|
|
/* are there any throtl rules between this group and td? */
|
|
bool has_rules[2];
|
|
|
|
/* internally used bytes per second rate limits */
|
|
uint64_t bps[2][LIMIT_CNT];
|
|
/* user configured bps limits */
|
|
uint64_t bps_conf[2][LIMIT_CNT];
|
|
|
|
/* internally used IOPS limits */
|
|
unsigned int iops[2][LIMIT_CNT];
|
|
/* user configured IOPS limits */
|
|
unsigned int iops_conf[2][LIMIT_CNT];
|
|
|
|
/* Number of bytes dispatched in current slice */
|
|
uint64_t bytes_disp[2];
|
|
/* Number of bio's dispatched in current slice */
|
|
unsigned int io_disp[2];
|
|
|
|
unsigned long last_low_overflow_time[2];
|
|
|
|
uint64_t last_bytes_disp[2];
|
|
unsigned int last_io_disp[2];
|
|
|
|
unsigned long last_check_time;
|
|
|
|
unsigned long latency_target; /* us */
|
|
unsigned long latency_target_conf; /* us */
|
|
/* When did we start a new slice */
|
|
unsigned long slice_start[2];
|
|
unsigned long slice_end[2];
|
|
|
|
unsigned long last_finish_time; /* ns / 1024 */
|
|
unsigned long checked_last_finish_time; /* ns / 1024 */
|
|
unsigned long avg_idletime; /* ns / 1024 */
|
|
unsigned long idletime_threshold; /* us */
|
|
unsigned long idletime_threshold_conf; /* us */
|
|
|
|
unsigned int bio_cnt; /* total bios */
|
|
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
|
|
unsigned long bio_cnt_reset_time;
|
|
|
|
struct blkg_rwstat stat_bytes;
|
|
struct blkg_rwstat stat_ios;
|
|
};
|
|
|
|
extern struct blkcg_policy blkcg_policy_throtl;
|
|
|
|
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
|
|
{
|
|
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
|
|
}
|
|
|
|
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
|
|
{
|
|
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
|
|
}
|
|
|
|
/*
|
|
* Internal throttling interface
|
|
*/
|
|
#ifndef CONFIG_BLK_DEV_THROTTLING
|
|
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
|
|
static inline void blk_throtl_exit(struct request_queue *q) { }
|
|
static inline void blk_throtl_register_queue(struct request_queue *q) { }
|
|
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
|
|
static inline void blk_throtl_cancel_bios(struct request_queue *q) { }
|
|
#else /* CONFIG_BLK_DEV_THROTTLING */
|
|
int blk_throtl_init(struct request_queue *q);
|
|
void blk_throtl_exit(struct request_queue *q);
|
|
void blk_throtl_register_queue(struct request_queue *q);
|
|
bool __blk_throtl_bio(struct bio *bio);
|
|
void blk_throtl_cancel_bios(struct request_queue *q);
|
|
static inline bool blk_throtl_bio(struct bio *bio)
|
|
{
|
|
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
|
|
|
|
/* no need to throttle bps any more if the bio has been throttled */
|
|
if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
|
|
!(tg->flags & THROTL_TG_HAS_IOPS_LIMIT))
|
|
return false;
|
|
|
|
if (!tg->has_rules[bio_data_dir(bio)])
|
|
return false;
|
|
|
|
return __blk_throtl_bio(bio);
|
|
}
|
|
#endif /* CONFIG_BLK_DEV_THROTTLING */
|
|
|
|
#endif
|