5bebf7486d
Commit 83ff9318c4
("bcache: not use hard coded memset size in
bch_cache_accounting_clear()") tries to make the code more easy to
understand by removing the hard coded number with following change,
void bch_cache_accounting_clear(...)
{
memset(&acc->total.cache_hits,
0,
- sizeof(unsigned long) * 7);
+ sizeof(struct cache_stats));
}
Unfortunately the change was wrong (it also tells us the original code
was not easy to correctly understand). The hard coded number 7 is used
because in struct cache_stats,
15 struct cache_stats {
16 struct kobject kobj;
17
18 unsigned long cache_hits;
19 unsigned long cache_misses;
20 unsigned long cache_bypass_hits;
21 unsigned long cache_bypass_misses;
22 unsigned long cache_readaheads;
23 unsigned long cache_miss_collisions;
24 unsigned long sectors_bypassed;
25
26 unsigned int rescale;
27 };
only members in LINE 18-24 want to be set to 0. It is wrong to use
'sizeof(struct cache_stats)' to replace 'sizeof(unsigned long) * 7), the
memory objects behind acc->total is staled by this change.
Сорокин Артем Сергеевич reports that by the following steps, kernel
panic will be triggered,
1. Create new set: make-bcache -B /dev/nvme1n1 -C /dev/sda --wipe-bcache
2. Run in /sys/fs/bcache/<uuid>:
echo 1 > clear_stats && cat stats_five_minute/cache_bypass_hits
I can reproduce the panic and get following dmesg with KASAN enabled,
[22613.172742] ==================================================================
[22613.172862] BUG: KASAN: null-ptr-deref in sysfs_kf_seq_show+0x117/0x230
[22613.172864] Read of size 8 at addr 0000000000000000 by task cat/6753
[22613.172870] CPU: 1 PID: 6753 Comm: cat Not tainted 5.5.0-rc7-lp151.28.16-default+ #11
[22613.172872] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/29/2019
[22613.172873] Call Trace:
[22613.172964] dump_stack+0x8b/0xbb
[22613.172968] ? sysfs_kf_seq_show+0x117/0x230
[22613.172970] ? sysfs_kf_seq_show+0x117/0x230
[22613.173031] __kasan_report+0x176/0x192
[22613.173064] ? pr_cont_kernfs_name+0x40/0x60
[22613.173067] ? sysfs_kf_seq_show+0x117/0x230
[22613.173070] kasan_report+0xe/0x20
[22613.173072] sysfs_kf_seq_show+0x117/0x230
[22613.173105] seq_read+0x199/0x6d0
[22613.173110] vfs_read+0xa5/0x1a0
[22613.173113] ksys_read+0x110/0x160
[22613.173115] ? kernel_write+0xb0/0xb0
[22613.173177] do_syscall_64+0x77/0x290
[22613.173238] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[22613.173241] RIP: 0033:0x7fc2c886ac61
[22613.173244] Code: fe ff ff 48 8d 3d c7 a0 09 00 48 83 ec 08 e8 46 03 02 00 66 0f 1f 44 00 00 8b 05 ca fb 2c 00 48 63 ff 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 55 53 48 89 d5 48 89
[22613.173245] RSP: 002b:00007ffebe776d68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[22613.173248] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2c886ac61
[22613.173249] RDX: 0000000000020000 RSI: 00007fc2c8cca000 RDI: 0000000000000003
[22613.173250] RBP: 0000000000020000 R08: ffffffffffffffff R09: 0000000000000000
[22613.173251] R10: 000000000000038c R11: 0000000000000246 R12: 00007fc2c8cca000
[22613.173253] R13: 0000000000000003 R14: 00007fc2c8cca00f R15: 0000000000020000
[22613.173255] ==================================================================
[22613.173256] Disabling lock debugging due to kernel taint
[22613.173350] BUG: kernel NULL pointer dereference, address: 0000000000000000
[22613.178380] #PF: supervisor read access in kernel mode
[22613.180959] #PF: error_code(0x0000) - not-present page
[22613.183444] PGD 0 P4D 0
[22613.184867] Oops: 0000 [#1] SMP KASAN PTI
[22613.186797] CPU: 1 PID: 6753 Comm: cat Tainted: G B 5.5.0-rc7-lp151.28.16-default+ #11
[22613.191253] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/29/2019
[22613.196706] RIP: 0010:sysfs_kf_seq_show+0x117/0x230
[22613.199097] Code: ff 48 8b 0b 48 8b 44 24 08 48 01 e9 eb a6 31 f6 48 89 cf ba 00 10 00 00 48 89 4c 24 10 e8 b1 e6 e9 ff 4c 89 ff e8 19 07 ea ff <49> 8b 07 48 85 c0 48 89 44 24 08 0f 84 91 00 00 00 49 8b 6d 00 48
[22613.208016] RSP: 0018:ffff8881d4f8fd78 EFLAGS: 00010246
[22613.210448] RAX: 0000000000000000 RBX: ffff8881eb99b180 RCX: ffffffff810d9ef6
[22613.213691] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246
[22613.216893] RBP: 0000000000001000 R08: fffffbfff072ddcd R09: fffffbfff072ddcd
[22613.220075] R10: 0000000000000001 R11: fffffbfff072ddcc R12: ffff8881de5c0200
[22613.223256] R13: ffff8881ed175500 R14: ffff8881eb99b198 R15: 0000000000000000
[22613.226290] FS: 00007fc2c8d3d500(0000) GS:ffff8881f2a80000(0000) knlGS:0000000000000000
[22613.229637] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[22613.231993] CR2: 0000000000000000 CR3: 00000001ec89a004 CR4: 00000000003606e0
[22613.234909] Call Trace:
[22613.235931] seq_read+0x199/0x6d0
[22613.237259] vfs_read+0xa5/0x1a0
[22613.239229] ksys_read+0x110/0x160
[22613.240590] ? kernel_write+0xb0/0xb0
[22613.242040] do_syscall_64+0x77/0x290
[22613.243625] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[22613.245450] RIP: 0033:0x7fc2c886ac61
[22613.246706] Code: fe ff ff 48 8d 3d c7 a0 09 00 48 83 ec 08 e8 46 03 02 00 66 0f 1f 44 00 00 8b 05 ca fb 2c 00 48 63 ff 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 55 53 48 89 d5 48 89
[22613.253296] RSP: 002b:00007ffebe776d68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[22613.255835] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2c886ac61
[22613.258472] RDX: 0000000000020000 RSI: 00007fc2c8cca000 RDI: 0000000000000003
[22613.260807] RBP: 0000000000020000 R08: ffffffffffffffff R09: 0000000000000000
[22613.263188] R10: 000000000000038c R11: 0000000000000246 R12: 00007fc2c8cca000
[22613.265598] R13: 0000000000000003 R14: 00007fc2c8cca00f R15: 0000000000020000
[22613.268729] Modules linked in: scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs vmw_vsock_vmci_transport vsock fuse bnep kvm_intel kvm irqbypass crc32_pclmul crc32c_intel ghash_clmulni_intel snd_ens1371 snd_ac97_codec ac97_bus bcache snd_pcm btusb btrtl btbcm btintel crc64 aesni_intel glue_helper crypto_simd vmw_balloon cryptd bluetooth snd_timer snd_rawmidi snd joydev pcspkr e1000 rfkill vmw_vmci soundcore ecdh_generic ecc gameport i2c_piix4 mptctl ac button hid_generic usbhid sr_mod cdrom ata_generic ehci_pci vmwgfx uhci_hcd drm_kms_helper syscopyarea serio_raw sysfillrect sysimgblt fb_sys_fops ttm ehci_hcd mptspi scsi_transport_spi mptscsih ata_piix mptbase ahci usbcore libahci drm sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua
[22613.292429] CR2: 0000000000000000
[22613.293563] ---[ end trace a074b26a8508f378 ]---
[22613.295138] RIP: 0010:sysfs_kf_seq_show+0x117/0x230
[22613.296769] Code: ff 48 8b 0b 48 8b 44 24 08 48 01 e9 eb a6 31 f6 48 89 cf ba 00 10 00 00 48 89 4c 24 10 e8 b1 e6 e9 ff 4c 89 ff e8 19 07 ea ff <49> 8b 07 48 85 c0 48 89 44 24 08 0f 84 91 00 00 00 49 8b 6d 00 48
[22613.303553] RSP: 0018:ffff8881d4f8fd78 EFLAGS: 00010246
[22613.305280] RAX: 0000000000000000 RBX: ffff8881eb99b180 RCX: ffffffff810d9ef6
[22613.307924] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246
[22613.310272] RBP: 0000000000001000 R08: fffffbfff072ddcd R09: fffffbfff072ddcd
[22613.312685] R10: 0000000000000001 R11: fffffbfff072ddcc R12: ffff8881de5c0200
[22613.315076] R13: ffff8881ed175500 R14: ffff8881eb99b198 R15: 0000000000000000
[22613.318116] FS: 00007fc2c8d3d500(0000) GS:ffff8881f2a80000(0000) knlGS:0000000000000000
[22613.320743] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[22613.322628] CR2: 0000000000000000 CR3: 00000001ec89a004 CR4: 00000000003606e0
Here this patch fixes the following problem by explicity set all the 7
members to 0 in bch_cache_accounting_clear().
Reported-by: Сорокин Артем Сергеевич <a.sorokin@bank-hlynov.ru>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
248 lines
6.7 KiB
C
248 lines
6.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* bcache stats code
|
|
*
|
|
* Copyright 2012 Google, Inc.
|
|
*/
|
|
|
|
#include "bcache.h"
|
|
#include "stats.h"
|
|
#include "btree.h"
|
|
#include "sysfs.h"
|
|
|
|
/*
|
|
* We keep absolute totals of various statistics, and addionally a set of three
|
|
* rolling averages.
|
|
*
|
|
* Every so often, a timer goes off and rescales the rolling averages.
|
|
* accounting_rescale[] is how many times the timer has to go off before we
|
|
* rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
|
|
* and one day.
|
|
*
|
|
* accounting_delay is how often the timer goes off - 22 times in 5 minutes,
|
|
* and accounting_weight is what we use to rescale:
|
|
*
|
|
* pow(31 / 32, 22) ~= 1/2
|
|
*
|
|
* So that we don't have to increment each set of numbers every time we (say)
|
|
* get a cache hit, we increment a single atomic_t in acc->collector, and when
|
|
* the rescale function runs it resets the atomic counter to 0 and adds its
|
|
* old value to each of the exported numbers.
|
|
*
|
|
* To reduce rounding error, the numbers in struct cache_stats are all
|
|
* stored left shifted by 16, and scaled back in the sysfs show() function.
|
|
*/
|
|
|
|
static const unsigned int DAY_RESCALE = 288;
|
|
static const unsigned int HOUR_RESCALE = 12;
|
|
static const unsigned int FIVE_MINUTE_RESCALE = 1;
|
|
static const unsigned int accounting_delay = (HZ * 300) / 22;
|
|
static const unsigned int accounting_weight = 32;
|
|
|
|
/* sysfs reading/writing */
|
|
|
|
read_attribute(cache_hits);
|
|
read_attribute(cache_misses);
|
|
read_attribute(cache_bypass_hits);
|
|
read_attribute(cache_bypass_misses);
|
|
read_attribute(cache_hit_ratio);
|
|
read_attribute(cache_readaheads);
|
|
read_attribute(cache_miss_collisions);
|
|
read_attribute(bypassed);
|
|
|
|
SHOW(bch_stats)
|
|
{
|
|
struct cache_stats *s =
|
|
container_of(kobj, struct cache_stats, kobj);
|
|
#define var(stat) (s->stat >> 16)
|
|
var_print(cache_hits);
|
|
var_print(cache_misses);
|
|
var_print(cache_bypass_hits);
|
|
var_print(cache_bypass_misses);
|
|
|
|
sysfs_print(cache_hit_ratio,
|
|
DIV_SAFE(var(cache_hits) * 100,
|
|
var(cache_hits) + var(cache_misses)));
|
|
|
|
var_print(cache_readaheads);
|
|
var_print(cache_miss_collisions);
|
|
sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
|
|
#undef var
|
|
return 0;
|
|
}
|
|
|
|
STORE(bch_stats)
|
|
{
|
|
return size;
|
|
}
|
|
|
|
static void bch_stats_release(struct kobject *k)
|
|
{
|
|
}
|
|
|
|
static struct attribute *bch_stats_files[] = {
|
|
&sysfs_cache_hits,
|
|
&sysfs_cache_misses,
|
|
&sysfs_cache_bypass_hits,
|
|
&sysfs_cache_bypass_misses,
|
|
&sysfs_cache_hit_ratio,
|
|
&sysfs_cache_readaheads,
|
|
&sysfs_cache_miss_collisions,
|
|
&sysfs_bypassed,
|
|
NULL
|
|
};
|
|
static KTYPE(bch_stats);
|
|
|
|
int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
|
|
struct kobject *parent)
|
|
{
|
|
int ret = kobject_add(&acc->total.kobj, parent,
|
|
"stats_total");
|
|
ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
|
|
"stats_five_minute");
|
|
ret = ret ?: kobject_add(&acc->hour.kobj, parent,
|
|
"stats_hour");
|
|
ret = ret ?: kobject_add(&acc->day.kobj, parent,
|
|
"stats_day");
|
|
return ret;
|
|
}
|
|
|
|
void bch_cache_accounting_clear(struct cache_accounting *acc)
|
|
{
|
|
acc->total.cache_hits = 0;
|
|
acc->total.cache_misses = 0;
|
|
acc->total.cache_bypass_hits = 0;
|
|
acc->total.cache_bypass_misses = 0;
|
|
acc->total.cache_readaheads = 0;
|
|
acc->total.cache_miss_collisions = 0;
|
|
acc->total.sectors_bypassed = 0;
|
|
}
|
|
|
|
void bch_cache_accounting_destroy(struct cache_accounting *acc)
|
|
{
|
|
kobject_put(&acc->total.kobj);
|
|
kobject_put(&acc->five_minute.kobj);
|
|
kobject_put(&acc->hour.kobj);
|
|
kobject_put(&acc->day.kobj);
|
|
|
|
atomic_set(&acc->closing, 1);
|
|
if (del_timer_sync(&acc->timer))
|
|
closure_return(&acc->cl);
|
|
}
|
|
|
|
/* EWMA scaling */
|
|
|
|
static void scale_stat(unsigned long *stat)
|
|
{
|
|
*stat = ewma_add(*stat, 0, accounting_weight, 0);
|
|
}
|
|
|
|
static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
|
|
{
|
|
if (++stats->rescale == rescale_at) {
|
|
stats->rescale = 0;
|
|
scale_stat(&stats->cache_hits);
|
|
scale_stat(&stats->cache_misses);
|
|
scale_stat(&stats->cache_bypass_hits);
|
|
scale_stat(&stats->cache_bypass_misses);
|
|
scale_stat(&stats->cache_readaheads);
|
|
scale_stat(&stats->cache_miss_collisions);
|
|
scale_stat(&stats->sectors_bypassed);
|
|
}
|
|
}
|
|
|
|
static void scale_accounting(struct timer_list *t)
|
|
{
|
|
struct cache_accounting *acc = from_timer(acc, t, timer);
|
|
|
|
#define move_stat(name) do { \
|
|
unsigned int t = atomic_xchg(&acc->collector.name, 0); \
|
|
t <<= 16; \
|
|
acc->five_minute.name += t; \
|
|
acc->hour.name += t; \
|
|
acc->day.name += t; \
|
|
acc->total.name += t; \
|
|
} while (0)
|
|
|
|
move_stat(cache_hits);
|
|
move_stat(cache_misses);
|
|
move_stat(cache_bypass_hits);
|
|
move_stat(cache_bypass_misses);
|
|
move_stat(cache_readaheads);
|
|
move_stat(cache_miss_collisions);
|
|
move_stat(sectors_bypassed);
|
|
|
|
scale_stats(&acc->total, 0);
|
|
scale_stats(&acc->day, DAY_RESCALE);
|
|
scale_stats(&acc->hour, HOUR_RESCALE);
|
|
scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
|
|
|
|
acc->timer.expires += accounting_delay;
|
|
|
|
if (!atomic_read(&acc->closing))
|
|
add_timer(&acc->timer);
|
|
else
|
|
closure_return(&acc->cl);
|
|
}
|
|
|
|
static void mark_cache_stats(struct cache_stat_collector *stats,
|
|
bool hit, bool bypass)
|
|
{
|
|
if (!bypass)
|
|
if (hit)
|
|
atomic_inc(&stats->cache_hits);
|
|
else
|
|
atomic_inc(&stats->cache_misses);
|
|
else
|
|
if (hit)
|
|
atomic_inc(&stats->cache_bypass_hits);
|
|
else
|
|
atomic_inc(&stats->cache_bypass_misses);
|
|
}
|
|
|
|
void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
|
|
bool hit, bool bypass)
|
|
{
|
|
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
|
|
|
|
mark_cache_stats(&dc->accounting.collector, hit, bypass);
|
|
mark_cache_stats(&c->accounting.collector, hit, bypass);
|
|
}
|
|
|
|
void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
|
|
{
|
|
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
|
|
|
|
atomic_inc(&dc->accounting.collector.cache_readaheads);
|
|
atomic_inc(&c->accounting.collector.cache_readaheads);
|
|
}
|
|
|
|
void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
|
|
{
|
|
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
|
|
|
|
atomic_inc(&dc->accounting.collector.cache_miss_collisions);
|
|
atomic_inc(&c->accounting.collector.cache_miss_collisions);
|
|
}
|
|
|
|
void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
|
|
int sectors)
|
|
{
|
|
atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
|
|
atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
|
|
}
|
|
|
|
void bch_cache_accounting_init(struct cache_accounting *acc,
|
|
struct closure *parent)
|
|
{
|
|
kobject_init(&acc->total.kobj, &bch_stats_ktype);
|
|
kobject_init(&acc->five_minute.kobj, &bch_stats_ktype);
|
|
kobject_init(&acc->hour.kobj, &bch_stats_ktype);
|
|
kobject_init(&acc->day.kobj, &bch_stats_ktype);
|
|
|
|
closure_init(&acc->cl, parent);
|
|
timer_setup(&acc->timer, scale_accounting, 0);
|
|
acc->timer.expires = jiffies + accounting_delay;
|
|
add_timer(&acc->timer);
|
|
}
|