Commit 83ff9318c4 ("bcache: not use hard coded memset size in
bch_cache_accounting_clear()") tries to make the code more easy to
understand by removing the hard coded number with following change,
	void bch_cache_accounting_clear(...)
	{
		memset(&acc->total.cache_hits,
			0,
	-		sizeof(unsigned long) * 7);
	+		sizeof(struct cache_stats));
	}
Unfortunately the change was wrong (it also tells us the original code
was not easy to correctly understand). The hard coded number 7 is used
because in struct cache_stats,
 15 struct cache_stats {
 16         struct kobject          kobj;
 17
 18         unsigned long cache_hits;
 19         unsigned long cache_misses;
 20         unsigned long cache_bypass_hits;
 21         unsigned long cache_bypass_misses;
 22         unsigned long cache_readaheads;
 23         unsigned long cache_miss_collisions;
 24         unsigned long sectors_bypassed;
 25
 26         unsigned int            rescale;
 27 };
only members in LINE 18-24 want to be set to 0. It is wrong to use
'sizeof(struct cache_stats)' to replace 'sizeof(unsigned long) * 7), the
memory objects behind acc->total is staled by this change.
Сорокин Артем Сергеевич reports that by the following steps, kernel
panic will be triggered,
1. Create new set: make-bcache -B /dev/nvme1n1 -C /dev/sda --wipe-bcache
2. Run in /sys/fs/bcache/<uuid>:
   echo 1 > clear_stats && cat stats_five_minute/cache_bypass_hits
I can reproduce the panic and get following dmesg with KASAN enabled,
[22613.172742] ==================================================================
[22613.172862] BUG: KASAN: null-ptr-deref in sysfs_kf_seq_show+0x117/0x230
[22613.172864] Read of size 8 at addr 0000000000000000 by task cat/6753
[22613.172870] CPU: 1 PID: 6753 Comm: cat Not tainted 5.5.0-rc7-lp151.28.16-default+ #11
[22613.172872] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/29/2019
[22613.172873] Call Trace:
[22613.172964]  dump_stack+0x8b/0xbb
[22613.172968]  ? sysfs_kf_seq_show+0x117/0x230
[22613.172970]  ? sysfs_kf_seq_show+0x117/0x230
[22613.173031]  __kasan_report+0x176/0x192
[22613.173064]  ? pr_cont_kernfs_name+0x40/0x60
[22613.173067]  ? sysfs_kf_seq_show+0x117/0x230
[22613.173070]  kasan_report+0xe/0x20
[22613.173072]  sysfs_kf_seq_show+0x117/0x230
[22613.173105]  seq_read+0x199/0x6d0
[22613.173110]  vfs_read+0xa5/0x1a0
[22613.173113]  ksys_read+0x110/0x160
[22613.173115]  ? kernel_write+0xb0/0xb0
[22613.173177]  do_syscall_64+0x77/0x290
[22613.173238]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[22613.173241] RIP: 0033:0x7fc2c886ac61
[22613.173244] Code: fe ff ff 48 8d 3d c7 a0 09 00 48 83 ec 08 e8 46 03 02 00 66 0f 1f 44 00 00 8b 05 ca fb 2c 00 48 63 ff 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 55 53 48 89 d5 48 89
[22613.173245] RSP: 002b:00007ffebe776d68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[22613.173248] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2c886ac61
[22613.173249] RDX: 0000000000020000 RSI: 00007fc2c8cca000 RDI: 0000000000000003
[22613.173250] RBP: 0000000000020000 R08: ffffffffffffffff R09: 0000000000000000
[22613.173251] R10: 000000000000038c R11: 0000000000000246 R12: 00007fc2c8cca000
[22613.173253] R13: 0000000000000003 R14: 00007fc2c8cca00f R15: 0000000000020000
[22613.173255] ==================================================================
[22613.173256] Disabling lock debugging due to kernel taint
[22613.173350] BUG: kernel NULL pointer dereference, address: 0000000000000000
[22613.178380] #PF: supervisor read access in kernel mode
[22613.180959] #PF: error_code(0x0000) - not-present page
[22613.183444] PGD 0 P4D 0
[22613.184867] Oops: 0000 [#1] SMP KASAN PTI
[22613.186797] CPU: 1 PID: 6753 Comm: cat Tainted: G    B             5.5.0-rc7-lp151.28.16-default+ #11
[22613.191253] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/29/2019
[22613.196706] RIP: 0010:sysfs_kf_seq_show+0x117/0x230
[22613.199097] Code: ff 48 8b 0b 48 8b 44 24 08 48 01 e9 eb a6 31 f6 48 89 cf ba 00 10 00 00 48 89 4c 24 10 e8 b1 e6 e9 ff 4c 89 ff e8 19 07 ea ff <49> 8b 07 48 85 c0 48 89 44 24 08 0f 84 91 00 00 00 49 8b 6d 00 48
[22613.208016] RSP: 0018:ffff8881d4f8fd78 EFLAGS: 00010246
[22613.210448] RAX: 0000000000000000 RBX: ffff8881eb99b180 RCX: ffffffff810d9ef6
[22613.213691] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246
[22613.216893] RBP: 0000000000001000 R08: fffffbfff072ddcd R09: fffffbfff072ddcd
[22613.220075] R10: 0000000000000001 R11: fffffbfff072ddcc R12: ffff8881de5c0200
[22613.223256] R13: ffff8881ed175500 R14: ffff8881eb99b198 R15: 0000000000000000
[22613.226290] FS:  00007fc2c8d3d500(0000) GS:ffff8881f2a80000(0000) knlGS:0000000000000000
[22613.229637] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[22613.231993] CR2: 0000000000000000 CR3: 00000001ec89a004 CR4: 00000000003606e0
[22613.234909] Call Trace:
[22613.235931]  seq_read+0x199/0x6d0
[22613.237259]  vfs_read+0xa5/0x1a0
[22613.239229]  ksys_read+0x110/0x160
[22613.240590]  ? kernel_write+0xb0/0xb0
[22613.242040]  do_syscall_64+0x77/0x290
[22613.243625]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[22613.245450] RIP: 0033:0x7fc2c886ac61
[22613.246706] Code: fe ff ff 48 8d 3d c7 a0 09 00 48 83 ec 08 e8 46 03 02 00 66 0f 1f 44 00 00 8b 05 ca fb 2c 00 48 63 ff 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 55 53 48 89 d5 48 89
[22613.253296] RSP: 002b:00007ffebe776d68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[22613.255835] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2c886ac61
[22613.258472] RDX: 0000000000020000 RSI: 00007fc2c8cca000 RDI: 0000000000000003
[22613.260807] RBP: 0000000000020000 R08: ffffffffffffffff R09: 0000000000000000
[22613.263188] R10: 000000000000038c R11: 0000000000000246 R12: 00007fc2c8cca000
[22613.265598] R13: 0000000000000003 R14: 00007fc2c8cca00f R15: 0000000000020000
[22613.268729] Modules linked in: scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs vmw_vsock_vmci_transport vsock fuse bnep kvm_intel kvm irqbypass crc32_pclmul crc32c_intel ghash_clmulni_intel snd_ens1371 snd_ac97_codec ac97_bus bcache snd_pcm btusb btrtl btbcm btintel crc64 aesni_intel glue_helper crypto_simd vmw_balloon cryptd bluetooth snd_timer snd_rawmidi snd joydev pcspkr e1000 rfkill vmw_vmci soundcore ecdh_generic ecc gameport i2c_piix4 mptctl ac button hid_generic usbhid sr_mod cdrom ata_generic ehci_pci vmwgfx uhci_hcd drm_kms_helper syscopyarea serio_raw sysfillrect sysimgblt fb_sys_fops ttm ehci_hcd mptspi scsi_transport_spi mptscsih ata_piix mptbase ahci usbcore libahci drm sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua
[22613.292429] CR2: 0000000000000000
[22613.293563] ---[ end trace a074b26a8508f378 ]---
[22613.295138] RIP: 0010:sysfs_kf_seq_show+0x117/0x230
[22613.296769] Code: ff 48 8b 0b 48 8b 44 24 08 48 01 e9 eb a6 31 f6 48 89 cf ba 00 10 00 00 48 89 4c 24 10 e8 b1 e6 e9 ff 4c 89 ff e8 19 07 ea ff <49> 8b 07 48 85 c0 48 89 44 24 08 0f 84 91 00 00 00 49 8b 6d 00 48
[22613.303553] RSP: 0018:ffff8881d4f8fd78 EFLAGS: 00010246
[22613.305280] RAX: 0000000000000000 RBX: ffff8881eb99b180 RCX: ffffffff810d9ef6
[22613.307924] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246
[22613.310272] RBP: 0000000000001000 R08: fffffbfff072ddcd R09: fffffbfff072ddcd
[22613.312685] R10: 0000000000000001 R11: fffffbfff072ddcc R12: ffff8881de5c0200
[22613.315076] R13: ffff8881ed175500 R14: ffff8881eb99b198 R15: 0000000000000000
[22613.318116] FS:  00007fc2c8d3d500(0000) GS:ffff8881f2a80000(0000) knlGS:0000000000000000
[22613.320743] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[22613.322628] CR2: 0000000000000000 CR3: 00000001ec89a004 CR4: 00000000003606e0
Here this patch fixes the following problem by explicity set all the 7
members to 0 in bch_cache_accounting_clear().
Reported-by: Сорокин Артем Сергеевич <a.sorokin@bank-hlynov.ru>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
		
	
			
		
			
				
	
	
		
			248 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			248 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /*
 | |
|  * bcache stats code
 | |
|  *
 | |
|  * Copyright 2012 Google, Inc.
 | |
|  */
 | |
| 
 | |
| #include "bcache.h"
 | |
| #include "stats.h"
 | |
| #include "btree.h"
 | |
| #include "sysfs.h"
 | |
| 
 | |
| /*
 | |
|  * We keep absolute totals of various statistics, and addionally a set of three
 | |
|  * rolling averages.
 | |
|  *
 | |
|  * Every so often, a timer goes off and rescales the rolling averages.
 | |
|  * accounting_rescale[] is how many times the timer has to go off before we
 | |
|  * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
 | |
|  * and one day.
 | |
|  *
 | |
|  * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
 | |
|  * and accounting_weight is what we use to rescale:
 | |
|  *
 | |
|  * pow(31 / 32, 22) ~= 1/2
 | |
|  *
 | |
|  * So that we don't have to increment each set of numbers every time we (say)
 | |
|  * get a cache hit, we increment a single atomic_t in acc->collector, and when
 | |
|  * the rescale function runs it resets the atomic counter to 0 and adds its
 | |
|  * old value to each of the exported numbers.
 | |
|  *
 | |
|  * To reduce rounding error, the numbers in struct cache_stats are all
 | |
|  * stored left shifted by 16, and scaled back in the sysfs show() function.
 | |
|  */
 | |
| 
 | |
| static const unsigned int DAY_RESCALE		= 288;
 | |
| static const unsigned int HOUR_RESCALE		= 12;
 | |
| static const unsigned int FIVE_MINUTE_RESCALE	= 1;
 | |
| static const unsigned int accounting_delay	= (HZ * 300) / 22;
 | |
| static const unsigned int accounting_weight	= 32;
 | |
| 
 | |
| /* sysfs reading/writing */
 | |
| 
 | |
| read_attribute(cache_hits);
 | |
| read_attribute(cache_misses);
 | |
| read_attribute(cache_bypass_hits);
 | |
| read_attribute(cache_bypass_misses);
 | |
| read_attribute(cache_hit_ratio);
 | |
| read_attribute(cache_readaheads);
 | |
| read_attribute(cache_miss_collisions);
 | |
| read_attribute(bypassed);
 | |
| 
 | |
| SHOW(bch_stats)
 | |
| {
 | |
| 	struct cache_stats *s =
 | |
| 		container_of(kobj, struct cache_stats, kobj);
 | |
| #define var(stat)		(s->stat >> 16)
 | |
| 	var_print(cache_hits);
 | |
| 	var_print(cache_misses);
 | |
| 	var_print(cache_bypass_hits);
 | |
| 	var_print(cache_bypass_misses);
 | |
| 
 | |
| 	sysfs_print(cache_hit_ratio,
 | |
| 		    DIV_SAFE(var(cache_hits) * 100,
 | |
| 			     var(cache_hits) + var(cache_misses)));
 | |
| 
 | |
| 	var_print(cache_readaheads);
 | |
| 	var_print(cache_miss_collisions);
 | |
| 	sysfs_hprint(bypassed,	var(sectors_bypassed) << 9);
 | |
| #undef var
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| STORE(bch_stats)
 | |
| {
 | |
| 	return size;
 | |
| }
 | |
| 
 | |
| static void bch_stats_release(struct kobject *k)
 | |
| {
 | |
| }
 | |
| 
 | |
| static struct attribute *bch_stats_files[] = {
 | |
| 	&sysfs_cache_hits,
 | |
| 	&sysfs_cache_misses,
 | |
| 	&sysfs_cache_bypass_hits,
 | |
| 	&sysfs_cache_bypass_misses,
 | |
| 	&sysfs_cache_hit_ratio,
 | |
| 	&sysfs_cache_readaheads,
 | |
| 	&sysfs_cache_miss_collisions,
 | |
| 	&sysfs_bypassed,
 | |
| 	NULL
 | |
| };
 | |
| static KTYPE(bch_stats);
 | |
| 
 | |
| int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
 | |
| 				   struct kobject *parent)
 | |
| {
 | |
| 	int ret = kobject_add(&acc->total.kobj, parent,
 | |
| 			      "stats_total");
 | |
| 	ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
 | |
| 				 "stats_five_minute");
 | |
| 	ret = ret ?: kobject_add(&acc->hour.kobj, parent,
 | |
| 				 "stats_hour");
 | |
| 	ret = ret ?: kobject_add(&acc->day.kobj, parent,
 | |
| 				 "stats_day");
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| void bch_cache_accounting_clear(struct cache_accounting *acc)
 | |
| {
 | |
| 	acc->total.cache_hits = 0;
 | |
| 	acc->total.cache_misses = 0;
 | |
| 	acc->total.cache_bypass_hits = 0;
 | |
| 	acc->total.cache_bypass_misses = 0;
 | |
| 	acc->total.cache_readaheads = 0;
 | |
| 	acc->total.cache_miss_collisions = 0;
 | |
| 	acc->total.sectors_bypassed = 0;
 | |
| }
 | |
| 
 | |
| void bch_cache_accounting_destroy(struct cache_accounting *acc)
 | |
| {
 | |
| 	kobject_put(&acc->total.kobj);
 | |
| 	kobject_put(&acc->five_minute.kobj);
 | |
| 	kobject_put(&acc->hour.kobj);
 | |
| 	kobject_put(&acc->day.kobj);
 | |
| 
 | |
| 	atomic_set(&acc->closing, 1);
 | |
| 	if (del_timer_sync(&acc->timer))
 | |
| 		closure_return(&acc->cl);
 | |
| }
 | |
| 
 | |
| /* EWMA scaling */
 | |
| 
 | |
| static void scale_stat(unsigned long *stat)
 | |
| {
 | |
| 	*stat =  ewma_add(*stat, 0, accounting_weight, 0);
 | |
| }
 | |
| 
 | |
| static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
 | |
| {
 | |
| 	if (++stats->rescale == rescale_at) {
 | |
| 		stats->rescale = 0;
 | |
| 		scale_stat(&stats->cache_hits);
 | |
| 		scale_stat(&stats->cache_misses);
 | |
| 		scale_stat(&stats->cache_bypass_hits);
 | |
| 		scale_stat(&stats->cache_bypass_misses);
 | |
| 		scale_stat(&stats->cache_readaheads);
 | |
| 		scale_stat(&stats->cache_miss_collisions);
 | |
| 		scale_stat(&stats->sectors_bypassed);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void scale_accounting(struct timer_list *t)
 | |
| {
 | |
| 	struct cache_accounting *acc = from_timer(acc, t, timer);
 | |
| 
 | |
| #define move_stat(name) do {						\
 | |
| 	unsigned int t = atomic_xchg(&acc->collector.name, 0);		\
 | |
| 	t <<= 16;							\
 | |
| 	acc->five_minute.name += t;					\
 | |
| 	acc->hour.name += t;						\
 | |
| 	acc->day.name += t;						\
 | |
| 	acc->total.name += t;						\
 | |
| } while (0)
 | |
| 
 | |
| 	move_stat(cache_hits);
 | |
| 	move_stat(cache_misses);
 | |
| 	move_stat(cache_bypass_hits);
 | |
| 	move_stat(cache_bypass_misses);
 | |
| 	move_stat(cache_readaheads);
 | |
| 	move_stat(cache_miss_collisions);
 | |
| 	move_stat(sectors_bypassed);
 | |
| 
 | |
| 	scale_stats(&acc->total, 0);
 | |
| 	scale_stats(&acc->day, DAY_RESCALE);
 | |
| 	scale_stats(&acc->hour, HOUR_RESCALE);
 | |
| 	scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
 | |
| 
 | |
| 	acc->timer.expires += accounting_delay;
 | |
| 
 | |
| 	if (!atomic_read(&acc->closing))
 | |
| 		add_timer(&acc->timer);
 | |
| 	else
 | |
| 		closure_return(&acc->cl);
 | |
| }
 | |
| 
 | |
| static void mark_cache_stats(struct cache_stat_collector *stats,
 | |
| 			     bool hit, bool bypass)
 | |
| {
 | |
| 	if (!bypass)
 | |
| 		if (hit)
 | |
| 			atomic_inc(&stats->cache_hits);
 | |
| 		else
 | |
| 			atomic_inc(&stats->cache_misses);
 | |
| 	else
 | |
| 		if (hit)
 | |
| 			atomic_inc(&stats->cache_bypass_hits);
 | |
| 		else
 | |
| 			atomic_inc(&stats->cache_bypass_misses);
 | |
| }
 | |
| 
 | |
| void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
 | |
| 			       bool hit, bool bypass)
 | |
| {
 | |
| 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 | |
| 
 | |
| 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
 | |
| 	mark_cache_stats(&c->accounting.collector, hit, bypass);
 | |
| }
 | |
| 
 | |
| void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
 | |
| {
 | |
| 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 | |
| 
 | |
| 	atomic_inc(&dc->accounting.collector.cache_readaheads);
 | |
| 	atomic_inc(&c->accounting.collector.cache_readaheads);
 | |
| }
 | |
| 
 | |
| void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
 | |
| {
 | |
| 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 | |
| 
 | |
| 	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
 | |
| 	atomic_inc(&c->accounting.collector.cache_miss_collisions);
 | |
| }
 | |
| 
 | |
| void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
 | |
| 			       int sectors)
 | |
| {
 | |
| 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
 | |
| 	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
 | |
| }
 | |
| 
 | |
| void bch_cache_accounting_init(struct cache_accounting *acc,
 | |
| 			       struct closure *parent)
 | |
| {
 | |
| 	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
 | |
| 	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
 | |
| 	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
 | |
| 	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
 | |
| 
 | |
| 	closure_init(&acc->cl, parent);
 | |
| 	timer_setup(&acc->timer, scale_accounting, 0);
 | |
| 	acc->timer.expires	= jiffies + accounting_delay;
 | |
| 	add_timer(&acc->timer);
 | |
| }
 |