From b954ebba296bb2eb2e38322f17aaa6426934bd7e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 Apr 2022 20:52:35 +0900 Subject: [PATCH 1/8] zonefs: Clear inode information flags on inode creation Ensure that the i_flags field of struct zonefs_inode_info is cleared to 0 when initializing a zone file inode, avoiding seeing the flag ZONEFS_ZONE_OPEN being incorrectly set. Fixes: b5c00e975779 ("zonefs: open/close zone on file open/close") Cc: Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hans Holmberg --- fs/zonefs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3614c7834007..75d8dabe0807 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1142,6 +1142,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); zi->i_wr_refcnt = 0; + zi->i_flags = 0; return &zi->i_vnode; } From 19139539207934aef6335bdef09c9e4bd70d1808 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 Apr 2022 17:41:37 +0900 Subject: [PATCH 2/8] zonefs: Fix management of open zones The mount option "explicit_open" manages the device open zone resources to ensure that if an application opens a sequential file for writing, the file zone can always be written by explicitly opening the zone and accounting for that state with the s_open_zones counter. However, if some zones are already open when mounting, the device open zone resource usage status will be larger than the initial s_open_zones value of 0. Ensure that this inconsistency does not happen by closing any sequential zone that is open when mounting. Furthermore, with ZNS drives, closing an explicitly open zone that has not been written will change the zone state to "closed", that is, the zone will remain in an active state. Since this can then cause failures of explicit open operations on other zones if the drive active zone resources are exceeded, we need to make sure that the zone is not active anymore by resetting it instead of closing it. To address this, zonefs_zone_mgmt() is modified to change a REQ_OP_ZONE_CLOSE request into a REQ_OP_ZONE_RESET for sequential zones that have not been written. Fixes: b5c00e975779 ("zonefs: open/close zone on file open/close") Cc: Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hans Holmberg --- fs/zonefs/super.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 75d8dabe0807..e20e7c841489 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -35,6 +35,17 @@ static inline int zonefs_zone_mgmt(struct inode *inode, lockdep_assert_held(&zi->i_truncate_mutex); + /* + * With ZNS drives, closing an explicitly open zone that has not been + * written will change the zone state to "closed", that is, the zone + * will remain active. Since this can then cause failure of explicit + * open operation on other zones if the drive active zone resources + * are exceeded, make sure that the zone does not remain active by + * resetting it. + */ + if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) + op = REQ_OP_ZONE_RESET; + trace_zonefs_zone_mgmt(inode, op); ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); @@ -1294,12 +1305,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, inc_nlink(parent); } -static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, - enum zonefs_ztype type) +static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + enum zonefs_ztype type) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; inode->i_mode = S_IFREG | sbi->s_perm; @@ -1324,6 +1336,22 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + + /* + * For sequential zones, make sure that any open zone is closed first + * to ensure that the initial number of open zones is 0, in sync with + * the open zone accounting done when the mount option + * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + */ + if (type == ZONEFS_ZTYPE_SEQ && + (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN)) { + mutex_lock(&zi->i_truncate_mutex); + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + mutex_unlock(&zi->i_truncate_mutex); + } + + return ret; } static struct dentry *zonefs_create_inode(struct dentry *parent, @@ -1333,6 +1361,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, struct inode *dir = d_inode(parent); struct dentry *dentry; struct inode *inode; + int ret; dentry = d_alloc_name(parent, name); if (!dentry) @@ -1343,10 +1372,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, goto dput; inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; - if (zone) - zonefs_init_file_inode(inode, zone, type); - else + if (zone) { + ret = zonefs_init_file_inode(inode, zone, type); + if (ret) { + iput(inode); + goto dput; + } + } else { zonefs_init_dir_inode(dir, inode, type); + } + d_add(dentry, inode); dir->i_size++; From 2b95a23c4f50c42fe85f0d345612075d0f2c3118 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 Apr 2022 17:00:13 +0900 Subject: [PATCH 3/8] zonefs: Rename super block information fields The s_open_zones field of struct zonefs_sb_info is used to count the number of files that are open for writing and may not necessarilly correspond to the number of open zones on the device. For instance, an application may open for writing a sequential zone file, fully write it and keep the file open. In such case, the zone of the file is not open anymore (it is in the full state). Avoid confusion about this counter meaning by renaming it to s_wro_seq_files. To keep things consistent, the field s_max_open_zones is renamed to s_max_wro_seq_files. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hans Holmberg --- fs/zonefs/super.c | 17 ++++++++++------- fs/zonefs/zonefs.h | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e20e7c841489..dafacde65659 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1035,8 +1035,10 @@ static int zonefs_open_zone(struct inode *inode) mutex_lock(&zi->i_truncate_mutex); if (!zi->i_wr_refcnt) { - if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { - atomic_dec(&sbi->s_open_zones); + unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); + + if (wro > sbi->s_max_wro_seq_files) { + atomic_dec(&sbi->s_wro_seq_files); ret = -EBUSY; goto unlock; } @@ -1044,7 +1046,7 @@ static int zonefs_open_zone(struct inode *inode) if (i_size_read(inode) < zi->i_max_size) { ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); if (ret) { - atomic_dec(&sbi->s_open_zones); + atomic_dec(&sbi->s_wro_seq_files); goto unlock; } zi->i_flags |= ZONEFS_ZONE_OPEN; @@ -1108,7 +1110,7 @@ static void zonefs_close_zone(struct inode *inode) } zi->i_flags &= ~ZONEFS_ZONE_OPEN; dec: - atomic_dec(&sbi->s_open_zones); + atomic_dec(&sbi->s_wro_seq_files); } mutex_unlock(&zi->i_truncate_mutex); } @@ -1688,9 +1690,10 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; - sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); - atomic_set(&sbi->s_open_zones, 0); - if (!sbi->s_max_open_zones && + + atomic_set(&sbi->s_wro_seq_files, 0); + sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); + if (!sbi->s_max_wro_seq_files && sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 7b147907c328..67fd00ab173f 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -182,8 +182,8 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; - unsigned int s_max_open_zones; - atomic_t s_open_zones; + unsigned int s_max_wro_seq_files; + atomic_t s_wro_seq_files; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) From 7d6dfbe03bd3bef51ead25d129dabebd8bae1ec4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 Apr 2022 22:38:07 +0900 Subject: [PATCH 4/8] zonefs: Always do seq file write open accounting The explicit_open mount option forces an explicitly open of the zone of sequential files that are open for writing to ensure that the open file can be written without the device failing write operations due to open zone resources limit being exceeded. To implement this, zonefs accounts all write open seq file when this mount option is used. This accounting however can be easily performed even when the explicit_open mount option is not used, thus allowing applications to control zone resources on their own, without relying on open() system call failures from zonefs. To implement this, the helper zonefs_file_use_exp_open() is removed and replaced with the helper zonefs_seq_file_need_wro() which test if a file is a sequential file being open with write access. zonefs_open_zone() and zonefs_close_zone() are renamed respectively to zonefs_seq_file_write_open() and zonefs_seq_file_write_close() and modified to update the s_wro_seq_files counter regardless of the explicit_open mount option use. If the explicit_open mount option is used, zonefs_seq_file_write_open() execute an explicit zone open operation for a sequential file open for writing for the first time, as before. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hans Holmberg --- fs/zonefs/super.c | 80 +++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index dafacde65659..02dbdec32b2f 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1009,13 +1009,13 @@ inode_unlock: return ret; } -static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +/* + * Write open accounting is done only for sequential files. + */ +static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) { struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - - if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) - return false; if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) return false; @@ -1026,30 +1026,33 @@ static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *fi return true; } -static int zonefs_open_zone(struct inode *inode) +static int zonefs_seq_file_write_open(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); int ret = 0; mutex_lock(&zi->i_truncate_mutex); if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); - if (wro > sbi->s_max_wro_seq_files) { - atomic_dec(&sbi->s_wro_seq_files); - ret = -EBUSY; - goto unlock; - } + if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - if (ret) { + if (wro > sbi->s_max_wro_seq_files) { atomic_dec(&sbi->s_wro_seq_files); + ret = -EBUSY; goto unlock; } - zi->i_flags |= ZONEFS_ZONE_OPEN; + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + } } } @@ -1069,30 +1072,31 @@ static int zonefs_file_open(struct inode *inode, struct file *file) if (ret) return ret; - if (zonefs_file_use_exp_open(inode, file)) - return zonefs_open_zone(inode); + if (zonefs_seq_file_need_wro(inode, file)) + return zonefs_seq_file_write_open(inode); return 0; } -static void zonefs_close_zone(struct inode *inode) +static void zonefs_seq_file_write_close(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); int ret = 0; mutex_lock(&zi->i_truncate_mutex); + zi->i_wr_refcnt--; - if (!zi->i_wr_refcnt) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - struct super_block *sb = inode->i_sb; - - /* - * If the file zone is full, it is not open anymore and we only - * need to decrement the open count. - */ - if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) - goto dec; + if (zi->i_wr_refcnt) + goto unlock; + /* + * The file zone may not be open anymore (e.g. the file was truncated to + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); if (ret) { __zonefs_io_error(inode, false); @@ -1104,14 +1108,22 @@ static void zonefs_close_zone(struct inode *inode) */ if (zi->i_flags & ZONEFS_ZONE_OPEN && !(sb->s_flags & SB_RDONLY)) { - zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + zonefs_warn(sb, + "closing zone at %llu failed %d\n", + zi->i_zsector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; } + goto unlock; } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; -dec: - atomic_dec(&sbi->s_wro_seq_files); } + + atomic_dec(&sbi->s_wro_seq_files); + +unlock: mutex_unlock(&zi->i_truncate_mutex); } @@ -1123,8 +1135,8 @@ static int zonefs_file_release(struct inode *inode, struct file *file) * the zone has gone offline or read-only). Make sure we don't fail the * close(2) for user-space. */ - if (zonefs_file_use_exp_open(inode, file)) - zonefs_close_zone(inode); + if (zonefs_seq_file_need_wro(inode, file)) + zonefs_seq_file_write_close(inode); return 0; } From 9277a6d4fbd4aaa668b19b819015f87f0da53a38 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 Apr 2022 16:25:34 +0900 Subject: [PATCH 5/8] zonefs: Export open zone resource information through sysfs To allow applications to easily check the current usage status of the open zone resources of the mounted device, export through sysfs the counter of write open sequential files s_wro_seq_files field of struct zonefs_sb_info. The attribute is named nr_wro_seq_files and is read only. The maximum number of write open sequential files (zones) indicated by the s_max_wro_seq_files field of struct zonefs_sb_info is also exported as the read only attribute max_wro_seq_files. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hans Holmberg --- fs/zonefs/Makefile | 2 +- fs/zonefs/super.c | 24 +++++++-- fs/zonefs/sysfs.c | 125 +++++++++++++++++++++++++++++++++++++++++++++ fs/zonefs/zonefs.h | 10 ++++ 4 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 fs/zonefs/sysfs.c diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile index 33c1a4f1132e..9fe54f5319f2 100644 --- a/fs/zonefs/Makefile +++ b/fs/zonefs/Makefile @@ -3,4 +3,4 @@ ccflags-y += -I$(src) obj-$(CONFIG_ZONEFS_FS) += zonefs.o -zonefs-y := super.o +zonefs-y := super.o sysfs.o diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 02dbdec32b2f..aa359f27102e 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1725,6 +1725,10 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) goto cleanup; + ret = zonefs_sysfs_register(sb); + if (ret) + goto cleanup; + zonefs_info(sb, "Mounting %u zones", blkdev_nr_zones(sb->s_bdev->bd_disk)); @@ -1770,6 +1774,8 @@ static void zonefs_kill_super(struct super_block *sb) if (sb->s_root) d_genocide(sb->s_root); + + zonefs_sysfs_unregister(sb); kill_block_super(sb); kfree(sbi); } @@ -1817,16 +1823,26 @@ static int __init zonefs_init(void) return ret; ret = register_filesystem(&zonefs_type); - if (ret) { - zonefs_destroy_inodecache(); - return ret; - } + if (ret) + goto destroy_inodecache; + + ret = zonefs_sysfs_init(); + if (ret) + goto unregister_fs; return 0; + +unregister_fs: + unregister_filesystem(&zonefs_type); +destroy_inodecache: + zonefs_destroy_inodecache(); + + return ret; } static void __exit zonefs_exit(void) { + zonefs_sysfs_exit(); zonefs_destroy_inodecache(); unregister_filesystem(&zonefs_type); } diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c new file mode 100644 index 000000000000..eaeaf983ed87 --- /dev/null +++ b/fs/zonefs/sysfs.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple file system for zoned block devices exposing zones as files. + * + * Copyright (C) 2022 Western Digital Corporation or its affiliates. + */ +#include +#include +#include + +#include "zonefs.h" + +struct zonefs_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf); +}; + +static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr) +{ + return container_of(attr, struct zonefs_sysfs_attr, attr); +} + +#define ZONEFS_SYSFS_ATTR_RO(name) \ +static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name) + +#define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr + +static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct zonefs_sb_info *sbi = + container_of(kobj, struct zonefs_sb_info, s_kobj); + struct zonefs_sysfs_attr *zonefs_attr = + container_of(attr, struct zonefs_sysfs_attr, attr); + + if (!zonefs_attr->show) + return 0; + + return zonefs_attr->show(sbi, buf); +} + +static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files); +} +ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files); + +static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files)); +} +ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files); + +static struct attribute *zonefs_sysfs_attrs[] = { + ATTR_LIST(max_wro_seq_files), + ATTR_LIST(nr_wro_seq_files), + NULL, +}; +ATTRIBUTE_GROUPS(zonefs_sysfs); + +static void zonefs_sysfs_sb_release(struct kobject *kobj) +{ + struct zonefs_sb_info *sbi = + container_of(kobj, struct zonefs_sb_info, s_kobj); + + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops zonefs_sysfs_attr_ops = { + .show = zonefs_sysfs_attr_show, +}; + +static struct kobj_type zonefs_sb_ktype = { + .default_groups = zonefs_sysfs_groups, + .sysfs_ops = &zonefs_sysfs_attr_ops, + .release = zonefs_sysfs_sb_release, +}; + +static struct kobject *zonefs_sysfs_root; + +int zonefs_sysfs_register(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret; + + init_completion(&sbi->s_kobj_unregister); + ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype, + zonefs_sysfs_root, "%s", sb->s_id); + if (ret) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return ret; + } + + sbi->s_sysfs_registered = true; + + return 0; +} + +void zonefs_sysfs_unregister(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + + if (!sbi || !sbi->s_sysfs_registered) + return; + + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} + +int __init zonefs_sysfs_init(void) +{ + zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj); + if (!zonefs_sysfs_root) + return -ENOMEM; + + return 0; +} + +void zonefs_sysfs_exit(void) +{ + kobject_put(zonefs_sysfs_root); + zonefs_sysfs_root = NULL; +} diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 67fd00ab173f..77d2d153c59d 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -12,6 +12,7 @@ #include #include #include +#include /* * Maximum length of file names: this only needs to be large enough to fit @@ -184,6 +185,10 @@ struct zonefs_sb_info { unsigned int s_max_wro_seq_files; atomic_t s_wro_seq_files; + + bool s_sysfs_registered; + struct kobject s_kobj; + struct completion s_kobj_unregister; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) @@ -198,4 +203,9 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) #define zonefs_warn(sb, format, args...) \ pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) +int zonefs_sysfs_register(struct super_block *sb); +void zonefs_sysfs_unregister(struct super_block *sb); +int zonefs_sysfs_init(void); +void zonefs_sysfs_exit(void); + #endif From 87c9ce3ffec9060cf7556ed4d3c9e582c8baf575 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 Apr 2022 18:54:39 +0900 Subject: [PATCH 6/8] zonefs: Add active seq file accounting Modify struct zonefs_sb_info to add the s_active_seq_files atomic to count the number of seq files representing a zone that is partially written or explicitly open, that is, to count sequential files with a zone that is in an active state on the device. The helper function zonefs_account_active() is introduced to update this counter whenever a file is written or truncated. This helper is also used in the zonefs_seq_file_write_open() and zonefs_seq_file_write_close() functions when the explicit_open mount option is used. The s_active_seq_files counter is exported through sysfs using the read-only attribute nr_active_seq_files. The device maximum number of active zones is also exported through sysfs with the read-only attribute max_active_seq_files. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hans Holmberg --- fs/zonefs/super.c | 71 ++++++++++++++++++++++++++++++++++++++++++---- fs/zonefs/sysfs.c | 14 +++++++++ fs/zonefs/zonefs.h | 4 +++ 3 files changed, 83 insertions(+), 6 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index aa359f27102e..e65da43f1453 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -27,6 +27,39 @@ #define CREATE_TRACE_POINTS #include "trace.h" +/* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +static void zonefs_account_active(struct inode *inode) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + lockdep_assert_held(&zi->i_truncate_mutex); + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return; + + /* + * If the zone is active, that is, if it is explicitly open or + * partially written, check if it was already accounted as active. + */ + if ((zi->i_flags & ZONEFS_ZONE_OPEN) || + (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { + if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { + zi->i_flags |= ZONEFS_ZONE_ACTIVE; + atomic_inc(&sbi->s_active_seq_files); + } + return; + } + + /* The zone is not active. If it was, update the active count */ + if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { + zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; + atomic_dec(&sbi->s_active_seq_files); + } +} + static inline int zonefs_zone_mgmt(struct inode *inode, enum req_opf op) { @@ -68,8 +101,13 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) * A full zone is no longer open/active and does not need * explicit closing. */ - if (isize >= zi->i_max_size) - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + if (isize >= zi->i_max_size) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (zi->i_flags & ZONEFS_ZONE_ACTIVE) + atomic_dec(&sbi->s_active_seq_files); + zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); + } } static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -397,6 +435,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, zonefs_update_stats(inode, data_size); zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; + zonefs_account_active(inode); return 0; } @@ -508,6 +547,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) zonefs_update_stats(inode, isize); truncate_setsize(inode, isize); zi->i_wpoffset = isize; + zonefs_account_active(inode); unlock: mutex_unlock(&zi->i_truncate_mutex); @@ -866,8 +906,15 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) count = ret; + + /* + * Update the zone write pointer offset assuming the write + * operation succeeded. If it did not, the error recovery path + * will correct it. Also do active seq file accounting. + */ mutex_lock(&zi->i_truncate_mutex); zi->i_wpoffset += count; + zonefs_account_active(inode); mutex_unlock(&zi->i_truncate_mutex); } @@ -1052,6 +1099,7 @@ static int zonefs_seq_file_write_open(struct inode *inode) goto unlock; } zi->i_flags |= ZONEFS_ZONE_OPEN; + zonefs_account_active(inode); } } } @@ -1119,6 +1167,7 @@ static void zonefs_seq_file_write_close(struct inode *inode) } zi->i_flags &= ~ZONEFS_ZONE_OPEN; + zonefs_account_active(inode); } atomic_dec(&sbi->s_wro_seq_files); @@ -1325,7 +1374,7 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; + int ret; inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; inode->i_mode = S_IFREG | sbi->s_perm; @@ -1351,6 +1400,8 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + mutex_lock(&zi->i_truncate_mutex); + /* * For sequential zones, make sure that any open zone is closed first * to ensure that the initial number of open zones is 0, in sync with @@ -1360,12 +1411,17 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, if (type == ZONEFS_ZTYPE_SEQ && (zone->cond == BLK_ZONE_COND_IMP_OPEN || zone->cond == BLK_ZONE_COND_EXP_OPEN)) { - mutex_lock(&zi->i_truncate_mutex); ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - mutex_unlock(&zi->i_truncate_mutex); + if (ret) + goto unlock; } - return ret; + zonefs_account_active(inode); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return 0; } static struct dentry *zonefs_create_inode(struct dentry *parent, @@ -1711,6 +1767,9 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; } + atomic_set(&sbi->s_active_seq_files, 0); + sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); + ret = zonefs_read_super(sb); if (ret) return ret; diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c index eaeaf983ed87..9cb6755ce39a 100644 --- a/fs/zonefs/sysfs.c +++ b/fs/zonefs/sysfs.c @@ -51,9 +51,23 @@ static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) } ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files); +static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files); +} +ZONEFS_SYSFS_ATTR_RO(max_active_seq_files); + +static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files)); +} +ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files); + static struct attribute *zonefs_sysfs_attrs[] = { ATTR_LIST(max_wro_seq_files), ATTR_LIST(nr_wro_seq_files), + ATTR_LIST(max_active_seq_files), + ATTR_LIST(nr_active_seq_files), NULL, }; ATTRIBUTE_GROUPS(zonefs_sysfs); diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 77d2d153c59d..4b3de66c3233 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -40,6 +40,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) } #define ZONEFS_ZONE_OPEN (1 << 0) +#define ZONEFS_ZONE_ACTIVE (1 << 1) /* * In-memory inode data. @@ -186,6 +187,9 @@ struct zonefs_sb_info { unsigned int s_max_wro_seq_files; atomic_t s_wro_seq_files; + unsigned int s_max_active_seq_files; + atomic_t s_active_seq_files; + bool s_sysfs_registered; struct kobject s_kobj; struct completion s_kobj_unregister; From ae4303886652248ed2568c9cb2ab0da485bfd7a7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 18 Apr 2022 09:06:17 +0900 Subject: [PATCH 7/8] documentation: zonefs: Cleanup the mount options section Use subsections to separate the descriptions of the "error=" and "explicit-open" mount sections. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hans Holmberg --- Documentation/filesystems/zonefs.rst | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/zonefs.rst b/Documentation/filesystems/zonefs.rst index 6b213fe9a33e..72d4baba0b6a 100644 --- a/Documentation/filesystems/zonefs.rst +++ b/Documentation/filesystems/zonefs.rst @@ -306,8 +306,15 @@ Further notes: Mount options ------------- -zonefs define the "errors=" mount option to allow the user to specify -zonefs behavior in response to I/O errors, inode size inconsistencies or zone +zonefs defines several mount options: +* errors= +* explicit-open + +"errors=" option +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The "errors=" option mount option allows the user to specify zonefs +behavior in response to I/O errors, inode size inconsistencies or zone condition changes. The defined behaviors are as follow: * remount-ro (default) @@ -326,6 +333,9 @@ discover the amount of data that has been written to the zone. In the case of a read-only zone discovered at run-time, as indicated in the previous section. The size of the zone file is left unchanged from its last updated value. +"explicit-open" option +~~~~~~~~~~~~~~~~~~~~~~ + A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on the number of zones that can be active, that is, zones that are in the implicit open, explicit open or closed conditions. This potential limitation From 31a644b3c2ae6d0c47e84614ded3ce9bef1adb7a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 18 Apr 2022 09:32:57 +0900 Subject: [PATCH 8/8] documentation: zonefs: Document sysfs attributes Document the max_wro_seq_files, nr_wro_seq_files, max_active_seq_files and nr_active_seq_files sysfs attributes. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hans Holmberg Reviewed-by: Sergey Shtylyov --- Documentation/filesystems/zonefs.rst | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/Documentation/filesystems/zonefs.rst b/Documentation/filesystems/zonefs.rst index 72d4baba0b6a..394b9f15dce0 100644 --- a/Documentation/filesystems/zonefs.rst +++ b/Documentation/filesystems/zonefs.rst @@ -351,6 +351,44 @@ guaranteed that write requests can be processed. Conversely, the to the device on the last close() of a zone file if the zone is not full nor empty. +Runtime sysfs attributes +------------------------ + +zonefs defines several sysfs attributes for mounted devices. All attributes +are user readable and can be found in the directory /sys/fs/zonefs//, +where is the name of the mounted zoned block device. + +The attributes defined are as follows. + +* **max_wro_seq_files**: This attribute reports the maximum number of + sequential zone files that can be open for writing. This number corresponds + to the maximum number of explicitly or implicitly open zones that the device + supports. A value of 0 means that the device has no limit and that any zone + (any file) can be open for writing and written at any time, regardless of the + state of other zones. When the *explicit-open* mount option is used, zonefs + will fail any open() system call requesting to open a sequential zone file for + writing when the number of sequential zone files already open for writing has + reached the *max_wro_seq_files* limit. +* **nr_wro_seq_files**: This attribute reports the current number of sequential + zone files open for writing. When the "explicit-open" mount option is used, + this number can never exceed *max_wro_seq_files*. If the *explicit-open* + mount option is not used, the reported number can be greater than + *max_wro_seq_files*. In such case, it is the responsibility of the + application to not write simultaneously more than *max_wro_seq_files* + sequential zone files. Failure to do so can result in write errors. +* **max_active_seq_files**: This attribute reports the maximum number of + sequential zone files that are in an active state, that is, sequential zone + files that are partially writen (not empty nor full) or that have a zone that + is explicitly open (which happens only if the *explicit-open* mount option is + used). This number is always equal to the maximum number of active zones that + the device supports. A value of 0 means that the mounted device has no limit + on the number of sequential zone files that can be active. +* **nr_active_seq_files**: This attributes reports the current number of + sequential zone files that are active. If *max_active_seq_files* is not 0, + then the value of *nr_active_seq_files* can never exceed the value of + *nr_active_seq_files*, regardless of the use of the *explicit-open* mount + option. + Zonefs User Space Tools =======================