From 5b6e7e120e716231a0bf9ad201438d72473e396d Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Thu, 14 Oct 2021 14:57:44 +0800 Subject: [PATCH 01/15] erofs: remove the fast path of per-CPU buffer decompression As Xiang mentioned, such path has no real impact to our current decompression strategy, remove it directly. Also, update the return value of z_erofs_lz4_decompress() to 0 if success to keep consistent with LZMA which will return 0 as well for that case. Link: https://lore.kernel.org/r/20211014065744.1787-1-zbestahu@gmail.com Reviewed-by: Gao Xiang Signed-off-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 63 ++++++----------------------------------- 1 file changed, 8 insertions(+), 55 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index a5bc4b1b7813..dce06ac61893 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -242,6 +242,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); ret = -EIO; + } else { + ret = 0; } if (maptype == 0) { @@ -268,33 +270,6 @@ static struct z_erofs_decompressor decompressors[] = { }, }; -static void copy_from_pcpubuf(struct page **out, const char *dst, - unsigned short pageofs_out, - unsigned int outputsize) -{ - const char *end = dst + outputsize; - const unsigned int righthalf = PAGE_SIZE - pageofs_out; - const char *cur = dst - pageofs_out; - - while (cur < end) { - struct page *const page = *out++; - - if (page) { - char *buf = kmap_atomic(page); - - if (cur >= dst) { - memcpy(buf, cur, min_t(uint, PAGE_SIZE, - end - cur)); - } else { - memcpy(buf + pageofs_out, cur + pageofs_out, - min_t(uint, righthalf, end - cur)); - } - kunmap_atomic(buf); - } - cur += PAGE_SIZE; - } -} - static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, struct list_head *pagepool) { @@ -305,34 +280,12 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, void *dst; int ret; - /* two optimized fast paths only for non bigpcluster cases yet */ - if (rq->inputsize <= PAGE_SIZE) { - if (nrpages_out == 1 && !rq->inplace_io) { - DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); - dst_maptype = 0; - goto dstmap_out; - } - - /* - * For the case of small output size (especially much less - * than PAGE_SIZE), memcpy the decompressed data rather than - * compressed data is preferred. - */ - if (rq->outputsize <= PAGE_SIZE * 7 / 8) { - dst = erofs_get_pcpubuf(1); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - rq->inplace_io = false; - ret = alg->decompress(rq, dst); - if (!ret) - copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, - rq->outputsize); - - erofs_put_pcpubuf(dst); - return ret; - } + /* one optimized fast path only for non bigpcluster cases yet */ + if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; } /* general decoding path which can be used for all cases */ From e62424651f43cb37e17ca26a7ee9ee42675f24bd Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 7 Oct 2021 15:02:23 +0800 Subject: [PATCH 02/15] erofs: decouple basic mount options from fs_context Previously, EROFS mount options are all in the basic types, so erofs_fs_context can be directly copied with assignment. However, when the multiple device feature is introduced, it's hard to handle multiple device information like the other basic mount options. Let's separate basic mount option usage from fs_context, thus multiple device information can be handled gracefully then. No logic changes. Link: https://lore.kernel.org/r/20211007070224.12833-1-hsiangkao@linux.alibaba.com Reviewed-by: Chao Yu Reviewed-by: Liu Bo Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 2 +- fs/erofs/internal.h | 16 ++++++++----- fs/erofs/super.c | 58 ++++++++++++++++++++++----------------------- fs/erofs/xattr.c | 4 ++-- fs/erofs/zdata.c | 8 +++---- 5 files changed, 45 insertions(+), 43 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index a552399e211d..2345f1de438e 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_flags &= ~S_DAX; - if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && vi->datalayout == EROFS_INODE_FLAT_PLAIN) inode->i_flags |= S_DAX; if (!nblks) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9524e155b38f..b1b9d1b5cb66 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,7 +47,7 @@ typedef u64 erofs_off_t; /* data type for filesystem-wide blocks number */ typedef u32 erofs_blk_t; -struct erofs_fs_context { +struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; @@ -60,6 +60,10 @@ struct erofs_fs_context { unsigned int mount_opt; }; +struct erofs_fs_context { + struct erofs_mount_opts opt; +}; + /* all filesystem-wide lz4 configurations */ struct erofs_sb_lz4_info { /* # of pages needed for EROFS lz4 rolling decompression */ @@ -69,6 +73,8 @@ struct erofs_sb_lz4_info { }; struct erofs_sb_info { + struct erofs_mount_opts opt; /* options */ + #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ struct list_head list; @@ -108,8 +114,6 @@ struct erofs_sb_info { u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; - - struct erofs_fs_context ctx; /* options */ }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -121,9 +125,9 @@ struct erofs_sb_info { #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 -#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) -#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) -#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option) +#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) enum { EROFS_ZIP_CACHE_DISABLED, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 11b88559f8bf..25f6b8b37f28 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -340,15 +340,15 @@ out: static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->max_sync_decompress_pages = 3; - ctx->readahead_sync_decompress = false; + ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + ctx->opt.max_sync_decompress_pages = 3; + ctx->opt.readahead_sync_decompress = false; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); #endif } @@ -392,12 +392,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(ctx, DAX_ALWAYS); - clear_opt(ctx, DAX_NEVER); + set_opt(&ctx->opt, DAX_ALWAYS); + clear_opt(&ctx->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(ctx, DAX_NEVER); - clear_opt(ctx, DAX_ALWAYS); + set_opt(&ctx->opt, DAX_NEVER); + clear_opt(&ctx->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -424,9 +424,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); else - clear_opt(ctx, XATTR_USER); + clear_opt(&ctx->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -434,16 +434,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); else - clear_opt(ctx, POSIX_ACL); + clear_opt(&ctx->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = result.uint_32; + ctx->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -540,15 +540,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->opt = ctx->opt; sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); err = erofs_read_superblock(sb); if (err) return err; - if (test_opt(ctx, DAX_ALWAYS) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(ctx, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_ALWAYS); } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -557,13 +558,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sb->s_xattr = erofs_xattr_handlers; - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; - #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); #endif @@ -607,12 +606,12 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; + sbi->opt = ctx->opt; fc->sb_flags |= SB_RDONLY; return 0; @@ -640,7 +639,6 @@ static int erofs_init_fs_context(struct fs_context *fc) erofs_default_options(fc->fs_private); fc->ops = &erofs_context_ops; - return 0; } @@ -763,31 +761,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx = &sbi->ctx; + struct erofs_mount_opts *opt = &sbi->opt; #ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(ctx, XATTR_USER)) + if (test_opt(opt, XATTR_USER)) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(opt, POSIX_ACL)) seq_puts(seq, ",acl"); else seq_puts(seq, ",noacl"); #endif #ifdef CONFIG_EROFS_FS_ZIP - if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED) + if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) seq_puts(seq, ",cache_strategy=disabled"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) seq_puts(seq, ",cache_strategy=readahead"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif - if (test_opt(ctx, DAX_ALWAYS)) + if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); - if (test_opt(ctx, DAX_NEVER)) + if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); return 0; } diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 778f2c52295d..01c581e93c5f 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) static bool erofs_xattr_user_list(struct dentry *dentry) { - return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER); + return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); } static bool erofs_xattr_trusted_list(struct dentry *dentry) @@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, switch (handler->flags) { case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->ctx, XATTR_USER)) + if (!test_opt(&sbi->opt, XATTR_USER)) return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 11c7a1aaebad..e59e22852c78 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -695,7 +695,7 @@ restart_now: goto err_out; /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la)) + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) cache_strategy = TRYALLOC; else cache_strategy = DONTALLOC; @@ -796,7 +796,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* Use workqueue and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); - sbi->ctx.readahead_sync_decompress = true; + sbi->opt.readahead_sync_decompress = true; return; } z_erofs_decompressqueue_work(&io->u.work); @@ -1411,8 +1411,8 @@ static void z_erofs_readahead(struct readahead_control *rac) struct erofs_sb_info *const sbi = EROFS_I_SB(inode); unsigned int nr_pages = readahead_count(rac); - bool sync = (sbi->ctx.readahead_sync_decompress && - nr_pages <= sbi->ctx.max_sync_decompress_pages); + bool sync = (sbi->opt.readahead_sync_decompress && + nr_pages <= sbi->opt.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); From dfeab2e95a75a424adf39992ac62dcb9e9517d4a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 14 Oct 2021 16:10:10 +0800 Subject: [PATCH 03/15] erofs: add multiple device support In order to support multi-layer container images, add multiple device feature to EROFS. Two ways are available to use for now: - Devices can be mapped into 32-bit global block address space; - Device ID can be specified with the chunk indexes format. Note that it assumes no extent would cross device boundary and mkfs should take care of it seriously. In the future, a dedicated device manager could be introduced then thus extra devices can be automatically scanned by UUID as well. Link: https://lore.kernel.org/r/20211014081010.43485-1-hsiangkao@linux.alibaba.com Reviewed-by: Chao Yu Reviewed-by: Liu Bo Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 12 ++- fs/erofs/Kconfig | 24 +++-- fs/erofs/data.c | 73 ++++++++++--- fs/erofs/erofs_fs.h | 22 +++- fs/erofs/internal.h | 35 ++++++- fs/erofs/super.c | 156 ++++++++++++++++++++++++++-- fs/erofs/zdata.c | 20 +++- 7 files changed, 296 insertions(+), 46 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index b97579b7d8fb..01df283c7d04 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios: immutable and bit-for-bit identical to the official golden image for their releases due to security and other considerations and - - hope to save some extra storage space with guaranteed end-to-end performance - by using reduced metadata and transparent file compression, especially - for those embedded devices with limited memory (ex, smartphone); + - hope to minimize extra storage space with guaranteed end-to-end performance + by using compact layout, transparent file compression and direct access, + especially for those embedded devices with limited memory and high-density + hosts with numerous containers; Here is the main features of EROFS: @@ -51,7 +52,9 @@ Here is the main features of EROFS: - Support POSIX.1e ACLs by using xattrs; - Support transparent data compression as an option: - LZ4 algorithm with the fixed-sized output compression for high performance. + LZ4 algorithm with the fixed-sized output compression for high performance; + + - Multiple device support for multi-layer container images. The following git tree provides the file system user-space tools under development (ex, formatting tool mkfs.erofs): @@ -87,6 +90,7 @@ cache_strategy=%s Select a strategy for cached decompression from now on: dax={always,never} Use direct access (no page cache). See Documentation/filesystems/dax.rst. dax A legacy option which is an alias for ``dax=always``. +device=%s Specify a path to an extra device to be used together. =================== ========================================================= On-disk details diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 14b747026742..addfe608d08e 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -6,16 +6,22 @@ config EROFS_FS select FS_IOMAP select LIBCRC32C help - EROFS (Enhanced Read-Only File System) is a lightweight - read-only file system with modern designs (eg. page-sized - blocks, inline xattrs/data, etc.) for scenarios which need - high-performance read-only requirements, e.g. Android OS - for mobile phones and LIVECDs. + EROFS (Enhanced Read-Only File System) is a lightweight read-only + file system with modern designs (e.g. no buffer heads, inline + xattrs/data, chunk-based deduplication, multiple devices, etc.) for + scenarios which need high-performance read-only solutions, e.g. + smartphones with Android OS, LiveCDs and high-density hosts with + numerous containers; - It also provides fixed-sized output compression support, - which improves storage density, keeps relatively higher - compression ratios, which is more useful to achieve high - performance for embedded devices with limited memory. + It also provides fixed-sized output compression support in order to + improve storage density as well as keep relatively higher compression + ratios and implements in-place decompression to reuse the file page + for compressed data temporarily with proper strategies, which is + quite useful to ensure guaranteed end-to-end runtime decompression + performance under extremely memory pressure without extra cost. + + See the documentation at + for more details. If unsure, say N. diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 9db829715652..808234d9190c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode, erofs_off_t pos; int err = 0; + map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; @@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode, map->m_flags = 0; break; default: - /* only one device is supported for now */ - if (idx->device_id) { - erofs_err(sb, "invalid device id %u @ %llu for nid %llu", - le16_to_cpu(idx->device_id), - chunknr, vi->nid); - err = -EFSCORRUPTED; - goto out_unlock; - } + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; @@ -155,11 +150,55 @@ out: return err; } +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) +{ + struct erofs_dev_context *devs = EROFS_SB(sb)->devs; + struct erofs_device_info *dif; + int id; + + /* primary device by default */ + map->m_bdev = sb->s_bdev; + map->m_daxdev = EROFS_SB(sb)->dax_dev; + + if (map->m_deviceid) { + down_read(&devs->rwsem); + dif = idr_find(&devs->tree, map->m_deviceid - 1); + if (!dif) { + up_read(&devs->rwsem); + return -ENODEV; + } + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + up_read(&devs->rwsem); + } else if (devs->extra_devices) { + down_read(&devs->rwsem); + idr_for_each_entry(&devs->tree, dif, id) { + erofs_off_t startoff, length; + + if (!dif->mapped_blkaddr) + continue; + startoff = blknr_to_addr(dif->mapped_blkaddr); + length = blknr_to_addr(dif->blocks); + + if (map->m_pa >= startoff && + map->m_pa < startoff + length) { + map->m_pa -= startoff; + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + break; + } + } + up_read(&devs->rwsem); + } + return 0; +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct erofs_map_blocks map; + struct erofs_map_dev mdev; map.m_la = offset; map.m_llen = length; @@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(inode->i_sb, &mdev); + if (ret) + return ret; + + iomap->bdev = mdev.m_bdev; + iomap->dax_dev = mdev.m_daxdev; iomap->offset = map.m_la; iomap->length = map.m_llen; iomap->flags = 0; @@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_INLINE; ipage = erofs_get_meta_page(inode->i_sb, - erofs_blknr(map.m_pa)); + erofs_blknr(mdev.m_pa)); if (IS_ERR(ipage)) return PTR_ERR(ipage); iomap->inline_data = page_address(ipage) + - erofs_blkoff(map.m_pa); + erofs_blkoff(mdev.m_pa); iomap->private = ipage; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = mdev.m_pa; } return 0; } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b0b23f41abc3..e480b3854d88 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -21,14 +21,27 @@ #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 +#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ + EROFS_FEATURE_INCOMPAT_DEVICE_TABLE) #define EROFS_SB_EXTSLOT_SIZE 16 +struct erofs_deviceslot { + union { + u8 uuid[16]; /* used for device manager later */ + u8 userdata[64]; /* digest(sha256), etc. */ + } u; + __le32 blocks; /* total fs blocks of this device */ + __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ + u8 reserved[56]; +}; +#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) + /* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ @@ -54,7 +67,9 @@ struct erofs_super_block { /* customized sliding window size instead of 64k by default */ __le16 lz4_max_distance; } __packed u1; - __u8 reserved2[42]; + __le16 extra_devices; /* # of devices besides the primary device */ + __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ + __u8 reserved2[38]; }; /* @@ -238,7 +253,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 8-byte inode chunk indexes */ struct erofs_inode_chunk_index { __le16 advise; /* always 0, don't care for now */ - __le16 device_id; /* back-end storage id, always 0 for now */ + __le16 device_id; /* back-end storage id (with bits masked) */ __le32 blkaddr; /* start block address of this inode chunk */ }; @@ -384,6 +399,7 @@ static inline void erofs_check_ondisk_layout_definitions(void) /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != sizeof(struct z_erofs_vle_decompressed_index)); + BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index b1b9d1b5cb66..0661d7d6969a 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,6 +47,15 @@ typedef u64 erofs_off_t; /* data type for filesystem-wide blocks number */ typedef u32 erofs_blk_t; +struct erofs_device_info { + char *path; + struct block_device *bdev; + struct dax_device *dax_dev; + + u32 blocks; + u32 mapped_blkaddr; +}; + struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ @@ -60,8 +69,16 @@ struct erofs_mount_opts { unsigned int mount_opt; }; +struct erofs_dev_context { + struct idr tree; + struct rw_semaphore rwsem; + + unsigned int extra_devices; +}; + struct erofs_fs_context { struct erofs_mount_opts opt; + struct erofs_dev_context *devs; }; /* all filesystem-wide lz4 configurations */ @@ -74,7 +91,6 @@ struct erofs_sb_lz4_info { struct erofs_sb_info { struct erofs_mount_opts opt; /* options */ - #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ struct list_head list; @@ -91,12 +107,16 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct erofs_dev_context *devs; struct dax_device *dax_dev; - u32 blocks; + u64 total_blocks; + u32 primarydevice_blocks; + u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; #endif + u16 device_id_mask; /* valid bits of device id to be used */ /* inode slot unit size in bit shift */ unsigned char islotbits; @@ -241,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -359,6 +380,7 @@ struct erofs_map_blocks { erofs_off_t m_pa, m_la; u64 m_plen, m_llen; + unsigned short m_deviceid; unsigned int m_flags; struct page *mpage; @@ -390,9 +412,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, } #endif /* !CONFIG_EROFS_FS_ZIP */ +struct erofs_map_dev { + struct block_device *m_bdev; + struct dax_device *m_daxdev; + + erofs_off_t m_pa; + unsigned int m_deviceid; +}; + /* data.c */ extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 25f6b8b37f28..2cfe1ce0f766 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -252,6 +252,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } #endif +static int erofs_init_devices(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int ondisk_extradevs; + erofs_off_t pos; + struct page *page = NULL; + struct erofs_device_info *dif; + struct erofs_deviceslot *dis; + void *ptr; + int id, err = 0; + + sbi->total_blocks = sbi->primarydevice_blocks; + if (!erofs_sb_has_device_table(sbi)) + ondisk_extradevs = 0; + else + ondisk_extradevs = le16_to_cpu(dsb->extra_devices); + + if (ondisk_extradevs != sbi->devs->extra_devices) { + erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", + ondisk_extradevs, sbi->devs->extra_devices); + return -EINVAL; + } + if (!ondisk_extradevs) + return 0; + + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; + pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; + down_read(&sbi->devs->rwsem); + idr_for_each_entry(&sbi->devs->tree, dif, id) { + erofs_blk_t blk = erofs_blknr(pos); + struct block_device *bdev; + + if (!page || page->index != blk) { + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) { + up_read(&sbi->devs->rwsem); + return PTR_ERR(page); + } + ptr = kmap(page); + } + dis = ptr + erofs_blkoff(pos); + + bdev = blkdev_get_by_path(dif->path, + FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto err_out; + } + dif->bdev = bdev; + dif->dax_dev = fs_dax_get_by_bdev(bdev); + dif->blocks = le32_to_cpu(dis->blocks); + dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + sbi->total_blocks += dif->blocks; + pos += EROFS_DEVT_SLOT_SIZE; + } +err_out: + up_read(&sbi->devs->rwsem); + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + return err; +} + static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -303,7 +376,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->blocks = le32_to_cpu(dsb->blocks); + sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -330,6 +403,11 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_load_compr_cfgs(sb, dsb); else ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); + if (ret < 0) + goto out; + + /* handle multiple devices */ + ret = erofs_init_devices(sb, dsb); out: kunmap(page); put_page(page); @@ -358,6 +436,7 @@ enum { Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_err }; @@ -381,6 +460,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), + fsparam_string("device", Opt_device), {} }; @@ -412,9 +492,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx __maybe_unused = fc->fs_private; + struct erofs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + struct erofs_device_info *dif; + int opt, ret; opt = fs_parse(fc, erofs_fs_parameters, param, &result); if (opt < 0) @@ -456,6 +537,25 @@ static int erofs_fc_parse_param(struct fs_context *fc, if (!erofs_fc_set_dax_mode(fc, result.uint_32)) return -EINVAL; break; + case Opt_device: + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) + return -ENOMEM; + dif->path = kstrdup(param->string, GFP_KERNEL); + if (!dif->path) { + kfree(dif); + return -ENOMEM; + } + down_write(&ctx->devs->rwsem); + ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&ctx->devs->rwsem); + if (ret < 0) { + kfree(dif->path); + kfree(dif); + return ret; + } + ++ctx->devs->extra_devices; + break; default: return -ENOPARAM; } @@ -542,6 +642,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + sbi->devs = ctx->devs; + ctx->devs = NULL; + err = erofs_read_superblock(sb); if (err) return err; @@ -617,9 +720,33 @@ static int erofs_fc_reconfigure(struct fs_context *fc) return 0; } +static int erofs_release_device_info(int id, void *ptr, void *data) +{ + struct erofs_device_info *dif = ptr; + + fs_put_dax(dif->dax_dev); + if (dif->bdev) + blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + kfree(dif->path); + kfree(dif); + return 0; +} + +static void erofs_free_dev_context(struct erofs_dev_context *devs) +{ + if (!devs) + return; + idr_for_each(&devs->tree, &erofs_release_device_info, NULL); + idr_destroy(&devs->tree); + kfree(devs); +} + static void erofs_fc_free(struct fs_context *fc) { - kfree(fc->fs_private); + struct erofs_fs_context *ctx = fc->fs_private; + + erofs_free_dev_context(ctx->devs); + kfree(ctx); } static const struct fs_context_operations erofs_context_ops = { @@ -631,13 +758,20 @@ static const struct fs_context_operations erofs_context_ops = { static int erofs_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL); - if (!fc->fs_private) + struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + + if (!ctx) return -ENOMEM; + ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!ctx->devs) { + kfree(ctx); + return -ENOMEM; + } + fc->fs_private = ctx; - /* set default mount options */ - erofs_default_options(fc->fs_private); - + idr_init(&ctx->devs->tree); + init_rwsem(&ctx->devs->rwsem); + erofs_default_options(ctx); fc->ops = &erofs_context_ops; return 0; } @@ -657,6 +791,8 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + + erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; @@ -746,7 +882,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = EROFS_BLKSIZ; - buf->f_blocks = sbi->blocks; + buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; buf->f_files = ULLONG_MAX; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index e59e22852c78..8c947ed49299 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1266,8 +1266,9 @@ static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; z_erofs_next_pcluster_t owned_head = f->clt.owned_head; - /* since bio will be NULL, no need to initialize last_index */ + /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; + struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; @@ -1279,6 +1280,7 @@ static void z_erofs_submit_queue(struct super_block *sb, q[JQ_SUBMIT]->head = owned_head; do { + struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; pgoff_t cur, end; unsigned int i = 0; @@ -1290,7 +1292,13 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - cur = pcl->obj.index; + /* no device id here, thus it will always succeed */ + mdev = (struct erofs_map_dev) { + .m_pa = blknr_to_addr(pcl->obj.index), + }; + (void)erofs_map_dev(sb, &mdev); + + cur = erofs_blknr(mdev.m_pa); end = cur + pcl->pclusterpages; /* close the main owned chain at first */ @@ -1306,7 +1314,8 @@ static void z_erofs_submit_queue(struct super_block *sb, if (!page) continue; - if (bio && cur != last_index + 1) { + if (bio && (cur != last_index + 1 || + last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); bio = NULL; @@ -1314,9 +1323,10 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); - bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); + + bio_set_dev(bio, mdev.m_bdev); + last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; From 8f89926290c4b3d31748d5089b27952243be0693 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 9 Oct 2021 04:08:37 +0800 Subject: [PATCH 04/15] erofs: get compression algorithms directly on mapping Currently, z_erofs_map_blocks_iter() returns whether extents are compressed or not, and the decompression frontend gets the specific algorithms then. It works but not quite well in many aspests, for example: - The decompression frontend has to deal with whether extents are compressed or not again and lookup the algorithms if compressed. It's duplicated and too detailed about the on-disk mapping. - A new secondary compression head will be introduced later so that each file can have 2 compression algorithms at most for different type of data. It could increase the complexity of the decompression frontend if still handled in this way; - A new readmore decompression strategy will be introduced to get better performance for much bigger pcluster and lzma, which needs the specific algorithm in advance as well. Let's look up compression algorithms in z_erofs_map_blocks_iter() directly instead. Link: https://lore.kernel.org/r/20211008200839.24541-2-xiang@kernel.org Reviewed-by: Chao Yu Reviewed-by: Yue Hu Signed-off-by: Gao Xiang --- fs/erofs/compress.h | 5 ----- fs/erofs/internal.h | 12 +++++++++--- fs/erofs/zdata.c | 12 ++++++------ fs/erofs/zmap.c | 19 ++++++++++--------- include/trace/events/erofs.h | 2 +- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 3701c72bacb2..ad62d1b4d371 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -8,11 +8,6 @@ #include "internal.h" -enum { - Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, - Z_EROFS_COMPRESSION_RUNTIME_MAX -}; - struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 0661d7d6969a..f8537ffdefeb 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -363,7 +363,7 @@ extern const struct address_space_operations z_erofs_aops; * of the corresponding uncompressed data in the file. */ enum { - BH_Zipped = BH_PrivateStart, + BH_Encoded = BH_PrivateStart, BH_FullMapped, }; @@ -371,8 +371,8 @@ enum { #define EROFS_MAP_MAPPED (1 << BH_Mapped) /* Located in metadata (could be copied from bd_inode) */ #define EROFS_MAP_META (1 << BH_Meta) -/* The extent has been compressed */ -#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The extent is encoded */ +#define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) @@ -381,6 +381,7 @@ struct erofs_map_blocks { u64 m_plen, m_llen; unsigned short m_deviceid; + char m_algorithmformat; unsigned int m_flags; struct page *mpage; @@ -394,6 +395,11 @@ struct erofs_map_blocks { */ #define EROFS_GET_BLOCKS_FIEMAP 0x0002 +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; + /* zmap.c */ extern const struct iomap_ops z_erofs_iomap_report_ops; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8c947ed49299..a9dced07c3c6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -476,6 +476,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct erofs_workgroup *grp; int err; + if (!(map->m_flags & EROFS_MAP_ENCODED)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + /* no available pcluster, let's allocate one */ pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) @@ -483,16 +488,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; - + pcl->algorithmformat = map->m_algorithmformat; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | (map->m_flags & EROFS_MAP_FULL_MAPPED ? Z_EROFS_PCLUSTER_FULL_LENGTH : 0); - if (map->m_flags & EROFS_MAP_ZIPPED) - pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; - else - pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = clt->owned_head; clt->mode = COLLECT_PRIMARY_FOLLOWED; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7a6df35fdc91..1c3b068e5a42 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -111,7 +111,7 @@ struct z_erofs_maprecorder { unsigned long lcn; /* compression extent information gathered */ - u8 type; + u8 type, headtype; u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedlcs; @@ -446,9 +446,8 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + m->headtype = m->type; map->m_la = (lcn << lclusterbits) | m->clusterofs; break; default: @@ -472,7 +471,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); - if (!(map->m_flags & EROFS_MAP_ZIPPED) || + if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { map->m_plen = 1 << lclusterbits; return 0; @@ -609,16 +608,14 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto unmap_out; - map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - if (endoff >= m.clusterofs) - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: if (endoff >= m.clusterofs) { + m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; break; } @@ -650,12 +647,16 @@ int z_erofs_map_blocks_iter(struct inode *inode, map->m_llen = end - map->m_la; map->m_pa = blknr_to_addr(m.pblk); - map->m_flags |= EROFS_MAP_MAPPED; err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + else + map->m_algorithmformat = vi->z_algorithmtype[0]; + if (flags & EROFS_GET_BLOCKS_FIEMAP) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index db4f2cec8360..16ae7b666810 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -24,7 +24,7 @@ struct erofs_map_blocks; #define show_mflags(flags) __print_flags(flags, "", \ { EROFS_MAP_MAPPED, "M" }, \ { EROFS_MAP_META, "I" }, \ - { EROFS_MAP_ZIPPED, "Z" }) + { EROFS_MAP_ENCODED, "E" }) TRACE_EVENT(erofs_lookup, From 72bb52620fdffca95a14ee52188a33cd84e574e2 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 18 Oct 2021 00:57:21 +0800 Subject: [PATCH 05/15] erofs: introduce the secondary compression head Previously, for each HEAD lcluster, it can be either HEAD or PLAIN lcluster to indicate whether the whole pcluster is compressed or not. In this patch, a new HEAD2 head type is introduced to specify another compression algorithm other than the primary algorithm for each compressed file, which can be used for upcoming LZMA compression and LZ4 range dictionary compression for various data patterns. It has been stayed in the EROFS roadmap for years. Complete it now! Link: https://lore.kernel.org/r/20211017165721.2442-1-xiang@kernel.org Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/erofs_fs.h | 39 ++++++++++++++++++++------------------- fs/erofs/zmap.c | 41 ++++++++++++++++++++++++++++------------- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index e480b3854d88..1c2917181346 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -22,12 +22,14 @@ #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 +#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ - EROFS_FEATURE_INCOMPAT_DEVICE_TABLE) + EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -303,35 +305,34 @@ struct z_erofs_map_header { #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 /* - * Fixed-sized output compression ondisk Logical Extent cluster type: - * 0 - literal (uncompressed) cluster - * 1 - compressed cluster (for the head logical cluster) - * 2 - compressed cluster (for the other logical clusters) + * Fixed-sized output compression on-disk logical cluster type: + * 0 - literal (uncompressed) lcluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * * In detail, - * 0 - literal (uncompressed) cluster, + * 0 - literal (uncompressed) lcluster, * di_advise = 0 - * di_clusterofs = the literal data offset of the cluster - * di_blkaddr = the blkaddr of the literal cluster + * di_clusterofs = the literal data offset of the lcluster + * di_blkaddr = the blkaddr of the literal pcluster * - * 1 - compressed cluster (for the head logical cluster) - * di_advise = 1 - * di_clusterofs = the decompressed data offset of the cluster - * di_blkaddr = the blkaddr of the compressed cluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * di_advise = 1 or 3 + * di_clusterofs = the decompressed data offset of the lcluster + * di_blkaddr = the blkaddr of the compressed pcluster * - * 2 - compressed cluster (for the other logical clusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * di_advise = 2 * di_clusterofs = - * the decompressed data offset in its own head cluster - * di_u.delta[0] = distance to its corresponding head cluster - * di_u.delta[1] = distance to its corresponding tail cluster - * (di_advise could be 0, 1 or 2) + * the decompressed data offset in its own HEAD lcluster + * di_u.delta[0] = distance to this HEAD lcluster + * di_u.delta[1] = distance to the next HEAD lcluster */ enum { Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, Z_EROFS_VLE_CLUSTER_TYPE_MAX }; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1c3b068e5a42..85d0289429b3 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err; + int err, headnr; erofs_off_t pos; struct page *page; void *kaddr; @@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel", - vi->z_algorithmtype[0], vi->nid); + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; goto unmap_done; } @@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { - if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -446,7 +450,8 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: m->headtype = m->type; map->m_la = (lcn << lclusterbits) | m->clusterofs; break; @@ -470,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, int err; DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != m->headtype); + if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1 << lclusterbits; return 0; } - lcn = m->lcn + 1; if (m->compressedlcs) goto out; @@ -498,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, switch (m->type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. @@ -553,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) DBG_BUGON(!m->delta[1] && m->clusterofs != 1 << lclusterbits); } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { /* go on until the next HEAD lcluster */ if (lcn != headlcn) break; @@ -613,7 +625,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, switch (m.type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; @@ -654,6 +667,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + map->m_algorithmformat = vi->z_algorithmtype[1]; else map->m_algorithmformat = vi->z_algorithmtype[0]; From 386292919c255da42ff6d6d3c51594f796ac146a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 9 Oct 2021 04:08:39 +0800 Subject: [PATCH 06/15] erofs: introduce readmore decompression strategy Previously, the readahead window was strictly followed by EROFS decompression strategy in order to minimize extra memory footprint. However, it could become inefficient if just reading the partial requested data for much big LZ4 pclusters and the upcoming LZMA implementation. Let's try to request the leading data in a pcluster without triggering memory reclaiming instead for the LZ4 approach first to boost up 100% randread of large big pclusters, and it has no real impact on low memory scenarios. It also introduces a way to expand read lengths in order to decompress the whole pcluster, which is useful for LZMA since the algorithm itself is relatively slow and causes CPU bound, but LZ4 is not. Link: https://lore.kernel.org/r/20211008200839.24541-4-xiang@kernel.org Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/internal.h | 13 ++++++ fs/erofs/zdata.c | 99 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 93 insertions(+), 19 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f8537ffdefeb..354ce3cb2b32 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -332,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value) EROFS_I_DATALAYOUT_BITS); } +/* + * Different from grab_cache_page_nowait(), reclaiming is never triggered + * when allocating new pages. + */ +static inline +struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); +} + extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a9dced07c3c6..98d3bd25d894 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1387,6 +1387,72 @@ static void z_erofs_runqueue(struct super_block *sb, z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } +/* + * Since partial uptodate is still unimplemented for now, we have to use + * approximate readmore strategies as a start. + */ +static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, + struct readahead_control *rac, + erofs_off_t end, + struct list_head *pagepool, + bool backmost) +{ + struct inode *inode = f->inode; + struct erofs_map_blocks *map = &f->map; + erofs_off_t cur; + int err; + + if (backmost) { + map->m_la = end; + /* TODO: pass in EROFS_GET_BLOCKS_READMORE for LZMA later */ + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + return; + + /* expend ra for the trailing edge if readahead */ + if (rac) { + loff_t newstart = readahead_pos(rac); + + cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); + readahead_expand(rac, newstart, cur - newstart); + return; + } + end = round_up(end, PAGE_SIZE); + } else { + end = round_up(map->m_la, PAGE_SIZE); + + if (!map->m_llen) + return; + } + + cur = map->m_la + map->m_llen - 1; + while (cur >= end) { + pgoff_t index = cur >> PAGE_SHIFT; + struct page *page; + + page = erofs_grab_cache_page_nowait(inode->i_mapping, index); + if (!page) + goto skip; + + if (PageUptodate(page)) { + unlock_page(page); + put_page(page); + goto skip; + } + + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + put_page(page); +skip: + if (cur < PAGE_SIZE) + break; + cur = (index << PAGE_SHIFT) - 1; + } +} + static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; @@ -1395,10 +1461,13 @@ static int z_erofs_readpage(struct file *file, struct page *page) LIST_HEAD(pagepool); trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, + &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); + z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ @@ -1419,29 +1488,20 @@ static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - unsigned int nr_pages = readahead_count(rac); - bool sync = (sbi->opt.readahead_sync_decompress && - nr_pages <= sbi->opt.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; + unsigned int nr_pages; LIST_HEAD(pagepool); - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); - f.readahead = true; f.headoffset = readahead_pos(rac); + z_erofs_pcluster_readmore(&f, rac, f.headoffset + + readahead_length(rac) - 1, &pagepool, true); + nr_pages = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - /* - * A pure asynchronous readahead is indicated if - * a PG_readahead marked page is hitted at first. - * Let's also do asynchronous decompression for this case. - */ - sync &= !(PageReadahead(page) && !head); - set_page_private(page, (unsigned long)head); head = page; } @@ -1460,11 +1520,12 @@ static void z_erofs_readahead(struct readahead_control *rac) page->index, EROFS_I(inode)->nid); put_page(page); } - + z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); - + z_erofs_runqueue(inode->i_sb, &f, &pagepool, + sbi->opt.readahead_sync_decompress && + nr_pages <= sbi->opt.max_sync_decompress_pages); if (f.map.mpage) put_page(f.map.mpage); From 83d3c4f22a36d005b55f44628f46cc0d319a75e8 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:39 +0800 Subject: [PATCH 07/15] lib/xz: Avoid overlapping memcpy() with invalid input with in-place decompression With valid files, the safety margin described in lib/decompress_unxz.c ensures that these buffers cannot overlap. But if the uncompressed size of the input is larger than the caller thought, which is possible when the input file is invalid/corrupt, the buffers can overlap. Obviously the result will then be garbage (and usually the decoder will return an error too) but no other harm will happen when such an over-run occurs. This change only affects uncompressed LZMA2 chunks and so this should have no effect on performance. Link: https://lore.kernel.org/r/20211010213145.17462-2-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang --- lib/decompress_unxz.c | 2 +- lib/xz/xz_dec_lzma2.c | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index a2f38e23004a..f7a3dc13316a 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -167,7 +167,7 @@ * memeq and memzero are not used much and any remotely sane implementation * is fast enough. memcpy/memmove speed matters in multi-call mode, but * the kernel image is decompressed in single-call mode, in which only - * memcpy speed can matter and only if there is a lot of uncompressible data + * memmove speed can matter and only if there is a lot of uncompressible data * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the * functions below should just be kept small; it's probably not worth * optimizing for speed. diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 7a6781e3f47b..d548cf0e59fe 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -387,7 +387,14 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, *left -= copy_size; - memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size); + /* + * If doing in-place decompression in single-call mode and the + * uncompressed size of the file is larger than the caller + * thought (i.e. it is invalid input!), the buffers below may + * overlap and cause undefined behavior with memcpy(). + * With valid inputs memcpy() would be fine here. + */ + memmove(dict->buf + dict->pos, b->in + b->in_pos, copy_size); dict->pos += copy_size; if (dict->full < dict->pos) @@ -397,7 +404,11 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, if (dict->pos == dict->end) dict->pos = 0; - memcpy(b->out + b->out_pos, b->in + b->in_pos, + /* + * Like above but for multi-call mode: use memmove() + * to avoid undefined behavior with invalid input. + */ + memmove(b->out + b->out_pos, b->in + b->in_pos, copy_size); } @@ -421,6 +432,12 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) if (dict->pos == dict->end) dict->pos = 0; + /* + * These buffers cannot overlap even if doing in-place + * decompression because in multi-call mode dict->buf + * has been allocated by us in this file; it's not + * provided by the caller like in single-call mode. + */ memcpy(b->out + b->out_pos, dict->buf + dict->start, copy_size); } From 4f8d7abaa413c34da9d751289849dbfb7c977d05 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:40 +0800 Subject: [PATCH 08/15] lib/xz: Validate the value before assigning it to an enum variable This might matter, for example, if the underlying type of enum xz_check was a signed char. In such a case the validation wouldn't have caught an unsupported header. I don't know if this problem can occur in the kernel on any arch but it's still good to fix it because some people might copy the XZ code to their own projects from Linux instead of the upstream XZ Embedded repository. This change may increase the code size by a few bytes. An alternative would have been to use an unsigned int instead of enum xz_check but using an enumeration looks cleaner. Link: https://lore.kernel.org/r/20211010213145.17462-3-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang --- lib/xz/xz_dec_stream.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c index fea86deaaa01..683570b93a8c 100644 --- a/lib/xz/xz_dec_stream.c +++ b/lib/xz/xz_dec_stream.c @@ -402,12 +402,12 @@ static enum xz_ret dec_stream_header(struct xz_dec *s) * we will accept other check types too, but then the check won't * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. */ + if (s->temp.buf[HEADER_MAGIC_SIZE + 1] > XZ_CHECK_MAX) + return XZ_OPTIONS_ERROR; + s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; #ifdef XZ_DEC_ANY_CHECK - if (s->check_type > XZ_CHECK_MAX) - return XZ_OPTIONS_ERROR; - if (s->check_type > XZ_CHECK_CRC32) return XZ_UNSUPPORTED_CHECK; #else From a98a25408b0e9b0264abcc3dabfafd9ff2ea1046 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:41 +0800 Subject: [PATCH 09/15] lib/xz: Move s->lzma.len = 0 initialization to lzma_reset() It's a more logical place even if the resetting needs to be done only once per LZMA2 stream (if lzma_reset() called in the middle of an LZMA2 stream, .len will already be 0). Link: https://lore.kernel.org/r/20211010213145.17462-4-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang --- lib/xz/xz_dec_lzma2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index d548cf0e59fe..22b789645ce5 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -791,6 +791,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s) s->lzma.rep1 = 0; s->lzma.rep2 = 0; s->lzma.rep3 = 0; + s->lzma.len = 0; /* * All probabilities are initialized to the same value. This hack @@ -1174,8 +1175,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) } } - s->lzma.len = 0; - s->lzma2.sequence = SEQ_CONTROL; s->lzma2.need_dict_reset = true; From aaa2975f2b07b04ee16b2cad1072cbdea3e1c50a Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:42 +0800 Subject: [PATCH 10/15] lib/xz: Add MicroLZMA decoder MicroLZMA is a yet another header format variant where the first byte of a raw LZMA stream (without the end of stream marker) has been replaced with a bitwise-negation of the lc/lp/pb properties byte. MicroLZMA was created to be used in EROFS but can be used by other things too where wasting minimal amount of space for headers is important. This is implemented using most of the LZMA2 code as is so the amount of new code is small. The API has a few extra features compared to the XZ decoder. On the other hand, the API lacks XZ_BUF_ERROR support which is important to take into account when using this API. MicroLZMA doesn't support BCJ filters. In theory they could be added later as there are many unused/reserved values for the first byte of the compressed stream but in practice it is somewhat unlikely to happen due to a few implementation reasons. Link: https://lore.kernel.org/r/20211010213145.17462-5-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang --- include/linux/xz.h | 106 ++++++++++++++++++++++++++++ lib/xz/Kconfig | 13 ++++ lib/xz/xz_dec_lzma2.c | 156 +++++++++++++++++++++++++++++++++++++++++- lib/xz/xz_dec_syms.c | 9 ++- lib/xz/xz_private.h | 3 + 5 files changed, 284 insertions(+), 3 deletions(-) diff --git a/include/linux/xz.h b/include/linux/xz.h index 9884c8440188..7285ca5d56e9 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -233,6 +233,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s); */ XZ_EXTERN void xz_dec_end(struct xz_dec *s); +/* + * Decompressor for MicroLZMA, an LZMA variant with a very minimal header. + * See xz_dec_microlzma_alloc() below for details. + * + * These functions aren't used or available in preboot code and thus aren't + * marked with XZ_EXTERN. This avoids warnings about static functions that + * are never defined. + */ +/** + * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state + */ +struct xz_dec_microlzma; + +/** + * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder + * @mode XZ_SINGLE or XZ_PREALLOC + * @dict_size LZMA dictionary size. This must be at least 4 KiB and + * at most 3 GiB. + * + * In contrast to xz_dec_init(), this function only allocates the memory + * and remembers the dictionary size. xz_dec_microlzma_reset() must be used + * before calling xz_dec_microlzma_run(). + * + * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE. + * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated. + * + * On success, xz_dec_microlzma_alloc() returns a pointer to + * struct xz_dec_microlzma. If memory allocation fails or + * dict_size is invalid, NULL is returned. + * + * The compressed format supported by this decoder is a raw LZMA stream + * whose first byte (always 0x00) has been replaced with bitwise-negation + * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is + * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. + * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream + * marker must not be used. The unused values are reserved for future use. + * This MicroLZMA header format was created for use in EROFS but may be used + * by others too. + */ +extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size); + +/** + * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state + * @s Decoder state allocated using xz_dec_microlzma_alloc() + * @comp_size Compressed size of the input stream + * @uncomp_size Uncompressed size of the input stream. A value smaller + * than the real uncompressed size of the input stream can + * be specified if uncomp_size_is_exact is set to false. + * uncomp_size can never be set to a value larger than the + * expected real uncompressed size because it would eventually + * result in XZ_DATA_ERROR. + * @uncomp_size_is_exact This is an int instead of bool to avoid + * requiring stdbool.h. This should normally be set to true. + * When this is set to false, error detection is weaker. + */ +extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, + uint32_t comp_size, uint32_t uncomp_size, + int uncomp_size_is_exact); + +/** + * xz_dec_microlzma_run() - Run the MicroLZMA decoder + * @s Decoder state initialized using xz_dec_microlzma_reset() + * @b: Input and output buffers + * + * This works similarly to xz_dec_run() with a few important differences. + * Only the differences are documented here. + * + * The only possible return values are XZ_OK, XZ_STREAM_END, and + * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress + * is possible due to lack of input data or output space, this function will + * keep returning XZ_OK. Thus, the calling code must be written so that it + * will eventually provide input and output space matching (or exceeding) + * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset(). + * If the caller cannot do this (for example, if the input file is truncated + * or otherwise corrupt), the caller must detect this error by itself to + * avoid an infinite loop. + * + * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned. + * This can happen also when incorrect dictionary, uncompressed, or + * compressed sizes have been specified. + * + * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over + * uncompressed data. This way the caller doesn't need to provide a temporary + * output buffer for the bytes that will be ignored. + * + * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK + * is also possible and thus XZ_SINGLE is actually a limited multi-call mode. + * After XZ_OK the bytes decoded so far may be read from the output buffer. + * It is possible to continue decoding but the variables b->out and b->out_pos + * MUST NOT be changed by the caller. Increasing the value of b->out_size is + * allowed to make more output space available; one doesn't need to provide + * space for the whole uncompressed data on the first call. The input buffer + * may be changed normally like with XZ_PREALLOC. This way input data can be + * provided from non-contiguous memory. + */ +extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, + struct xz_buf *b); + +/** + * xz_dec_microlzma_end() - Free the memory allocated for the decoder state + * @s: Decoder state allocated using xz_dec_microlzma_alloc(). + * If s is NULL, this function does nothing. + */ +extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); + /* * Standalone build (userspace build or in-kernel build for boot time use) * needs a CRC32 implementation. For normal in-kernel use, kernel's own diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 5cb50245a878..adce22ac18d6 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -39,6 +39,19 @@ config XZ_DEC_SPARC default y select XZ_DEC_BCJ +config XZ_DEC_MICROLZMA + bool "MicroLZMA decoder" + default n + help + MicroLZMA is a header format variant where the first byte + of a raw LZMA stream (without the end of stream marker) has + been replaced with a bitwise-negation of the lc/lp/pb + properties byte. MicroLZMA was created to be used in EROFS + but can be used by other things too where wasting minimal + amount of space for headers is important. + + Unless you know that you need this, say N. + endif config XZ_DEC_BCJ diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 22b789645ce5..46b186d7eb45 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -248,6 +248,10 @@ struct lzma2_dec { * before the first LZMA chunk. */ bool need_props; + +#ifdef XZ_DEC_MICROLZMA + bool pedantic_microlzma; +#endif }; struct xz_dec_lzma2 { @@ -419,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, } } +#ifdef XZ_DEC_MICROLZMA +# define DICT_FLUSH_SUPPORTS_SKIPPING true +#else +# define DICT_FLUSH_SUPPORTS_SKIPPING false +#endif + /* * Flush pending data from dictionary to b->out. It is assumed that there is * enough space in b->out. This is guaranteed because caller uses dict_limit() @@ -437,9 +447,14 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) * decompression because in multi-call mode dict->buf * has been allocated by us in this file; it's not * provided by the caller like in single-call mode. + * + * With MicroLZMA, b->out can be NULL to skip bytes that + * the caller doesn't need. This cannot be done with XZ + * because it would break BCJ filters. */ - memcpy(b->out + b->out_pos, dict->buf + dict->start, - copy_size); + if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL) + memcpy(b->out + b->out_pos, dict->buf + dict->start, + copy_size); } dict->start = dict->pos; @@ -1190,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) kfree(s); } + +#ifdef XZ_DEC_MICROLZMA +/* This is a wrapper struct to have a nice struct name in the public API. */ +struct xz_dec_microlzma { + struct xz_dec_lzma2 s; +}; + +enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr, + struct xz_buf *b) +{ + struct xz_dec_lzma2 *s = &s_ptr->s; + + /* + * sequence is SEQ_PROPERTIES before the first input byte, + * SEQ_LZMA_PREPARE until a total of five bytes have been read, + * and SEQ_LZMA_RUN for the rest of the input stream. + */ + if (s->lzma2.sequence != SEQ_LZMA_RUN) { + if (s->lzma2.sequence == SEQ_PROPERTIES) { + /* One byte is needed for the props. */ + if (b->in_pos >= b->in_size) + return XZ_OK; + + /* + * Don't increment b->in_pos here. The same byte is + * also passed to rc_read_init() which will ignore it. + */ + if (!lzma_props(s, ~b->in[b->in_pos])) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_LZMA_PREPARE; + } + + /* + * xz_dec_microlzma_reset() doesn't validate the compressed + * size so we do it here. We have to limit the maximum size + * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice + * round number and much more than users of this code should + * ever need. + */ + if (s->lzma2.compressed < RC_INIT_BYTES + || s->lzma2.compressed > (3U << 30)) + return XZ_DATA_ERROR; + + if (!rc_read_init(&s->rc, b)) + return XZ_OK; + + s->lzma2.compressed -= RC_INIT_BYTES; + s->lzma2.sequence = SEQ_LZMA_RUN; + + dict_reset(&s->dict, b); + } + + /* This is to allow increasing b->out_size between calls. */ + if (DEC_IS_SINGLE(s->dict.mode)) + s->dict.end = b->out_size - b->out_pos; + + while (true) { + dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos, + s->lzma2.uncompressed)); + + if (!lzma2_lzma(s, b)) + return XZ_DATA_ERROR; + + s->lzma2.uncompressed -= dict_flush(&s->dict, b); + + if (s->lzma2.uncompressed == 0) { + if (s->lzma2.pedantic_microlzma) { + if (s->lzma2.compressed > 0 || s->lzma.len > 0 + || !rc_is_finished(&s->rc)) + return XZ_DATA_ERROR; + } + + return XZ_STREAM_END; + } + + if (b->out_pos == b->out_size) + return XZ_OK; + + if (b->in_pos == b->in_size + && s->temp.size < s->lzma2.compressed) + return XZ_OK; + } +} + +struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size) +{ + struct xz_dec_microlzma *s; + + /* Restrict dict_size to the same range as in the LZMA2 code. */ + if (dict_size < 4096 || dict_size > (3U << 30)) + return NULL; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->s.dict.mode = mode; + s->s.dict.size = dict_size; + + if (DEC_IS_MULTI(mode)) { + s->s.dict.end = dict_size; + + s->s.dict.buf = vmalloc(dict_size); + if (s->s.dict.buf == NULL) { + kfree(s); + return NULL; + } + } + + return s; +} + +void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size, + uint32_t uncomp_size, int uncomp_size_is_exact) +{ + /* + * comp_size is validated in xz_dec_microlzma_run(). + * uncomp_size can safely be anything. + */ + s->s.lzma2.compressed = comp_size; + s->s.lzma2.uncompressed = uncomp_size; + s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact; + + s->s.lzma2.sequence = SEQ_PROPERTIES; + s->s.temp.size = 0; +} + +void xz_dec_microlzma_end(struct xz_dec_microlzma *s) +{ + if (DEC_IS_MULTI(s->s.dict.mode)) + vfree(s->s.dict.buf); + + kfree(s); +} +#endif diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c index 32eb3c03aede..61098c67a413 100644 --- a/lib/xz/xz_dec_syms.c +++ b/lib/xz/xz_dec_syms.c @@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset); EXPORT_SYMBOL(xz_dec_run); EXPORT_SYMBOL(xz_dec_end); +#ifdef CONFIG_XZ_DEC_MICROLZMA +EXPORT_SYMBOL(xz_dec_microlzma_alloc); +EXPORT_SYMBOL(xz_dec_microlzma_reset); +EXPORT_SYMBOL(xz_dec_microlzma_run); +EXPORT_SYMBOL(xz_dec_microlzma_end); +#endif + MODULE_DESCRIPTION("XZ decompressor"); -MODULE_VERSION("1.0"); +MODULE_VERSION("1.1"); MODULE_AUTHOR("Lasse Collin and Igor Pavlov"); /* diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 09360ebb510e..bf1e94ec7873 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -37,6 +37,9 @@ # ifdef CONFIG_XZ_DEC_SPARC # define XZ_DEC_SPARC # endif +# ifdef CONFIG_XZ_DEC_MICROLZMA +# define XZ_DEC_MICROLZMA +# endif # define memeq(a, b, size) (memcmp(a, b, size) == 0) # define memzero(buf, size) memset(buf, 0, size) # endif From 0a434e0a2c9f4395e4560aac22677ef25ab4afd9 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:43 +0800 Subject: [PATCH 11/15] lib/xz, lib/decompress_unxz.c: Fix spelling in comments uncompressible -> incompressible non-splitted -> non-split Link: https://lore.kernel.org/r/20211010213145.17462-6-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang --- lib/decompress_unxz.c | 10 +++++----- lib/xz/xz_dec_lzma2.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index f7a3dc13316a..9f4262ee33a5 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -20,8 +20,8 @@ * * The worst case for in-place decompression is that the beginning of * the file is compressed extremely well, and the rest of the file is - * uncompressible. Thus, we must look for worst-case expansion when the - * compressor is encoding uncompressible data. + * incompressible. Thus, we must look for worst-case expansion when the + * compressor is encoding incompressible data. * * The structure of the .xz file in case of a compressed kernel is as follows. * Sizes (as bytes) of the fields are in parenthesis. @@ -58,7 +58,7 @@ * uncompressed size of the payload is in practice never less than the * payload size itself. The LZMA2 format would allow uncompressed size * to be less than the payload size, but no sane compressor creates such - * files. LZMA2 supports storing uncompressible data in uncompressed form, + * files. LZMA2 supports storing incompressible data in uncompressed form, * so there's never a need to create payloads whose uncompressed size is * smaller than the compressed size. * @@ -167,8 +167,8 @@ * memeq and memzero are not used much and any remotely sane implementation * is fast enough. memcpy/memmove speed matters in multi-call mode, but * the kernel image is decompressed in single-call mode, in which only - * memmove speed can matter and only if there is a lot of uncompressible data - * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the + * memmove speed can matter and only if there is a lot of incompressible data + * (LZMA2 stores incompressible chunks in uncompressed form). Thus, the * functions below should just be kept small; it's probably not worth * optimizing for speed. */ diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 46b186d7eb45..27ce34520e78 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -520,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc) * functions so that the compiler is supposed to be able to more easily avoid * an extra branch. In this particular version of the LZMA decoder, this * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 - * on x86). Using a non-splitted version results in nicer looking code too. + * on x86). Using a non-split version results in nicer looking code too. * * NOTE: This must return an int. Do not make it return a bool or the speed * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, From 966edfb0a3dc2adf5aab461c298fa0feb02f49d6 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 11 Oct 2021 05:31:44 +0800 Subject: [PATCH 12/15] erofs: rename some generic methods in decompressor Previously, some LZ4 methods were named with `generic'. However, while evaluating the effective LZMA approach, it seems they aren't quite generic at all (e.g. no need preparing dstpages for most LZMA cases.) Avoid such naming instead. Link: https://lore.kernel.org/r/20211010213145.17462-7-xiang@kernel.org Acked-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 63 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index dce06ac61893..8fd7af9d6b38 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -17,13 +17,8 @@ #endif struct z_erofs_decompressor { - /* - * if destpages have sparsed pages, fill them with bounce pages. - * it also check whether destpages indicate continuous physical memory. - */ - int (*prepare_destpages)(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); - int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); + int (*decompress)(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); char *name; }; @@ -63,8 +58,12 @@ int z_erofs_load_lz4_config(struct super_block *sb, return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); } -static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +/* + * Fill all gaps with bounce pages if it's a sparse page list. Also check if + * all physical pages are consecutive, which can be seen for moderate CR. + */ +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) { const unsigned int nr = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -119,7 +118,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq, +static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, void *inpage, unsigned int *inputmargin, int *maptype, bool support_0padding) { @@ -189,7 +188,8 @@ docopy: return src; } -static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, + u8 *out) { unsigned int inputmargin; u8 *headpage, *src; @@ -216,8 +216,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) } rq->inputsize -= inputmargin; - src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype, - support_0padding); + src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin, + &maptype, support_0padding); if (IS_ERR(src)) return PTR_ERR(src); @@ -259,23 +259,11 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) return ret; } -static struct z_erofs_decompressor decompressors[] = { - [Z_EROFS_COMPRESSION_SHIFTED] = { - .name = "shifted" - }, - [Z_EROFS_COMPRESSION_LZ4] = { - .prepare_destpages = z_erofs_lz4_prepare_destpages, - .decompress = z_erofs_lz4_decompress, - .name = "lz4" - }, -}; - -static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const struct z_erofs_decompressor *alg = decompressors + rq->alg; unsigned int dst_maptype; void *dst; int ret; @@ -289,7 +277,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, } /* general decoding path which can be used for all cases */ - ret = alg->prepare_destpages(rq, pagepool); + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); if (ret < 0) return ret; if (ret) { @@ -304,7 +292,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, dst_maptype = 2; dstmap_out: - ret = alg->decompress(rq, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out); if (!dst_maptype) kunmap_atomic(dst); @@ -313,7 +301,7 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, +static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, struct list_head *pagepool) { const unsigned int nrpages_out = @@ -352,10 +340,19 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, return 0; } +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .decompress = z_erofs_shifted_transform, + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +}; + int z_erofs_decompress(struct z_erofs_decompress_req *rq, struct list_head *pagepool) { - if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) - return z_erofs_shifted_transform(rq, pagepool); - return z_erofs_decompress_generic(rq, pagepool); + return decompressors[rq->alg].decompress(rq, pagepool); } From 622ceaddb7649ca328832f50ba1400af778d75fa Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 11 Oct 2021 05:31:45 +0800 Subject: [PATCH 13/15] erofs: lzma compression support Add MicroLZMA support in order to maximize compression ratios for specific scenarios. For example, it's useful for low-end embedded boards and as a secondary algorithm in a file for specific access patterns. MicroLZMA is a new container format for raw LZMA1, which was created by Lasse Collin aiming to minimize old LZMA headers and get rid of unnecessary EOPM (end of payload marker) as well as to enable fixed-sized output compression, especially for 4KiB pclusters. Similar to LZ4, inplace I/O approach is used to minimize runtime memory footprint when dealing with I/O. Overlapped decompression is handled with 1) bounced buffer for data under processing or 2) extra short-lived pages from the on-stack pagepool which will be shared in the same read request (128KiB for example). Link: https://lore.kernel.org/r/20211010213145.17462-8-xiang@kernel.org Acked-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/Kconfig | 16 ++ fs/erofs/Makefile | 1 + fs/erofs/compress.h | 16 ++ fs/erofs/decompressor.c | 12 +- fs/erofs/decompressor_lzma.c | 290 +++++++++++++++++++++++++++++++++++ fs/erofs/erofs_fs.h | 14 +- fs/erofs/internal.h | 22 +++ fs/erofs/super.c | 17 +- fs/erofs/zdata.c | 4 +- fs/erofs/zdata.h | 7 - fs/erofs/zmap.c | 5 +- 11 files changed, 383 insertions(+), 21 deletions(-) create mode 100644 fs/erofs/decompressor_lzma.c diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index addfe608d08e..f57255ab88ed 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -82,3 +82,19 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. + +config EROFS_FS_ZIP_LZMA + bool "EROFS LZMA compressed data support" + depends on EROFS_FS_ZIP + select XZ_DEC + select XZ_DEC_MICROLZMA + help + Saying Y here includes support for reading EROFS file systems + containing LZMA compressed data, specifically called microLZMA. it + gives better compression ratios than the LZ4 algorithm, at the + expense of more CPU overhead. + + LZMA support is an experimental feature for now and so most file + systems will be readable without selecting this option. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 1f9aced49070..756fe2d65272 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o +erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index ad62d1b4d371..8ea6a9b14962 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -20,6 +20,12 @@ struct z_erofs_decompress_req { bool inplace_io, partial_decoding; }; +struct z_erofs_decompressor { + int (*decompress)(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); + char *name; +}; + /* some special page->private (unsigned long, see below) */ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) #define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) @@ -75,7 +81,17 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, return true; } +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} + int z_erofs_decompress(struct z_erofs_decompress_req *rq, struct list_head *pagepool); +/* prototypes for specific algorithms */ +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool); #endif diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 8fd7af9d6b38..8a624d73c185 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -16,12 +16,6 @@ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif -struct z_erofs_decompressor { - int (*decompress)(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); - char *name; -}; - int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int size) @@ -349,6 +343,12 @@ static struct z_erofs_decompressor decompressors[] = { .decompress = z_erofs_lz4_decompress, .name = "lz4" }, +#ifdef CONFIG_EROFS_FS_ZIP_LZMA + [Z_EROFS_COMPRESSION_LZMA] = { + .decompress = z_erofs_lzma_decompress, + .name = "lzma" + }, +#endif }; int z_erofs_decompress(struct z_erofs_decompress_req *rq, diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c new file mode 100644 index 000000000000..bd7d9809ecf7 --- /dev/null +++ b/fs/erofs/decompressor_lzma.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include "compress.h" + +struct z_erofs_lzma { + struct z_erofs_lzma *next; + struct xz_dec_microlzma *state; + struct xz_buf buf; + u8 bounce[PAGE_SIZE]; +}; + +/* considering the LZMA performance, no need to use a lockless list for now */ +static DEFINE_SPINLOCK(z_erofs_lzma_lock); +static unsigned int z_erofs_lzma_max_dictsize; +static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms; +static struct z_erofs_lzma *z_erofs_lzma_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); + +module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); + +void z_erofs_lzma_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_lzma_avail_strms) { + struct z_erofs_lzma *strm; + + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + DBG_BUGON(1); + return; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + while (strm) { + struct z_erofs_lzma *n = strm->next; + + if (strm->state) + xz_dec_microlzma_end(strm->state); + kfree(strm); + --z_erofs_lzma_avail_strms; + strm = n; + } + } +} + +int z_erofs_lzma_init(void) +{ + unsigned int i; + + /* by default, use # of possible CPUs instead */ + if (!z_erofs_lzma_nstrms) + z_erofs_lzma_nstrms = num_possible_cpus(); + + for (i = 0; i < z_erofs_lzma_nstrms; ++i) { + struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL); + + if (!strm) { + z_erofs_lzma_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + ++z_erofs_lzma_avail_strms; + } + return 0; +} + +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) +{ + static DEFINE_MUTEX(lzma_resize_mutex); + unsigned int dict_size, i; + struct z_erofs_lzma *strm, *head = NULL; + int err; + + if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) { + erofs_err(sb, "invalid lzma cfgs, size=%u", size); + return -EINVAL; + } + if (lzma->format) { + erofs_err(sb, "unidentified lzma format %x, please check kernel version", + le16_to_cpu(lzma->format)); + return -EINVAL; + } + dict_size = le32_to_cpu(lzma->dict_size); + if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) { + erofs_err(sb, "unsupported lzma dictionary size %u", + dict_size); + return -EINVAL; + } + + erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); + + /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ + mutex_lock(&lzma_resize_mutex); + + if (z_erofs_lzma_max_dictsize >= dict_size) { + mutex_unlock(&lzma_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + for (i = 0; i < z_erofs_lzma_avail_strms; ++i) { + struct z_erofs_lzma *last; + +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, + READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + for (last = strm; last->next; last = last->next) + ++i; + last->next = head; + head = strm; + } + + err = 0; + /* 2. walk each isolated stream and grow max dict_size if needed */ + for (strm = head; strm; strm = strm->next) { + if (strm->state) + xz_dec_microlzma_end(strm->state); + strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size); + if (!strm->state) + err = -ENOMEM; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_lzma_lock); + DBG_BUGON(z_erofs_lzma_head); + z_erofs_lzma_head = head; + spin_unlock(&z_erofs_lzma_lock); + + z_erofs_lzma_max_dictsize = dict_size; + mutex_unlock(&lzma_resize_mutex); + return err; +} + +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct list_head *pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + unsigned int inputmargin, inlen, outlen, pageofs; + struct z_erofs_lzma *strm; + u8 *kin; + bool bounced = false; + int no, ni, j, err = 0; + + /* 1. get the exact LZMA compressed size */ + kin = kmap(*rq->in); + inputmargin = 0; + while (!kin[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= PAGE_SIZE) { + kunmap(*rq->in); + return -EFSCORRUPTED; + } + rq->inputsize -= inputmargin; + + /* 2. get an available lzma context */ +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = strm->next; + spin_unlock(&z_erofs_lzma_lock); + + /* 3. multi-call decompress */ + inlen = rq->inputsize; + outlen = rq->outputsize; + xz_dec_microlzma_reset(strm->state, inlen, outlen, + !rq->partial_decoding); + pageofs = rq->pageofs_out; + strm->buf.in = kin + inputmargin; + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin); + inlen -= strm->buf.in_size; + strm->buf.out = NULL; + strm->buf.out_pos = 0; + strm->buf.out_size = 0; + + for (ni = 0, no = -1;;) { + enum xz_ret xz_err; + + if (strm->buf.out_pos == strm->buf.out_size) { + if (strm->buf.out) { + kunmap(rq->out[no]); + strm->buf.out = NULL; + } + + if (++no >= nrpages_out || !outlen) { + erofs_err(rq->sb, "decompressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.out_pos = 0; + strm->buf.out_size = min_t(u32, outlen, + PAGE_SIZE - pageofs); + outlen -= strm->buf.out_size; + if (rq->out[no]) + strm->buf.out = kmap(rq->out[no]) + pageofs; + pageofs = 0; + } else if (strm->buf.in_pos == strm->buf.in_size) { + kunmap(rq->in[ni]); + + if (++ni >= nrpages_in || !inlen) { + erofs_err(rq->sb, "compressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); + inlen -= strm->buf.in_size; + kin = kmap(rq->in[ni]); + strm->buf.in = kin; + bounced = false; + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Otherwise, Use short-lived pages + * from the on-stack pagepool where pages share with the same + * request. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); + strm->buf.in = strm->bounce; + bounced = true; + } + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); + DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); + DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + + if (xz_err != XZ_OK) { + if (xz_err == XZ_STREAM_END && !outlen) + break; + erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", + xz_err, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + if (no < nrpages_out && strm->buf.out) + kunmap(rq->in[no]); + if (ni < nrpages_in) + kunmap(rq->in[ni]); + /* 4. push back LZMA stream context to the global list */ + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + wake_up(&z_erofs_lzma_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 1c2917181346..083997a034e5 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -264,10 +264,11 @@ struct erofs_inode_chunk_index { /* available compression algorithm types (for h_algorithmtype) */ enum { - Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZMA = 1, Z_EROFS_COMPRESSION_MAX }; -#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1)) +#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) /* 14 bytes (+ length field = 16 bytes) */ struct z_erofs_lz4_cfgs { @@ -276,6 +277,15 @@ struct z_erofs_lz4_cfgs { u8 reserved[10]; } __packed; +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lzma_cfgs { + __le32 dict_size; + __le16 format; + u8 reserved[8]; +} __packed; + +#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 354ce3cb2b32..a6a53d22dfd6 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -407,6 +407,8 @@ struct erofs_map_blocks { * approach instead if possible since it's more metadata lightweight.) */ #define EROFS_GET_BLOCKS_FIEMAP 0x0002 +/* Used to map the whole extent if non-negligible data is requested for LZMA */ +#define EROFS_GET_BLOCKS_READMORE 0x0004 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -537,6 +539,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_ZIP_LZMA +int z_erofs_lzma_init(void); +void z_erofs_lzma_exit(void); +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size); +#else +static inline int z_erofs_lzma_init(void) { return 0; } +static inline int z_erofs_lzma_exit(void) { return 0; } +static inline int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) { + if (lzma) { + erofs_err(sb, "lzma algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 2cfe1ce0f766..6a969b1e0ee6 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZ4: ret = z_erofs_load_lz4_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_LZMA: + ret = z_erofs_load_lzma_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -840,6 +843,10 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; + err = z_erofs_lzma_init(); + if (err) + goto lzma_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -854,6 +861,8 @@ static int __init erofs_module_init(void) fs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_lzma_exit(); +lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); @@ -864,11 +873,13 @@ icache_err: static void __exit erofs_module_exit(void) { unregister_filesystem(&erofs_fs_type); - z_erofs_exit_zip_subsystem(); - erofs_exit_shrinker(); - /* Ensure all RCU free inodes are safe before cache is destroyed. */ + /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + + z_erofs_exit_zip_subsystem(); + z_erofs_lzma_exit(); + erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 98d3bd25d894..d55e6215cd44 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1404,8 +1404,8 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (backmost) { map->m_la = end; - /* TODO: pass in EROFS_GET_BLOCKS_READMORE for LZMA later */ - err = z_erofs_map_blocks_iter(inode, map, 0); + err = z_erofs_map_blocks_iter(inode, map, + EROFS_GET_BLOCKS_READMORE); if (err) return; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 3a008f1b9f78..879df5362777 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -94,13 +94,6 @@ struct z_erofs_decompressqueue { } u; }; -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} - #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 85d0289429b3..660489a7fb64 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -672,7 +672,10 @@ int z_erofs_map_blocks_iter(struct inode *inode, else map->m_algorithmformat = vi->z_algorithmtype[0]; - if (flags & EROFS_GET_BLOCKS_FIEMAP) { + if ((flags & EROFS_GET_BLOCKS_FIEMAP) || + ((flags & EROFS_GET_BLOCKS_READMORE) && + map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && + map->m_llen >= EROFS_BLKSIZ)) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; From eaa9172ad988b3ef5c59a051c825706252d435e1 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 22 Oct 2021 17:01:20 +0800 Subject: [PATCH 14/15] erofs: get rid of ->lru usage Currently, ->lru is a way to arrange non-LRU pages and has some in-kernel users. In order to minimize noticable issues of page reclaim and cache thrashing under high memory presure, limited temporary pages were all chained with ->lru and can be reused during the request. However, it seems that ->lru could be removed when folio is landing. Let's use page->private to chain temporary pages for now instead and transform EROFS formally after the topic of the folio / file page design is finalized. Link: https://lore.kernel.org/r/20211022090120.14675-1-hsiangkao@linux.alibaba.com Cc: Matthew Wilcox Reviewed-by: Kent Overstreet Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/compress.h | 11 +++++----- fs/erofs/decompressor.c | 8 +++---- fs/erofs/decompressor_lzma.c | 2 +- fs/erofs/internal.h | 9 +++++++- fs/erofs/pcpubuf.c | 6 +++--- fs/erofs/utils.c | 19 +++++++++++----- fs/erofs/zdata.c | 42 ++++++++++++++++-------------------- 7 files changed, 53 insertions(+), 44 deletions(-) diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 8ea6a9b14962..579406504919 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -22,7 +22,7 @@ struct z_erofs_decompress_req { struct z_erofs_decompressor { int (*decompress)(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); + struct page **pagepool); char *name; }; @@ -64,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page) return true; } -static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, +static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, struct page *page) { if (!z_erofs_is_shortlived_page(page)) @@ -75,8 +75,7 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, put_page(page); } else { /* follow the pcluster rule above. */ - set_page_private(page, 0); - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); } return true; } @@ -89,9 +88,9 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, } int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); + struct page **pagepool); /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); + struct page **pagepool); #endif diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 8a624d73c185..a0786b95cdf9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -57,7 +57,7 @@ int z_erofs_load_lz4_config(struct super_block *sb, * all physical pages are consecutive, which can be seen for moderate CR. */ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { const unsigned int nr = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -254,7 +254,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, } static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -296,7 +296,7 @@ dstmap_out: } static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -352,7 +352,7 @@ static struct z_erofs_decompressor decompressors[] = { }; int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { return decompressors[rq->alg].decompress(rq, pagepool); } diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index bd7d9809ecf7..50045510a1f4 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -150,7 +150,7 @@ again: } int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a6a53d22dfd6..3265688af7f9 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -499,7 +499,14 @@ void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); /* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +static inline void erofs_pagepool_add(struct page **pagepool, + struct page *page) +{ + set_page_private(page, (unsigned long)*pagepool); + *pagepool = page; +} +void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c index 6c885575128a..a2efd833d1b6 100644 --- a/fs/erofs/pcpubuf.c +++ b/fs/erofs/pcpubuf.c @@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) { static DEFINE_MUTEX(pcb_resize_mutex); static unsigned int pcb_nrpages; - LIST_HEAD(pagepool); + struct page *pagepool = NULL; int delta, cpu, ret, i; mutex_lock(&pcb_resize_mutex); @@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) vunmap(old_ptr); free_pagearray: while (i) - list_add(&oldpages[--i]->lru, &pagepool); + erofs_pagepool_add(&pagepool, oldpages[--i]); kfree(oldpages); if (ret) break; } pcb_nrpages = nrpages; - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); out: mutex_unlock(&pcb_resize_mutex); return ret; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index bd86067a63f7..84da2c280012 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -6,20 +6,29 @@ #include "internal.h" #include -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { - struct page *page; + struct page *page = *pagepool; - if (!list_empty(pool)) { - page = lru_to_page(pool); + if (page) { DBG_BUGON(page_ref_count(page) != 1); - list_del(&page->lru); + *pagepool = (struct page *)page_private(page); } else { page = alloc_page(gfp); } return page; } +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + put_page(page); + } +} + #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d55e6215cd44..bcb1b91b234f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -236,7 +236,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + struct page **pagepool) { struct z_erofs_pcluster *pcl = clt->pcl; bool standalone = true; @@ -287,12 +287,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) continue; - if (page) { + if (page) put_page(page); - } else if (newpage) { - set_page_private(newpage, 0); - list_add(&newpage->lru, pagepool); - } + else if (newpage) + erofs_pagepool_add(pagepool, newpage); } /* @@ -643,7 +641,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct list_head *pagepool) + struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -836,7 +834,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, - struct list_head *pagepool) + struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); struct z_erofs_pagevec_ctor ctor; @@ -1036,7 +1034,7 @@ out: } static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct list_head *pagepool) + struct page **pagepool) { z_erofs_next_pcluster_t owned = io->head; @@ -1060,18 +1058,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); - LIST_HEAD(pagepool); + struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); kvfree(bgq); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, - struct list_head *pagepool, + struct page **pagepool, struct address_space *mc, gfp_t gfp) { @@ -1173,7 +1171,7 @@ repeat: out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; } @@ -1257,7 +1255,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, + struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { @@ -1365,7 +1363,7 @@ submit_bio_retry: static void z_erofs_runqueue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, bool force_fg) + struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; @@ -1394,7 +1392,7 @@ static void z_erofs_runqueue(struct super_block *sb, static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct readahead_control *rac, erofs_off_t end, - struct list_head *pagepool, + struct page **pagepool, bool backmost) { struct inode *inode = f->inode; @@ -1457,8 +1455,8 @@ static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + struct page *pagepool = NULL; int err; - LIST_HEAD(pagepool); trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; @@ -1479,8 +1477,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) if (f.map.mpage) put_page(f.map.mpage); - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); return err; } @@ -1489,9 +1486,8 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *page, *head = NULL; + struct page *pagepool = NULL, *head = NULL, *page; unsigned int nr_pages; - LIST_HEAD(pagepool); f.readahead = true; f.headoffset = readahead_pos(rac); @@ -1528,9 +1524,7 @@ static void z_erofs_readahead(struct readahead_control *rac) nr_pages <= sbi->opt.max_sync_decompress_pages); if (f.map.mpage) put_page(f.map.mpage); - - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); } const struct address_space_operations z_erofs_aops = { From a0961f351d82d43ab0b845304caa235dfe249ae9 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 25 Oct 2021 15:43:11 +0800 Subject: [PATCH 15/15] erofs: don't trigger WARN() when decompression fails syzbot reported a WARNING [1] due to corrupted compressed data. As Dmitry said, "If this is not a kernel bug, then the code should not use WARN. WARN if for kernel bugs and is recognized as such by all testing systems and humans." [1] https://lore.kernel.org/r/000000000000b3586105cf0ff45e@google.com Link: https://lore.kernel.org/r/20211025074311.130395-1-hsiangkao@linux.alibaba.com Cc: Dmitry Vyukov Reviewed-by: Chao Yu Reported-by: syzbot+d8aaffc3719597e8cfb4@syzkaller.appspotmail.com Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index a0786b95cdf9..bf37fc76b182 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -227,7 +227,6 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", ret, rq->inputsize, inputmargin, rq->outputsize); - WARN_ON(1); print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, 16, 1, src + inputmargin, rq->inputsize, true); print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,