From 9ed50b8231e37b1ae863f5dec8153b98d9f389b4 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 9 Sep 2024 11:19:11 +0800 Subject: [PATCH 01/15] erofs: fix incorrect symlink detection in fast symlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fast symlink can be used if the on-disk symlink data is stored in the same block as the on-disk inode, so we don’t need to trigger another I/O for symlink data. However, currently fs correction could be reported _incorrectly_ if inode xattrs are too large. In fact, these should be valid images although they cannot be handled as fast symlinks. Many thanks to Colin for reporting this! Reported-by: Colin Walters Reported-by: https://honggfuzz.dev/ Link: https://lore.kernel.org/r/bb2dd430-7de0-47da-ae5b-82ab2dd4d945@app.fastmail.com Fixes: 431339ba9042 ("staging: erofs: add inode operations") [ Note that it's a runtime misbehavior instead of a security issue. ] Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240909031911.1174718-1-hsiangkao@linux.alibaba.com --- fs/erofs/inode.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 419432be3223..4e3ea6689cb4 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -178,12 +178,14 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, unsigned int m_pofs) { struct erofs_inode *vi = EROFS_I(inode); - unsigned int bsz = i_blocksize(inode); + loff_t off; char *lnk; - /* if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= bsz || inode->i_size < 0) { + m_pofs += vi->xattr_isize; + /* check if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || inode->i_size < 0 || + check_add_overflow(m_pofs, inode->i_size, &off) || + off > i_blocksize(inode)) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -192,16 +194,6 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, if (!lnk) return -ENOMEM; - m_pofs += vi->xattr_isize; - /* inline symlink data shouldn't cross block boundary */ - if (m_pofs + inode->i_size > bsz) { - kfree(lnk); - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } memcpy(lnk, kaddr + m_pofs, inode->i_size); lnk[inode->i_size] = '\0'; From 59aadaa7ebafbc57e642d772cfc02c2b907e5b89 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Aug 2024 17:52:32 +0800 Subject: [PATCH 02/15] erofs: clean up erofs_register_sysfs() After commit 684b290abc77 ("erofs: add support for FS_IOC_GETFSSYSFSPATH"), `sb->s_sysfs_name` is now valid. Just use it to get rid of duplicated logic. Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240828095232.571946-1-hsiangkao@linux.alibaba.com --- fs/erofs/super.c | 2 +- fs/erofs/sysfs.c | 30 ++++++------------------------ 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 6cb5c8916174..aae3fd15899a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -644,7 +644,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - erofs_set_sysfs_name(sb); #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); @@ -682,6 +681,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + erofs_set_sysfs_name(sb); err = erofs_register_sysfs(sb); if (err) return err; diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 435e515c0792..63cffd0fd261 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -205,34 +205,16 @@ static struct kobject erofs_feat = { int erofs_register_sysfs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - char *name; - char *str = NULL; int err; - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) { - str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, - sbi->fsid); - if (!str) - return -ENOMEM; - name = str; - } else { - name = sbi->fsid; - } - } else { - name = sb->s_id; - } sbi->s_kobj.kset = &erofs_root; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name); - kfree(str); - if (err) - goto put_sb_kobj; - return 0; - -put_sb_kobj: - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", + sb->s_sysfs_name); + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + } return err; } From 3fc3e45fcdeaad4b7660b560fcbc827eb733f58e Mon Sep 17 00:00:00 2001 From: Sandeep Dhavale Date: Wed, 4 Sep 2024 23:00:25 -0700 Subject: [PATCH 03/15] erofs: fix error handling in z_erofs_init_decompressor If we get a failure at the first decompressor init (i = 0), the clean up while loop could enter infinite loop due to wrong while check. Check the value of i now to see if we need any clean up at all. Fixes: 5a7cce827ee9 ("erofs: refine z_erofs_{init,exit}_subsystem()") Reported-by: liujinbao1 Signed-off-by: Sandeep Dhavale Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240905060027.2388893-1-dhavale@google.com Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index c2253b6a5416..eb318c7ddd80 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -539,7 +539,7 @@ int __init z_erofs_init_decompressor(void) for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) { err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0; if (err) { - while (--i) + while (i--) if (z_erofs_decomp[i]) z_erofs_decomp[i]->exit(); return err; From 9e2f9d34dd12e6e5b244ec488bcebd0c2d566c50 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 10 Sep 2024 15:08:47 +0800 Subject: [PATCH 04/15] erofs: handle overlapped pclusters out of crafted images properly syzbot reported a task hang issue due to a deadlock case where it is waiting for the folio lock of a cached folio that will be used for cache I/Os. After looking into the crafted fuzzed image, I found it's formed with several overlapped big pclusters as below: Ext: logical offset | length : physical offset | length 0: 0.. 16384 | 16384 : 151552.. 167936 | 16384 1: 16384.. 32768 | 16384 : 155648.. 172032 | 16384 2: 32768.. 49152 | 16384 : 537223168.. 537239552 | 16384 ... Here, extent 0/1 are physically overlapped although it's entirely _impossible_ for normal filesystem images generated by mkfs. First, managed folios containing compressed data will be marked as up-to-date and then unlocked immediately (unlike in-place folios) when compressed I/Os are complete. If physical blocks are not submitted in the incremental order, there should be separate BIOs to avoid dependency issues. However, the current code mis-arranges z_erofs_fill_bio_vec() and BIO submission which causes unexpected BIO waits. Second, managed folios will be connected to their own pclusters for efficient inter-queries. However, this is somewhat hard to implement easily if overlapped big pclusters exist. Again, these only appear in fuzzed images so let's simply fall back to temporary short-lived pages for correctness. Additionally, it justifies that referenced managed folios cannot be truncated for now and reverts part of commit 2080ca1ed3e4 ("erofs: tidy up `struct z_erofs_bvec`") for simplicity although it shouldn't be any difference. Reported-by: syzbot+4fc98ed414ae63d1ada2@syzkaller.appspotmail.com Reported-by: syzbot+de04e06b28cfecf2281c@syzkaller.appspotmail.com Reported-by: syzbot+c8c8238b394be4a1087d@syzkaller.appspotmail.com Tested-by: syzbot+4fc98ed414ae63d1ada2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/0000000000002fda01061e334873@google.com Fixes: 8e6c8fa9f2e9 ("erofs: enable big pcluster feature") Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240910070847.3356592-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 71 ++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 424f656cd765..a0bae499c5ff 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1428,6 +1428,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, struct z_erofs_bvec zbv; struct address_space *mapping; struct folio *folio; + struct page *page; int bs = i_blocksize(f->inode); /* Except for inplace folios, the entire folio can be used for I/Os */ @@ -1450,7 +1451,6 @@ repeat: * file-backed folios will be used instead. */ if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) { - folio->private = 0; tocache = true; goto out_tocache; } @@ -1468,7 +1468,7 @@ repeat: } folio_lock(folio); - if (folio->mapping == mc) { + if (likely(folio->mapping == mc)) { /* * The cached folio is still in managed cache but without * a valid `->private` pcluster hint. Let's reconnect them. @@ -1478,41 +1478,44 @@ repeat: /* compressed_bvecs[] already takes a ref before */ folio_put(folio); } - - /* no need to submit if it is already up-to-date */ - if (folio_test_uptodate(folio)) { - folio_unlock(folio); - bvec->bv_page = NULL; + if (likely(folio->private == pcl)) { + /* don't submit cache I/Os again if already uptodate */ + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + bvec->bv_page = NULL; + } + return; } - return; + /* + * Already linked with another pcluster, which only appears in + * crafted images by fuzzers for now. But handle this anyway. + */ + tocache = false; /* use temporary short-lived pages */ + } else { + DBG_BUGON(1); /* referenced managed folios can't be truncated */ + tocache = true; } - - /* - * It has been truncated, so it's unsafe to reuse this one. Let's - * allocate a new page for compressed data. - */ - DBG_BUGON(folio->mapping); - tocache = true; folio_unlock(folio); folio_put(folio); out_allocfolio: - zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); spin_lock(&pcl->obj.lockref.lock); - if (pcl->compressed_bvecs[nr].page) { - erofs_pagepool_add(&f->pagepool, zbv.page); + if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { + erofs_pagepool_add(&f->pagepool, page); spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } - bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page; - folio = page_folio(zbv.page); - /* first mark it as a temporary shortlived folio (now 1 ref) */ - folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; + bvec->bv_page = pcl->compressed_bvecs[nr].page = page; + folio = page_folio(page); spin_unlock(&pcl->obj.lockref.lock); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) + filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived folio (1 ref) */ + folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; + } folio_attach_private(folio, pcl); /* drop a refcount added by allocpage (then 2 refs in total here) */ folio_put(folio); @@ -1647,13 +1650,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, cur = mdev.m_pa; end = cur + pcl->pclustersize; do { - z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); - if (!bvec.bv_page) - continue; - + bvec.bv_page = NULL; if (bio && (cur != last_pa || bio->bi_bdev != mdev.m_bdev)) { -io_retry: +drain_io: if (!erofs_is_fscache_mode(sb)) submit_bio(bio); else @@ -1666,6 +1666,15 @@ io_retry: bio = NULL; } + if (!bvec.bv_page) { + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) + continue; + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + DBG_BUGON(bvec.bv_len < sb->s_blocksize); + } + if (unlikely(PageWorkingset(bvec.bv_page)) && !memstall) { psi_memstall_enter(&pflags); @@ -1685,13 +1694,9 @@ io_retry: ++nr_bios; } - if (cur + bvec.bv_len > end) - bvec.bv_len = end - cur; - DBG_BUGON(bvec.bv_len < sb->s_blocksize); if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, bvec.bv_offset)) - goto io_retry; - + goto drain_io; last_pa = cur + bvec.bv_len; bypass = false; } while ((cur += bvec.bv_len) < end); From fb176750266a3d7f42ebdcf28e8ba40350b27847 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 30 Aug 2024 11:28:37 +0800 Subject: [PATCH 05/15] erofs: add file-backed mount support It actually has been around for years: For containers and other sandbox use cases, there will be thousands (and even more) of authenticated (sub)images running on the same host, unlike OS images. Of course, all scenarios can use the same EROFS on-disk format, but bdev-backed mounts just work well for OS images since golden data is dumped into real block devices. However, it's somewhat hard for container runtimes to manage and isolate so many unnecessary virtual block devices safely and efficiently [1]: they just look like a burden to orchestrators and file-backed mounts are preferred indeed. There were already enough attempts such as Incremental FS, the original ComposeFS and PuzzleFS acting in the same way for immutable fses. As for current EROFS users, ComposeFS, containerd and Android APEXs will be directly benefited from it. On the other hand, previous experimental feature "erofs over fscache" was once also intended to provide a similar solution (inspired by Incremental FS discussion [2]), but the following facts show file-backed mounts will be a better approach: - Fscache infrastructure has recently been moved into new Netfslib which is an unexpected dependency to EROFS really, although it originally claims "it could be used for caching other things such as ISO9660 filesystems too." [3] - It takes an unexpectedly long time to upstream Fscache/Cachefiles enhancements. For example, the failover feature took more than one year, and the deamonless feature is still far behind now; - Ongoing HSM "fanotify pre-content hooks" [4] together with this will perfectly supersede "erofs over fscache" in a simpler way since developers (mainly containerd folks) could leverage their existing caching mechanism entirely in userspace instead of strictly following the predefined in-kernel caching tree hierarchy. After "fanotify pre-content hooks" lands upstream to provide the same functionality, "erofs over fscache" will be removed then (as an EROFS internal improvement and EROFS will not have to bother with on-demand fetching and/or caching improvements anymore.) [1] https://github.com/containers/storage/pull/2039 [2] https://lore.kernel.org/r/CAOQ4uxjbVxnubaPjVaGYiSwoGDTdpWbB=w_AeM6YM=zVixsUfQ@mail.gmail.com [3] https://docs.kernel.org/filesystems/caching/fscache.html [4] https://lore.kernel.org/r/cover.1723670362.git.josef@toxicpanda.com Closes: https://github.com/containers/composefs/issues/144 Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240830032840.3783206-1-hsiangkao@linux.alibaba.com --- fs/erofs/Kconfig | 17 ++++++++++ fs/erofs/data.c | 35 ++++++++++++--------- fs/erofs/inode.c | 5 ++- fs/erofs/internal.h | 11 +++++-- fs/erofs/super.c | 76 +++++++++++++++++++++++++++++---------------- 5 files changed, 100 insertions(+), 44 deletions(-) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 7dcdce660cac..1428d0530e1c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -74,6 +74,23 @@ config EROFS_FS_SECURITY If you are not using a security module, say N. +config EROFS_FS_BACKED_BY_FILE + bool "File-backed EROFS filesystem support" + depends on EROFS_FS + default y + help + This allows EROFS to use filesystem image files directly, without + the intercession of loopback block devices or likewise. It is + particularly useful for container images with numerous blobs and + other sandboxes, where loop devices behave intricately. It can also + be used to simplify error-prone lifetime management of unnecessary + virtual block devices. + + Note that this feature, along with ongoing fanotify pre-content + hooks, will eventually replace "EROFS over fscache." + + If you don't want to enable this feature, say N. + config EROFS_FS_ZIP bool "EROFS Data Compression Support" depends on EROFS_FS diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1b7eba38ba1e..0fb31c588ae0 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -59,8 +59,12 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { - if (erofs_is_fscache_mode(sb)) - buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping; + struct erofs_sb_info *sbi = EROFS_SB(sb); + + if (erofs_is_fileio_mode(sbi)) + buf->mapping = file_inode(sbi->fdev)->i_mapping; + else if (erofs_is_fscache_mode(sb)) + buf->mapping = sbi->s_fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } @@ -189,10 +193,22 @@ out: return err; } +static void erofs_fill_from_devinfo(struct erofs_map_dev *map, + struct erofs_device_info *dif) +{ + map->m_bdev = NULL; + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); + map->m_daxdev = dif->dax_dev; + map->m_dax_part_off = dif->dax_part_off; + map->m_fscache = dif->fscache; +} + int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; + erofs_off_t startoff, length; int id; map->m_bdev = sb->s_bdev; @@ -212,29 +228,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - erofs_off_t startoff, length; - if (!dif->mapped_blkaddr) continue; + startoff = erofs_pos(sb, dif->mapped_blkaddr); length = erofs_pos(sb, dif->blocks); - if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev_file ? - file_bdev(dif->bdev_file) : NULL; - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + erofs_fill_from_devinfo(map, dif); break; } } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 4e3ea6689cb4..83a14b55327f 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -250,7 +250,10 @@ static int erofs_fill_inode(struct inode *inode) } mapping_set_large_folios(inode->i_mapping); - if (erofs_inode_is_data_compressed(vi->datalayout)) { + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { + /* XXX: data I/Os will be implemented in the following patches */ + err = -EOPNOTSUPP; + } else if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, erofs_info, inode->i_sb, diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 45dc15ebd870..9bf4fb1cfa09 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -49,7 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct file *bdev_file; + struct file *file; struct dax_device *dax_dev; u64 dax_part_off; @@ -130,6 +130,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct file *fdev; struct inode *packed_inode; struct erofs_dev_context *devs; struct dax_device *dax_dev; @@ -190,9 +191,15 @@ struct erofs_sb_info { #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) #define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) +static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) +{ + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; +} + static inline bool erofs_is_fscache_mode(struct super_block *sb) { - return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev; + return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && + !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev; } enum { diff --git a/fs/erofs/super.c b/fs/erofs/super.c index aae3fd15899a..9a7e67eceed4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "xattr.h" #define CREATE_TRACE_POINTS @@ -161,7 +162,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct file *bdev_file; + struct file *file; dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); if (IS_ERR(dis)) @@ -183,13 +184,17 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ, - sb->s_type, NULL); - if (IS_ERR(bdev_file)) - return PTR_ERR(bdev_file); - dif->bdev_file = bdev_file; - dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file), - &dif->dax_part_off, NULL, NULL); + file = erofs_is_fileio_mode(sbi) ? + filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : + bdev_file_open_by_path(dif->path, + BLK_OPEN_READ, sb->s_type, NULL); + if (IS_ERR(file)) + return PTR_ERR(file); + + dif->file = file; + if (!erofs_is_fileio_mode(sbi)) + dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), + &dif->dax_part_off, NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -566,15 +571,16 @@ static void erofs_set_sysfs_name(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (erofs_is_fscache_mode(sb)) { - if (sbi->domain_id) - super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id, - sbi->fsid); - else - super_set_sysfs_name_generic(sb, "%s", sbi->fsid); - return; - } - super_set_sysfs_name_id(sb); + if (sbi->domain_id) + super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, + sbi->fsid); + else if (sbi->fsid) + super_set_sysfs_name_generic(sb, "%s", sbi->fsid); + else if (erofs_is_fileio_mode(sbi)) + super_set_sysfs_name_generic(sb, "%s", + bdi_dev_name(sb->s_bdi)); + else + super_set_sysfs_name_id(sb); } static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) @@ -589,14 +595,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sbi->blkszbits = PAGE_SHIFT; - if (erofs_is_fscache_mode(sb)) { + if (!sb->s_bdev) { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - err = erofs_fscache_register_fs(sb); - if (err) - return err; - + if (erofs_is_fscache_mode(sb)) { + err = erofs_fscache_register_fs(sb); + if (err) + return err; + } err = super_setup_bdi(sb); if (err) return err; @@ -693,11 +700,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; + int ret; if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - return get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev(fc, erofs_fc_fill_super); +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (ret == -ENOTBLK) { + if (!fc->source) + return invalf(fc, "No source specified"); + sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(sbi->fdev)) + return PTR_ERR(sbi->fdev); + + return get_tree_nodev(fc, erofs_fc_fill_super); + } +#endif + return ret; } static int erofs_fc_reconfigure(struct fs_context *fc) @@ -727,8 +747,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev_file) - fput(dif->bdev_file); + if (dif->file) + fput(dif->file); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -791,7 +811,7 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) kill_anon_super(sb); else kill_block_super(sb); @@ -801,6 +821,8 @@ static void erofs_kill_sb(struct super_block *sb) erofs_fscache_unregister_fs(sb); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->fdev) + fput(sbi->fdev); kfree(sbi); sb->s_fs_info = NULL; } @@ -903,7 +925,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; if (uuid_is_null(&sb->s_uuid)) - buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 : + buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 : huge_encode_dev(sb->s_bdev->bd_dev)); else buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); From ce63cb62d794c98c7631c2296fa845f2a8d0a4a1 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 5 Sep 2024 17:30:31 +0800 Subject: [PATCH 06/15] erofs: support unencoded inodes for fileio Since EROFS only needs to handle read requests in simple contexts, Just directly use vfs_iocb_iter_read() for data I/Os. Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240905093031.2745929-1-hsiangkao@linux.alibaba.com --- fs/erofs/Makefile | 1 + fs/erofs/data.c | 50 ++++++++++++- fs/erofs/fileio.c | 178 ++++++++++++++++++++++++++++++++++++++++++++ fs/erofs/inode.c | 17 +++-- fs/erofs/internal.h | 7 +- fs/erofs/zdata.c | 46 ++---------- 6 files changed, 248 insertions(+), 51 deletions(-) create mode 100644 fs/erofs/fileio.c diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 097d672e6b14..4331d53c7109 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0fb31c588ae0..b4c07ce7a294 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -132,7 +132,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; - map->m_plen = 0; + map->m_plen = map->m_llen; goto out; } @@ -197,8 +197,13 @@ static void erofs_fill_from_devinfo(struct erofs_map_dev *map, struct erofs_device_info *dif) { map->m_bdev = NULL; - if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) - map->m_bdev = file_bdev(dif->file); + map->m_fp = NULL; + if (dif->file) { + if (S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); + else + map->m_fp = dif->file; + } map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -215,6 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) map->m_daxdev = EROFS_SB(sb)->dax_dev; map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; map->m_fscache = EROFS_SB(sb)->s_fscache; + map->m_fp = EROFS_SB(sb)->fdev; if (map->m_deviceid) { down_read(&devs->rwsem); @@ -250,6 +256,42 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return 0; } +/* + * bit 30: I/O error occurred on this folio + * bit 0 - 29: remaining parts to complete this folio + */ +#define EROFS_ONLINEFOLIO_EIO (1 << 30) + +void erofs_onlinefolio_init(struct folio *folio) +{ + union { + atomic_t o; + void *v; + } u = { .o = ATOMIC_INIT(1) }; + + folio->private = u.v; /* valid only if file-backed folio is locked */ +} + +void erofs_onlinefolio_split(struct folio *folio) +{ + atomic_inc((atomic_t *)&folio->private); +} + +void erofs_onlinefolio_end(struct folio *folio, int err) +{ + int orig, v; + + do { + orig = atomic_read((atomic_t *)&folio->private); + v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); + + if (v & ~EROFS_ONLINEFOLIO_EIO) + return; + folio->private = 0; + folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO)); +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { @@ -399,7 +441,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { +const struct address_space_operations erofs_aops = { .read_folio = erofs_read_folio, .readahead = erofs_readahead, .bmap = erofs_bmap, diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c new file mode 100644 index 000000000000..42b346593bf5 --- /dev/null +++ b/fs/erofs/fileio.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include "internal.h" +#include + +struct erofs_fileio_rq { + struct bio_vec bvecs[BIO_MAX_VECS]; + struct bio bio; + struct kiocb iocb; +}; + +struct erofs_fileio { + struct erofs_map_blocks map; + struct erofs_map_dev dev; + struct erofs_fileio_rq *rq; +}; + +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) +{ + struct erofs_fileio_rq *rq = + container_of(iocb, struct erofs_fileio_rq, iocb); + struct folio_iter fi; + + DBG_BUGON(rq->bio.bi_end_io); + if (ret > 0) { + if (ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); + } + ret = 0; + } + bio_for_each_folio_all(fi, &rq->bio) { + DBG_BUGON(folio_test_uptodate(fi.folio)); + erofs_onlinefolio_end(fi.folio, ret); + } + bio_uninit(&rq->bio); + kfree(rq); +} + +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) +{ + struct iov_iter iter; + int ret; + + if (!rq) + return; + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; + rq->iocb.ki_ioprio = get_current_ioprio(); + rq->iocb.ki_complete = erofs_fileio_ki_complete; + rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? + IOCB_DIRECT : 0; + iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, + rq->bio.bi_iter.bi_size); + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + if (ret != -EIOCBQUEUED) + erofs_fileio_ki_complete(&rq->iocb, ret); +} + +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) +{ + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), + GFP_KERNEL | __GFP_NOFAIL); + + bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); + rq->iocb.ki_filp = mdev->m_fp; + return rq; +} + +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) +{ + struct inode *inode = folio_inode(folio); + struct erofs_map_blocks *map = &io->map; + unsigned int cur = 0, end = folio_size(folio), len, attached = 0; + loff_t pos = folio_pos(folio), ofs; + struct iov_iter iter; + struct bio_vec bv; + int err = 0; + + erofs_onlinefolio_init(folio); + while (cur < end) { + if (!in_range(pos + cur, map->m_la, map->m_llen)) { + map->m_la = pos + cur; + map->m_llen = end - cur; + err = erofs_map_blocks(inode, map); + if (err) + break; + } + + ofs = folio_pos(folio) + cur - map->m_la; + len = min_t(loff_t, map->m_llen - ofs, end - cur); + if (map->m_flags & EROFS_MAP_META) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *src; + + src = erofs_read_metabuf(&buf, inode->i_sb, + map->m_pa + ofs, EROFS_KMAP); + if (IS_ERR(src)) { + err = PTR_ERR(src); + break; + } + bvec_set_folio(&bv, folio, len, cur); + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len); + if (copy_to_iter(src, len, &iter) != len) { + erofs_put_metabuf(&buf); + err = -EIO; + break; + } + erofs_put_metabuf(&buf); + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { + folio_zero_segment(folio, cur, cur + len); + attached = 0; + } else { + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || + map->m_deviceid != io->dev.m_deviceid)) { +io_retry: + erofs_fileio_rq_submit(io->rq); + io->rq = NULL; + } + + if (!io->rq) { + io->dev = (struct erofs_map_dev) { + .m_pa = io->map.m_pa + ofs, + .m_deviceid = io->map.m_deviceid, + }; + err = erofs_map_dev(inode->i_sb, &io->dev); + if (err) + break; + io->rq = erofs_fileio_rq_alloc(&io->dev); + io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; + attached = 0; + } + if (!attached++) + erofs_onlinefolio_split(folio); + if (!bio_add_folio(&io->rq->bio, folio, len, cur)) + goto io_retry; + io->dev.m_pa += len; + } + cur += len; + } + erofs_onlinefolio_end(folio, err); + return err; +} + +static int erofs_fileio_read_folio(struct file *file, struct folio *folio) +{ + struct erofs_fileio io = {}; + int err; + + trace_erofs_read_folio(folio, true); + err = erofs_fileio_scan_folio(&io, folio); + erofs_fileio_rq_submit(io.rq); + return err; +} + +static void erofs_fileio_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct erofs_fileio io = {}; + struct folio *folio; + int err; + + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), true); + while ((folio = readahead_folio(rac))) { + err = erofs_fileio_scan_folio(&io, folio); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); + } + erofs_fileio_rq_submit(io.rq); +} + +const struct address_space_operations erofs_fileio_aops = { + .read_folio = erofs_fileio_read_folio, + .readahead = erofs_fileio_readahead, +}; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 83a14b55327f..f8eab339417d 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -250,11 +250,14 @@ static int erofs_fill_inode(struct inode *inode) } mapping_set_large_folios(inode->i_mapping); - if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { - /* XXX: data I/Os will be implemented in the following patches */ - err = -EOPNOTSUPP; - } else if (erofs_inode_is_data_compressed(vi->datalayout)) { + if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { + err = -EOPNOTSUPP; + goto out_unlock; + } +#endif DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, erofs_info, inode->i_sb, "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); @@ -263,10 +266,14 @@ static int erofs_fill_inode(struct inode *inode) err = -EOPNOTSUPP; #endif } else { - inode->i_mapping->a_ops = &erofs_raw_access_aops; + inode->i_mapping->a_ops = &erofs_aops; #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; +#endif +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; #endif } out_unlock: diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9bf4fb1cfa09..9bc4dcfd06d7 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -372,6 +372,7 @@ struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; struct dax_device *m_daxdev; + struct file *m_fp; u64 m_dax_part_off; erofs_off_t m_pa; @@ -380,7 +381,8 @@ struct erofs_map_dev { extern const struct super_operations erofs_sops; -extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations erofs_aops; +extern const struct address_space_operations erofs_fileio_aops; extern const struct address_space_operations z_erofs_aops; extern const struct address_space_operations erofs_fscache_access_aops; @@ -411,6 +413,9 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); +void erofs_onlinefolio_init(struct folio *folio); +void erofs_onlinefolio_split(struct folio *folio); +void erofs_onlinefolio_end(struct folio *folio, int err); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a0bae499c5ff..3371dcb549dc 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -122,42 +122,6 @@ static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) return fo->mapping == MNGD_MAPPING(sbi); } -/* - * bit 30: I/O error occurred on this folio - * bit 0 - 29: remaining parts to complete this folio - */ -#define Z_EROFS_FOLIO_EIO (1 << 30) - -static void z_erofs_onlinefolio_init(struct folio *folio) -{ - union { - atomic_t o; - void *v; - } u = { .o = ATOMIC_INIT(1) }; - - folio->private = u.v; /* valid only if file-backed folio is locked */ -} - -static void z_erofs_onlinefolio_split(struct folio *folio) -{ - atomic_inc((atomic_t *)&folio->private); -} - -static void z_erofs_onlinefolio_end(struct folio *folio, int err) -{ - int orig, v; - - do { - orig = atomic_read((atomic_t *)&folio->private); - v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0); - } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); - - if (v & ~Z_EROFS_FOLIO_EIO) - return; - folio->private = 0; - folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO)); -} - #define Z_EROFS_ONSTACK_PAGES 32 /* @@ -965,7 +929,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, int err = 0; tight = (bs == PAGE_SIZE); - z_erofs_onlinefolio_init(folio); + erofs_onlinefolio_init(folio); do { if (offset + end - 1 < map->m_la || offset + end - 1 >= map->m_la + map->m_llen) { @@ -1024,7 +988,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, if (err) break; - z_erofs_onlinefolio_split(folio); + erofs_onlinefolio_split(folio); if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) f->pcl->multibases = true; if (f->pcl->length < offset + end - map->m_la) { @@ -1044,7 +1008,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f, tight = (bs == PAGE_SIZE); } } while ((end = cur) > 0); - z_erofs_onlinefolio_end(folio, err); + erofs_onlinefolio_end(folio, err); return err; } @@ -1147,7 +1111,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); list_del(p); kfree(bvi); } @@ -1302,7 +1266,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - z_erofs_onlinefolio_end(page_folio(page), err); + erofs_onlinefolio_end(page_folio(page), err); continue; } if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { From 283213718f5d618dfe88d16a3c63a077a12f15ec Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 30 Aug 2024 11:28:39 +0800 Subject: [PATCH 07/15] erofs: support compressed inodes for fileio Use pseudo bios just like the previous fscache approach since merged bio_vecs can be filled properly with unique interfaces. Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240830032840.3783206-3-hsiangkao@linux.alibaba.com --- fs/erofs/fileio.c | 22 ++++++++++++++++++---- fs/erofs/inode.c | 6 ------ fs/erofs/internal.h | 8 ++++++++ fs/erofs/zdata.c | 27 +++++++++++++++++---------- 4 files changed, 43 insertions(+), 20 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 42b346593bf5..3af96b1e2c2a 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -23,7 +23,6 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) container_of(iocb, struct erofs_fileio_rq, iocb); struct folio_iter fi; - DBG_BUGON(rq->bio.bi_end_io); if (ret > 0) { if (ret != rq->bio.bi_iter.bi_size) { bio_advance(&rq->bio, ret); @@ -31,9 +30,13 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) } ret = 0; } - bio_for_each_folio_all(fi, &rq->bio) { - DBG_BUGON(folio_test_uptodate(fi.folio)); - erofs_onlinefolio_end(fi.folio, ret); + if (rq->bio.bi_end_io) { + rq->bio.bi_end_io(&rq->bio); + } else { + bio_for_each_folio_all(fi, &rq->bio) { + DBG_BUGON(folio_test_uptodate(fi.folio)); + erofs_onlinefolio_end(fi.folio, ret); + } } bio_uninit(&rq->bio); kfree(rq); @@ -68,6 +71,17 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) return rq; } +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) +{ + return &erofs_fileio_rq_alloc(mdev)->bio; +} + +void erofs_fileio_submit_bio(struct bio *bio) +{ + return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq, + bio)); +} + static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) { struct inode *inode = folio_inode(folio); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index f8eab339417d..b9d57d42a158 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -252,12 +252,6 @@ static int erofs_fill_inode(struct inode *inode) mapping_set_large_folios(inode->i_mapping); if (erofs_inode_is_data_compressed(vi->datalayout)) { #ifdef CONFIG_EROFS_FS_ZIP -#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE - if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { - err = -EOPNOTSUPP; - goto out_unlock; - } -#endif DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, erofs_info, inode->i_sb, "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9bc4dcfd06d7..4efd578d7c62 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -489,6 +489,14 @@ static inline void z_erofs_exit_subsystem(void) {} static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE +struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev); +void erofs_fileio_submit_bio(struct bio *bio); +#else +static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; } +static inline void erofs_fileio_submit_bio(struct bio *bio) {} +#endif + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 3371dcb549dc..dca11ab0ab75 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1618,10 +1618,12 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, if (bio && (cur != last_pa || bio->bi_bdev != mdev.m_bdev)) { drain_io: - if (!erofs_is_fscache_mode(sb)) - submit_bio(bio); - else + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); + else + submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); @@ -1646,10 +1648,13 @@ drain_io: } if (!bio) { - bio = erofs_is_fscache_mode(sb) ? - erofs_fscache_bio_alloc(&mdev) : - bio_alloc(mdev.m_bdev, BIO_MAX_VECS, - REQ_OP_READ, GFP_NOIO); + if (erofs_is_fileio_mode(EROFS_SB(sb))) + bio = erofs_fileio_bio_alloc(&mdev); + else if (erofs_is_fscache_mode(sb)) + bio = erofs_fscache_bio_alloc(&mdev); + else + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_endio; bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; @@ -1672,10 +1677,12 @@ drain_io: } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { - if (!erofs_is_fscache_mode(sb)) - submit_bio(bio); - else + if (erofs_is_fileio_mode(EROFS_SB(sb))) + erofs_fileio_submit_bio(bio); + else if (erofs_is_fscache_mode(sb)) erofs_fscache_submit_bio(bio); + else + submit_bio(bio); if (memstall) psi_memstall_leave(&pflags); } From 0d442ce0b3027e99491520a3a8585e8c4ffd79dc Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 30 Aug 2024 11:28:40 +0800 Subject: [PATCH 08/15] erofs: mark experimental fscache backend deprecated Although fscache is still described as "General Filesystem Caching" for network filesystems and other things such as ISO9660 filesystems, it has actually become a part of netfslib recently, which was unexpected at the time when "EROFS over fscache" proposed (2021) since EROFS is entirely a disk filesystem and the dependency is redundant. Mark it deprecated and it will be removed after "fanotify pre-content hooks" lands, which will provide the same functionality for EROFS. Reviewed-by: Sandeep Dhavale Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240830032840.3783206-4-hsiangkao@linux.alibaba.com --- fs/erofs/Kconfig | 5 ++++- fs/erofs/super.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 1428d0530e1c..6ea60661fa55 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -145,7 +145,7 @@ config EROFS_FS_ZIP_ZSTD If unsure, say N. config EROFS_FS_ONDEMAND - bool "EROFS fscache-based on-demand read support" + bool "EROFS fscache-based on-demand read support (deprecated)" depends on EROFS_FS select NETFS_SUPPORT select FSCACHE @@ -155,6 +155,9 @@ config EROFS_FS_ONDEMAND This permits EROFS to use fscache-backed data blobs with on-demand read support. + It is now deprecated and scheduled to be removed from the kernel + after fanotify pre-content hooks are landed. + If unsure, say N. config EROFS_FS_PCPU_KTHREAD diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 9a7e67eceed4..666873f745da 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -353,7 +353,7 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_scan_devices(sb, dsb); if (erofs_is_fscache_mode(sb)) - erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); + erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; From b1bbb9a637a329873e14b596b7e6fa2fd44b87b4 Mon Sep 17 00:00:00 2001 From: Yiyang Wu Date: Mon, 2 Sep 2024 16:31:46 +0800 Subject: [PATCH 09/15] erofs: use kmemdup_nul in erofs_fill_symlink Remove open coding in erofs_fill_symlink. Suggested-by: Al Viro Link: https://lore.kernel.org/all/20240425222847.GN2118490@ZenIV Signed-off-by: Yiyang Wu Link: https://lore.kernel.org/r/20240902083147.450558-2-toolmanp@tlmp.cc Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index b9d57d42a158..4ced52b29f5f 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -179,7 +179,6 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, { struct erofs_inode *vi = EROFS_I(inode); loff_t off; - char *lnk; m_pofs += vi->xattr_isize; /* check if it cannot be handled with fast symlink scheme */ @@ -190,14 +189,9 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, return 0; } - lnk = kmalloc(inode->i_size + 1, GFP_KERNEL); - if (!lnk) + inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); + if (!inode->i_link) return -ENOMEM; - - memcpy(lnk, kaddr + m_pofs, inode->i_size); - lnk[inode->i_size] = '\0'; - - inode->i_link = lnk; inode->i_op = &erofs_fast_symlink_iops; return 0; } From 53d514b970106976fd64f593ea13b55ebf26b3ff Mon Sep 17 00:00:00 2001 From: Yiyang Wu Date: Mon, 2 Sep 2024 17:34:12 +0800 Subject: [PATCH 10/15] erofs: refactor read_inode calling convention Refactor out the iop binding behavior out of the erofs_fill_symlink and move erofs_buf into the erofs_read_inode, so that erofs_fill_inode can only deal with inode operation bindings and can be decoupled from metabuf operations. This results in better calling conventions. Note that after this patch, we do not need erofs_buf and ofs as parameters any more when calling erofs_read_inode as all the data operations are now included in itself. Suggested-by: Al Viro Link: https://lore.kernel.org/all/20240425222847.GN2118490@ZenIV/ Signed-off-by: Yiyang Wu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240902093412.509083-1-toolmanp@tlmp.cc Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 111 ++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 59 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 4ced52b29f5f..7a98df664814 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -8,8 +8,24 @@ #include -static void *erofs_read_inode(struct erofs_buf *buf, - struct inode *inode, unsigned int *ofs) +static int erofs_fill_symlink(struct inode *inode, void *kaddr, + unsigned int m_pofs) +{ + struct erofs_inode *vi = EROFS_I(inode); + loff_t off; + + m_pofs += vi->xattr_isize; + /* check if it cannot be handled with fast symlink scheme */ + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || inode->i_size < 0 || + check_add_overflow(m_pofs, inode->i_size, &off) || + off > i_blocksize(inode)) + return 0; + + inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); + return inode->i_link ? 0 : -ENOMEM; +} + +static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -20,20 +36,21 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; union erofs_inode_i_u iu; - unsigned int ifmt; - int err; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int ifmt, ofs; + int err = 0; blkaddr = erofs_blknr(sb, inode_loc); - *ofs = erofs_blkoff(sb, inode_loc); + ofs = erofs_blkoff(sb, inode_loc); - kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", vi->nid, PTR_ERR(kaddr)); - return kaddr; + return PTR_ERR(kaddr); } - dic = kaddr + *ofs; + dic = kaddr + ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { erofs_err(sb, "unsupported i_format %u of nid %llu", @@ -54,11 +71,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); /* check if the extended inode acrosses block boundary */ - if (*ofs + vi->inode_isize <= sb->s_blocksize) { - *ofs += vi->inode_isize; + if (ofs + vi->inode_isize <= sb->s_blocksize) { + ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = sb->s_blocksize - *ofs; + const unsigned int gotten = sb->s_blocksize - ofs; copied = kmalloc(vi->inode_isize, GFP_KERNEL); if (!copied) { @@ -66,16 +83,16 @@ static void *erofs_read_inode(struct erofs_buf *buf, goto err_out; } memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr + 1), + kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), EROFS_KMAP); if (IS_ERR(kaddr)) { erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", vi->nid, PTR_ERR(kaddr)); kfree(copied); - return kaddr; + return PTR_ERR(kaddr); } - *ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, *ofs); + ofs = vi->inode_isize - gotten; + memcpy((u8 *)copied + gotten, kaddr, ofs); die = copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); @@ -91,11 +108,10 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_size = le64_to_cpu(die->i_size); kfree(copied); - copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); - *ofs += vi->inode_isize; + ofs += vi->inode_isize; vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); @@ -120,6 +136,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, case S_IFDIR: case S_IFLNK: vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + if(S_ISLNK(inode->i_mode)) { + err = erofs_fill_symlink(inode, kaddr, ofs); + if (err) + goto err_out; + } break; case S_IFCHR: case S_IFBLK: @@ -165,51 +186,24 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; else inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); - return kaddr; err_out: - DBG_BUGON(1); - kfree(copied); - erofs_put_metabuf(buf); - return ERR_PTR(err); -} - -static int erofs_fill_symlink(struct inode *inode, void *kaddr, - unsigned int m_pofs) -{ - struct erofs_inode *vi = EROFS_I(inode); - loff_t off; - - m_pofs += vi->xattr_isize; - /* check if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || inode->i_size < 0 || - check_add_overflow(m_pofs, inode->i_size, &off) || - off > i_blocksize(inode)) { - inode->i_op = &erofs_symlink_iops; - return 0; - } - - inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); - if (!inode->i_link) - return -ENOMEM; - inode->i_op = &erofs_fast_symlink_iops; - return 0; + DBG_BUGON(err); + erofs_put_metabuf(&buf); + return err; } static int erofs_fill_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - unsigned int ofs; - int err = 0; + int err; trace_erofs_fill_inode(inode); /* read inode base data from disk */ - kaddr = erofs_read_inode(&buf, inode, &ofs); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); + err = erofs_read_inode(inode); + if (err) + return err; /* setup the new inode */ switch (inode->i_mode & S_IFMT) { @@ -226,9 +220,10 @@ static int erofs_fill_inode(struct inode *inode) inode_nohighmem(inode); break; case S_IFLNK: - err = erofs_fill_symlink(inode, kaddr, ofs); - if (err) - goto out_unlock; + if (inode->i_link) + inode->i_op = &erofs_fast_symlink_iops; + else + inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; case S_IFCHR: @@ -237,10 +232,9 @@ static int erofs_fill_inode(struct inode *inode) case S_IFSOCK: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); - goto out_unlock; + return 0; default: - err = -EFSCORRUPTED; - goto out_unlock; + return -EFSCORRUPTED; } mapping_set_large_folios(inode->i_mapping); @@ -264,8 +258,7 @@ static int erofs_fill_inode(struct inode *inode) inode->i_mapping->a_ops = &erofs_fileio_aops; #endif } -out_unlock: - erofs_put_metabuf(&buf); + return err; } From 8bdb6a8393dc32e3ab2cf89081e5b0f95cb7221b Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Thu, 5 Sep 2024 11:03:39 +0800 Subject: [PATCH 11/15] erofs: simplify erofs_map_blocks_flatmode() Get rid of redundant variables (nblocks, offset) and a dead branch (!tailendpacking). Signed-off-by: Hongzhen Luo Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240905030339.1474396-1-hongzhen@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index b4c07ce7a294..61debd799cf9 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -79,38 +79,28 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, static int erofs_map_blocks_flatmode(struct inode *inode, struct erofs_map_blocks *map) { - erofs_blk_t nblocks, lastblk; - u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); struct super_block *sb = inode->i_sb; bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; - nblocks = erofs_iblks(inode); - lastblk = nblocks - tailendpacking; - - /* there is no hole in flatmode */ - map->m_flags = EROFS_MAP_MAPPED; - if (offset < erofs_pos(sb, lastblk)) { + map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ + if (map->m_la < erofs_pos(sb, lastblk)) { map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen = erofs_pos(sb, lastblk) - offset; - } else if (tailendpacking) { + map->m_plen = erofs_pos(sb, lastblk) - map->m_la; + } else { + DBG_BUGON(!tailendpacking); map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, offset); - map->m_plen = inode->i_size - offset; + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_plen = inode->i_size - map->m_la; /* inline data should be located in the same meta block */ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data cross block boundary @ nid %llu", - vi->nid); + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } map->m_flags |= EROFS_MAP_META; - } else { - erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", - vi->nid, inode->i_size, map->m_la); - DBG_BUGON(1); - return -EIO; } return 0; } From 2349d2fa02db19ebc5e9033ddc3ed09e22c4abb5 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 5 Sep 2024 16:47:32 +0800 Subject: [PATCH 12/15] erofs: sunset unneeded NOFAILs With iterative development, our codebase can now deal with compressed buffer misses properly if both in-place I/O and compressed buffer allocation fail. Note that if readahead fails (with non-uptodate folios), the original request will then fall back to synchronous read, and `.read_folio()` should return appropriate errnos; otherwise -EIO will be passed to user space, which is unexpected. To simplify rarely encountered failure paths, a mimic decompression will be just used. Before that, failure reasons are recorded in compressed_bvecs[] and they also act as placeholders to avoid in-place pages. They will be parsed just before decompression and then pass back to `.read_folio()`. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240905084732.2684515-1-hsiangkao@linux.alibaba.com --- fs/erofs/zdata.c | 57 ++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index dca11ab0ab75..b720dba14edd 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1154,9 +1154,10 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; - /* compressed data ought to be valid before decompressing */ - if (!page) { - err = -EIO; + /* compressed data ought to be valid when decompressing */ + if (IS_ERR(page) || !page) { + bvec->page = NULL; /* clear the failure reason */ + err = page ? PTR_ERR(page) : -EIO; continue; } be->compressed_pages[i] = page; @@ -1232,8 +1233,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, - .gfp = pcl->besteffort ? - GFP_KERNEL | __GFP_NOFAIL : + .gfp = pcl->besteffort ? GFP_KERNEL : GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); @@ -1297,8 +1297,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, return err; } -static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct page **pagepool) +static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct page **pagepool) { struct z_erofs_decompress_backend be = { .sb = io->sb, @@ -1307,6 +1307,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, LIST_HEAD_INIT(be.decompressed_secondary_bvecs), }; z_erofs_next_pcluster_t owned = io->head; + int err = io->eio ? -EIO : 0; while (owned != Z_EROFS_PCLUSTER_TAIL) { DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); @@ -1314,12 +1315,13 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, be.pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + err = z_erofs_decompress_pcluster(&be, err) ?: err; if (z_erofs_is_inline_pcluster(be.pcl)) z_erofs_free_pcluster(be.pcl); else erofs_workgroup_put(&be.pcl->obj); } + return err; } static void z_erofs_decompressqueue_work(struct work_struct *work) @@ -1462,17 +1464,21 @@ repeat: folio_unlock(folio); folio_put(folio); out_allocfolio: - page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + page = erofs_allocpage(&f->pagepool, gfp); spin_lock(&pcl->obj.lockref.lock); if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { - erofs_pagepool_add(&f->pagepool, page); + if (page) + erofs_pagepool_add(&f->pagepool, page); spin_unlock(&pcl->obj.lockref.lock); cond_resched(); goto repeat; } - bvec->bv_page = pcl->compressed_bvecs[nr].page = page; - folio = page_folio(page); + pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); spin_unlock(&pcl->obj.lockref.lock); + bvec->bv_page = page; + if (!page) + return; + folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { @@ -1698,26 +1704,28 @@ drain_io: z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } -static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - bool force_fg, bool ra) +static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f, + unsigned int ra_folios) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; + struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); + bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios); + int err; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) - return; - z_erofs_submit_queue(f, io, &force_fg, ra); + return 0; + z_erofs_submit_queue(f, io, &force_fg, !!ra_folios); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); - + err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) - return; + return err; /* wait until all bios are completed */ wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); + return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err; } /* @@ -1779,7 +1787,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; @@ -1791,9 +1798,8 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); - /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - + /* if some pclusters are ready, need submit them anyway */ + err = z_erofs_runqueue(&f, 0) ?: err; if (err && err != -EINTR) erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", err, folio->index, EROFS_I(inode)->nid); @@ -1806,7 +1812,6 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct folio *head = NULL, *folio; unsigned int nr_folios; @@ -1836,7 +1841,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); + (void)z_erofs_runqueue(&f, nr_folios); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } From 79f504a2cd3c0b7d953d0015618a2a41559a2cfd Mon Sep 17 00:00:00 2001 From: Chunhai Guo Date: Fri, 6 Sep 2024 06:11:10 -0600 Subject: [PATCH 13/15] erofs: allocate more short-lived pages from reserved pool first This patch aims to allocate bvpages and short-lived compressed pages from the reserved pool first. After applying this patch, there are three benefits. 1. It reduces the page allocation time. The bvpages and short-lived compressed pages account for about 4% of the pages allocated from the system in the multi-app launch benchmarks [1]. It reduces the page allocation time accordingly and lowers the likelihood of blockage by page allocation in low memory scenarios. 2. The pages in the reserved pool will be allocated on demand. Currently, bvpages and short-lived compressed pages are short-lived pages allocated from the system, and the pages in the reserved pool all originate from short-lived pages. Consequently, the number of reserved pool pages will increase to z_erofs_rsv_nrpages over time. With this patch, all short-lived pages are allocated from the reserved pool first, so the number of reserved pool pages will only increase when there are not enough pages. Thus, even if z_erofs_rsv_nrpages is set to a large number for specific reasons, the actual number of reserved pool pages may remain low as per demand. In the multi-app launch benchmarks [1], z_erofs_rsv_nrpages is set at 256, while the number of reserved pool pages remains below 64. 3. When erofs cache decompression is disabled (EROFS_ZIP_CACHE_DISABLED), all pages will *only* be allocated from the reserved pool for erofs. This will significantly reduce the memory pressure from erofs. [1] For additional details on the multi-app launch benchmarks, please refer to commit 0f6273ab4637 ("erofs: add a reserved buffer pool for lz4 decompression"). Signed-off-by: Chunhai Guo Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20240906121110.3701889-1-guochunhai@vivo.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b720dba14edd..8936790618c6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -196,7 +196,8 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = erofs_allocpage(pagepool, GFP_KERNEL); + nextpage = __erofs_allocpage(pagepool, GFP_KERNEL, + true); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -1464,7 +1465,7 @@ repeat: folio_unlock(folio); folio_put(folio); out_allocfolio: - page = erofs_allocpage(&f->pagepool, gfp); + page = __erofs_allocpage(&f->pagepool, gfp, true); spin_lock(&pcl->obj.lockref.lock); if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { if (page) From 7c3ca1838a7831855cbf2e6927a10e0e4723edf6 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Sep 2024 15:41:56 +0800 Subject: [PATCH 14/15] erofs: restrict pcluster size limitations Error out if {en,de}encoded size of a pcluster is unsupported: Maximum supported encoded size (of a pcluster): 1 MiB Maximum supported decoded size (of a pcluster): 12 MiB Users can still choose to use supported large configurations (e.g., for archival purposes), but there may be performance penalties in low-memory scenarios compared to smaller pclusters. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240912074156.2925394-1-hsiangkao@linux.alibaba.com --- fs/erofs/erofs_fs.h | 5 ++++- fs/erofs/zmap.c | 42 ++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 6c0c270c42e1..c8f2ae845bd2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -288,9 +288,12 @@ struct erofs_dirent { #define EROFS_NAME_LEN 255 -/* maximum supported size of a physical compression cluster */ +/* maximum supported encoded size of a physical compressed cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) +/* maximum supported decoded size of a physical compressed cluster */ +#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024) + /* available compression algorithm types (for h_algorithmtype) */ enum { Z_EROFS_COMPRESSION_LZ4 = 0, diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 403af6e31d5b..e980e29873a5 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -687,32 +687,30 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int err = 0; trace_erofs_map_blocks_enter(inode, map, flags); - - /* when trying to read beyond EOF, leave it unmapped */ - if (map->m_la >= inode->i_size) { + if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */ map->m_llen = map->m_la + 1 - inode->i_size; map->m_la = inode->i_size; map->m_flags = 0; - goto out; + } else { + err = z_erofs_fill_inode_lazy(inode); + if (!err) { + if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + } else { + err = z_erofs_do_map_blocks(inode, map, flags); + } + } + if (!err && (map->m_flags & EROFS_MAP_ENCODED) && + unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + err = -EOPNOTSUPP; + if (err) + map->m_llen = 0; } - - err = z_erofs_fill_inode_lazy(inode); - if (err) - goto out; - - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED | - EROFS_MAP_FRAGMENT; - goto out; - } - - err = z_erofs_do_map_blocks(inode, map, flags); -out: - if (err) - map->m_llen = 0; trace_erofs_map_blocks_exit(inode, map, flags, err); return err; } From 025497e1d176a9e063d1e60699527e2f3a871935 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 12 Sep 2024 16:35:38 +0800 Subject: [PATCH 15/15] erofs: reject inodes with negative i_size Negative i_size is never supported, although crafted images with inodes having negative i_size will NOT lead to security issues in our current codebase: The following image can verify this (gzip+base64 encoded): H4sICCmk4mYAA3Rlc3QuaW1nAGNgGAWjYBSMVPDo4dcH3jP2aTED2TwMKgxMUHHNJY/SQDQX LxcDIw3tZwXit44MDNpQ/n8gQJZ/vxjijosPuSyZ0DUDgQqcZoKzVYFsDShbHeh6PT29ktTi Eqz2g/y2pBFiLxDMh4lhs5+W4TAKRsEoGAWjYBSMglEwCkYBPQAAS2DbowAQAAA= Mark as bad inodes for such corrupted inodes explicitly. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240912083538.3011860-1-hsiangkao@linux.alibaba.com --- fs/erofs/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 7a98df664814..db29190656eb 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -5,7 +5,6 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" - #include static int erofs_fill_symlink(struct inode *inode, void *kaddr, @@ -16,7 +15,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, m_pofs += vi->xattr_isize; /* check if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || inode->i_size < 0 || + if (vi->datalayout != EROFS_INODE_FLAT_INLINE || check_add_overflow(m_pofs, inode->i_size, &off) || off > i_blocksize(inode)) return 0; @@ -131,6 +130,11 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } + if (unlikely(inode->i_size < 0)) { + erofs_err(sb, "negative i_size @ nid %llu", vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } switch (inode->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: @@ -186,7 +190,6 @@ static int erofs_read_inode(struct inode *inode) inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; else inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); - err_out: DBG_BUGON(err); erofs_put_metabuf(&buf);