diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index c4c6dcdc89ad..5eead7fdc7a6 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -52,8 +52,8 @@ struct erofs_sb_info { struct list_head list; struct mutex umount_mutex; - /* the dedicated workstation for compression */ - struct radix_tree_root workstn_tree; + /* managed XArray arranged in physical block number */ + struct xarray managed_pslots; /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; @@ -402,8 +402,8 @@ static inline void *erofs_get_pcpubuf(unsigned int pagenr) int erofs_workgroup_put(struct erofs_workgroup *grp); struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, pgoff_t index); -int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp); +struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, + struct erofs_workgroup *grp); void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 057e6d7b5b7f..b514c67e5fc2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -425,7 +425,7 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~SB_POSIXACL; #ifdef CONFIG_EROFS_FS_ZIP - INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); + xa_init(&sbi->managed_pslots); #endif /* get the root inode */ diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index fddc5059c930..4396b424373f 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -37,9 +37,6 @@ void *erofs_get_pcpubuf(unsigned int pagenr) /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; -#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) -#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount) - static int erofs_workgroup_get(struct erofs_workgroup *grp) { int o; @@ -66,7 +63,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, repeat: rcu_read_lock(); - grp = radix_tree_lookup(&sbi->workstn_tree, index); + grp = xa_load(&sbi->managed_pslots, index); if (grp) { if (erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ @@ -80,43 +77,37 @@ repeat: return grp; } -int erofs_register_workgroup(struct super_block *sb, - struct erofs_workgroup *grp) +struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, + struct erofs_workgroup *grp) { - struct erofs_sb_info *sbi; - int err; - - /* grp shouldn't be broken or used before */ - if (atomic_read(&grp->refcount) != 1) { - DBG_BUGON(1); - return -EINVAL; - } - - err = radix_tree_preload(GFP_NOFS); - if (err) - return err; - - sbi = EROFS_SB(sb); - xa_lock(&sbi->workstn_tree); + struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct erofs_workgroup *pre; /* - * Bump up reference count before making this workgroup - * visible to other users in order to avoid potential UAF - * without serialized by workstn_lock. + * Bump up a reference count before making this visible + * to others for the XArray in order to avoid potential + * UAF without serialized by xa_lock. */ - __erofs_workgroup_get(grp); + atomic_inc(&grp->refcount); - err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp); - if (err) - /* - * it's safe to decrease since the workgroup isn't visible - * and refcount >= 2 (cannot be freezed). - */ - __erofs_workgroup_put(grp); - - xa_unlock(&sbi->workstn_tree); - radix_tree_preload_end(); - return err; +repeat: + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, + NULL, grp, GFP_NOFS); + if (pre) { + if (xa_is_err(pre)) { + pre = ERR_PTR(xa_err(pre)); + } else if (erofs_workgroup_get(pre)) { + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); + goto repeat; + } + atomic_dec(&grp->refcount); + grp = pre; + } + xa_unlock(&sbi->managed_pslots); + return grp; } static void __erofs_workgroup_free(struct erofs_workgroup *grp) @@ -155,7 +146,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, /* * Note that all cached pages should be unattached - * before deleted from the radix tree. Otherwise some + * before deleted from the XArray. Otherwise some * cached pages could be still attached to the orphan * old workgroup when the new one is available in the tree. */ @@ -169,7 +160,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, * however in order to avoid some race conditions, add a * DBG_BUGON to observe this in advance. */ - DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp); + DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp); /* * If managed cache is on, last refcount should indicate @@ -182,22 +173,11 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, unsigned long nr_shrink) { - pgoff_t first_index = 0; - void *batch[PAGEVEC_SIZE]; + struct erofs_workgroup *grp; unsigned int freed = 0; + unsigned long index; - int i, found; -repeat: - xa_lock(&sbi->workstn_tree); - - found = radix_tree_gang_lookup(&sbi->workstn_tree, - batch, first_index, PAGEVEC_SIZE); - - for (i = 0; i < found; ++i) { - struct erofs_workgroup *grp = batch[i]; - - first_index = grp->index + 1; - + xa_for_each(&sbi->managed_pslots, index, grp) { /* try to shrink each valid workgroup */ if (!erofs_try_to_release_workgroup(sbi, grp)) continue; @@ -206,10 +186,6 @@ repeat: if (!--nr_shrink) break; } - xa_unlock(&sbi->workstn_tree); - - if (i && nr_shrink) - goto repeat; return freed; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 80e47f07d946..c4b6c9aa87ec 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -67,16 +67,6 @@ static void z_erofs_pcluster_init_once(void *ptr) pcl->compressed_pages[i] = NULL; } -static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl) -{ - struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); - - atomic_set(&pcl->obj.refcount, 1); - - DBG_BUGON(cl->nr_pages); - DBG_BUGON(cl->vcnt); -} - int __init z_erofs_init_zip_subsystem(void) { pcluster_cachep = kmem_cache_create("erofs_compress", @@ -341,26 +331,19 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { - struct erofs_workgroup *grp; - struct z_erofs_pcluster *pcl; + struct z_erofs_pcluster *pcl = clt->pcl; struct z_erofs_collection *cl; unsigned int length; - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); - if (!grp) - return -ENOENT; - - pcl = container_of(grp, struct z_erofs_pcluster, obj); + /* to avoid unexpected loop formed by corrupted images */ if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { DBG_BUGON(1); - erofs_workgroup_put(grp); return -EFSCORRUPTED; } cl = z_erofs_primarycollection(pcl); if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); - erofs_workgroup_put(grp); return -EFSCORRUPTED; } @@ -368,7 +351,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { DBG_BUGON(1); - erofs_workgroup_put(grp); return -EFSCORRUPTED; } } else { @@ -391,7 +373,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) clt->tailpcl = NULL; - clt->pcl = pcl; clt->cl = cl; return 0; } @@ -402,6 +383,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, { struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; + struct erofs_workgroup *grp; int err; /* no available workgroup, let's allocate one */ @@ -409,7 +391,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, if (!pcl) return -ENOMEM; - z_erofs_pcluster_init_always(pcl); + atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | @@ -429,19 +411,29 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, clt->mode = COLLECT_PRIMARY_FOLLOWED; cl = z_erofs_primarycollection(pcl); + + /* must be cleaned before freeing to slab */ + DBG_BUGON(cl->nr_pages); + DBG_BUGON(cl->vcnt); + cl->pageofs = map->m_la & ~PAGE_MASK; /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ - mutex_trylock(&cl->lock); + DBG_BUGON(!mutex_trylock(&cl->lock)); - err = erofs_register_workgroup(inode->i_sb, &pcl->obj); - if (err) { - mutex_unlock(&cl->lock); - kmem_cache_free(pcluster_cachep, pcl); - return -EAGAIN; + grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto err_out; + } + + if (grp != &pcl->obj) { + clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); + err = -EEXIST; + goto err_out; } /* used to check tail merging loop due to corrupted images */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -450,12 +442,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, clt->pcl = pcl; clt->cl = cl; return 0; + +err_out: + mutex_unlock(&cl->lock); + kmem_cache_free(pcluster_cachep, pcl); + return err; } static int z_erofs_collector_begin(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { + struct erofs_workgroup *grp; int ret; DBG_BUGON(clt->cl); @@ -469,21 +467,25 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, return -EINVAL; } -repeat: - ret = z_erofs_lookup_collection(clt, inode, map); - if (ret == -ENOENT) { + grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); + if (grp) { + clt->pcl = container_of(grp, struct z_erofs_pcluster, obj); + } else { ret = z_erofs_register_collection(clt, inode, map); - /* someone registered at the same time, give another try */ - if (ret == -EAGAIN) { - cond_resched(); - goto repeat; - } + if (!ret) + goto out; + if (ret != -EEXIST) + return ret; } - if (ret) + ret = z_erofs_lookup_collection(clt, inode, map); + if (ret) { + erofs_workgroup_put(&clt->pcl->obj); return ret; + } +out: z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, clt->cl->pagevec, clt->cl->vcnt);