From 6da1b4b1ab36d80a3994fd4811c8381de10af604 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <darrick.wong@oracle.com> Date: Fri, 22 Jan 2021 16:48:32 -0800 Subject: [PATCH 01/85] xfs: fix an ABBA deadlock in xfs_rename When overlayfs is running on top of xfs and the user unlinks a file in the overlay, overlayfs will create a whiteout inode and ask xfs to "rename" the whiteout file atop the one being unlinked. If the file being unlinked loses its one nlink, we then have to put the inode on the unlinked list. This requires us to grab the AGI buffer of the whiteout inode to take it off the unlinked list (which is where whiteouts are created) and to grab the AGI buffer of the file being deleted. If the whiteout was created in a higher numbered AG than the file being deleted, we'll lock the AGIs in the wrong order and deadlock. Therefore, grab all the AGI locks we think we'll need ahead of time, and in order of increasing AG number per the locking rules. Reported-by: wenli xie <wlxie7296@gmail.com> Fixes: 93597ae8dac0 ("xfs: Fix deadlock between AGI and AGF when target_ip exists in xfs_rename()") Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/libxfs/xfs_dir2.h | 2 -- fs/xfs/libxfs/xfs_dir2_sf.c | 2 +- fs/xfs/xfs_inode.c | 42 ++++++++++++++++++++++--------------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index e55378640b05..d03e6098ded9 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -47,8 +47,6 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); -extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp, - xfs_ino_t inum); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 2463b5d73447..8c4f76bba88b 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -1018,7 +1018,7 @@ xfs_dir2_sf_removename( /* * Check whether the sf dir replace operation need more blocks. */ -bool +static bool xfs_dir2_sf_replace_needblock( struct xfs_inode *dp, xfs_ino_t inum) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b7352bc4c815..e5dc41b10ebb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3017,7 +3017,7 @@ xfs_rename( struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; - struct xfs_buf *agibp; + int i; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); @@ -3130,6 +3130,30 @@ xfs_rename( } } + /* + * Lock the AGI buffers we need to handle bumping the nlink of the + * whiteout inode off the unlinked list and to handle dropping the + * nlink of the target inode. Per locking order rules, do this in + * increasing AG order and before directory block allocation tries to + * grab AGFs because we grab AGIs before AGFs. + * + * The (vfs) caller must ensure that if src is a directory then + * target_ip is either null or an empty directory. + */ + for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { + if (inodes[i] == wip || + (inodes[i] == target_ip && + (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { + struct xfs_buf *bp; + xfs_agnumber_t agno; + + agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); + error = xfs_read_agi(mp, tp, agno, &bp); + if (error) + goto out_trans_cancel; + } + } + /* * Directory entry creation below may acquire the AGF. Remove * the whiteout from the unlinked list first to preserve correct @@ -3182,22 +3206,6 @@ xfs_rename( * In case there is already an entry with the same * name at the destination directory, remove it first. */ - - /* - * Check whether the replace operation will need to allocate - * blocks. This happens when the shortform directory lacks - * space and we have to convert it to a block format directory. - * When more blocks are necessary, we must lock the AGI first - * to preserve locking order (AGI -> AGF). - */ - if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) { - error = xfs_read_agi(mp, tp, - XFS_INO_TO_AGNO(mp, target_ip->i_ino), - &agibp); - if (error) - goto out_trans_cancel; - } - error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) From b9b7e1dc56c5ca8d6fc37c410b054e9f26737d2e Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:10 -0800 Subject: [PATCH 02/85] xfs: Add helper for checking per-inode extent count overflow XFS does not check for possible overflow of per-inode extent counter fields when adding extents to either data or attr fork. For e.g. 1. Insert 5 million xattrs (each having a value size of 255 bytes) and then delete 50% of them in an alternating manner. 2. On a 4k block sized XFS filesystem instance, the above causes 98511 extents to be created in the attr fork of the inode. xfsaild/loop0 2008 [003] 1475.127209: probe:xfs_inode_to_disk: (ffffffffa43fb6b0) if_nextents=98511 i_ino=131 3. The incore inode fork extent counter is a signed 32-bit quantity. However the on-disk extent counter is an unsigned 16-bit quantity and hence cannot hold 98511 extents. 4. The following incorrect value is stored in the attr extent counter, # xfs_db -f -c 'inode 131' -c 'print core.naextents' /dev/loop0 core.naextents = -32561 This commit adds a new helper function (i.e. xfs_iext_count_may_overflow()) to check for overflow of the per-inode data and xattr extent counters. Future patches will use this function to make sure that an FS operation won't cause the extent counter to overflow. Suggested-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_inode_fork.c | 23 +++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_fork.h | 2 ++ 2 files changed, 25 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 7575de5cecb1..8d48716547e5 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -23,6 +23,7 @@ #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" #include "xfs_attr_leaf.h" +#include "xfs_types.h" kmem_zone_t *xfs_ifork_zone; @@ -728,3 +729,25 @@ xfs_ifork_verify_local_attr( return 0; } + +int +xfs_iext_count_may_overflow( + struct xfs_inode *ip, + int whichfork, + int nr_to_add) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + uint64_t max_exts; + uint64_t nr_exts; + + if (whichfork == XFS_COW_FORK) + return 0; + + max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + + nr_exts = ifp->if_nextents + nr_to_add; + if (nr_exts < ifp->if_nextents || nr_exts > max_exts) + return -EFBIG; + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index a4953e95c4f3..0beb8e2a00be 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -172,5 +172,7 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip); int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); +int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, + int nr_to_add); #endif /* __XFS_INODE_FORK_H__ */ From 727e1acd297cae15449607d6e2ee39c71216cf1a Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:11 -0800 Subject: [PATCH 03/85] xfs: Check for extent overflow when trivally adding a new extent When adding a new data extent (without modifying an inode's existing extents) the extent count increases only by 1. This commit checks for extent count overflow in such cases. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_bmap.c | 6 ++++++ fs/xfs/libxfs/xfs_inode_fork.h | 6 ++++++ fs/xfs/xfs_bmap_item.c | 7 +++++++ fs/xfs/xfs_bmap_util.c | 5 +++++ fs/xfs/xfs_dquot.c | 8 +++++++- fs/xfs/xfs_iomap.c | 5 +++++ fs/xfs/xfs_rtalloc.c | 5 +++++ 7 files changed, 41 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index bc446418e227..32aeacf6f055 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4527,6 +4527,12 @@ xfs_bmapi_convert_delalloc( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); + + error = xfs_iext_count_may_overflow(ip, whichfork, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + xfs_trans_ijoin(tp, ip, 0); if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 0beb8e2a00be..7fc2b129a2e7 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -34,6 +34,12 @@ struct xfs_ifork { #define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ #define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +/* + * Worst-case increase in the fork extent count when we're adding a single + * extent to a fork and there's no possibility of splitting an existing mapping. + */ +#define XFS_IEXT_ADD_NOSPLIT_CNT (1) + /* * Fork handling. */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 93e4d8ae6e92..0534304ed0a7 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -508,6 +508,13 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + if (bui_type == XFS_BMAP_MAP) { + error = xfs_iext_count_may_overflow(ip, whichfork, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto err_cancel; + } + count = bmap->me_len; error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 7371a7f7c652..db44bfaabe88 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -822,6 +822,11 @@ xfs_alloc_file_space( if (error) goto error1; + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto error0; + xfs_trans_ijoin(tp, ip, 0); error = xfs_bmapi_write(tp, ip, startoffset_fsb, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1d95ed387d66..175f544f7c45 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -314,8 +314,14 @@ xfs_dquot_disk_alloc( return -ESRCH; } - /* Create the block mapping. */ xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + + error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + return error; + + /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 7b9ff824e82d..32246d90b9cb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -250,6 +250,11 @@ xfs_iomap_write_direct( if (error) goto out_trans_cancel; + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_res_cancel; + xfs_trans_ijoin(tp, ip, 0); /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b4999fb01ff7..161b0e8992ba 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -804,6 +804,11 @@ xfs_growfs_rt_alloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + /* * Allocate blocks to the bitmap file. */ From 85ef08b5a667615bc7be5058259753dc42a7adcd Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:11 -0800 Subject: [PATCH 04/85] xfs: Check for extent overflow when punching a hole The extent mapping the file offset at which a hole has to be inserted will be split into two extents causing extent count to increase by 1. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_inode_fork.h | 7 +++++++ fs/xfs/xfs_bmap_item.c | 15 +++++++++------ fs/xfs/xfs_bmap_util.c | 10 ++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fc2b129a2e7..bcac769a7df6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -40,6 +40,13 @@ struct xfs_ifork { */ #define XFS_IEXT_ADD_NOSPLIT_CNT (1) +/* + * Punching out an extent from the middle of an existing extent can cause the + * extent count to increase by 1. + * i.e. | Old extent | Hole | Old extent | + */ +#define XFS_IEXT_PUNCH_HOLE_CNT (1) + /* * Fork handling. */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 0534304ed0a7..2344757ede63 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -471,6 +471,7 @@ xfs_bui_item_recover( xfs_exntst_t state; unsigned int bui_type; int whichfork; + int iext_delta; int error = 0; if (!xfs_bui_validate(mp, buip)) { @@ -508,12 +509,14 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (bui_type == XFS_BMAP_MAP) { - error = xfs_iext_count_may_overflow(ip, whichfork, - XFS_IEXT_ADD_NOSPLIT_CNT); - if (error) - goto err_cancel; - } + if (bui_type == XFS_BMAP_MAP) + iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; + else + iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; + + error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + if (error) + goto err_cancel; count = bmap->me_len; error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index db44bfaabe88..6ac7a6ac2658 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -891,6 +891,11 @@ xfs_unmap_extent( xfs_trans_ijoin(tp, ip, 0); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_PUNCH_HOLE_CNT); + if (error) + goto out_trans_cancel; + error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done); if (error) goto out_trans_cancel; @@ -1168,6 +1173,11 @@ xfs_insert_file_space( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_PUNCH_HOLE_CNT); + if (error) + goto out_trans_cancel; + /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at From f5d92749191402c50e32ac83dd9da3b910f5680f Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:12 -0800 Subject: [PATCH 05/85] xfs: Check for extent overflow when adding dir entries Directory entry addition can cause the following, 1. Data block can be added/removed. A new extent can cause extent count to increase by 1. 2. Free disk block can be added/removed. Same behaviour as described above for Data block. 3. Dabtree blocks. XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_inode_fork.h | 13 +++++++++++++ fs/xfs/xfs_inode.c | 10 ++++++++++ fs/xfs/xfs_symlink.c | 5 +++++ 3 files changed, 28 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index bcac769a7df6..ea1a9dd8a763 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -47,6 +47,19 @@ struct xfs_ifork { */ #define XFS_IEXT_PUNCH_HOLE_CNT (1) +/* + * Directory entry addition can cause the following, + * 1. Data block can be added/removed. + * A new extent can cause extent count to increase by 1. + * 2. Free disk block can be added/removed. + * Same behaviour as described above for Data block. + * 3. Dabtree blocks. + * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new + * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. + */ +#define XFS_IEXT_DIR_MANIP_CNT(mp) \ + ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) + /* * Fork handling. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e5dc41b10ebb..3cb41b5d5a26 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1042,6 +1042,11 @@ xfs_create( if (error) goto out_trans_cancel; + error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); + if (error) + goto out_trans_cancel; + /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry @@ -1258,6 +1263,11 @@ xfs_link( xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); + if (error) + goto error_return; + /* * If we are using project inheritance, we only allow hard link * creation in our tree when the project IDs are the same; else diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 1f43fd7f3209..0b8136a32484 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -220,6 +220,11 @@ xfs_symlink( if (error) goto out_trans_cancel; + error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); + if (error) + goto out_trans_cancel; + /* * Allocate an inode for the symlink. */ From 0dbc5cb1a91cc8c44b1c75429f5b9351837114fd Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:12 -0800 Subject: [PATCH 06/85] xfs: Check for extent overflow when removing dir entries Directory entry removal must always succeed; Hence XFS does the following during low disk space scenario: 1. Data/Free blocks linger until a future remove operation. 2. Dabtree blocks would be swapped with the last block in the leaf space and then the new last block will be unmapped. This facility is reused during low inode extent count scenario i.e. this commit causes xfs_bmap_del_extent_real() to return -ENOSPC error code so that the above mentioned behaviour is exercised causing no change to the directory's extent count. Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_bmap.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 32aeacf6f055..6c8f17a0e247 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5151,6 +5151,24 @@ xfs_bmap_del_extent_real( /* * Deleting the middle of the extent. */ + + /* + * For directories, -ENOSPC is returned since a directory entry + * remove operation must not fail due to low extent count + * availability. -ENOSPC will be handled by higher layers of XFS + * by letting the corresponding empty Data/Free blocks to linger + * until a future remove operation. Dabtree blocks would be + * swapped with the last block in the leaf space and then the + * new last block will be unmapped. + */ + error = xfs_iext_count_may_overflow(ip, whichfork, 1); + if (error) { + ASSERT(S_ISDIR(VFS_I(ip)->i_mode) && + whichfork == XFS_DATA_FORK); + error = -ENOSPC; + goto done; + } + old = got; got.br_blockcount = del->br_startoff - got.br_startoff; From 02092a2f034fdeabab524ae39c2de86ba9ffa15a Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:13 -0800 Subject: [PATCH 07/85] xfs: Check for extent overflow when renaming dir entries A rename operation is essentially a directory entry remove operation from the perspective of parent directory (i.e. src_dp) of rename's source. Hence the only place where we check for extent count overflow for src_dp is in xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns -ENOSPC when it detects a possible extent count overflow and in response, the higher layers of directory handling code do the following: 1. Data/Free blocks: XFS lets these blocks linger until a future remove operation removes them. 2. Dabtree blocks: XFS swaps the blocks with the last block in the Leaf space and unmaps the last block. For target_dp, there are two cases depending on whether the destination directory entry exists or not. When destination directory entry does not exist (i.e. target_ip == NULL), extent count overflow check is performed only when transaction has a non-zero sized space reservation associated with it. With a zero-sized space reservation, XFS allows a rename operation to continue only when the directory has sufficient free space in its data/leaf/free space blocks to hold the new entry. When destination directory entry exists (i.e. target_ip != NULL), all we need to do is change the inode number associated with the already existing entry. Hence there is no need to perform an extent count overflow check. Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_bmap.c | 3 +++ fs/xfs/xfs_inode.c | 44 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6c8f17a0e247..8ebe5f13279c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5160,6 +5160,9 @@ xfs_bmap_del_extent_real( * until a future remove operation. Dabtree blocks would be * swapped with the last block in the leaf space and then the * new last block will be unmapped. + * + * The above logic also applies to the source directory entry of + * a rename operation. */ error = xfs_iext_count_may_overflow(ip, whichfork, 1); if (error) { diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cb41b5d5a26..8ebd9c64aa48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3116,6 +3116,35 @@ xfs_rename( /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. + * + * Extent count overflow check: + * + * From the perspective of src_dp, a rename operation is essentially a + * directory entry remove operation. Hence the only place where we check + * for extent count overflow for src_dp is in + * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns + * -ENOSPC when it detects a possible extent count overflow and in + * response, the higher layers of directory handling code do the + * following: + * 1. Data/Free blocks: XFS lets these blocks linger until a + * future remove operation removes them. + * 2. Dabtree blocks: XFS swaps the blocks with the last block in the + * Leaf space and unmaps the last block. + * + * For target_dp, there are two cases depending on whether the + * destination directory entry exists or not. + * + * When destination directory entry does not exist (i.e. target_ip == + * NULL), extent count overflow check is performed only when transaction + * has a non-zero sized space reservation associated with it. With a + * zero-sized space reservation, XFS allows a rename operation to + * continue only when the directory has sufficient free space in its + * data/leaf/free space blocks to hold the new entry. + * + * When destination directory entry exists (i.e. target_ip != NULL), all + * we need to do is change the inode number associated with the already + * existing entry. Hence there is no need to perform an extent count + * overflow check. */ if (target_ip == NULL) { /* @@ -3126,6 +3155,12 @@ xfs_rename( error = xfs_dir_canenter(tp, target_dp, target_name); if (error) goto out_trans_cancel; + } else { + error = xfs_iext_count_may_overflow(target_dp, + XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); + if (error) + goto out_trans_cancel; } } else { /* @@ -3291,9 +3326,16 @@ xfs_rename( if (wip) { error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, spaceres); - } else + } else { + /* + * NOTE: We don't need to check for extent count overflow here + * because the dir remove name code will leave the dir block in + * place if the extent count would overflow. + */ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); + } + if (error) goto out_trans_cancel; From 3a19bb147c72d2e9b77137bf5130b9cfb50a5eef Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:13 -0800 Subject: [PATCH 08/85] xfs: Check for extent overflow when adding/removing xattrs Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to be added. One extra extent for dabtree in case a local attr is large enough to cause a double split. It can also cause extent count to increase proportional to the size of a remote xattr's value. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_attr.c | 13 +++++++++++++ fs/xfs/libxfs/xfs_inode_fork.h | 10 ++++++++++ 2 files changed, 23 insertions(+) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fd8e6418a0d3..be51e7068dcd 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -396,6 +396,7 @@ xfs_attr_set( struct xfs_trans_res tres; bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; + int rmt_blks = 0; unsigned int total; if (XFS_FORCED_SHUTDOWN(dp->i_mount)) @@ -442,11 +443,15 @@ xfs_attr_set( tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; total = args->total; + + if (!local) + rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); } else { XFS_STATS_INC(mp, xs_attr_remove); tres = M_RES(mp)->tr_attrrm; total = XFS_ATTRRM_SPACE_RES(mp); + rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } /* @@ -460,6 +465,14 @@ xfs_attr_set( xfs_ilock(dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(args->trans, dp, 0); + + if (args->value || xfs_inode_hasattr(dp)) { + error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error) + goto out_trans_cancel; + } + if (args->value) { unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index ea1a9dd8a763..8d89838e23f8 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -60,6 +60,16 @@ struct xfs_ifork { #define XFS_IEXT_DIR_MANIP_CNT(mp) \ ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) +/* + * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to + * be added. One extra extent for dabtree in case a local attr is + * large enough to cause a double split. It can also cause extent + * count to increase proportional to the size of a remote xattr's + * value. + */ +#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \ + (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks)) + /* * Fork handling. */ From c442f3086d5a108b7ff086c8ade1923a8f389db5 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:14 -0800 Subject: [PATCH 09/85] xfs: Check for extent overflow when writing to unwritten extent A write to a sub-interval of an existing unwritten extent causes the original extent to be split into 3 extents i.e. | Unwritten | Real | Unwritten | Hence extent count can increase by 2. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_inode_fork.h | 9 +++++++++ fs/xfs/xfs_iomap.c | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 8d89838e23f8..917e289ad962 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -70,6 +70,15 @@ struct xfs_ifork { #define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \ (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks)) +/* + * A write to a sub-interval of an existing unwritten extent causes the original + * extent to be split into 3 extents + * i.e. | Unwritten | Real | Unwritten | + * Hence extent count can increase by 2. + */ +#define XFS_IEXT_WRITE_UNWRITTEN_CNT (2) + + /* * Fork handling. */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 32246d90b9cb..8f4b27cded20 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -566,6 +566,11 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_WRITE_UNWRITTEN_CNT); + if (error) + goto error_on_bmapi_transaction; + /* * Modify the unwritten extent state of the buffer. */ From 5f1d5bbfb2e674052a9fe542f53678978af20770 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:14 -0800 Subject: [PATCH 10/85] xfs: Check for extent overflow when moving extent from cow to data fork Moving an extent to data fork can cause a sub-interval of an existing extent to be unmapped. This will increase extent count by 1. Mapping in the new extent can increase the extent count by 1 again i.e. | Old extent | New extent | Old extent | Hence number of extents increases by 2. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_inode_fork.h | 9 +++++++++ fs/xfs/xfs_reflink.c | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 917e289ad962..c8f279edc5c1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -79,6 +79,15 @@ struct xfs_ifork { #define XFS_IEXT_WRITE_UNWRITTEN_CNT (2) +/* + * Moving an extent to data fork can cause a sub-interval of an existing extent + * to be unmapped. This will increase extent count by 1. Mapping in the new + * extent can increase the extent count by 1 again i.e. + * | Old extent | New extent | Old extent | + * Hence number of extents increases by 2. + */ +#define XFS_IEXT_REFLINK_END_COW_CNT (2) + /* * Fork handling. */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6fa05fb78189..ca0ac1426d74 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -628,6 +628,11 @@ xfs_reflink_end_cow_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + goto out_cancel; + /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that From ee898d78c3540b44270a5fdffe208d7bbb219d93 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:14 -0800 Subject: [PATCH 11/85] xfs: Check for extent overflow when remapping an extent Remapping an extent involves unmapping the existing extent and mapping in the new extent. When unmapping, an extent containing the entire unmap range can be split into two extents, i.e. | Old extent | hole | Old extent | Hence extent count increases by 1. Mapping in the new extent into the destination file can increase the extent count by 1. Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/xfs_reflink.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ca0ac1426d74..e1c98dbf79e4 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1006,6 +1006,7 @@ xfs_reflink_remap_extent( unsigned int resblks; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); + int iext_delta = 0; int nimaps; int error; @@ -1099,6 +1100,16 @@ xfs_reflink_remap_extent( goto out_cancel; } + if (smap_real) + ++iext_delta; + + if (dmap_written) + ++iext_delta; + + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); + if (error) + goto out_cancel; + if (smap_real) { /* * If the extent we're unmapping is backed by storage (written From bcc561f21f115437a010307420fc43d91be91c66 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:15 -0800 Subject: [PATCH 12/85] xfs: Check for extent overflow when swapping extents Removing an initial range of source/donor file's extent and adding a new extent (from donor/source file) in its place will cause extent count to increase by 1. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_inode_fork.h | 7 +++++++ fs/xfs/xfs_bmap_util.c | 16 ++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index c8f279edc5c1..9e2137cd7372 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -88,6 +88,13 @@ struct xfs_ifork { */ #define XFS_IEXT_REFLINK_END_COW_CNT (2) +/* + * Removing an initial range of source/donor file's extent and adding a new + * extent (from donor/source file) in its place will cause extent count to + * increase by 1. + */ +#define XFS_IEXT_SWAP_RMAP_CNT (1) + /* * Fork handling. */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6ac7a6ac2658..f3f8c48ff5bf 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1399,6 +1399,22 @@ xfs_swap_extent_rmap( irec.br_blockcount); trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); + if (xfs_bmap_is_real_extent(&uirec)) { + error = xfs_iext_count_may_overflow(ip, + XFS_DATA_FORK, + XFS_IEXT_SWAP_RMAP_CNT); + if (error) + goto out; + } + + if (xfs_bmap_is_real_extent(&irec)) { + error = xfs_iext_count_may_overflow(tip, + XFS_DATA_FORK, + XFS_IEXT_SWAP_RMAP_CNT); + if (error) + goto out; + } + /* Remove the mapping from the donor file. */ xfs_bmap_unmap_extent(tp, tip, &uirec); From f9fa87169d2bc1bf55ab42bb6085114378c53b86 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:15 -0800 Subject: [PATCH 13/85] xfs: Introduce error injection to reduce maximum inode fork extent count This commit adds XFS_ERRTAG_REDUCE_MAX_IEXTENTS error tag which enables userspace programs to test "Inode fork extent count overflow detection" by reducing maximum possible inode fork extent count to 10. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_errortag.h | 4 +++- fs/xfs/libxfs/xfs_inode_fork.c | 4 ++++ fs/xfs/xfs_error.c | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 53b305dea381..1c56fcceeea6 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -56,7 +56,8 @@ #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 #define XFS_ERRTAG_IUNLINK_FALLBACK 34 #define XFS_ERRTAG_BUF_IOERROR 35 -#define XFS_ERRTAG_MAX 36 +#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 +#define XFS_ERRTAG_MAX 37 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -97,5 +98,6 @@ #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 #define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT +#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 8d48716547e5..e080d7e07643 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -24,6 +24,7 @@ #include "xfs_dir2_priv.h" #include "xfs_attr_leaf.h" #include "xfs_types.h" +#include "xfs_errortag.h" kmem_zone_t *xfs_ifork_zone; @@ -745,6 +746,9 @@ xfs_iext_count_may_overflow( max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + max_exts = 10; + nr_exts = ifp->if_nextents + nr_to_add; if (nr_exts < ifp->if_nextents || nr_exts > max_exts) return -EFBIG; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 7f6e20899473..3780b118cc47 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -54,6 +54,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_FORCE_SUMMARY_RECALC, XFS_RANDOM_IUNLINK_FALLBACK, XFS_RANDOM_BUF_IOERROR, + XFS_RANDOM_REDUCE_MAX_IEXTENTS, }; struct xfs_errortag_attr { @@ -164,6 +165,7 @@ XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); +XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -202,6 +204,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(bad_summary), XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), XFS_ERRORTAG_ATTR_LIST(buf_ioerror), + XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), NULL, }; From aff4db57d510082f11194ca915d8101463c92d46 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:16 -0800 Subject: [PATCH 14/85] xfs: Remove duplicate assert statement in xfs_bmap_btalloc() The check for verifying if the allocated extent is from an AG whose index is greater than or equal to that of tp->t_firstblock is already done a couple of statements earlier in the same function. Hence this commit removes the redundant assert statement. Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_bmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8ebe5f13279c..0b15b1ff4bdd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3699,7 +3699,6 @@ xfs_bmap_btalloc( ap->blkno = args.fsbno; if (ap->tp->t_firstblock == NULLFSBLOCK) ap->tp->t_firstblock = args.fsbno; - ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; /* * If the extent size hint is active, we tried to round the From 0961fddfdd3f8ccd6302af2e7718abbaf18c9fff Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:16 -0800 Subject: [PATCH 15/85] xfs: Compute bmap extent alignments in a separate function This commit moves over the code which computes stripe alignment and extent size hint alignment into a separate function. Apart from xfs_bmap_btalloc(), the new function will be used by another function introduced in a future commit. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_bmap.c | 89 +++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0b15b1ff4bdd..8955a0a938d5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3463,13 +3463,59 @@ xfs_bmap_btalloc_accounting( args->len); } +static int +xfs_bmap_compute_alignments( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = args->mp; + xfs_extlen_t align = 0; /* minimum allocation alignment */ + int stripe_align = 0; + int error; + + /* stripe alignment for allocation is determined by mount parameters */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + + if (ap->flags & XFS_BMAPI_COWFORK) + align = xfs_get_cowextsz_hint(ap->ip); + else if (ap->datatype & XFS_ALLOC_USERDATA) + align = xfs_get_extsz_hint(ap->ip); + if (align) { + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 0, ap->eof, 0, ap->conv, + &ap->offset, &ap->length); + ASSERT(!error); + ASSERT(ap->length); + } + + /* apply extent size hints if obtained earlier */ + if (align) { + args->prod = align; + div_u64_rem(ap->offset, args->prod, &args->mod); + if (args->mod) + args->mod = args->prod - args->mod; + } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { + args->prod = 1; + args->mod = 0; + } else { + args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; + div_u64_rem(ap->offset, args->prod, &args->mod); + if (args->mod) + args->mod = args->prod - args->mod; + } + + return stripe_align; +} + STATIC int xfs_bmap_btalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ - xfs_extlen_t align = 0; /* minimum allocation alignment */ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_agnumber_t ag; xfs_alloc_arg_t args; @@ -3489,25 +3535,11 @@ xfs_bmap_btalloc( mp = ap->ip->i_mount; - /* stripe alignment for allocation is determined by mount parameters */ - stripe_align = 0; - if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) - stripe_align = mp->m_swidth; - else if (mp->m_dalign) - stripe_align = mp->m_dalign; - - if (ap->flags & XFS_BMAPI_COWFORK) - align = xfs_get_cowextsz_hint(ap->ip); - else if (ap->datatype & XFS_ALLOC_USERDATA) - align = xfs_get_extsz_hint(ap->ip); - if (align) { - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 0, ap->eof, 0, ap->conv, - &ap->offset, &ap->length); - ASSERT(!error); - ASSERT(ap->length); - } + memset(&args, 0, sizeof(args)); + args.tp = ap->tp; + args.mp = mp; + stripe_align = xfs_bmap_compute_alignments(ap, &args); nullfb = ap->tp->t_firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, @@ -3538,9 +3570,6 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; - memset(&args, 0, sizeof(args)); - args.tp = ap->tp; - args.mp = mp; args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; @@ -3571,21 +3600,7 @@ xfs_bmap_btalloc( args.total = ap->total; args.minlen = ap->minlen; } - /* apply extent size hints if obtained earlier */ - if (align) { - args.prod = align; - div_u64_rem(ap->offset, args.prod, &args.mod); - if (args.mod) - args.mod = args.prod - args.mod; - } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { - args.prod = 1; - args.mod = 0; - } else { - args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; - div_u64_rem(ap->offset, args.prod, &args.mod); - if (args.mod) - args.mod = args.prod - args.mod; - } + /* * If we are not low on available data blocks, and the underlying * logical volume manager is a stripe, and the file offset is zero then From 07c72e556299a7fea448912b1330b9ebfd418662 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:17 -0800 Subject: [PATCH 16/85] xfs: Process allocated extent in a separate function This commit moves over the code in xfs_bmap_btalloc() which is responsible for processing an allocated extent to a new function. Apart from xfs_bmap_btalloc(), the new function will be invoked by another function introduced in a future commit. Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_bmap.c | 74 ++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8955a0a938d5..bf53a0b1eff3 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3510,6 +3510,48 @@ xfs_bmap_compute_alignments( return stripe_align; } +static void +xfs_bmap_process_allocated_extent( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_fileoff_t orig_offset, + xfs_extlen_t orig_length) +{ + int nullfb; + + nullfb = ap->tp->t_firstblock == NULLFSBLOCK; + + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(nullfb || + XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <= + XFS_FSB_TO_AGNO(args->mp, args->fsbno)); + + ap->blkno = args->fsbno; + if (nullfb) + ap->tp->t_firstblock = args->fsbno; + ap->length = args->len; + /* + * If the extent size hint is active, we tried to round the + * caller's allocation request offset down to extsz and the + * length up to another extsz boundary. If we found a free + * extent we mapped it in starting at this new offset. If the + * newly mapped space isn't long enough to cover any of the + * range of offsets that was originally requested, move the + * mapping up so that we can fill as much of the caller's + * original request as possible. Free space is apparently + * very fragmented so we're unlikely to be able to satisfy the + * hints anyway. + */ + if (ap->length <= orig_length) + ap->offset = orig_offset; + else if (ap->offset + ap->length < orig_offset + orig_length) + ap->offset = orig_offset + orig_length - ap->length; + xfs_bmap_btalloc_accounting(ap, args); +} + STATIC int xfs_bmap_btalloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ @@ -3702,36 +3744,10 @@ xfs_bmap_btalloc( return error; ap->tp->t_flags |= XFS_TRANS_LOWMODE; } - if (args.fsbno != NULLFSBLOCK) { - /* - * check the allocation happened at the same or higher AG than - * the first block that was allocated. - */ - ASSERT(ap->tp->t_firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock) <= - XFS_FSB_TO_AGNO(mp, args.fsbno)); - ap->blkno = args.fsbno; - if (ap->tp->t_firstblock == NULLFSBLOCK) - ap->tp->t_firstblock = args.fsbno; - ap->length = args.len; - /* - * If the extent size hint is active, we tried to round the - * caller's allocation request offset down to extsz and the - * length up to another extsz boundary. If we found a free - * extent we mapped it in starting at this new offset. If the - * newly mapped space isn't long enough to cover any of the - * range of offsets that was originally requested, move the - * mapping up so that we can fill as much of the caller's - * original request as possible. Free space is apparently - * very fragmented so we're unlikely to be able to satisfy the - * hints anyway. - */ - if (ap->length <= orig_length) - ap->offset = orig_offset; - else if (ap->offset + ap->length < orig_offset + orig_length) - ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap, &args); + if (args.fsbno != NULLFSBLOCK) { + xfs_bmap_process_allocated_extent(ap, &args, orig_offset, + orig_length); } else { ap->blkno = NULLFSBLOCK; ap->length = 0; From 301519674699aa9b80a15b2b2165e08532b176e6 Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Fri, 22 Jan 2021 16:48:17 -0800 Subject: [PATCH 17/85] xfs: Introduce error injection to allocate only minlen size extents for files This commit adds XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT error tag which helps userspace test programs to get xfs_bmap_btalloc() to always allocate minlen sized extents. This is required for test programs which need a guarantee that minlen extents allocated for a file do not get merged with their existing neighbours in the inode's BMBT. "Inode fork extent overflow check" for Directories, Xattrs and extension of realtime inodes need this since the file offset at which the extents are being allocated cannot be explicitly controlled from userspace. One way to use this error tag is to, 1. Consume all of the free space by sequentially writing to a file. 2. Punch alternate blocks of the file. This causes CNTBT to contain sufficient number of one block sized extent records. 3. Inject XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT error tag. After step 3, xfs_bmap_btalloc() will issue space allocation requests for minlen sized extents only. ENOSPC error code is returned to userspace when there aren't any "one block sized" extents left in any of the AGs. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/libxfs/xfs_alloc.c | 50 ++++++++++++++ fs/xfs/libxfs/xfs_alloc.h | 3 + fs/xfs/libxfs/xfs_bmap.c | 124 ++++++++++++++++++++++++++++------- fs/xfs/libxfs/xfs_errortag.h | 4 +- fs/xfs/xfs_error.c | 3 + 5 files changed, 159 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7cb9f064ac64..0c623d3c1036 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2474,6 +2474,47 @@ xfs_defer_agfl_block( xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); } +#ifdef DEBUG +/* + * Check if an AGF has a free extent record whose length is equal to + * args->minlen. + */ +STATIC int +xfs_exact_minlen_extent_available( + struct xfs_alloc_arg *args, + struct xfs_buf *agbp, + int *stat) +{ + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, + args->agno, XFS_BTNUM_CNT); + error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); + if (error) + goto out; + + if (*stat == 0) { + error = -EFSCORRUPTED; + goto out; + } + + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); + if (error) + goto out; + + if (*stat == 1 && flen != args->minlen) + *stat = 0; + +out: + xfs_btree_del_cursor(cnt_cur, error); + + return error; +} +#endif + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. @@ -2545,6 +2586,15 @@ xfs_alloc_fix_freelist( if (!xfs_alloc_space_available(args, need, flags)) goto out_agbp_relse; +#ifdef DEBUG + if (args->alloc_minlen_only) { + int stat; + + error = xfs_exact_minlen_extent_available(args, agbp, &stat); + if (error || !stat) + goto out_agbp_relse; + } +#endif /* * Make the freelist shorter if it's too long. * diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6c22b12176b8..a4427c5775c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -75,6 +75,9 @@ typedef struct xfs_alloc_arg { char wasfromfl; /* set if allocation is from freelist */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ enum xfs_ag_resv_type resv; /* block reservation to use */ +#ifdef DEBUG + bool alloc_minlen_only; /* allocate exact minlen extent */ +#endif } xfs_alloc_arg_t; /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index bf53a0b1eff3..2cd24bb06040 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3552,35 +3552,102 @@ xfs_bmap_process_allocated_extent( xfs_bmap_btalloc_accounting(ap, args); } +#ifdef DEBUG +static int +xfs_bmap_exact_minlen_extent_alloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + int error; + + ASSERT(ap->length); + + if (ap->minlen != 1) { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + return 0; + } + + orig_offset = ap->offset; + orig_length = ap->length; + + args.alloc_minlen_only = 1; + + xfs_bmap_compute_alignments(ap, &args); + + if (ap->tp->t_firstblock == NULLFSBLOCK) { + /* + * Unlike the longest extent available in an AG, we don't track + * the length of an AG's shortest extent. + * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and + * hence we can afford to start traversing from the 0th AG since + * we need not be concerned about a drop in performance in + * "debug only" code paths. + */ + ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); + } else { + ap->blkno = ap->tp->t_firstblock; + } + + args.fsbno = ap->blkno; + args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = args.minlen = args.maxlen = ap->minlen; + + args.alignment = 1; + args.minalignslop = 0; + + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.resv = XFS_AG_RESV_NONE; + args.datatype = ap->datatype; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + if (args.fsbno != NULLFSBLOCK) { + xfs_bmap_process_allocated_extent(ap, &args, orig_offset, + orig_length); + } else { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + } + + return 0; +} +#else + +#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED) + +#endif + STATIC int xfs_bmap_btalloc( - struct xfs_bmalloca *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) { - xfs_mount_t *mp; /* mount point structure */ - xfs_alloctype_t atype = 0; /* type for allocation routines */ - xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t ag; - xfs_alloc_arg_t args; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - xfs_extlen_t blen; - xfs_extlen_t nextminlen = 0; - int nullfb; /* true if ap->firstblock isn't set */ - int isaligned; - int tryagain; - int error; - int stripe_align; + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; + xfs_alloctype_t atype = 0; + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_agnumber_t ag; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + xfs_extlen_t blen; + xfs_extlen_t nextminlen = 0; + int nullfb; /* true if ap->firstblock isn't set */ + int isaligned; + int tryagain; + int error; + int stripe_align; ASSERT(ap->length); orig_offset = ap->offset; orig_length = ap->length; - mp = ap->ip->i_mount; - - memset(&args, 0, sizeof(args)); - args.tp = ap->tp; - args.mp = mp; - stripe_align = xfs_bmap_compute_alignments(ap, &args); nullfb = ap->tp->t_firstblock == NULLFSBLOCK; @@ -4113,6 +4180,10 @@ xfs_bmap_alloc_userdata( return xfs_bmap_rtalloc(bma); } + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + return xfs_bmap_exact_minlen_extent_alloc(bma); + return xfs_bmap_btalloc(bma); } @@ -4149,10 +4220,15 @@ xfs_bmapi_allocate( else bma->minlen = 1; - if (bma->flags & XFS_BMAPI_METADATA) - error = xfs_bmap_btalloc(bma); - else + if (bma->flags & XFS_BMAPI_METADATA) { + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + error = xfs_bmap_exact_minlen_extent_alloc(bma); + else + error = xfs_bmap_btalloc(bma); + } else { error = xfs_bmap_alloc_userdata(bma); + } if (error || bma->blkno == NULLFSBLOCK) return error; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 1c56fcceeea6..6ca9084b6934 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -57,7 +57,8 @@ #define XFS_ERRTAG_IUNLINK_FALLBACK 34 #define XFS_ERRTAG_BUF_IOERROR 35 #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 -#define XFS_ERRTAG_MAX 37 +#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 +#define XFS_ERRTAG_MAX 38 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -99,5 +100,6 @@ #define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT #define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 +#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 3780b118cc47..185b4915b7bf 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -55,6 +55,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_IUNLINK_FALLBACK, XFS_RANDOM_BUF_IOERROR, XFS_RANDOM_REDUCE_MAX_IEXTENTS, + XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, }; struct xfs_errortag_attr { @@ -166,6 +167,7 @@ XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); +XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -205,6 +207,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), XFS_ERRORTAG_ATTR_LIST(buf_ioerror), XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), + XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), NULL, }; From eaf92540a9189851672d33215a34f22ea8d30446 Mon Sep 17 00:00:00 2001 From: Eric Biggers <ebiggers@google.com> Date: Fri, 22 Jan 2021 16:48:18 -0800 Subject: [PATCH 18/85] xfs: remove a stale comment from xfs_file_aio_write_checks() The comment in xfs_file_aio_write_checks() about calling file_modified() after dropping the ilock doesn't make sense, because the code that unconditionally acquires and drops the ilock was removed by commit 467f78992a07 ("xfs: reduce ilock hold times in xfs_file_aio_write_checks"). Remove this outdated comment. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> --- fs/xfs/xfs_file.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5b0f93f73837..4927c6653f15 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -389,12 +389,6 @@ restart: } else spin_unlock(&ip->i_flags_lock); - /* - * Updating the timestamps will grab the ilock again from - * xfs_fs_dirty_inode, so we have to call it after dropping the - * lock above. Eventually we should look into a way to avoid - * the pointless lock roundtrip. - */ return file_modified(file); } From 01ea173e103edd5ec41acec65b9261b87e123fc2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 22 Jan 2021 16:48:18 -0800 Subject: [PATCH 19/85] xfs: fix up non-directory creation in SGID directories XFS always inherits the SGID bit if it is set on the parent inode, while the generic inode_init_owner does not do this in a few cases where it can create a possible security problem, see commit 0fa3ecd87848 ("Fix up non-directory creation in SGID directories") for details. Switch XFS to use the generic helper for the normal path to fix this, just keeping the simple field inheritance open coded for the case of the non-sgid case with the bsdgrpid mount option. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ebd9c64aa48..e2a1db4cee43 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -775,6 +775,7 @@ xfs_init_new_inode( prid_t prid, struct xfs_inode **ipp) { + struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *ip; unsigned int flags; @@ -804,18 +805,17 @@ xfs_init_new_inode( ASSERT(ip != NULL); inode = VFS_I(ip); - inode->i_mode = mode; set_nlink(inode, nlink); - inode->i_uid = current_fsuid(); inode->i_rdev = rdev; ip->i_d.di_projid = prid; - if (pip && XFS_INHERIT_GID(pip)) { - inode->i_gid = VFS_I(pip)->i_gid; - if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) - inode->i_mode |= S_ISGID; + if (dir && !(dir->i_mode & S_ISGID) && + (mp->m_flags & XFS_MOUNT_GRPID)) { + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + inode->i_mode = mode; } else { - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); } /* From 88a9e03beef22cc5fabea344f54b9a0dfe63de08 Mon Sep 17 00:00:00 2001 From: Yumei Huang <yuhuang@redhat.com> Date: Fri, 22 Jan 2021 16:48:19 -0800 Subject: [PATCH 20/85] xfs: Fix assert failure in xfs_setattr_size() An assert failure is triggered by syzkaller test due to ATTR_KILL_PRIV is not cleared before xfs_setattr_size. As ATTR_KILL_PRIV is not checked/used by xfs_setattr_size, just remove it from the assert. Signed-off-by: Yumei Huang <yuhuang@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 67c8dc9de8aa..f1e21b6cfa48 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -846,7 +846,7 @@ xfs_setattr_size( ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(inode->i_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; From 10fb9ac1251fd0daa645c9e6a22270bfc72bd5e8 Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:19 -0800 Subject: [PATCH 21/85] xfs: rename xfs_wait_buftarg() to xfs_buftarg_drain() xfs_wait_buftarg() is vaguely named and somewhat overloaded. Its primary purpose is to reclaim all buffers from the provided buffer target LRU. In preparation to refactor xfs_wait_buftarg() into serialization and LRU draining components, rename the function and associated helpers to something more descriptive. This patch has no functional changes with the minor exception of renaming a tracepoint. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_buf.c | 12 ++++++------ fs/xfs/xfs_buf.h | 10 +++++----- fs/xfs/xfs_log.c | 6 +++--- fs/xfs/xfs_mount.c | 4 ++-- fs/xfs/xfs_trace.h | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f8400bbd6473..99ee8b0f499c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -43,7 +43,7 @@ static kmem_zone_t *xfs_buf_zone; * pag_buf_lock * lru_lock * - * xfs_buftarg_wait_rele + * xfs_buftarg_drain_rele * lru_lock * b_lock (trylock due to inversion) * @@ -88,7 +88,7 @@ xfs_buf_vmap_len( * because the corresponding decrement is deferred to buffer release. Buffers * can undergo I/O multiple times in a hold-release cycle and per buffer I/O * tracking adds unnecessary overhead. This is used for sychronization purposes - * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * with unmount (see xfs_buftarg_drain()), so all we really need is a count of * in-flight buffers. * * Buffers that are never released (e.g., superblock, iclog buffers) must set @@ -1786,7 +1786,7 @@ __xfs_buf_mark_corrupt( * while freeing all the buffers only held by the LRU. */ static enum lru_status -xfs_buftarg_wait_rele( +xfs_buftarg_drain_rele( struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, @@ -1798,7 +1798,7 @@ xfs_buftarg_wait_rele( if (atomic_read(&bp->b_hold) > 1) { /* need to wait, so skip it this pass */ - trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + trace_xfs_buf_drain_buftarg(bp, _RET_IP_); return LRU_SKIP; } if (!spin_trylock(&bp->b_lock)) @@ -1816,7 +1816,7 @@ xfs_buftarg_wait_rele( } void -xfs_wait_buftarg( +xfs_buftarg_drain( struct xfs_buftarg *btp) { LIST_HEAD(dispose); @@ -1841,7 +1841,7 @@ xfs_wait_buftarg( /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { - list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, &dispose, LONG_MAX); while (!list_empty(&dispose)) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5d91a31298a4..d5e31ba205e0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -152,7 +152,7 @@ struct xfs_buf { struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ struct xfs_mount *b_mount; - xfs_buftarg_t *b_target; /* buffer target (device) */ + struct xfs_buftarg *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; struct completion b_iowait; /* queue for I/O waiters */ @@ -344,11 +344,11 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, struct dax_device *); +extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *, struct dax_device *); extern void xfs_free_buftarg(struct xfs_buftarg *); -extern void xfs_wait_buftarg(xfs_buftarg_t *); -extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); +extern void xfs_buftarg_drain(struct xfs_buftarg *); +extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fa2d05e65ff1..5ad4d5e78019 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -741,7 +741,7 @@ xfs_log_mount_finish( xfs_log_force(mp, XFS_LOG_SYNC); xfs_ail_push_all_sync(mp->m_ail); } - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_drain(mp->m_ddev_targp); if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; @@ -936,13 +936,13 @@ xfs_log_quiesce( /* * The superblock buffer is uncached and while xfs_ail_push_all_sync() - * will push it, xfs_wait_buftarg() will not wait for it. Further, + * will push it, xfs_buftarg_drain() will not wait for it. Further, * xfs_buf_iowait() cannot be used because it was pushed with the * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for * the IO to complete. */ xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_drain(mp->m_ddev_targp); xfs_buf_lock(mp->m_sb_bp); xfs_buf_unlock(mp->m_sb_bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7110507a2b6b..29a553f0877d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1023,8 +1023,8 @@ xfs_mountfs( xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) - xfs_wait_buftarg(mp->m_logdev_targp); - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_drain(mp->m_logdev_targp); + xfs_buftarg_drain(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); out_free_dir: diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5a263ae3d4f0..e2e0092c331d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -358,7 +358,7 @@ DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); -DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); +DEFINE_BUF_EVENT(xfs_buf_drain_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); /* not really buffer traces, but the buf provides useful information */ From 8321ddb2fa2964bffbc61400894a47dc3462323f Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:20 -0800 Subject: [PATCH 22/85] xfs: don't drain buffer lru on freeze and read-only remount xfs_buftarg_drain() is called from xfs_log_quiesce() to ensure the buffer cache is reclaimed during unmount. xfs_log_quiesce() is also called from xfs_quiesce_attr(), however, which means that cache state is completely drained for filesystem freeze and read-only remount. While technically harmless, this is unnecessarily heavyweight. Both freeze and read-only mounts allow reads and thus allow population of the buffer cache. Therefore, the transitional sequence in either case really only needs to quiesce outstanding writes to return the filesystem in a generally read-only state. Additionally, some users have reported that attempts to freeze a filesystem concurrent with a read-heavy workload causes the freeze process to stall for a significant amount of time. This occurs because, as mentioned above, the read workload repopulates the buffer LRU while the freeze task attempts to drain it. To improve this situation, replace the drain in xfs_log_quiesce() with a buffer I/O quiesce and lift the drain into the unmount path. This removes buffer LRU reclaim from freeze and read-only [re]mount, but ensures the LRU is still drained before the filesystem unmounts. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_buf.c | 20 +++++++++++++++----- fs/xfs/xfs_buf.h | 1 + fs/xfs/xfs_log.c | 6 ++++-- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 99ee8b0f499c..f6e5235df7c9 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1815,14 +1815,13 @@ xfs_buftarg_drain_rele( return LRU_REMOVED; } +/* + * Wait for outstanding I/O on the buftarg to complete. + */ void -xfs_buftarg_drain( +xfs_buftarg_wait( struct xfs_buftarg *btp) { - LIST_HEAD(dispose); - int loop = 0; - bool write_fail = false; - /* * First wait on the buftarg I/O count for all in-flight buffers to be * released. This is critical as new buffers do not make the LRU until @@ -1838,6 +1837,17 @@ xfs_buftarg_drain( while (percpu_counter_sum(&btp->bt_io_count)) delay(100); flush_workqueue(btp->bt_mount->m_buf_workqueue); +} + +void +xfs_buftarg_drain( + struct xfs_buftarg *btp) +{ + LIST_HEAD(dispose); + int loop = 0; + bool write_fail = false; + + xfs_buftarg_wait(btp); /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d5e31ba205e0..459ca34f26f5 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -347,6 +347,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, struct block_device *, struct dax_device *); extern void xfs_free_buftarg(struct xfs_buftarg *); +extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5ad4d5e78019..46ea4017fcec 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -936,13 +936,13 @@ xfs_log_quiesce( /* * The superblock buffer is uncached and while xfs_ail_push_all_sync() - * will push it, xfs_buftarg_drain() will not wait for it. Further, + * will push it, xfs_buftarg_wait() will not wait for it. Further, * xfs_buf_iowait() cannot be used because it was pushed with the * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for * the IO to complete. */ xfs_ail_push_all_sync(mp->m_ail); - xfs_buftarg_drain(mp->m_ddev_targp); + xfs_buftarg_wait(mp->m_ddev_targp); xfs_buf_lock(mp->m_sb_bp); xfs_buf_unlock(mp->m_sb_bp); @@ -962,6 +962,8 @@ xfs_log_unmount( { xfs_log_quiesce(mp); + xfs_buftarg_drain(mp->m_ddev_targp); + xfs_trans_ail_destroy(mp); xfs_sysfs_del(&mp->m_log->l_kobj); From 8aa921a95335d0a8c8e2be35a44467e7c91ec3e4 Mon Sep 17 00:00:00 2001 From: Jeffrey Mitchell <jeffrey.mitchell@starlab.io> Date: Fri, 22 Jan 2021 16:48:20 -0800 Subject: [PATCH 23/85] xfs: set inode size after creating symlink When XFS creates a new symlink, it writes its size to disk but not to the VFS inode. This causes i_size_read() to return 0 for that symlink until it is re-read from disk, for example when the system is rebooted. I found this inconsistency while protecting directories with eCryptFS. The command "stat path/to/symlink/in/ecryptfs" will report "Size: 0" if the symlink was created after the last reboot on an XFS root. Call i_size_write() in xfs_symlink() Signed-off-by: Jeffrey Mitchell <jeffrey.mitchell@starlab.io> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_symlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 0b8136a32484..7f96649e918a 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -305,6 +305,7 @@ xfs_symlink( } ASSERT(pathlen == 0); } + i_size_write(VFS_I(ip), ip->i_d.di_size); /* * Create the directory entry for the symlink. From 50d25484bebe94320c49dd1347d3330c7063bbdb Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:20 -0800 Subject: [PATCH 24/85] xfs: sync lazy sb accounting on quiesce of read-only mounts xfs_log_sbcount() syncs the superblock specifically to accumulate the in-core percpu superblock counters and commit them to disk. This is required to maintain filesystem consistency across quiesce (freeze, read-only mount/remount) or unmount when lazy superblock accounting is enabled because individual transactions do not update the superblock directly. This mechanism works as expected for writable mounts, but xfs_log_sbcount() skips the update for read-only mounts. Read-only mounts otherwise still allow log recovery and write out an unmount record during log quiesce. If a read-only mount performs log recovery, it can modify the in-core superblock counters and write an unmount record when the filesystem unmounts without ever syncing the in-core counters. This leaves the filesystem with a clean log but in an inconsistent state with regard to lazy sb counters. Update xfs_log_sbcount() to use the same logic xfs_log_unmount_write() uses to determine when to write an unmount record. This ensures that lazy accounting is always synced before the log is cleaned. Refactor this logic into a new helper to distinguish between a writable filesystem and a writable log. Specifically, the log is writable unless the filesystem is mounted with the norecovery mount option, the underlying log device is read-only, or the filesystem is shutdown. Drop the freeze state check because the update is already allowed during the freezing process and no context calls this function on an already frozen fs. Also, retain the shutdown check in xfs_log_unmount_write() to catch the case where the preceding log force might have triggered a shutdown. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Gao Xiang <hsiangkao@redhat.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Bill O'Donnell <billodo@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_log.c | 28 ++++++++++++++++++++-------- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_mount.c | 3 +-- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46ea4017fcec..62bcdaa07dc9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -347,6 +347,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) tic->t_res_num++; } +bool +xfs_log_writable( + struct xfs_mount *mp) +{ + /* + * Never write to the log on norecovery mounts, if the block device is + * read-only, or if the filesystem is shutdown. Read-only mounts still + * allow internal writes for log recovery and unmount purposes, so don't + * restrict that case here. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return false; + if (xfs_readonly_buftarg(mp->m_log->l_targ)) + return false; + if (XFS_FORCED_SHUTDOWN(mp)) + return false; + return true; +} + /* * Replenish the byte reservation required by moving the grant write head. */ @@ -886,15 +905,8 @@ xfs_log_unmount_write( { struct xlog *log = mp->m_log; - /* - * Don't write out unmount record on norecovery mounts or ro devices. - * Or, if we are doing a forced umount (typically because of IO errors). - */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY || - xfs_readonly_buftarg(log->l_targ)) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (!xfs_log_writable(mp)) return; - } xfs_log_force(mp, XFS_LOG_SYNC); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 58c3fcbec94a..98c913da7587 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -127,6 +127,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); +bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 29a553f0877d..8c24e1ee63ec 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1176,8 +1176,7 @@ xfs_fs_writable( int xfs_log_sbcount(xfs_mount_t *mp) { - /* allow this to proceed during the freeze sequence... */ - if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) + if (!xfs_log_writable(mp)) return 0; /* From 37444fc4cc398266fe0f71a9c0925620d44fb76a Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:21 -0800 Subject: [PATCH 25/85] xfs: lift writable fs check up into log worker task The log covering helper checks whether the filesystem is writable to determine whether to cover the log. The helper is currently only called from the background log worker. In preparation to reuse the helper from freezing contexts, lift the check into xfs_log_worker(). Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_log.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 62bcdaa07dc9..3ede7ad431c0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1051,14 +1051,12 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -static int -xfs_log_need_covered(xfs_mount_t *mp) +static bool +xfs_log_need_covered( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - int needed = 0; - - if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) - return 0; + struct xlog *log = mp->m_log; + bool needed = false; if (!xlog_cil_empty(log)) return 0; @@ -1076,14 +1074,14 @@ xfs_log_need_covered(xfs_mount_t *mp) if (!xlog_iclogs_empty(log)) break; - needed = 1; + needed = true; if (log->l_covered_state == XLOG_STATE_COVER_NEED) log->l_covered_state = XLOG_STATE_COVER_DONE; else log->l_covered_state = XLOG_STATE_COVER_DONE2; break; default: - needed = 1; + needed = true; break; } spin_unlock(&log->l_icloglock); @@ -1273,7 +1271,7 @@ xfs_log_worker( struct xfs_mount *mp = log->l_mp; /* dgc: errors ignored - not fatal and nowhere to report them */ - if (xfs_log_need_covered(mp)) { + if (xfs_fs_writable(mp, SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) { /* * Dump a transaction into the log that contains no real change. * This is needed to stamp the current tail LSN into the log From 9e54ee0fc9ef88ee255dc9770b291d047b38643c Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:21 -0800 Subject: [PATCH 26/85] xfs: separate log cleaning from log quiesce Log quiesce is currently associated with cleaning the log, which is accomplished by writing an unmount record as the last step of the quiesce sequence. The quiesce codepath is a bit convoluted in this regard due to how it is reused from various contexts. In preparation to create separate log cleaning and log covering interfaces, lift the write of the unmount record into a new cleaning helper and call that wherever xfs_log_quiesce() is currently invoked. No functional changes. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_log.c | 8 +++++++- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_super.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3ede7ad431c0..d02e14054956 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -957,7 +957,13 @@ xfs_log_quiesce( xfs_buftarg_wait(mp->m_ddev_targp); xfs_buf_lock(mp->m_sb_bp); xfs_buf_unlock(mp->m_sb_bp); +} +void +xfs_log_clean( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); xfs_log_unmount_write(mp); } @@ -972,7 +978,7 @@ void xfs_log_unmount( struct xfs_mount *mp) { - xfs_log_quiesce(mp); + xfs_log_clean(mp); xfs_buftarg_drain(mp->m_ddev_targp); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 98c913da7587..b0400589f824 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -139,6 +139,7 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_quiesce(struct xfs_mount *mp); +void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 813be879a5e5..09d956e30fd8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -897,7 +897,7 @@ xfs_quiesce_attr( if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - xfs_log_quiesce(mp); + xfs_log_clean(mp); } /* From 303591a0a9473fc4842984080fdb619188426bad Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:22 -0800 Subject: [PATCH 27/85] xfs: cover the log during log quiesce The log quiesce mechanism historically terminates by marking the log clean with an unmount record. The primary objective is to indicate that log recovery is no longer required after the quiesce has flushed all in-core changes and written back filesystem metadata. While this is perfectly fine, it is somewhat hacky as currently used in certain contexts. For example, filesystem freeze quiesces (i.e. cleans) the log and immediately redirties it with a dummy superblock transaction to ensure that log recovery runs in the event of a crash. While this functions correctly, cleaning the log from freeze context is clearly superfluous given the current redirtying behavior. Instead, the desired behavior can be achieved by simply covering the log. This effectively retires all on-disk log items from the active range of the log by issuing two synchronous and sequential dummy superblock update transactions that serve to update the on-disk log head and tail. The subtle difference is that the log technically remains dirty due to the lack of an unmount record, though recovery is effectively a no-op due to the content of the checkpoints being clean (i.e. the unmodified on-disk superblock). Log covering currently runs in the background and only triggers once the filesystem and log has idled. The purpose of the background mechanism is to prevent log recovery from replaying the most recently logged items long after those items may have been written back. In the quiesce path, the log has been deliberately idled by forcing the log and pushing the AIL until empty in a context where no further mutable filesystem operations are allowed. Therefore, we can cover the log as the final step in the log quiesce codepath to reflect that all previously active items have been successfully written back. This facilitates selective log covering from certain contexts (i.e. freeze) that only seek to quiesce, but not necessarily clean the log. Note that as a side effect of this change, log covering now occurs when cleaning the log as well. This is harmless, facilitates subsequent cleanups, and is mostly temporary as various operations switch to use explicit log covering. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> --- fs/xfs/xfs_log.c | 49 +++++++++++++++++++++++++++++++++++++++++++++--- fs/xfs/xfs_log.h | 2 +- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d02e14054956..e941faea8e98 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -91,6 +91,9 @@ STATIC int xlog_iclogs_empty( struct xlog *log); +static int +xfs_log_cover(struct xfs_mount *); + static void xlog_grant_sub_space( struct xlog *log, @@ -936,10 +939,9 @@ xfs_log_unmount_write( * To do this, we first need to shut down the background log work so it is not * trying to cover the log as we clean up. We then need to unpin all objects in * the log so we can then flush them out. Once they have completed their IO and - * run the callbacks removing themselves from the AIL, we can write the unmount - * record. + * run the callbacks removing themselves from the AIL, we can cover the log. */ -void +int xfs_log_quiesce( struct xfs_mount *mp) { @@ -957,6 +959,8 @@ xfs_log_quiesce( xfs_buftarg_wait(mp->m_ddev_targp); xfs_buf_lock(mp->m_sb_bp); xfs_buf_unlock(mp->m_sb_bp); + + return xfs_log_cover(mp); } void @@ -1094,6 +1098,45 @@ xfs_log_need_covered( return needed; } +/* + * Explicitly cover the log. This is similar to background log covering but + * intended for usage in quiesce codepaths. The caller is responsible to ensure + * the log is idle and suitable for covering. The CIL, iclog buffers and AIL + * must all be empty. + */ +static int +xfs_log_cover( + struct xfs_mount *mp) +{ + struct xlog *log = mp->m_log; + int error = 0; + + ASSERT((xlog_cil_empty(log) && xlog_iclogs_empty(log) && + !xfs_ail_min_lsn(log->l_ailp)) || + XFS_FORCED_SHUTDOWN(mp)); + + if (!xfs_log_writable(mp)) + return 0; + + /* + * To cover the log, commit the superblock twice (at most) in + * independent checkpoints. The first serves as a reference for the + * tail pointer. The sync transaction and AIL push empties the AIL and + * updates the in-core tail to the LSN of the first checkpoint. The + * second commit updates the on-disk tail with the in-core LSN, + * covering the log. Push the AIL one more time to leave it empty, as + * we found it. + */ + while (xfs_log_need_covered(mp)) { + error = xfs_sync_sb(mp, true); + if (error) + break; + xfs_ail_push_all_sync(mp->m_ail); + } + + return error; +} + /* * We may be holding the log iclog lock upon entering this routine. */ diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index b0400589f824..044e02cb8921 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -138,7 +138,7 @@ void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_quiesce(struct xfs_mount *mp); +int xfs_log_quiesce(struct xfs_mount *mp); void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *); From b0eb9e1182668b0e9cf81dbf38041cfb8c12887f Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:22 -0800 Subject: [PATCH 28/85] xfs: don't reset log idle state on covering checkpoints Now that log covering occurs on quiesce, we'd like to reuse the underlying superblock sync for final superblock updates. This includes things like lazy superblock counter updates, log feature incompat bits in the future, etc. One quirk to this approach is that once the log is in the IDLE (i.e. already covered) state, any subsequent log write resets the state back to NEED. This means that a final superblock sync to an already covered log requires two more sb syncs to return the log back to IDLE again. For example, if a lazy superblock enabled filesystem is mount cycled without any modifications, the unmount path syncs the superblock once and writes an unmount record. With the desired log quiesce covering behavior, we sync the superblock three times at unmount time: once for the lazy superblock counter update and twice more to cover the log. By contrast, if the log is active or only partially covered at unmount time, a final superblock sync would doubly serve as the one or two remaining syncs required to cover the log. This duplicate covering sequence is unnecessary because the filesystem remains consistent if a crash occurs at any point. The superblock will either be recovered in the event of a crash or written back before the log is quiesced and potentially cleaned with an unmount record. Update the log covering state machine to remain in the IDLE state if additional covering checkpoints pass through the log. This facilitates final superblock updates (such as lazy superblock counters) via a single sb sync without losing covered status. This provides some consistency with the active and partially covered cases and also avoids harmless, but spurious checkpoints when quiescing the log. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> --- fs/xfs/xfs_log.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e941faea8e98..a613f008b95f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2599,12 +2599,15 @@ xlog_covered_state( int iclogs_changed) { /* - * We usually go to NEED. But we go to NEED2 if the changed indicates we - * are done writing the dummy record. If we are done with the second - * dummy recored (DONE2), then we go to IDLE. + * We go to NEED for any non-covering writes. We go to NEED2 if we just + * wrote the first covering record (DONE). We go to IDLE if we just + * wrote the second covering record (DONE2) and remain in IDLE until a + * non-covering write occurs. */ switch (prev_state) { case XLOG_STATE_COVER_IDLE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: break; From f46e5a174655fd0bdb73008f6a4967d9c706f691 Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:23 -0800 Subject: [PATCH 29/85] xfs: fold sbcount quiesce logging into log covering xfs_log_sbcount() calls xfs_sync_sb() to sync superblock counters to disk when lazy superblock accounting is enabled. This occurs on unmount, freeze, and read-only (re)mount and ensures the final values are calculated and persisted to disk before each form of quiesce completes. Now that log covering occurs in all of these contexts and uses the same xfs_sync_sb() mechanism to update log state, there is no need to log the superblock separately for any reason. Update the log quiesce path to sync the superblock at least once for any mount where lazy superblock accounting is enabled. If the log is already covered, it will remain in the covered state. Otherwise, the next sync as part of the normal covering sequence will carry the associated superblock update with it. Remove xfs_log_sbcount() now that it is no longer needed. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> --- fs/xfs/xfs_log.c | 20 ++++++++++++++++++-- fs/xfs/xfs_mount.c | 31 ------------------------------- fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 8 -------- 4 files changed, 18 insertions(+), 42 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a613f008b95f..58699881c100 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1110,6 +1110,7 @@ xfs_log_cover( { struct xlog *log = mp->m_log; int error = 0; + bool need_covered; ASSERT((xlog_cil_empty(log) && xlog_iclogs_empty(log) && !xfs_ail_min_lsn(log->l_ailp)) || @@ -1118,6 +1119,21 @@ xfs_log_cover( if (!xfs_log_writable(mp)) return 0; + /* + * xfs_log_need_covered() is not idempotent because it progresses the + * state machine if the log requires covering. Therefore, we must call + * this function once and use the result until we've issued an sb sync. + * Do so first to make that abundantly clear. + * + * Fall into the covering sequence if the log needs covering or the + * mount has lazy superblock accounting to sync to disk. The sb sync + * used for covering accumulates the in-core counters, so covering + * handles this for us. + */ + need_covered = xfs_log_need_covered(mp); + if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + /* * To cover the log, commit the superblock twice (at most) in * independent checkpoints. The first serves as a reference for the @@ -1127,12 +1143,12 @@ xfs_log_cover( * covering the log. Push the AIL one more time to leave it empty, as * we found it. */ - while (xfs_log_need_covered(mp)) { + do { error = xfs_sync_sb(mp, true); if (error) break; xfs_ail_push_all_sync(mp->m_ail); - } + } while (xfs_log_need_covered(mp)); return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 8c24e1ee63ec..b621b09899e5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1124,12 +1124,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "Unable to update superblock counters. " - "Freespace may not be correct on next mount."); - - xfs_log_unmount(mp); xfs_da_unmount(mp); xfs_uuid_unmount(mp); @@ -1164,31 +1158,6 @@ xfs_fs_writable( return true; } -/* - * xfs_log_sbcount - * - * Sync the superblock counters to disk. - * - * Note this code can be called during the process of freezing, so we use the - * transaction allocator that does not block when the transaction subsystem is - * in its frozen state. - */ -int -xfs_log_sbcount(xfs_mount_t *mp) -{ - if (!xfs_log_writable(mp)) - return 0; - - /* - * we don't need to do this if we are updating the superblock - * counters on every modification. - */ - if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) - return 0; - - return xfs_sync_sb(mp, true); -} - /* * Deltas for the block count can vary from 1 to very large, but lock contention * only occurs on frequent small block count updates such as in the delayed diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index dfa429b77ee2..452ca7654dc5 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -399,7 +399,6 @@ int xfs_buf_hash_init(xfs_perag_t *pag); void xfs_buf_hash_destroy(xfs_perag_t *pag); extern void xfs_uuid_table_free(void); -extern int xfs_log_sbcount(xfs_mount_t *); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 09d956e30fd8..75ada867c665 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -884,19 +884,11 @@ void xfs_quiesce_attr( struct xfs_mount *mp) { - int error = 0; - cancel_delayed_work_sync(&mp->m_log->l_work); /* force the log to unpin objects from the now complete transactions */ xfs_log_force(mp, XFS_LOG_SYNC); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); xfs_log_clean(mp); } From 5232b9315034e45dba43b164aca3d5228948d05b Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:23 -0800 Subject: [PATCH 30/85] xfs: remove duplicate wq cancel and log force from attr quiesce These two calls are repeated at the beginning of xfs_log_quiesce(). Drop them from xfs_quiesce_attr(). Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> --- fs/xfs/xfs_super.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 75ada867c665..8fc9044131fc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -884,11 +884,6 @@ void xfs_quiesce_attr( struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* force the log to unpin objects from the now complete transactions */ - xfs_log_force(mp, XFS_LOG_SYNC); - xfs_log_clean(mp); } From ea2064da4592723d7b96235ca9bba4091a7458e3 Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:24 -0800 Subject: [PATCH 31/85] xfs: remove xfs_quiesce_attr() xfs_quiesce_attr() is now a wrapper for xfs_log_clean(). Remove it and call xfs_log_clean() directly. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> --- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_super.c | 24 ++---------------------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b621b09899e5..53b8ccab7235 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -946,7 +946,7 @@ xfs_mountfs( */ if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == XFS_MOUNT_RDONLY) { - xfs_quiesce_attr(mp); + xfs_log_clean(mp); } /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8fc9044131fc..aedf622d221b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -867,26 +867,6 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } -/* - * Trigger writeback of all the dirty metadata in the file system. - * - * This ensures that the metadata is written to their location on disk rather - * than just existing in transactions in the log. This means after a quiesce - * there is no log replay required to write the inodes to disk - this is the - * primary difference between a sync and a quiesce. - * - * We cancel log work early here to ensure all transactions the log worker may - * run have finished before we clean up and log the superblock and write an - * unmount record. The unfreeze process is responsible for restarting the log - * worker correctly. - */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) -{ - xfs_log_clean(mp); -} - /* * Second stage of a freeze. The data is already frozen so we only * need to take care of the metadata. Once that's done sync the superblock @@ -909,7 +889,7 @@ xfs_fs_freeze( flags = memalloc_nofs_save(); xfs_stop_block_reaping(mp); xfs_save_resvblks(mp); - xfs_quiesce_attr(mp); + xfs_log_clean(mp); ret = xfs_sync_sb(mp, true); memalloc_nofs_restore(flags); return ret; @@ -1752,7 +1732,7 @@ xfs_remount_ro( */ xfs_save_resvblks(mp); - xfs_quiesce_attr(mp); + xfs_log_clean(mp); mp->m_flags |= XFS_MOUNT_RDONLY; return 0; From 5b0ad7c2a52d4fdfec86a2c29096701783f46719 Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Fri, 22 Jan 2021 16:48:24 -0800 Subject: [PATCH 32/85] xfs: cover the log on freeze instead of cleaning it Filesystem freeze cleans the log and immediately redirties it so log recovery runs if a crash occurs after the filesystem is frozen. Now that log quiesce covers the log, there is no need to clean the log and redirty it to trigger log recovery because covering has the same effect. Update xfs_fs_freeze() to quiesce (and thus cover) the log. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> --- fs/xfs/xfs_super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aedf622d221b..aed74a3fc787 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -889,8 +889,7 @@ xfs_fs_freeze( flags = memalloc_nofs_save(); xfs_stop_block_reaping(mp); xfs_save_resvblks(mp); - xfs_log_clean(mp); - ret = xfs_sync_sb(mp, true); + ret = xfs_log_quiesce(mp); memalloc_nofs_restore(flags); return ret; } From f22c7f87777361f94aa17f746fbadfa499248dc8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 22 Jan 2021 16:48:25 -0800 Subject: [PATCH 33/85] xfs: refactor xfs_file_fsync Factor out the log syncing logic into two helpers to make the code easier to read and more maintainable. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/xfs_file.c | 81 +++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4927c6653f15..9441c257d86c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -118,6 +118,54 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } +static xfs_lsn_t +xfs_fsync_lsn( + struct xfs_inode *ip, + bool datasync) +{ + if (!xfs_ipincount(ip)) + return 0; + if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + return 0; + return ip->i_itemp->ili_last_lsn; +} + +/* + * All metadata updates are logged, which means that we just have to flush the + * log up to the latest LSN that touched the inode. + * + * If we have concurrent fsync/fdatasync() calls, we need them to all block on + * the log force before we clear the ili_fsync_fields field. This ensures that + * we don't get a racing sync operation that does not wait for the metadata to + * hit the journal before returning. If we race with clearing ili_fsync_fields, + * then all that will happen is the log force will do nothing as the lsn will + * already be on disk. We can't race with setting ili_fsync_fields because that + * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock + * shared until after the ili_fsync_fields is cleared. + */ +static int +xfs_fsync_flush_log( + struct xfs_inode *ip, + bool datasync, + int *log_flushed) +{ + int error = 0; + xfs_lsn_t lsn; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + lsn = xfs_fsync_lsn(ip, datasync); + if (lsn) { + error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, + log_flushed); + + spin_lock(&ip->i_itemp->ili_lock); + ip->i_itemp->ili_fsync_fields = 0; + spin_unlock(&ip->i_itemp->ili_lock); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + STATIC int xfs_file_fsync( struct file *file, @@ -125,13 +173,10 @@ xfs_file_fsync( loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_inode *ip = XFS_I(file->f_mapping->host); struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; - xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -155,33 +200,7 @@ xfs_file_fsync( else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); - /* - * All metadata updates are logged, which means that we just have to - * flush the log up to the latest LSN that touched the inode. If we have - * concurrent fsync/fdatasync() calls, we need them to all block on the - * log force before we clear the ili_fsync_fields field. This ensures - * that we don't get a racing sync operation that does not wait for the - * metadata to hit the journal before returning. If we race with - * clearing the ili_fsync_fields, then all that will happen is the log - * force will do nothing as the lsn will already be on disk. We can't - * race with setting ili_fsync_fields because that is done under - * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared - * until after the ili_fsync_fields is cleared. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - if (!datasync || - (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = iip->ili_last_lsn; - } - - if (lsn) { - error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); - spin_lock(&iip->ili_lock); - iip->ili_fsync_fields = 0; - spin_unlock(&iip->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = xfs_fsync_flush_log(ip, datasync, &log_flushed); /* * If we only have a single device, and the log force about was From ae29e4220fd3047b5442e7e8db8027d7745093f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Fri, 22 Jan 2021 16:48:25 -0800 Subject: [PATCH 34/85] xfs: reduce ilock acquisitions in xfs_file_fsync If the inode is not pinned by the time fsync is called we don't need the ilock to protect against concurrent clearing of ili_fsync_fields as the inode won't need a log flush or clearing of these fields. Not taking the iolock allows for full concurrency of fsync and thus O_DSYNC completions with io_uring/aio write submissions. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com> --- fs/xfs/xfs_file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9441c257d86c..48f6f89a38a9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -200,7 +200,14 @@ xfs_file_fsync( else if (mp->m_logdev_targp != mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); - error = xfs_fsync_flush_log(ip, datasync, &log_flushed); + /* + * Any inode that has dirty modifications in the log is pinned. The + * racy check here for a pinned inode while not catch modifications + * that happen concurrently to the fsync call, but fsync semantics + * only require to sync previously completed I/O. + */ + if (xfs_ipincount(ip)) + error = xfs_fsync_flush_log(ip, datasync, &log_flushed); /* * If we only have a single device, and the log force about was From 4533fc631547213bd03fbdf0a96dd8eb6807d3a7 Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Tue, 26 Jan 2021 19:14:55 -0800 Subject: [PATCH 35/85] xfs: fix unused log variable in xfs_log_cover() The log variable is only used in kernels with asserts enabled. Remove it and open code the dereference to avoid unused variable warnings. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_log.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 58699881c100..d8b814227734 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1108,12 +1108,11 @@ static int xfs_log_cover( struct xfs_mount *mp) { - struct xlog *log = mp->m_log; int error = 0; bool need_covered; - ASSERT((xlog_cil_empty(log) && xlog_iclogs_empty(log) && - !xfs_ail_min_lsn(log->l_ailp)) || + ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) && + !xfs_ail_min_lsn(mp->m_log->l_ailp)) || XFS_FORCED_SHUTDOWN(mp)); if (!xfs_log_writable(mp)) From 560ab6c0d12ebccabb83638abe23a7875b946f9a Mon Sep 17 00:00:00 2001 From: Chandan Babu R <chandanrlinux@gmail.com> Date: Wed, 27 Jan 2021 08:35:14 -0800 Subject: [PATCH 36/85] xfs: Fix 'set but not used' warning in xfs_bmap_compute_alignments() With both CONFIG_XFS_DEBUG and CONFIG_XFS_WARN disabled, the only reference to local variable "error" in xfs_bmap_compute_alignments() gets eliminated during pre-processing stage of the compilation process. This causes the compiler to generate a "set but not used" warning. Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/libxfs/xfs_bmap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2cd24bb06040..7ea1dbbe3d0b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3471,7 +3471,6 @@ xfs_bmap_compute_alignments( struct xfs_mount *mp = args->mp; xfs_extlen_t align = 0; /* minimum allocation alignment */ int stripe_align = 0; - int error; /* stripe alignment for allocation is determined by mount parameters */ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) @@ -3484,10 +3483,10 @@ xfs_bmap_compute_alignments( else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); if (align) { - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 0, ap->eof, 0, ap->conv, - &ap->offset, &ap->length); - ASSERT(!error); + if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, + ap->eof, 0, ap->conv, &ap->offset, + &ap->length)) + ASSERT(0); ASSERT(ap->length); } From 1aecf3734a95f3c167d1495550ca57556d33f7ec Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 29 Jan 2021 19:06:10 -0800 Subject: [PATCH 37/85] xfs: fix chown leaking delalloc quota blocks when fssetxattr fails While refactoring the quota code to create a function to allocate inode change transactions, I noticed that xfs_qm_vop_chown_reserve does more than just make reservations: it also *modifies* the incore counts directly to handle the owner id change for the delalloc blocks. I then observed that the fssetxattr code continues validating input arguments after making the quota reservation but before dirtying the transaction. If the routine decides to error out, it fails to undo the accounting switch! This leads to incorrect quota reservation and failure down the line. We can fix this by making the reservation function do only that -- for the new dquot, it reserves ondisk and delalloc blocks to the transaction, and the old dquot hangs on to its incore reservation for now. Once we actually switch the dquots, we can then update the incore reservations because we've dirtied the transaction and it's too late to turn back now. No fixes tag because this has been broken since the start of git. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_qm.c | 92 +++++++++++++++++++------------------------------ 1 file changed, 35 insertions(+), 57 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index c134eb4aeaa8..c2e4d3a27469 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1785,6 +1785,29 @@ xfs_qm_vop_chown( xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + /* + * Back when we made quota reservations for the chown, we reserved the + * ondisk blocks + delalloc blocks with the new dquot. Now that we've + * switched the dquots, decrease the new dquot's block reservation + * (having already bumped up the real counter) so that we don't have + * any reservation to give back when we commit. + */ + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS, + -ip->i_delayed_blks); + + /* + * Give the incore reservation for delalloc blocks back to the old + * dquot. We don't normally handle delalloc quota reservations + * transactionally, so just lock the dquot and subtract from the + * reservation. Dirty the transaction because it's too late to turn + * back now. + */ + tp->t_flags |= XFS_TRANS_DIRTY; + xfs_dqlock(prevdq); + ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); + prevdq->q_blk.reserved -= ip->i_delayed_blks; + xfs_dqunlock(prevdq); + /* * Take an extra reference, because the inode is going to keep * this dquot pointer even after the trans_commit. @@ -1807,84 +1830,39 @@ xfs_qm_vop_chown_reserve( uint flags) { struct xfs_mount *mp = ip->i_mount; - uint64_t delblks; unsigned int blkflags; - struct xfs_dquot *udq_unres = NULL; - struct xfs_dquot *gdq_unres = NULL; - struct xfs_dquot *pdq_unres = NULL; struct xfs_dquot *udq_delblks = NULL; struct xfs_dquot *gdq_delblks = NULL; struct xfs_dquot *pdq_delblks = NULL; - int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - delblks = ip->i_delayed_blks; blkflags = XFS_IS_REALTIME_INODE(ip) ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - i_uid_read(VFS_I(ip)) != udqp->q_id) { + i_uid_read(VFS_I(ip)) != udqp->q_id) udq_delblks = udqp; - /* - * If there are delayed allocation blocks, then we have to - * unreserve those from the old dquot, and add them to the - * new dquot. - */ - if (delblks) { - ASSERT(ip->i_udquot); - udq_unres = ip->i_udquot; - } - } + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - i_gid_read(VFS_I(ip)) != gdqp->q_id) { + i_gid_read(VFS_I(ip)) != gdqp->q_id) gdq_delblks = gdqp; - if (delblks) { - ASSERT(ip->i_gdquot); - gdq_unres = ip->i_gdquot; - } - } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != pdqp->q_id) { + ip->i_d.di_projid != pdqp->q_id) pdq_delblks = pdqp; - if (delblks) { - ASSERT(ip->i_pdquot); - pdq_unres = ip->i_pdquot; - } - } - - error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, - udq_delblks, gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks, 1, flags | blkflags); - if (error) - return error; /* - * Do the delayed blks reservations/unreservations now. Since, these - * are done without the help of a transaction, if a reservation fails - * its previous reservations won't be automatically undone by trans - * code. So, we have to do it manually here. + * Reserve enough quota to handle blocks on disk and reserved for a + * delayed allocation. We'll actually transfer the delalloc + * reservation between dquots at chown time, even though that part is + * only semi-transactional. */ - if (delblks) { - /* - * Do the reservations first. Unreservation can't fail. - */ - ASSERT(udq_delblks || gdq_delblks || pdq_delblks); - ASSERT(udq_unres || gdq_unres || pdq_unres); - error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - udq_delblks, gdq_delblks, pdq_delblks, - (xfs_qcnt_t)delblks, 0, flags | blkflags); - if (error) - return error; - xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - udq_unres, gdq_unres, pdq_unres, - -((xfs_qcnt_t)delblks), 0, blkflags); - } - - return 0; + return xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, + gdq_delblks, pdq_delblks, + ip->i_d.di_nblocks + ip->i_delayed_blks, + 1, blkflags | flags); } int From b8055ed6779d675e30f019ba3b7141848a4d6558 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Thu, 28 Jan 2021 10:56:38 -0800 Subject: [PATCH 38/85] xfs: reduce quota reservation when doing a dax unwritten extent conversion In commit 3b0fe47805802, we reduced the free space requirement to perform a pre-write unwritten extent conversion on an S_DAX file. Since we're not actually allocating any space, the logic goes, we only need enough reservation to handle shape changes in the bmbt. The same logic should have been applied to quota -- we're not allocating any space, so we only need to reserve enough quota to handle the bmbt shape changes. Fixes: 3b0fe4780580 ("xfs: Don't use reserved blocks for data blocks with DAX") Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 8f4b27cded20..b05cfeb09301 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -236,7 +236,7 @@ xfs_iomap_write_direct( bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { tflags |= XFS_TRANS_RESERVE; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, From 4abe21ad67a7b9dc6844f55e91a6e3ef81879d42 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:33 -0800 Subject: [PATCH 39/85] xfs: clean up quota reservation callsites Convert a few xfs_trans_*reserve* callsites that are open-coding other convenience functions. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/libxfs/xfs_bmap.c | 3 +-- fs/xfs/xfs_bmap_util.c | 4 ++-- fs/xfs/xfs_reflink.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 7ea1dbbe3d0b..c730288b5981 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4937,8 +4937,7 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, - -((long)del->br_blockcount), 0, + error = xfs_trans_unreserve_quota_nblks(NULL, ip, del->br_blockcount, 0, isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f3f8c48ff5bf..792809debaaa 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -884,8 +884,8 @@ xfs_unmap_extent( } xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, - ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e1c98dbf79e4..183142fd0961 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -508,8 +508,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &icur, &got, &del); /* Remove the quota reservation */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, - -(long)del.br_blockcount, 0, + error = xfs_trans_unreserve_quota_nblks(NULL, ip, + del.br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); if (error) break; From 8554650003b8a66f3dd357692ab73101d088d938 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:34 -0800 Subject: [PATCH 40/85] xfs: create convenience wrappers for incore quota block reservations Create a couple of convenience wrappers for creating and deleting quota block reservations against future changes. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/libxfs/xfs_bmap.c | 10 ++++------ fs/xfs/xfs_quota.h | 19 +++++++++++++++++++ fs/xfs/xfs_reflink.c | 5 ++--- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c730288b5981..94d582a9587d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4097,8 +4097,7 @@ xfs_bmapi_reserve_delalloc( * blocks. This number gets adjusted later. We return if we haven't * allocated blocks already inside this loop. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_quota_reserve_blkres(ip, alen); if (error) return error; @@ -4144,8 +4143,7 @@ out_unreserve_blocks: xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + xfs_quota_unreserve_blkres(ip, alen); return error; } @@ -4937,8 +4935,8 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - error = xfs_trans_unreserve_quota_nblks(NULL, ip, del->br_blockcount, 0, - isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ASSERT(!isrt); + error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); if (error) return error; ip->i_delayed_blks -= del->br_blockcount; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5a62398940d0..1d1a1634ea29 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -108,6 +108,12 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); +static inline int +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, + XFS_QMOPT_RES_REGBLKS); +} #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -136,6 +142,13 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, { return 0; } + +static inline int +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return 0; +} + #define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -157,6 +170,12 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS) +static inline int +xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_quota_reserve_blkres(ip, -blocks); +} + extern int xfs_mount_reset_sbqflags(struct xfs_mount *); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 183142fd0961..bea64ed5a57f 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -508,9 +508,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &icur, &got, &del); /* Remove the quota reservation */ - error = xfs_trans_unreserve_quota_nblks(NULL, ip, - del.br_blockcount, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_quota_unreserve_blkres(ip, + del.br_blockcount); if (error) break; } else { From 35b1101099e85af74a46b8e36f4d1fdac0367ffd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Tue, 26 Jan 2021 17:23:30 -0800 Subject: [PATCH 41/85] xfs: remove xfs_trans_unreserve_quota_nblks completely xfs_trans_cancel will release all the quota resources that were reserved on behalf of the transaction, so get rid of the explicit unreserve step. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_bmap_util.c | 11 ++++------- fs/xfs/xfs_iomap.c | 6 ++---- fs/xfs/xfs_quota.h | 2 -- fs/xfs/xfs_reflink.c | 5 +---- 4 files changed, 7 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 792809debaaa..ae2d98af693c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -820,12 +820,12 @@ xfs_alloc_file_space( error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; + goto error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) - goto error0; + goto error; xfs_trans_ijoin(tp, ip, 0); @@ -833,7 +833,7 @@ xfs_alloc_file_space( allocatesize_fsb, alloc_type, 0, imapp, &nimaps); if (error) - goto error0; + goto error; /* * Complete the transaction @@ -856,10 +856,7 @@ xfs_alloc_file_space( return error; -error0: /* unlock inode, unreserve quota blocks, cancel trans */ - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ +error: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b05cfeb09301..b046eadd3b21 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -253,7 +253,7 @@ xfs_iomap_write_direct( error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) - goto out_res_cancel; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); @@ -265,7 +265,7 @@ xfs_iomap_write_direct( error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0, imap, &nimaps); if (error) - goto out_res_cancel; + goto out_trans_cancel; /* * Complete the transaction @@ -289,8 +289,6 @@ out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; -out_res_cancel: - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 1d1a1634ea29..31d0de899cc4 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -164,8 +164,6 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) #define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ -#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ - xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) #define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index bea64ed5a57f..15435229bc1f 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -411,7 +411,7 @@ xfs_reflink_allocate_cow( XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, &nimaps); if (error) - goto out_unreserve; + goto out_trans_cancel; xfs_inode_set_cowblocks_tag(ip); error = xfs_trans_commit(tp); @@ -436,9 +436,6 @@ convert: trace_xfs_reflink_convert_cow(ip, cmap); return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); -out_unreserve: - xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, - XFS_QMOPT_RES_REGBLKS); out_trans_cancel: xfs_trans_cancel(tp); return error; From ad4a74739708e193c21245dae908ff50f72ff207 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:34 -0800 Subject: [PATCH 42/85] xfs: clean up icreate quota reservation calls Create a proper helper so that inode creation calls can reserve quota with a dedicated function. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> --- fs/xfs/xfs_inode.c | 6 ++---- fs/xfs/xfs_quota.h | 14 ++++++++++---- fs/xfs/xfs_symlink.c | 3 +-- fs/xfs/xfs_trans_dquot.c | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e2a1db4cee43..4bbd2fb628f7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1037,8 +1037,7 @@ xfs_create( /* * Reserve disk quota and the inode. */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); if (error) goto out_trans_cancel; @@ -1169,8 +1168,7 @@ xfs_create_tmpfile( if (error) goto out_release_inode; - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 31d0de899cc4..919c3a924821 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -86,6 +86,9 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); +int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, + struct xfs_dquot *udqp, struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, int64_t dblocks); extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, @@ -149,6 +152,13 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) return 0; } +static inline int +xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks) +{ + return 0; +} + #define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -164,10 +174,6 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) #define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ -#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ - xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ - f | XFS_QMOPT_RES_REGBLKS) - static inline int xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks) { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 7f96649e918a..d5dee8f409b2 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -215,8 +215,7 @@ xfs_symlink( /* * Reserve disk quota : blocks and inode. */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 28b8ac701919..22aa875b84f7 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -804,6 +804,24 @@ xfs_trans_reserve_quota_nblks( nblks, ninos, flags); } +/* Change the quota reservations for an inode creation activity. */ +int +xfs_trans_reserve_quota_icreate( + struct xfs_trans *tp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + int64_t dblocks) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp, + dblocks, 1, XFS_QMOPT_RES_REGBLKS); +} + /* * This routine is called to allocate a quotaoff log item. */ From 7ac6eb46c9f32d3e6ae37943191cd744ffa1ef33 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Mon, 25 Jan 2021 11:18:00 -0800 Subject: [PATCH 43/85] xfs: fix up build warnings when quotas are disabled Fix some build warnings on gcc 10.2 when quotas are disabled. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_quota.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 919c3a924821..03235c184aab 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -130,7 +130,7 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, } #define xfs_trans_dup_dqinfo(tp, tp2) #define xfs_trans_free_dqinfo(tp) -#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) +#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, @@ -166,8 +166,8 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) -#define xfs_qm_dqrele(d) -#define xfs_qm_statvfs(ip, s) +#define xfs_qm_dqrele(d) do { (d) = (d); } while(0) +#define xfs_qm_statvfs(ip, s) do { } while(0) #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) From 02b7ee4eb613240d2bb3f6a67723f94ceda19eb6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Tue, 26 Jan 2021 17:20:42 -0800 Subject: [PATCH 44/85] xfs: reserve data and rt quota at the same time Modify xfs_trans_reserve_quota_nblks so that we can reserve data and realtime blocks from the dquot at the same time. This change has the theoretical side effect that for allocations to realtime files we will reserve from the dquot both the number of rtblocks being allocated and the number of bmbt blocks that might be needed to add the mapping. However, since the mount code disables quota if it finds a realtime device, this should not result in any behavior changes. Now that we've moved the inode creation callers away from using the _nblks function, we can repurpose the (now unused) ninos argument for realtime blocks, so make that change. This also replaces the flags argument with a boolean parameter to force the reservation since we don't need to distinguish between data and rt quota reservations any more, and the only flag being passed in was FORCE_RES. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/libxfs/xfs_attr.c | 6 +----- fs/xfs/libxfs/xfs_bmap.c | 4 +--- fs/xfs/xfs_bmap_util.c | 24 +++++++++++------------- fs/xfs/xfs_iomap.c | 26 +++++++++++++------------- fs/xfs/xfs_quota.h | 10 +++++----- fs/xfs/xfs_reflink.c | 6 ++---- fs/xfs/xfs_trans_dquot.c | 40 +++++++++++++++++++++++++++------------- 7 files changed, 60 insertions(+), 56 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index be51e7068dcd..e05dc0bc4a8f 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -474,12 +474,8 @@ xfs_attr_set( } if (args->value) { - unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; - - if (rsvd) - quota_flags |= XFS_QMOPT_FORCE_RES; error = xfs_trans_reserve_quota_nblks(args->trans, dp, - args->total, 0, quota_flags); + args->total, 0, rsvd); if (error) goto out_trans_cancel; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 94d582a9587d..6e6734398f0d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1085,9 +1085,7 @@ xfs_bmap_add_attrfork( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd); if (error) goto trans_cancel; if (XFS_IFORK_Q(ip)) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ae2d98af693c..ef8f7055af77 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -727,11 +727,10 @@ xfs_alloc_file_space( xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; int nimaps; - int quota_flag; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; - uint qblocks, resblks, resrtextents; + uint resblks, resrtextents; int error; trace_xfs_alloc_file_space(ip); @@ -761,6 +760,7 @@ xfs_alloc_file_space( */ while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; + unsigned int dblocks, rblocks; /* * Determine space reservations for data/realtime. @@ -790,20 +790,19 @@ xfs_alloc_file_space( */ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = qblocks = resblks; + resrtextents = resblks; resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resblks; } else { - resrtextents = 0; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); - quota_flag = XFS_QMOPT_RES_REGBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + rblocks = 0; } /* * Allocate and setup the transaction. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, resrtextents, 0, &tp); /* @@ -817,8 +816,8 @@ xfs_alloc_file_space( break; } xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, - 0, quota_flag); + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + false); if (error) goto error; @@ -881,8 +880,7 @@ xfs_unmap_extent( } xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b046eadd3b21..fba7303d8ed6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -194,25 +194,25 @@ xfs_iomap_write_direct( struct xfs_trans *tp; xfs_filblks_t resaligned; int nimaps; - int quota_flag; - uint qblocks, resblks; + unsigned int dblocks, rblocks; unsigned int resrtextents = 0; int error; int bmapi_flags = XFS_BMAPI_PREALLOC; - uint tflags = 0; + int tflags = 0; + bool force = false; ASSERT(count_fsb > 0); resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, xfs_get_extsz_hint(ip)); if (unlikely(XFS_IS_REALTIME_INODE(ip))) { - resrtextents = qblocks = resaligned; + resrtextents = resaligned; resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; } else { - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - quota_flag = XFS_QMOPT_RES_REGBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; } error = xfs_qm_dqattach(ip); @@ -235,18 +235,19 @@ xfs_iomap_write_direct( if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { + force = true; tflags |= XFS_TRANS_RESERVE; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, resrtextents, tflags, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); if (error) goto out_trans_cancel; @@ -559,8 +560,7 @@ xfs_iomap_write_unwritten( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, true); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 03235c184aab..6ddc4b358ede 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -81,8 +81,8 @@ extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); -extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, - struct xfs_inode *, int64_t, long, uint); +int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, + int64_t dblocks, int64_t rblocks, bool force); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); @@ -114,8 +114,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); static inline int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) { - return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, - XFS_QMOPT_RES_REGBLKS); + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); } #else static inline int @@ -134,7 +133,8 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, - struct xfs_inode *ip, int64_t nblks, long ninos, uint flags) + struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, + bool force) { return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 15435229bc1f..0778b5810c26 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -398,8 +398,7 @@ xfs_reflink_allocate_cow( goto convert; } - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); if (error) goto out_trans_cancel; @@ -1090,8 +1089,7 @@ xfs_reflink_remap_extent( if (!smap_real && dmap_written) qres += dmap->br_blockcount; if (qres > 0) { - error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, false); if (error) goto out_cancel; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 22aa875b84f7..a1a72b7900c5 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -780,28 +780,42 @@ int xfs_trans_reserve_quota_nblks( struct xfs_trans *tp, struct xfs_inode *ip, - int64_t nblks, - long ninos, - uint flags) + int64_t dblocks, + int64_t rblocks, + bool force) { struct xfs_mount *mp = ip->i_mount; + unsigned int qflags = 0; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS || - (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS); - /* - * Reserve nblks against these dquots, with trans as the mediator. - */ - return xfs_trans_reserve_quota_bydquots(tp, mp, - ip->i_udquot, ip->i_gdquot, - ip->i_pdquot, - nblks, ninos, flags); + if (force) + qflags |= XFS_QMOPT_FORCE_RES; + + /* Reserve data device quota against the inode's dquots. */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, dblocks, 0, + XFS_QMOPT_RES_REGBLKS | qflags); + if (error) + return error; + + /* Do the same but for realtime blocks. */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, rblocks, 0, + XFS_QMOPT_RES_RTBLKS | qflags); + if (error) { + xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, -dblocks, 0, + XFS_QMOPT_RES_REGBLKS); + return error; + } + + return 0; } /* Change the quota reservations for an inode creation activity. */ From 3a1af6c317d0a55524f39079183be107be4c1f39 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Tue, 26 Jan 2021 16:33:29 -0800 Subject: [PATCH 45/85] xfs: refactor common transaction/inode/quota allocation idiom Create a new helper xfs_trans_alloc_inode that allocates a transaction, locks and joins an inode to it, and then reserves the appropriate amount of quota against that transction. Then replace all the open-coded idioms with a single call to this helper. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/libxfs/xfs_attr.c | 11 +-------- fs/xfs/libxfs/xfs_bmap.c | 10 ++------- fs/xfs/xfs_bmap_util.c | 14 +++--------- fs/xfs/xfs_iomap.c | 11 ++------- fs/xfs/xfs_trans.c | 48 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 3 +++ 6 files changed, 59 insertions(+), 38 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e05dc0bc4a8f..cb95bc77fe59 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -458,14 +458,10 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &tres, total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args->trans); + error = xfs_trans_alloc_inode(dp, &tres, total, rsvd, &args->trans); if (error) return error; - xfs_ilock(dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(args->trans, dp, 0); - if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); @@ -474,11 +470,6 @@ xfs_attr_set( } if (args->value) { - error = xfs_trans_reserve_quota_nblks(args->trans, dp, - args->total, 0, rsvd); - if (error) - goto out_trans_cancel; - error = xfs_has_attr(args); if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) goto out_trans_cancel; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6e6734398f0d..be6661645b59 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1079,19 +1079,13 @@ xfs_bmap_add_attrfork( blks = XFS_ADDAFORK_SPACE_RES(mp); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, + rsvd, &tp); if (error) return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd); - if (error) - goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; - xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ef8f7055af77..c5687ae437dc 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -873,18 +873,10 @@ xfs_unmap_extent( uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); int error; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - if (error) { - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, + false, &tp); if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(tp, ip, 0); + return error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_PUNCH_HOLE_CNT); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fba7303d8ed6..ac91c971342d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -552,18 +552,11 @@ xfs_iomap_write_unwritten( * here as we might be asked to write out the same inode that we * complete here and might deadlock on the iolock. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, + true, &tp); if (error) return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, true); - if (error) - goto error_on_bmapi_transaction; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index e72730f85af1..156b9ed8534f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_defer.h" +#include "xfs_inode.h" kmem_zone_t *xfs_trans_zone; @@ -1024,3 +1025,50 @@ xfs_trans_roll( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; return xfs_trans_reserve(*tpp, &tres, 0, 0); } + +/* + * Allocate an transaction, lock and join the inode to it, and reserve quota. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The caller is responsible for + * releasing ILOCK_EXCL if a new transaction is returned. + */ +int +xfs_trans_alloc_inode( + struct xfs_inode *ip, + struct xfs_trans_res *resv, + unsigned int dblocks, + bool force, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + int error; + + error = xfs_trans_alloc(mp, resv, dblocks, 0, + force ? XFS_TRANS_RESERVE : 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, 0, force); + if (error) + goto out_cancel; + + *tpp = tp; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 084658946cc8..aa50be244432 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -268,4 +268,7 @@ xfs_trans_item_relog( return lip->li_ops->iop_relog(lip, tp); } +int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, + unsigned int dblocks, bool force, struct xfs_trans **tpp); + #endif /* __XFS_TRANS_H__ */ From 3de4eb106fcc97f086b78bd17a0c3529691e8259 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Tue, 26 Jan 2021 16:44:07 -0800 Subject: [PATCH 46/85] xfs: allow reservation of rtblocks with xfs_trans_alloc_inode Make it so that we can reserve rt blocks with the xfs_trans_alloc_inode wrapper function, then convert a few more callsites. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/libxfs/xfs_attr.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/xfs_bmap_util.c | 29 +++++------------------------ fs/xfs/xfs_iomap.c | 22 +++++----------------- fs/xfs/xfs_trans.c | 6 ++++-- fs/xfs/xfs_trans.h | 3 ++- 6 files changed, 18 insertions(+), 46 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index cb95bc77fe59..472b3039eabb 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -458,7 +458,7 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc_inode(dp, &tres, total, rsvd, &args->trans); + error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index be6661645b59..e0905ad171f0 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1079,7 +1079,7 @@ xfs_bmap_add_attrfork( blks = XFS_ADDAFORK_SPACE_RES(mp); - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, rsvd, &tp); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c5687ae437dc..e7d68318e6a5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -730,7 +730,6 @@ xfs_alloc_file_space( int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; - uint resblks, resrtextents; int error; trace_xfs_alloc_file_space(ip); @@ -760,7 +759,7 @@ xfs_alloc_file_space( */ while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; - unsigned int dblocks, rblocks; + unsigned int dblocks, rblocks, resblks; /* * Determine space reservations for data/realtime. @@ -790,8 +789,6 @@ xfs_alloc_file_space( */ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = resblks; - resrtextents /= mp->m_sb.sb_rextsize; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resblks; } else { @@ -802,32 +799,16 @@ xfs_alloc_file_space( /* * Allocate and setup the transaction. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, - resrtextents, 0, &tp); - - /* - * Check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, - false); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + dblocks, rblocks, false, &tp); if (error) - goto error; + break; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto error; - xfs_trans_ijoin(tp, ip, 0); - error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, 0, imapp, &nimaps); @@ -873,7 +854,7 @@ xfs_unmap_extent( uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); int error; - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, false, &tp); if (error) return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ac91c971342d..fe2bbd9b6fdb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -195,19 +195,15 @@ xfs_iomap_write_direct( xfs_filblks_t resaligned; int nimaps; unsigned int dblocks, rblocks; - unsigned int resrtextents = 0; + bool force = false; int error; int bmapi_flags = XFS_BMAPI_PREALLOC; - int tflags = 0; - bool force = false; ASSERT(count_fsb > 0); resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, xfs_get_extsz_hint(ip)); if (unlikely(XFS_IS_REALTIME_INODE(ip))) { - resrtextents = resaligned; - resrtextents /= mp->m_sb.sb_rextsize; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resaligned; } else { @@ -236,28 +232,20 @@ xfs_iomap_write_direct( bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; - tflags |= XFS_TRANS_RESERVE; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, dblocks, resrtextents, - tflags, &tp); + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, force, &tp); if (error) return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); - if (error) - goto out_trans_cancel; - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - /* * From this point onwards we overwrite the imap pointer that the * caller gave to us. @@ -553,7 +541,7 @@ xfs_iomap_write_unwritten( * complete here and might deadlock on the iolock. */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, - true, &tp); + 0, true, &tp); if (error) return error; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 156b9ed8534f..151f274eee43 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1038,6 +1038,7 @@ xfs_trans_alloc_inode( struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, + unsigned int rblocks, bool force, struct xfs_trans **tpp) { @@ -1045,7 +1046,8 @@ xfs_trans_alloc_inode( struct xfs_mount *mp = ip->i_mount; int error; - error = xfs_trans_alloc(mp, resv, dblocks, 0, + error = xfs_trans_alloc(mp, resv, dblocks, + rblocks / mp->m_sb.sb_rextsize, force ? XFS_TRANS_RESERVE : 0, &tp); if (error) return error; @@ -1060,7 +1062,7 @@ xfs_trans_alloc_inode( goto out_cancel; } - error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, 0, force); + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); if (error) goto out_cancel; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index aa50be244432..52bbd7e6a552 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -269,6 +269,7 @@ xfs_trans_item_relog( } int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, - unsigned int dblocks, bool force, struct xfs_trans **tpp); + unsigned int dblocks, unsigned int rblocks, bool force, + struct xfs_trans **tpp); #endif /* __XFS_TRANS_H__ */ From f273387b048543f2b8b2d809cc65fca28e7788a1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Wed, 27 Jan 2021 10:07:27 -0800 Subject: [PATCH 47/85] xfs: refactor reflink functions to use xfs_trans_alloc_inode The two remaining callers of xfs_trans_reserve_quota_nblks are in the reflink code. These conversions aren't as uniform as the previous conversions, so call that out in a separate patch. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_iomap.c | 3 ++- fs/xfs/xfs_reflink.c | 53 ++++++++++++++++++-------------------------- 2 files changed, 23 insertions(+), 33 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fe2bbd9b6fdb..70c341658c01 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -831,7 +831,8 @@ out_found_cow: return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); out_unlock: - xfs_iunlock(ip, lockmode); + if (lockmode) + xfs_iunlock(ip, lockmode); return error; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0778b5810c26..27f875fa7a0d 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -376,16 +376,14 @@ xfs_reflink_allocate_cow( resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); + *lockmode = 0; + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, + false, &tp); if (error) return error; - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_trans_cancel; + *lockmode = XFS_ILOCK_EXCL; /* * Check for an overlapping extent again now that we dropped the ilock. @@ -398,12 +396,6 @@ xfs_reflink_allocate_cow( goto convert; } - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, false); - if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(tp, ip, 0); - /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, @@ -997,7 +989,7 @@ xfs_reflink_remap_extent( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_off_t newlen; - int64_t qres, qdelta; + int64_t qdelta = 0; unsigned int resblks; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); @@ -1005,15 +997,22 @@ xfs_reflink_remap_extent( int nimaps; int error; - /* Start a rolling transaction to switch the mappings */ + /* + * Start a rolling transaction to switch the mappings. + * + * Adding a written extent to the extent map can cause a bmbt split, + * and removing a mapped extent from the extent can cause a bmbt split. + * The two operations cannot both cause a split since they operate on + * the same index in the bmap btree, so we only need a reservation for + * one bmbt split if either thing is happening. However, we haven't + * locked the inode yet, so we reserve assuming this is the case. + */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, + false, &tp); if (error) goto out; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * Read what's currently mapped in the destination file into smap. * If smap isn't a hole, we will have to remove it before we can add @@ -1061,15 +1060,9 @@ xfs_reflink_remap_extent( } /* - * Compute quota reservation if we think the quota block counter for + * Increase quota reservation if we think the quota block counter for * this file could increase. * - * Adding a written extent to the extent map can cause a bmbt split, - * and removing a mapped extent from the extent can cause a bmbt split. - * The two operations cannot both cause a split since they operate on - * the same index in the bmap btree, so we only need a reservation for - * one bmbt split if either thing is happening. - * * If we are mapping a written extent into the file, we need to have * enough quota block count reservation to handle the blocks in that * extent. We log only the delta to the quota block counts, so if the @@ -1083,13 +1076,9 @@ xfs_reflink_remap_extent( * before we started. That should have removed all the delalloc * reservations, but we code defensively. */ - qres = qdelta = 0; - if (smap_real || dmap_written) - qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - if (!smap_real && dmap_written) - qres += dmap->br_blockcount; - if (qres > 0) { - error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, false); + if (!smap_real && dmap_written) { + error = xfs_trans_reserve_quota_nblks(tp, ip, + dmap->br_blockcount, 0, false); if (error) goto out_cancel; } From f2f7b9ff62a28928f6fe2bd55cdb4d4b02ab7477 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Wed, 27 Jan 2021 12:07:57 -0800 Subject: [PATCH 48/85] xfs: refactor inode creation transaction/inode/quota allocation idiom For file creation, create a new helper xfs_trans_alloc_icreate that allocates a transaction and reserves the appropriate amount of quota against that transction. Replace all the open-coded idioms with a single call to this helper so that we can contain the retry loops in the next patchset. This changes the locking behavior for non-tempfile creation slightly, in that we now make the quota reservation without holding the directory ILOCK. While the dquots chosen for inode creation are based on the directory state at a given point in time, the directory ILOCK was released as soon as the dquot references are picked up. Hence it was never necessary to hold the directory ILOCK for the quota reservation. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_inode.c | 28 ++++++++++------------------ fs/xfs/xfs_symlink.c | 14 ++++---------- fs/xfs/xfs_trans.c | 33 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 6 ++++++ 4 files changed, 53 insertions(+), 28 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4bbd2fb628f7..636ac13b1df2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1022,25 +1022,20 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, + resblks, &tp); } if (error) - goto out_release_inode; + goto out_release_dquots; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - /* - * Reserve disk quota and the inode. - */ - error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); - if (error) - goto out_trans_cancel; - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, XFS_IEXT_DIR_MANIP_CNT(mp)); if (error) @@ -1120,7 +1115,7 @@ xfs_create( xfs_finish_inode_setup(ip); xfs_irele(ip); } - + out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -1164,13 +1159,10 @@ xfs_create_tmpfile( resblks = XFS_IALLOC_SPACE_RES(mp); tres = &M_RES(mp)->tr_create_tmpfile; - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); if (error) - goto out_release_inode; - - error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); - if (error) - goto out_trans_cancel; + goto out_release_dquots; error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) @@ -1213,7 +1205,7 @@ xfs_create_tmpfile( xfs_finish_inode_setup(ip); xfs_irele(ip); } - + out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index d5dee8f409b2..8565663b16cd 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -197,9 +197,10 @@ xfs_symlink( fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp, + pdqp, resblks, &tp); if (error) - goto out_release_inode; + goto out_release_dquots; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -212,13 +213,6 @@ xfs_symlink( goto out_trans_cancel; } - /* - * Reserve disk quota : blocks and inode. - */ - error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, resblks); - if (error) - goto out_trans_cancel; - error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, XFS_IEXT_DIR_MANIP_CNT(mp)); if (error) @@ -347,7 +341,7 @@ out_release_inode: xfs_finish_inode_setup(ip); xfs_irele(ip); } - +out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 151f274eee43..6c68635cc6ac 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -21,6 +21,8 @@ #include "xfs_error.h" #include "xfs_defer.h" #include "xfs_inode.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" kmem_zone_t *xfs_trans_zone; @@ -1074,3 +1076,34 @@ out_cancel: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +/* + * Allocate an transaction in preparation for inode creation by reserving quota + * against the given dquots. Callers are not required to hold any inode locks. + */ +int +xfs_trans_alloc_icreate( + struct xfs_mount *mp, + struct xfs_trans_res *resv, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + unsigned int dblocks, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); + if (error) + return error; + + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + *tpp = tp; + return 0; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 52bbd7e6a552..04c132c55e9b 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -268,8 +268,14 @@ xfs_trans_item_relog( return lip->li_ops->iop_relog(lip, tp); } +struct xfs_dquot; + int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, unsigned int rblocks, bool force, struct xfs_trans **tpp); +int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, + struct xfs_dquot *udqp, struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, unsigned int dblocks, + struct xfs_trans **tpp); #endif /* __XFS_TRANS_H__ */ From 7317a03df703f7cdae3ae9e9635a0ef45849fe09 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 29 Jan 2021 11:32:09 -0800 Subject: [PATCH 49/85] xfs: refactor inode ownership change transaction/inode/quota allocation idiom For file ownership (uid, gid, prid) changes, create a new helper xfs_trans_alloc_ichange that allocates a transaction and reserves the appropriate amount of quota against that transction in preparation for a change of user, group, or project id. Replace all the open-coded idioms with a single call to this helper so that we can contain the retry loops in the next patchset. This changes the locking behavior for ichange transactions slightly. Since tr_ichange does not have a permanent reservation and cannot roll, we pass XFS_ILOCK_EXCL to ijoin so that the inode will be unlocked automatically at commit time. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_ioctl.c | 29 ++++++++-------------- fs/xfs/xfs_iops.c | 26 ++----------------- fs/xfs/xfs_trans.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 3 +++ 4 files changed, 77 insertions(+), 43 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3fbd98f61ea5..78ee201eb7cb 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1275,24 +1275,23 @@ xfs_ioctl_setattr_prepare_dax( */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_dquot *pdqp) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = -EROFS; if (mp->m_flags & XFS_MOUNT_RDONLY) - goto out_unlock; + goto out_error; error = -EIO; if (XFS_FORCED_SHUTDOWN(mp)) - goto out_unlock; + goto out_error; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, + capable(CAP_FOWNER), &tp); if (error) - goto out_unlock; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + goto out_error; /* * CAP_FOWNER overrides the following restrictions: @@ -1312,7 +1311,7 @@ xfs_ioctl_setattr_get_trans( out_cancel: xfs_trans_cancel(tp); -out_unlock: +out_error: return ERR_PTR(error); } @@ -1462,20 +1461,12 @@ xfs_ioctl_setattr( xfs_ioctl_setattr_prepare_dax(ip, fa); - tp = xfs_ioctl_setattr_get_trans(ip); + tp = xfs_ioctl_setattr_get_trans(ip, pdqp); if (IS_ERR(tp)) { code = PTR_ERR(tp); goto error_free_dquots; } - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - ip->i_d.di_projid != fa->fsx_projid) { - code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp, - capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); - if (code) /* out of quota */ - goto error_trans_cancel; - } - xfs_fill_fsxattr(ip, false, &old_fa); code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); if (code) @@ -1608,7 +1599,7 @@ xfs_ioc_setxflags( xfs_ioctl_setattr_prepare_dax(ip, &fa); - tp = xfs_ioctl_setattr_get_trans(ip); + tp = xfs_ioctl_setattr_get_trans(ip, NULL); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f1e21b6cfa48..00369502fe25 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -700,13 +700,11 @@ xfs_setattr_nonsize( return error; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, + capable(CAP_FOWNER), &tp); if (error) goto out_dqrele; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * Change file ownership. Must be the owner or privileged. */ @@ -722,21 +720,6 @@ xfs_setattr_nonsize( gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - /* - * Do a quota reservation only if uid/gid is actually - * going to change. - */ - if (XFS_IS_QUOTA_RUNNING(mp) && - ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || - (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { - ASSERT(tp); - error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, - NULL, capable(CAP_FOWNER) ? - XFS_QMOPT_FORCE_RES : 0); - if (error) /* out of quota */ - goto out_cancel; - } - /* * CAP_FSETID overrides the following restrictions: * @@ -786,8 +769,6 @@ xfs_setattr_nonsize( xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* * Release any dquot(s) the inode had kept before chown. */ @@ -814,9 +795,6 @@ xfs_setattr_nonsize( return 0; -out_cancel: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 6c68635cc6ac..60672b5545c9 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1107,3 +1107,65 @@ xfs_trans_alloc_icreate( *tpp = tp; return 0; } + +/* + * Allocate an transaction, lock and join the inode to it, and reserve quota + * in preparation for inode attribute changes that include uid, gid, or prid + * changes. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The ILOCK will be dropped when the + * transaction is committed or cancelled. + */ +int +xfs_trans_alloc_ichange( + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + bool force, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + /* + * For each quota type, skip quota reservations if the inode's dquots + * now match the ones that came from the caller, or the caller didn't + * pass one in. + */ + if (udqp == ip->i_udquot) + udqp = NULL; + if (gdqp == ip->i_gdquot) + gdqp = NULL; + if (pdqp == ip->i_pdquot) + pdqp = NULL; + if (udqp || gdqp || pdqp) { + error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, pdqp, + force ? XFS_QMOPT_FORCE_RES : 0); + if (error) + goto out_cancel; + } + + *tpp = tp; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 04c132c55e9b..8b03fbfe9a1b 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -277,5 +277,8 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int dblocks, struct xfs_trans **tpp); +int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, + struct xfs_trans **tpp); #endif /* __XFS_TRANS_H__ */ From 5c615f0feb9a559abd08da0842d6fcfee105b7e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Mon, 1 Feb 2021 10:38:51 -0800 Subject: [PATCH 50/85] xfs: remove xfs_qm_vop_chown_reserve Now that the only caller of this function is xfs_trans_alloc_ichange, just open-code the meat of _chown_reserve in that caller. Drop the (redundant) [ugp]id checks because xfs has a 1:1 relationship between quota ids and incore dquots. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_qm.c | 48 ---------------------------------------------- fs/xfs/xfs_quota.h | 4 ---- fs/xfs/xfs_trans.c | 16 ++++++++++++++-- 3 files changed, 14 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index c2e4d3a27469..742d1413e2d0 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1817,54 +1817,6 @@ xfs_qm_vop_chown( return prevdq; } -/* - * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). - */ -int -xfs_qm_vop_chown_reserve( - struct xfs_trans *tp, - struct xfs_inode *ip, - struct xfs_dquot *udqp, - struct xfs_dquot *gdqp, - struct xfs_dquot *pdqp, - uint flags) -{ - struct xfs_mount *mp = ip->i_mount; - unsigned int blkflags; - struct xfs_dquot *udq_delblks = NULL; - struct xfs_dquot *gdq_delblks = NULL; - struct xfs_dquot *pdq_delblks = NULL; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - blkflags = XFS_IS_REALTIME_INODE(ip) ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; - - if (XFS_IS_UQUOTA_ON(mp) && udqp && - i_uid_read(VFS_I(ip)) != udqp->q_id) - udq_delblks = udqp; - - if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - i_gid_read(VFS_I(ip)) != gdqp->q_id) - gdq_delblks = gdqp; - - if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != pdqp->q_id) - pdq_delblks = pdqp; - - /* - * Reserve enough quota to handle blocks on disk and reserved for a - * delayed allocation. We'll actually transfer the delalloc - * reservation between dquots at chown time, even though that part is - * only semi-transactional. - */ - return xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, - gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks + ip->i_delayed_blks, - 1, blkflags | flags); -} - int xfs_qm_vop_rename_dqattach( struct xfs_inode **i_tab) diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 6ddc4b358ede..d00d01302545 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -98,9 +98,6 @@ extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); -extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot *, struct xfs_dquot *, - struct xfs_dquot *, uint); extern int xfs_qm_dqattach(struct xfs_inode *); extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc); extern void xfs_qm_dqdetach(struct xfs_inode *); @@ -162,7 +159,6 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) -#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) #define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 60672b5545c9..29dca1bc4c1a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1156,8 +1156,20 @@ xfs_trans_alloc_ichange( if (pdqp == ip->i_pdquot) pdqp = NULL; if (udqp || gdqp || pdqp) { - error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, pdqp, - force ? XFS_QMOPT_FORCE_RES : 0); + unsigned int qflags = XFS_QMOPT_RES_REGBLKS; + + if (force) + qflags |= XFS_QMOPT_FORCE_RES; + + /* + * Reserve enough quota to handle blocks on disk and reserved + * for a delayed allocation. We'll actually transfer the + * delalloc reservation between dquots at chown time, even + * though that part is only semi-transactional. + */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, + pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks, + 1, qflags); if (error) goto out_cancel; } From fea7aae6cecfed1b6a520cc527d297df8801b999 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 29 Jan 2021 17:14:33 -0800 Subject: [PATCH 51/85] xfs: rename code to error in xfs_ioctl_setattr Rename the 'code' variable to 'error' to follow the naming convention of most other functions in xfs. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_ioctl.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 78ee201eb7cb..38ee66d999d8 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1435,13 +1435,13 @@ xfs_ioctl_setattr( struct xfs_trans *tp; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; - int code; + int error; trace_xfs_ioctl_setattr(ip); - code = xfs_ioctl_setattr_check_projid(ip, fa); - if (code) - return code; + error = xfs_ioctl_setattr_check_projid(ip, fa); + if (error) + return error; /* * If disk quotas is on, we make sure that the dquots do exist on disk, @@ -1452,36 +1452,36 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, VFS_I(ip)->i_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); - if (code) - return code; + if (error) + return error; } xfs_ioctl_setattr_prepare_dax(ip, fa); tp = xfs_ioctl_setattr_get_trans(ip, pdqp); if (IS_ERR(tp)) { - code = PTR_ERR(tp); + error = PTR_ERR(tp); goto error_free_dquots; } xfs_fill_fsxattr(ip, false, &old_fa); - code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); - if (code) + error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); + if (error) goto error_trans_cancel; - code = xfs_ioctl_setattr_check_extsize(ip, fa); - if (code) + error = xfs_ioctl_setattr_check_extsize(ip, fa); + if (error) goto error_trans_cancel; - code = xfs_ioctl_setattr_check_cowextsize(ip, fa); - if (code) + error = xfs_ioctl_setattr_check_cowextsize(ip, fa); + if (error) goto error_trans_cancel; - code = xfs_ioctl_setattr_xflags(tp, ip, fa); - if (code) + error = xfs_ioctl_setattr_xflags(tp, ip, fa); + if (error) goto error_trans_cancel; /* @@ -1521,7 +1521,7 @@ xfs_ioctl_setattr( else ip->i_d.di_cowextsize = 0; - code = xfs_trans_commit(tp); + error = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. @@ -1529,13 +1529,13 @@ xfs_ioctl_setattr( xfs_qm_dqrele(olddquot); xfs_qm_dqrele(pdqp); - return code; + return error; error_trans_cancel: xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(pdqp); - return code; + return error; } STATIC int From 2a4bdfa8558ca2904dc17b83497dc82aa7fc05e9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 29 Jan 2021 15:44:32 -0800 Subject: [PATCH 52/85] xfs: shut down the filesystem if we screw up quota reservation If we ever screw up the quota reservations enough to trip the assertions, something's wrong with the quota code. Shut down the filesystem when this happens, because this is corruption. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_trans_dquot.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index a1a72b7900c5..48e09ea30ee5 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -16,6 +16,7 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_error.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -691,9 +692,11 @@ xfs_trans_dqresv( nblks); xfs_trans_mod_dquot(tp, dqp, XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); - ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); - ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); + + if (XFS_IS_CORRUPT(mp, dqp->q_blk.reserved < dqp->q_blk.count) || + XFS_IS_CORRUPT(mp, dqp->q_rtb.reserved < dqp->q_rtb.count) || + XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count)) + goto error_corrupt; xfs_dqunlock(dqp); return 0; @@ -703,6 +706,10 @@ error_return: if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; +error_corrupt: + xfs_dqunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; } From a636b1d1cf73804e385990c975e33cf06c032b64 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:34 -0800 Subject: [PATCH 53/85] xfs: trigger all block gc scans when low on quota space The functions to run an eof/cowblocks scan to try to reduce quota usage are kind of a mess -- the logic repeatedly initializes an eofb structure and there are logic bugs in the code that result in the cowblocks scan never actually happening. Replace all three functions with a single function that fills out an eofb and runs both eof and cowblocks scans. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_file.c | 15 ++++++--------- fs/xfs/xfs_icache.c | 46 ++++++++++++++++----------------------------- fs/xfs/xfs_icache.h | 4 ++-- 3 files changed, 24 insertions(+), 41 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 48f6f89a38a9..b55f1eb8f1c8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -672,7 +672,7 @@ xfs_file_buffered_aio_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int enospc = 0; + bool cleared_space = false; int iolock; if (iocb->ki_flags & IOCB_NOWAIT) @@ -704,19 +704,16 @@ write_retry: * also behaves as a filter to prevent too many eofblocks scans from * running at the same time. */ - if (ret == -EDQUOT && !enospc) { + if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - enospc = xfs_inode_free_quota_eofblocks(ip); - if (enospc) - goto write_retry; - enospc = xfs_inode_free_quota_cowblocks(ip); - if (enospc) + cleared_space = xfs_inode_free_quota_blocks(ip); + if (cleared_space) goto write_retry; iolock = 0; - } else if (ret == -ENOSPC && !enospc) { + } else if (ret == -ENOSPC && !cleared_space) { struct xfs_eofblocks eofb = {0}; - enospc = 1; + cleared_space = true; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index deb99300d171..c71eb15e3835 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1397,33 +1397,31 @@ xfs_icache_free_eofblocks( } /* - * Run eofblocks scans on the quotas applicable to the inode. For inodes with - * multiple quotas, we don't know exactly which quota caused an allocation + * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes + * with multiple quotas, we don't know exactly which quota caused an allocation * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. */ -static int -__xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip, - int (*execute)(struct xfs_mount *mp, - struct xfs_eofblocks *eofb)) +bool +xfs_inode_free_quota_blocks( + struct xfs_inode *ip) { - int scan = 0; - struct xfs_eofblocks eofb = {0}; - struct xfs_dquot *dq; + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + bool do_work = false; /* * Run a sync scan to increase effectiveness and use the union filter to * cover all applicable quotas in a single scan. */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_uid = VFS_I(ip)->i_uid; eofb.eof_flags |= XFS_EOF_FLAGS_UID; - scan = 1; + do_work = true; } } @@ -1432,21 +1430,16 @@ __xfs_inode_free_quota_eofblocks( if (dq && xfs_dquot_lowsp(dq)) { eofb.eof_gid = VFS_I(ip)->i_gid; eofb.eof_flags |= XFS_EOF_FLAGS_GID; - scan = 1; + do_work = true; } } - if (scan) - execute(ip->i_mount, &eofb); + if (!do_work) + return false; - return scan; -} - -int -xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip) -{ - return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + xfs_icache_free_cowblocks(ip->i_mount, &eofb); + return true; } static inline unsigned long @@ -1646,13 +1639,6 @@ xfs_icache_free_cowblocks( XFS_ICI_COWBLOCKS_TAG); } -int -xfs_inode_free_quota_cowblocks( - struct xfs_inode *ip) -{ - return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); -} - void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3a4c8b382cd0..3f7ddbca8638 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,17 +54,17 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +bool xfs_inode_free_quota_blocks(struct xfs_inode *ip); + void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); -int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *); void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); -int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); void xfs_queue_cowblocks(struct xfs_mount *); From f41a0716f4b08678a73173d71ff3f409b996df2d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:35 -0800 Subject: [PATCH 54/85] xfs: don't stall cowblocks scan if we can't take locks Don't stall the cowblocks scan on a locked inode if we possibly can. We'd much rather the background scanner keep moving. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_icache.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c71eb15e3835..89f9e692fde7 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1605,17 +1605,31 @@ xfs_inode_free_cowblocks( void *args) { struct xfs_eofblocks *eofb = args; + bool wait; int ret = 0; + wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + if (!xfs_prep_free_cowblocks(ip)) return 0; if (!xfs_inode_matches_eofb(ip, eofb)) return 0; - /* Free the CoW blocks */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (wait) + return -EAGAIN; + return 0; + } + if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { + if (wait) + ret = -EAGAIN; + goto out_iolock; + } /* * Check again, nobody else should be able to dirty blocks or change @@ -1625,6 +1639,7 @@ xfs_inode_free_cowblocks( ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); +out_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; From 9a537de3b009d95cfb048b7cbfe9bdb0f655596e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:35 -0800 Subject: [PATCH 55/85] xfs: xfs_inode_free_quota_blocks should scan project quota Buffered writers who have run out of quota reservation call xfs_inode_free_quota_blocks to try to free any space reservations that might reduce the quota usage. Unfortunately, the buffered write path treats "out of project quota" the same as "out of overall space" so this function has never supported scanning for space that might ease an "out of project quota" condition. We're about to start using this function for cases where we actually /can/ tell if we're out of project quota, so add in this functionality. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_icache.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 89f9e692fde7..10c1a0dee17d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1434,6 +1434,15 @@ xfs_inode_free_quota_blocks( } } + if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_prid = ip->i_d.di_projid; + eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + do_work = true; + } + } + if (!do_work) return false; From 3d4feec00673d34fbbfe0277d2e0ed1f51d20cb2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:36 -0800 Subject: [PATCH 56/85] xfs: move and rename xfs_inode_free_quota_blocks to avoid conflicts Move this function further down in the file so that later cleanups won't have to declare static functions. Change the name because we're about to rework all the code that performs garbage collection of speculatively allocated file blocks. No functional changes. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_icache.c | 110 ++++++++++++++++++++++---------------------- fs/xfs/xfs_icache.h | 2 +- 3 files changed, 57 insertions(+), 57 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b55f1eb8f1c8..eade63d53be5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -706,7 +706,7 @@ write_retry: */ if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - cleared_space = xfs_inode_free_quota_blocks(ip); + cleared_space = xfs_blockgc_free_quota(ip); if (cleared_space) goto write_retry; iolock = 0; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 10c1a0dee17d..aba901d5637b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1396,61 +1396,6 @@ xfs_icache_free_eofblocks( XFS_ICI_EOFBLOCKS_TAG); } -/* - * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes - * with multiple quotas, we don't know exactly which quota caused an allocation - * failure. We make a best effort by including each quota under low free space - * conditions (less than 1% free space) in the scan. - */ -bool -xfs_inode_free_quota_blocks( - struct xfs_inode *ip) -{ - struct xfs_eofblocks eofb = {0}; - struct xfs_dquot *dq; - bool do_work = false; - - /* - * Run a sync scan to increase effectiveness and use the union filter to - * cover all applicable quotas in a single scan. - */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC; - - if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_uid = VFS_I(ip)->i_uid; - eofb.eof_flags |= XFS_EOF_FLAGS_UID; - do_work = true; - } - } - - if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_gid = VFS_I(ip)->i_gid; - eofb.eof_flags |= XFS_EOF_FLAGS_GID; - do_work = true; - } - } - - if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_prid = ip->i_d.di_projid; - eofb.eof_flags |= XFS_EOF_FLAGS_PRID; - do_work = true; - } - } - - if (!do_work) - return false; - - xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); - return true; -} - static inline unsigned long xfs_iflag_for_tag( int tag) @@ -1699,3 +1644,58 @@ xfs_start_block_reaping( xfs_queue_eofblocks(mp); xfs_queue_cowblocks(mp); } + +/* + * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes + * with multiple quotas, we don't know exactly which quota caused an allocation + * failure. We make a best effort by including each quota under low free space + * conditions (less than 1% free space) in the scan. + */ +bool +xfs_blockgc_free_quota( + struct xfs_inode *ip) +{ + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + bool do_work = false; + + /* + * Run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ + eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_uid = VFS_I(ip)->i_uid; + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + do_work = true; + } + } + + if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_gid = VFS_I(ip)->i_gid; + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + do_work = true; + } + } + + if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_prid = ip->i_d.di_projid; + eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + do_work = true; + } + } + + if (!do_work) + return false; + + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + xfs_icache_free_cowblocks(ip->i_mount, &eofb); + return true; +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3f7ddbca8638..21b726a05b0d 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,7 +54,7 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -bool xfs_inode_free_quota_blocks(struct xfs_inode *ip); +bool xfs_blockgc_free_quota(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); From 111068f80eac00173816c2e822c52c316b650df3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:36 -0800 Subject: [PATCH 57/85] xfs: pass flags and return gc errors from xfs_blockgc_free_quota Change the signature of xfs_blockgc_free_quota in preparation for the next few patches. Callers can now pass EOF_FLAGS into the function to control scan parameters; and the function will now pass back any corruption errors seen while scanning, though for our retry loops we'll just try again unconditionally. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_file.c | 10 +++++----- fs/xfs/xfs_icache.c | 26 +++++++++++++++++--------- fs/xfs/xfs_icache.h | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index eade63d53be5..8546dbf6c5ac 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -702,14 +702,14 @@ write_retry: * metadata space. This reduces the chances that the eofblocks scan * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this * also behaves as a filter to prevent too many eofblocks scans from - * running at the same time. + * running at the same time. Use a synchronous scan to increase the + * effectiveness of the scan. */ if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - cleared_space = xfs_blockgc_free_quota(ip); - if (cleared_space) - goto write_retry; - iolock = 0; + xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC); + cleared_space = true; + goto write_retry; } else if (ret == -ENOSPC && !cleared_space) { struct xfs_eofblocks eofb = {0}; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index aba901d5637b..4a074aa12b52 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1650,20 +1650,26 @@ xfs_start_block_reaping( * with multiple quotas, we don't know exactly which quota caused an allocation * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. + * + * Callers must not hold any inode's ILOCK. If requesting a synchronous scan + * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or + * MMAPLOCK. */ -bool +int xfs_blockgc_free_quota( - struct xfs_inode *ip) + struct xfs_inode *ip, + unsigned int eof_flags) { struct xfs_eofblocks eofb = {0}; struct xfs_dquot *dq; bool do_work = false; + int error; /* - * Run a sync scan to increase effectiveness and use the union filter to - * cover all applicable quotas in a single scan. + * Run a scan to free blocks using the union filter to cover all + * applicable quotas in a single scan. */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION | XFS_EOF_FLAGS_SYNC; + eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); @@ -1693,9 +1699,11 @@ xfs_blockgc_free_quota( } if (!do_work) - return false; + return 0; - xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); - return true; + error = xfs_icache_free_eofblocks(ip->i_mount, &eofb); + if (error) + return error; + + return xfs_icache_free_cowblocks(ip->i_mount, &eofb); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 21b726a05b0d..d64ea8f5c589 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,7 +54,7 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -bool xfs_blockgc_free_quota(struct xfs_inode *ip); +int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); From 4ca74205685ee3a72ab7fe475f51cc26dea36509 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Wed, 27 Jan 2021 10:40:00 -0800 Subject: [PATCH 58/85] xfs: try worst case space reservation upfront in xfs_reflink_remap_extent Now that we've converted xfs_reflink_remap_extent to use the new xfs_trans_alloc_inode API, we can focus on its slightly unusual behavior with regard to quota reservations. Since it's valid to remap written blocks into a hole, we must be able to increase the quota count by the number of blocks in the mapping. However, the incore space reservation process requires us to supply an asymptotic guess before we can gain exclusive access to resources. We'd like to reserve all the quota we need up front, but we also don't want to fail a written -> allocated remap operation unnecessarily. The solution is to make the remap_extents function call the transaction allocation function twice. The first time we ask to reserve enough space and quota to handle the absolute worst case situation, but if that fails, we can fall back to the old strategy: ask for the bare minimum space reservation upfront and increase the quota reservation later if we need to. Later in this patchset we change the transaction and quota code to try to reclaim space if we cannot reserve free space or quota. Restructuring the remap_extent function in this manner means that if the fallback increase fails, we can pass that back to the caller knowing that the transaction allocation already tried freeing space. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_reflink.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 27f875fa7a0d..086866f6e71f 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -991,6 +991,7 @@ xfs_reflink_remap_extent( xfs_off_t newlen; int64_t qdelta = 0; unsigned int resblks; + bool quota_reserved = true; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); int iext_delta = 0; @@ -1006,10 +1007,26 @@ xfs_reflink_remap_extent( * the same index in the bmap btree, so we only need a reservation for * one bmbt split if either thing is happening. However, we haven't * locked the inode yet, so we reserve assuming this is the case. + * + * The first allocation call tries to reserve enough space to handle + * mapping dmap into a sparse part of the file plus the bmbt split. We + * haven't locked the inode or read the existing mapping yet, so we do + * not know for sure that we need the space. This should succeed most + * of the time. + * + * If the first attempt fails, try again but reserving only enough + * space to handle a bmbt split. This is the hard minimum requirement, + * and we revisit quota reservations later when we know more about what + * we're remapping. */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, - false, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + resblks + dmap->br_blockcount, 0, false, &tp); + if (error == -EDQUOT || error == -ENOSPC) { + quota_reserved = false; + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + resblks, 0, false, &tp); + } if (error) goto out; @@ -1076,7 +1093,7 @@ xfs_reflink_remap_extent( * before we started. That should have removed all the delalloc * reservations, but we code defensively. */ - if (!smap_real && dmap_written) { + if (!quota_reserved && !smap_real && dmap_written) { error = xfs_trans_reserve_quota_nblks(tp, ip, dmap->br_blockcount, 0, false); if (error) From 766aabd59929cd05fc1a249f376e4395bed93d30 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:37 -0800 Subject: [PATCH 59/85] xfs: flush eof/cowblocks if we can't reserve quota for file blocks If a fs modification (data write, reflink, xattr set, fallocate, etc.) is unable to reserve enough quota to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_reflink.c | 5 +++++ fs/xfs/xfs_trans.c | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 086866f6e71f..725c7d8e4438 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1092,6 +1092,11 @@ xfs_reflink_remap_extent( * count. This is suboptimal, but the VFS flushed the dest range * before we started. That should have removed all the delalloc * reservations, but we code defensively. + * + * xfs_trans_alloc_inode above already tried to grab an even larger + * quota reservation, and kicked off a blockgc scan if it couldn't. + * If we can't get a potentially smaller quota reservation now, we're + * done. */ if (!quota_reserved && !smap_real && dmap_written) { error = xfs_trans_reserve_quota_nblks(tp, ip, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 29dca1bc4c1a..4071bbed2d48 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -23,6 +23,7 @@ #include "xfs_inode.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_icache.h" kmem_zone_t *xfs_trans_zone; @@ -1046,8 +1047,10 @@ xfs_trans_alloc_inode( { struct xfs_trans *tp; struct xfs_mount *mp = ip->i_mount; + bool retried = false; int error; +retry: error = xfs_trans_alloc(mp, resv, dblocks, rblocks / mp->m_sb.sb_rextsize, force ? XFS_TRANS_RESERVE : 0, &tp); @@ -1065,6 +1068,13 @@ xfs_trans_alloc_inode( } error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_blockgc_free_quota(ip, 0); + retried = true; + goto retry; + } if (error) goto out_cancel; From c237dd7c709432611a7642ca10c2a0c8c48ea313 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:37 -0800 Subject: [PATCH 60/85] xfs: flush eof/cowblocks if we can't reserve quota for inode creation If an inode creation is unable to reserve enough quota to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_icache.c | 68 +++++++++++++++++++++++++-------------------- fs/xfs/xfs_icache.h | 3 ++ fs/xfs/xfs_trans.c | 8 ++++++ 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4a074aa12b52..df9533d6bc16 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1646,64 +1646,72 @@ xfs_start_block_reaping( } /* - * Run cow/eofblocks scans on the quotas applicable to the inode. For inodes - * with multiple quotas, we don't know exactly which quota caused an allocation - * failure. We make a best effort by including each quota under low free space - * conditions (less than 1% free space) in the scan. + * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which + * quota caused an allocation failure, so we make a best effort by including + * each quota under low free space conditions (less than 1% free space) in the + * scan. * * Callers must not hold any inode's ILOCK. If requesting a synchronous scan * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or * MMAPLOCK. */ int -xfs_blockgc_free_quota( - struct xfs_inode *ip, +xfs_blockgc_free_dquots( + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, unsigned int eof_flags) { struct xfs_eofblocks eofb = {0}; - struct xfs_dquot *dq; bool do_work = false; int error; + if (!udqp && !gdqp && !pdqp) + return 0; + /* * Run a scan to free blocks using the union filter to cover all * applicable quotas in a single scan. */ eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags; - if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_uid = VFS_I(ip)->i_uid; - eofb.eof_flags |= XFS_EOF_FLAGS_UID; - do_work = true; - } + if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { + eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + do_work = true; } - if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_gid = VFS_I(ip)->i_gid; - eofb.eof_flags |= XFS_EOF_FLAGS_GID; - do_work = true; - } + if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { + eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + do_work = true; } - if (XFS_IS_PQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_PROJ); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_prid = ip->i_d.di_projid; - eofb.eof_flags |= XFS_EOF_FLAGS_PRID; - do_work = true; - } + if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { + eofb.eof_prid = pdqp->q_id; + eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + do_work = true; } if (!do_work) return 0; - error = xfs_icache_free_eofblocks(ip->i_mount, &eofb); + error = xfs_icache_free_eofblocks(mp, &eofb); if (error) return error; - return xfs_icache_free_cowblocks(ip->i_mount, &eofb); + return xfs_icache_free_cowblocks(mp, &eofb); +} + +/* Run cow/eofblocks scans on the quotas attached to the inode. */ +int +xfs_blockgc_free_quota( + struct xfs_inode *ip, + unsigned int eof_flags) +{ + return xfs_blockgc_free_dquots(ip->i_mount, + xfs_inode_dquot(ip, XFS_DQTYPE_USER), + xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), + xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d64ea8f5c589..5f7d7c192d1e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -54,6 +54,9 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, + unsigned int eof_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 4071bbed2d48..b29434199079 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1102,13 +1102,21 @@ xfs_trans_alloc_icreate( struct xfs_trans **tpp) { struct xfs_trans *tp; + bool retried = false; int error; +retry: error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); if (error) return error; error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } if (error) { xfs_trans_cancel(tp); return error; From 758303d1449965819661048e9e31f32d61888f70 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:38 -0800 Subject: [PATCH 61/85] xfs: flush eof/cowblocks if we can't reserve quota for chown If a file user, group, or project change is unable to reserve enough quota to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_trans.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index b29434199079..a23400f59e7d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1138,16 +1138,21 @@ retry: int xfs_trans_alloc_ichange( struct xfs_inode *ip, - struct xfs_dquot *udqp, - struct xfs_dquot *gdqp, - struct xfs_dquot *pdqp, + struct xfs_dquot *new_udqp, + struct xfs_dquot *new_gdqp, + struct xfs_dquot *new_pdqp, bool force, struct xfs_trans **tpp) { struct xfs_trans *tp; struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; + bool retried = false; int error; +retry: error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) return error; @@ -1165,14 +1170,12 @@ xfs_trans_alloc_ichange( /* * For each quota type, skip quota reservations if the inode's dquots * now match the ones that came from the caller, or the caller didn't - * pass one in. + * pass one in. The inode's dquots can change if we drop the ILOCK to + * perform a blockgc scan, so we must preserve the caller's arguments. */ - if (udqp == ip->i_udquot) - udqp = NULL; - if (gdqp == ip->i_gdquot) - gdqp = NULL; - if (pdqp == ip->i_pdquot) - pdqp = NULL; + udqp = (new_udqp != ip->i_udquot) ? new_udqp : NULL; + gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL; + pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL; if (udqp || gdqp || pdqp) { unsigned int qflags = XFS_QMOPT_RES_REGBLKS; @@ -1188,6 +1191,12 @@ xfs_trans_alloc_ichange( error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks, 1, qflags); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } if (error) goto out_cancel; } From 38899f8099945559662e6a6e355b9059088e3b34 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:38 -0800 Subject: [PATCH 62/85] xfs: add a tracepoint for blockgc scans Add some tracepoints so that we can observe when the speculative preallocation garbage collector runs. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_ioctl.c | 2 ++ fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 38ee66d999d8..3ddefb685ac6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2339,6 +2339,8 @@ xfs_file_ioctl( if (error) return error; + trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_); + sb_start_write(mp->m_super); error = xfs_icache_free_eofblocks(mp, &keofb); sb_end_write(mp->m_super); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 120398a37c2a..9b8d703dc9fd 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -29,6 +29,7 @@ #include "xfs_filestream.h" #include "xfs_fsmap.h" #include "xfs_btree_staging.h" +#include "xfs_icache.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e2e0092c331d..965873026116 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -37,6 +37,7 @@ struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; +struct xfs_eofblocks; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -3888,6 +3889,46 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \ DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); +DECLARE_EVENT_CLASS(xfs_eofblocks_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, + unsigned long caller_ip), + TP_ARGS(mp, eofb, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__u32, flags) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(prid_t, prid) + __field(__u64, min_file_size) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = eofb ? eofb->eof_flags : 0; + __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns, + eofb->eof_uid) : 0; + __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns, + eofb->eof_gid) : 0; + __entry->prid = eofb ? eofb->eof_prid : 0; + __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags, + __entry->uid, + __entry->gid, + __entry->prid, + __entry->min_file_size, + (char *)__entry->caller_ip) +); +#define DEFINE_EOFBLOCKS_EVENT(name) \ +DEFINE_EVENT(xfs_eofblocks_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \ + unsigned long caller_ip), \ + TP_ARGS(mp, eofb, caller_ip)) +DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH From 85c5b27075ba0389855d9f46ff1b1d5c34a44c94 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:39 -0800 Subject: [PATCH 63/85] xfs: refactor xfs_icache_free_{eof,cow}blocks call sites In anticipation of more restructuring of the eof/cowblocks gc code, refactor calling of those two functions into a single internal helper function, then present a new standard interface to purge speculative block preallocations and start shifting higher level code to use that. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_file.c | 3 +-- fs/xfs/xfs_icache.c | 39 +++++++++++++++++++++++++++++++++------ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 1 + 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8546dbf6c5ac..38528e59030e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -718,8 +718,7 @@ write_retry: xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; - xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); + xfs_blockgc_free_space(ip->i_mount, &eofb); goto write_retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index df9533d6bc16..0d81330a0fd3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1645,6 +1645,38 @@ xfs_start_block_reaping( xfs_queue_cowblocks(mp); } +/* Scan all incore inodes for block preallocations that we can remove. */ +static inline int +xfs_blockgc_scan( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + int error; + + error = xfs_icache_free_eofblocks(mp, eofb); + if (error) + return error; + + error = xfs_icache_free_cowblocks(mp, eofb); + if (error) + return error; + + return 0; +} + +/* + * Try to free space in the filesystem by purging eofblocks and cowblocks. + */ +int +xfs_blockgc_free_space( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); + + return xfs_blockgc_scan(mp, eofb); +} + /* * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which * quota caused an allocation failure, so we make a best effort by including @@ -1665,7 +1697,6 @@ xfs_blockgc_free_dquots( { struct xfs_eofblocks eofb = {0}; bool do_work = false; - int error; if (!udqp && !gdqp && !pdqp) return 0; @@ -1697,11 +1728,7 @@ xfs_blockgc_free_dquots( if (!do_work) return 0; - error = xfs_icache_free_eofblocks(mp, &eofb); - if (error) - return error; - - return xfs_icache_free_cowblocks(mp, &eofb); + return xfs_blockgc_free_space(mp, &eofb); } /* Run cow/eofblocks scans on the quotas attached to the inode. */ diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 5f7d7c192d1e..f7dc8d1c91e5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -58,6 +58,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int eof_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); +int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 965873026116..404a00ea9d9e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3928,6 +3928,7 @@ DEFINE_EVENT(xfs_eofblocks_class, name, \ unsigned long caller_ip), \ TP_ARGS(mp, eofb, caller_ip)) DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); +DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space); #endif /* _TRACE_XFS_H */ From a1a7d05a05765eec042942a5c360e909c0dd0131 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:39 -0800 Subject: [PATCH 64/85] xfs: flush speculative space allocations when we run out of space If a fs modification (creation, file write, reflink, etc.) is unable to reserve enough space to handle the modification, try clearing whatever space the filesystem might have been hanging onto in the hopes of speeding up the filesystem. The flushing behavior will become particularly important when we add deferred inode inactivation because that will increase the amount of space that isn't actively tied to user data. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_trans.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index a23400f59e7d..44f72c09c203 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -289,6 +289,17 @@ xfs_trans_alloc( tp->t_firstblock = NULLFSBLOCK; error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error == -ENOSPC) { + /* + * We weren't able to reserve enough space for the transaction. + * Flush the other speculative space allocations to free space. + * Do not perform a synchronous scan because callers can hold + * other locks. + */ + error = xfs_blockgc_free_space(mp, NULL); + if (!error) + error = xfs_trans_reserve(tp, resp, blocks, rtextents); + } if (error) { xfs_trans_cancel(tp); return error; From f83d436aef5def77b318effc14809fdc57092588 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:41 -0800 Subject: [PATCH 65/85] xfs: increase the default parallelism levels of pwork clients Increase the parallelism level for pwork clients to the workqueue defaults so that we can take advantage of computers with a lot of CPUs and a lot of hardware. On fast systems this will speed up quotacheck by a large factor, and the following posteof/cowblocks cleanup series will use the functionality presented in this patch to run garbage collection as quickly as possible. We do this by switching the pwork workqueue to unbounded, since the current user (quotacheck) runs lengthy scans for each work item and we don't care about dispatching the work on a warm cpu cache or anything like that. Also set WQ_SYSFS so that we can monitor where the wq is running. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> --- Documentation/admin-guide/xfs.rst | 38 +++++++++++++++++++++++++++++++ fs/xfs/xfs_iwalk.c | 5 +--- fs/xfs/xfs_pwork.c | 25 ++++---------------- fs/xfs/xfs_pwork.h | 4 +--- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 86de8a1ad91c..b00b1eece9de 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -495,3 +495,41 @@ the class and error context. For example, the default values for "metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults to "fail immediately" behaviour. This is done because ENODEV is a fatal, unrecoverable error no matter how many times the metadata IO is retried. + +Workqueue Concurrency +===================== + +XFS uses kernel workqueues to parallelize metadata update processes. This +enables it to take advantage of storage hardware that can service many IO +operations simultaneously. This interface exposes internal implementation +details of XFS, and as such is explicitly not part of any userspace API/ABI +guarantee the kernel may give userspace. These are undocumented features of +the generic workqueue implementation XFS uses for concurrency, and they are +provided here purely for diagnostic and tuning purposes and may change at any +time in the future. + +The control knobs for a filesystem's workqueues are organized by task at hand +and the short name of the data device. They all can be found in: + + /sys/bus/workqueue/devices/${task}!${device} + +================ =========== + Task Description +================ =========== + xfs_iwalk-$pid Inode scans of the entire filesystem. Currently limited to + mount time quotacheck. +================ =========== + +For example, the knobs for the quotacheck workqueue for /dev/nvme0n1 would be +found in /sys/bus/workqueue/devices/xfs_iwalk-1111!nvme0n1/. + +The interesting knobs for XFS workqueues are as follows: + +============ =========== + Knob Description +============ =========== + max_active Maximum number of background threads that can be started to + run the work. + cpumask CPUs upon which the threads are allowed to run. + nice Relative priority of scheduling the threads. These are the + same nice levels that can be applied to userspace processes. diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index eae3aff9bc97..c4a340f1f1e1 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -618,15 +618,12 @@ xfs_iwalk_threaded( { struct xfs_pwork_ctl pctl; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); - unsigned int nr_threads; int error; ASSERT(agno < mp->m_sb.sb_agcount); ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); - nr_threads = xfs_pwork_guess_datadev_parallelism(mp); - error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", - nr_threads); + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk"); if (error) return error; diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c index b03333f1c84a..c283b801cc5d 100644 --- a/fs/xfs/xfs_pwork.c +++ b/fs/xfs/xfs_pwork.c @@ -61,16 +61,18 @@ xfs_pwork_init( struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, xfs_pwork_work_fn work_fn, - const char *tag, - unsigned int nr_threads) + const char *tag) { + unsigned int nr_threads = 0; + #ifdef DEBUG if (xfs_globals.pwork_threads >= 0) nr_threads = xfs_globals.pwork_threads; #endif trace_xfs_pwork_init(mp, nr_threads, current->pid); - pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, + pctl->wq = alloc_workqueue("%s-%d", + WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag, current->pid); if (!pctl->wq) return -ENOMEM; @@ -117,20 +119,3 @@ xfs_pwork_poll( atomic_read(&pctl->nr_work) == 0, HZ) == 0) touch_softlockup_watchdog(); } - -/* - * Return the amount of parallelism that the data device can handle, or 0 for - * no limit. - */ -unsigned int -xfs_pwork_guess_datadev_parallelism( - struct xfs_mount *mp) -{ - struct xfs_buftarg *btp = mp->m_ddev_targp; - - /* - * For now we'll go with the most conservative setting possible, - * which is two threads for an SSD and 1 thread everywhere else. - */ - return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1; -} diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h index 8133124cf3bb..c0ef81fc85dd 100644 --- a/fs/xfs/xfs_pwork.h +++ b/fs/xfs/xfs_pwork.h @@ -51,11 +51,9 @@ xfs_pwork_want_abort( } int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, - xfs_pwork_work_fn work_fn, const char *tag, - unsigned int nr_threads); + xfs_pwork_work_fn work_fn, const char *tag); void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); void xfs_pwork_poll(struct xfs_pwork_ctl *pctl); -unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); #endif /* __XFS_PWORK_H__ */ From 05a302a17062ca73dc91b508cf2a0b25724db15d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:42 -0800 Subject: [PATCH 66/85] xfs: set WQ_SYSFS on all workqueues in debug mode When CONFIG_XFS_DEBUG=y, set WQ_SYSFS on all workqueues that we create so that we (developers) have a means to monitor cpu affinity and whatnot for background workers. In the next patchset we'll expose knobs for more of the workqueues publicly and document it, but not now. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> --- fs/xfs/xfs_log.c | 5 +++-- fs/xfs/xfs_mru_cache.c | 2 +- fs/xfs/xfs_super.c | 23 ++++++++++++++--------- fs/xfs/xfs_super.h | 6 ++++++ 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d8b814227734..9aa30e7cd314 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1492,8 +1492,9 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, - mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | + WQ_HIGHPRI), + 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index a06661dac5be..34c3b16f834f 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -294,7 +294,7 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aed74a3fc787..8959561351ca 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -495,33 +495,37 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), 0, mp->m_super->s_id); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_eofblocks_workqueue) goto out_destroy_reclaim; - mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, - mp->m_super->s_id); + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", + XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_eofb; @@ -2085,11 +2089,12 @@ xfs_init_workqueues(void) * max_active value for this workqueue. */ xfs_alloc_wq = alloc_workqueue("xfsalloc", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); if (!xfs_alloc_wq) return -ENOMEM; - xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0); + xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND), + 0); if (!xfs_discard_wq) goto out_free_alloc_wq; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index b552cf6d3379..1ca484b8357f 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -75,6 +75,12 @@ extern void xfs_qm_exit(void); XFS_ASSERT_FATAL_STRING \ XFS_DBG_STRING /* DBG must be last */ +#ifdef DEBUG +# define XFS_WQFLAGS(wqflags) (WQ_SYSFS | (wqflags)) +#else +# define XFS_WQFLAGS(wqflags) (wqflags) +#endif + struct xfs_inode; struct xfs_mount; struct xfs_buftarg; From f9296569837c3fd66ae32717b0f8f5a26758b4b7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:39 -0800 Subject: [PATCH 67/85] xfs: relocate the eofb/cowb workqueue functions Move the xfs_{eof,cow}blocks_worker and xfs_queue_{eof,cow}blocks functions further down in the file so that the cleanups in the next patches won't have to pre-declare static functions. No functional changes. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 126 ++++++++++++++++++++++---------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0d81330a0fd3..ae0c14df0f9b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -915,69 +915,6 @@ xfs_inode_walk( return last_error; } -/* - * Background scanning to trim post-EOF preallocated space. This is queued - * based on the 'speculative_prealloc_lifetime' tunable (5m by default). - */ -void -xfs_queue_eofblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_eofblocks_work, - msecs_to_jiffies(xfs_eofb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_eofblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_eofblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_icache_free_eofblocks(mp, NULL); - sb_end_write(mp->m_super); - - xfs_queue_eofblocks(mp); -} - -/* - * Background scanning to trim preallocated CoW space. This is queued - * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). - * (We'll just piggyback on the post-EOF prealloc space workqueue.) - */ -void -xfs_queue_cowblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_cowblocks_work, - msecs_to_jiffies(xfs_cowb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_cowblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_cowblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_icache_free_cowblocks(mp, NULL); - sb_end_write(mp->m_super); - - xfs_queue_cowblocks(mp); -} - /* * Grab the inode for reclaim exclusively. * @@ -1396,6 +1333,37 @@ xfs_icache_free_eofblocks( XFS_ICI_EOFBLOCKS_TAG); } +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'speculative_prealloc_lifetime' tunable (5m by default). + */ +void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; + xfs_icache_free_eofblocks(mp, NULL); + sb_end_write(mp->m_super); + + xfs_queue_eofblocks(mp); +} + static inline unsigned long xfs_iflag_for_tag( int tag) @@ -1608,6 +1576,38 @@ xfs_icache_free_cowblocks( XFS_ICI_COWBLOCKS_TAG); } +/* + * Background scanning to trim preallocated CoW space. This is queued + * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). + * (We'll just piggyback on the post-EOF prealloc space workqueue.) + */ +void +xfs_queue_cowblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_cowblocks_work, + msecs_to_jiffies(xfs_cowb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_cowblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_cowblocks_work); + + if (!sb_start_write_trylock(mp->m_super)) + return; + xfs_icache_free_cowblocks(mp, NULL); + sb_end_write(mp->m_super); + + xfs_queue_cowblocks(mp); +} + void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) From 0461a320e33a16405ac3c165463837e028a42680 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:40 -0800 Subject: [PATCH 68/85] xfs: hide xfs_icache_free_eofblocks Change the one remaining caller of xfs_icache_free_eofblocks to use our new combined blockgc scan function instead, since we will soon be combining the two scans. This introduces a slight behavior change, since the XFS_IOC_FREE_EOFBLOCKS now clears out speculative CoW reservations in addition to post-eof blocks. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_icache.h | 1 - fs/xfs/xfs_ioctl.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index ae0c14df0f9b..d1a849053e68 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1324,7 +1324,7 @@ xfs_inode_free_eofblocks( return ret; } -int +static int xfs_icache_free_eofblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index f7dc8d1c91e5..b8c2f23aa829 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -62,7 +62,6 @@ int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3ddefb685ac6..78f965425fa6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2342,7 +2342,7 @@ xfs_file_ioctl( trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_); sb_start_write(mp->m_super); - error = xfs_icache_free_eofblocks(mp, &keofb); + error = xfs_blockgc_free_space(mp, &keofb); sb_end_write(mp->m_super); return error; } From b943c0cd5615233ae4cea66666725a9bf2edccca Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:40 -0800 Subject: [PATCH 69/85] xfs: hide xfs_icache_free_cowblocks Change the one remaining caller of xfs_icache_free_cowblocks to use our new combined blockgc scan function instead, since we will soon be combining the two scans. This introduces a slight behavior change, since a readonly remount now clears out post-EOF preallocations and not just CoW staging extents. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_icache.h | 1 - fs/xfs/xfs_super.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d1a849053e68..45146e016975 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1567,7 +1567,7 @@ out_iolock: return ret; } -int +static int xfs_icache_free_cowblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index b8c2f23aa829..de7604576e9e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -67,7 +67,6 @@ void xfs_queue_eofblocks(struct xfs_mount *); void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_cowblocks_worker(struct work_struct *); void xfs_queue_cowblocks(struct xfs_mount *); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8959561351ca..e8f71714d737 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1713,7 +1713,7 @@ xfs_remount_ro( xfs_stop_block_reaping(mp); /* Get rid of any leftover CoW reservations... */ - error = xfs_icache_free_cowblocks(mp, NULL); + error = xfs_blockgc_free_space(mp, NULL); if (error) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; From 865ac8e253c97423c41e22ce615615eb006fc52e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:41 -0800 Subject: [PATCH 70/85] xfs: remove trivial eof/cowblocks functions Get rid of these trivial helpers. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 45146e016975..705f16193596 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1324,15 +1324,6 @@ xfs_inode_free_eofblocks( return ret; } -static int -xfs_icache_free_eofblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) -{ - return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, - XFS_ICI_EOFBLOCKS_TAG); -} - /* * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). @@ -1358,7 +1349,8 @@ xfs_eofblocks_worker( if (!sb_start_write_trylock(mp->m_super)) return; - xfs_icache_free_eofblocks(mp, NULL); + xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, NULL, + XFS_ICI_EOFBLOCKS_TAG); sb_end_write(mp->m_super); xfs_queue_eofblocks(mp); @@ -1567,15 +1559,6 @@ out_iolock: return ret; } -static int -xfs_icache_free_cowblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) -{ - return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, - XFS_ICI_COWBLOCKS_TAG); -} - /* * Background scanning to trim preallocated CoW space. This is queued * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). @@ -1602,7 +1585,8 @@ xfs_cowblocks_worker( if (!sb_start_write_trylock(mp->m_super)) return; - xfs_icache_free_cowblocks(mp, NULL); + xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, NULL, + XFS_ICI_COWBLOCKS_TAG); sb_end_write(mp->m_super); xfs_queue_cowblocks(mp); @@ -1653,11 +1637,13 @@ xfs_blockgc_scan( { int error; - error = xfs_icache_free_eofblocks(mp, eofb); + error = xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, + XFS_ICI_EOFBLOCKS_TAG); if (error) return error; - error = xfs_icache_free_cowblocks(mp, eofb); + error = xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, + XFS_ICI_COWBLOCKS_TAG); if (error) return error; From ce2d3bbe06473fa76eb9dad21529f9cc48408000 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:43 -0800 Subject: [PATCH 71/85] xfs: consolidate incore inode radix tree posteof/cowblocks tags The clearing of posteof blocks and cowblocks serve the same purpose: removing speculative block preallocations from inactive files. We don't need to burn two radix tree tags on this, so combine them into one. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 114 +++++++++++++++++++++----------------------- fs/xfs/xfs_icache.h | 4 +- fs/xfs/xfs_trace.h | 6 +-- 3 files changed, 58 insertions(+), 66 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 705f16193596..34cc84fc7391 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1291,6 +1291,9 @@ xfs_inode_free_eofblocks( wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) + return 0; + if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1333,7 +1336,7 @@ xfs_queue_eofblocks( struct xfs_mount *mp) { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) queue_delayed_work(mp->m_eofblocks_workqueue, &mp->m_eofblocks_work, msecs_to_jiffies(xfs_eofb_secs * 1000)); @@ -1350,67 +1353,54 @@ xfs_eofblocks_worker( if (!sb_start_write_trylock(mp->m_super)) return; xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, NULL, - XFS_ICI_EOFBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); sb_end_write(mp->m_super); xfs_queue_eofblocks(mp); } -static inline unsigned long -xfs_iflag_for_tag( - int tag) -{ - switch (tag) { - case XFS_ICI_EOFBLOCKS_TAG: - return XFS_IEOFBLOCKS; - case XFS_ICI_COWBLOCKS_TAG: - return XFS_ICOWBLOCKS; - default: - ASSERT(0); - return 0; - } -} - static void -__xfs_inode_set_blocks_tag( - xfs_inode_t *ip, - void (*execute)(struct xfs_mount *mp), - void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, - int error, unsigned long caller_ip), - int tag) +xfs_blockgc_set_iflag( + struct xfs_inode *ip, + void (*execute)(struct xfs_mount *mp), + unsigned long iflag) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - int tagged; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); /* * Don't bother locking the AG and looking up in the radix trees * if we already know that we have the tag set. */ - if (ip->i_flags & xfs_iflag_for_tag(tag)) + if (ip->i_flags & iflag) return; spin_lock(&ip->i_flags_lock); - ip->i_flags |= xfs_iflag_for_tag(tag); + ip->i_flags |= iflag; spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - tagged = radix_tree_tagged(&pag->pag_ici_root, tag); + tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG); radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); if (!tagged) { - /* propagate the eofblocks tag up into the perag radix tree */ + /* propagate the blockgc tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_set(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - tag); + XFS_ICI_BLOCKGC_TAG); spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ execute(ip->i_mount); - set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, + _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1422,38 +1412,43 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, - trace_xfs_perag_set_eofblocks, - XFS_ICI_EOFBLOCKS_TAG); + return xfs_blockgc_set_iflag(ip, xfs_queue_eofblocks, XFS_IEOFBLOCKS); } static void -__xfs_inode_clear_blocks_tag( - xfs_inode_t *ip, - void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, - int error, unsigned long caller_ip), - int tag) +xfs_blockgc_clear_iflag( + struct xfs_inode *ip, + unsigned long iflag) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + bool clear_tag; + + ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~xfs_iflag_for_tag(tag); + ip->i_flags &= ~iflag; + clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; spin_unlock(&ip->i_flags_lock); + if (!clear_tag) + return; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); - if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { - /* clear the eofblocks tag from the perag radix tree */ + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { + /* clear the blockgc tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_clear(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - tag); + XFS_ICI_BLOCKGC_TAG); spin_unlock(&ip->i_mount->m_perag_lock); - clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1, + _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1465,8 +1460,7 @@ xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_eofblocks_tag(ip); - return __xfs_inode_clear_blocks_tag(ip, - trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); + return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); } /* @@ -1524,6 +1518,9 @@ xfs_inode_free_cowblocks( wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) + return 0; + if (!xfs_prep_free_cowblocks(ip)) return 0; @@ -1569,7 +1566,7 @@ xfs_queue_cowblocks( struct xfs_mount *mp) { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) queue_delayed_work(mp->m_eofblocks_workqueue, &mp->m_cowblocks_work, msecs_to_jiffies(xfs_cowb_secs * 1000)); @@ -1586,7 +1583,7 @@ xfs_cowblocks_worker( if (!sb_start_write_trylock(mp->m_super)) return; xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, NULL, - XFS_ICI_COWBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); sb_end_write(mp->m_super); xfs_queue_cowblocks(mp); @@ -1597,9 +1594,7 @@ xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, - trace_xfs_perag_set_cowblocks, - XFS_ICI_COWBLOCKS_TAG); + return xfs_blockgc_set_iflag(ip, xfs_queue_cowblocks, XFS_ICOWBLOCKS); } void @@ -1607,8 +1602,7 @@ xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_cowblocks_tag(ip); - return __xfs_inode_clear_blocks_tag(ip, - trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); + return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); } /* Disable post-EOF and CoW block auto-reclamation. */ @@ -1638,12 +1632,12 @@ xfs_blockgc_scan( int error; error = xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, - XFS_ICI_EOFBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); if (error) return error; error = xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, - XFS_ICI_COWBLOCKS_TAG); + XFS_ICI_BLOCKGC_TAG); if (error) return error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index de7604576e9e..ab7107370be4 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -23,8 +23,8 @@ struct xfs_eofblocks { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_walk */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ -#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ +/* Inode has speculative preallocations (posteof or cow) to clean. */ +#define XFS_ICI_BLOCKGC_TAG 1 /* * Flags for xfs_iget() diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 404a00ea9d9e..63ecbc6b1a76 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -155,10 +155,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), From 9669f51de5c0c93e79257f690d1feaf16ebc179b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:43 -0800 Subject: [PATCH 72/85] xfs: consolidate the eofblocks and cowblocks workers Remove the separate cowblocks work items and knob so that we can control and run everything from a single blockgc work queue. Note that the speculative_prealloc_lifetime sysfs knob retains its historical name even though the functions move to prefix xfs_blockgc_*. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_globals.c | 7 ++-- fs/xfs/xfs_icache.c | 96 ++++++++++++++------------------------------ fs/xfs/xfs_icache.h | 6 +-- fs/xfs/xfs_linux.h | 3 +- fs/xfs/xfs_mount.h | 6 +-- fs/xfs/xfs_super.c | 11 +++-- fs/xfs/xfs_sysctl.c | 15 ++----- fs/xfs/xfs_sysctl.h | 3 +- 8 files changed, 48 insertions(+), 99 deletions(-) diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index fa55ab8b8d80..f62fa652c2fd 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -8,8 +8,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer and cowb_timer, which - * are measured in seconds. + * 100ths of a second) with the exception of blockgc_timer, which is measured + * in seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -28,8 +28,7 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, - .eofb_timer = { 1, 300, 3600*24}, - .cowb_timer = { 1, 1800, 3600*24}, + .blockgc_timer = { 1, 300, 3600*24}, }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 34cc84fc7391..c6ef4d14fb8d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1328,41 +1328,24 @@ xfs_inode_free_eofblocks( } /* - * Background scanning to trim post-EOF preallocated space. This is queued - * based on the 'speculative_prealloc_lifetime' tunable (5m by default). + * Background scanning to trim preallocated space. This is queued based on the + * 'speculative_prealloc_lifetime' tunable (5m by default). */ -void -xfs_queue_eofblocks( - struct xfs_mount *mp) +static inline void +xfs_blockgc_queue( + struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_eofblocks_work, - msecs_to_jiffies(xfs_eofb_secs * 1000)); + queue_delayed_work(mp->m_blockgc_workqueue, + &mp->m_blockgc_work, + msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); } -void -xfs_eofblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_eofblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, NULL, - XFS_ICI_BLOCKGC_TAG); - sb_end_write(mp->m_super); - - xfs_queue_eofblocks(mp); -} - static void xfs_blockgc_set_iflag( struct xfs_inode *ip, - void (*execute)(struct xfs_mount *mp), unsigned long iflag) { struct xfs_mount *mp = ip->i_mount; @@ -1397,7 +1380,7 @@ xfs_blockgc_set_iflag( spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - execute(ip->i_mount); + xfs_blockgc_queue(ip->i_mount); trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -1412,7 +1395,7 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return xfs_blockgc_set_iflag(ip, xfs_queue_eofblocks, XFS_IEOFBLOCKS); + return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); } static void @@ -1556,45 +1539,12 @@ out_iolock: return ret; } -/* - * Background scanning to trim preallocated CoW space. This is queued - * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). - * (We'll just piggyback on the post-EOF prealloc space workqueue.) - */ -void -xfs_queue_cowblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_cowblocks_work, - msecs_to_jiffies(xfs_cowb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_cowblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_cowblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, NULL, - XFS_ICI_BLOCKGC_TAG); - sb_end_write(mp->m_super); - - xfs_queue_cowblocks(mp); -} - void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return xfs_blockgc_set_iflag(ip, xfs_queue_cowblocks, XFS_ICOWBLOCKS); + return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); } void @@ -1610,8 +1560,7 @@ void xfs_stop_block_reaping( struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_eofblocks_work); - cancel_delayed_work_sync(&mp->m_cowblocks_work); + cancel_delayed_work_sync(&mp->m_blockgc_work); } /* Enable post-EOF and CoW block auto-reclamation. */ @@ -1619,8 +1568,7 @@ void xfs_start_block_reaping( struct xfs_mount *mp) { - xfs_queue_eofblocks(mp); - xfs_queue_cowblocks(mp); + xfs_blockgc_queue(mp); } /* Scan all incore inodes for block preallocations that we can remove. */ @@ -1644,6 +1592,24 @@ xfs_blockgc_scan( return 0; } +/* Background worker that trims preallocated space. */ +void +xfs_blockgc_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_blockgc_work); + int error; + + if (!sb_start_write_trylock(mp->m_super)) + return; + error = xfs_blockgc_scan(mp, NULL); + if (error) + xfs_info(mp, "preallocation gc worker failed, err=%d", error); + sb_end_write(mp->m_super); + xfs_blockgc_queue(mp); +} + /* * Try to free space in the filesystem by purging eofblocks and cowblocks. */ diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index ab7107370be4..5c57476287f6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -62,13 +62,11 @@ int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -void xfs_eofblocks_worker(struct work_struct *); -void xfs_queue_eofblocks(struct xfs_mount *); void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); -void xfs_cowblocks_worker(struct work_struct *); -void xfs_queue_cowblocks(struct xfs_mount *); + +void xfs_blockgc_worker(struct work_struct *work); int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 5b7a1e201559..af6be9b9ccdf 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -98,8 +98,7 @@ typedef __u32 xfs_nlink_t; #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val -#define xfs_eofb_secs xfs_params.eofb_timer.val -#define xfs_cowb_secs xfs_params.cowb_timer.val +#define xfs_blockgc_secs xfs_params.blockgc_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_set_flags_nested(sp, f) \ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 452ca7654dc5..316e0d79cc40 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -93,7 +93,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_blockgc_workqueue; struct workqueue_struct *m_sync_workqueue; int m_bsize; /* fs logical block size */ @@ -177,9 +177,7 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct delayed_work m_eofblocks_work; /* background eof blocks - trimming */ - struct delayed_work m_cowblocks_work; /* background cow blocks + struct delayed_work m_blockgc_work; /* background prealloc blocks trimming */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e8f71714d737..471592e8dba6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -518,10 +518,10 @@ xfs_init_mount_workqueues( if (!mp->m_reclaim_workqueue) goto out_destroy_cil; - mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); - if (!mp->m_eofblocks_workqueue) + if (!mp->m_blockgc_workqueue) goto out_destroy_reclaim; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", @@ -532,7 +532,7 @@ xfs_init_mount_workqueues( return 0; out_destroy_eofb: - destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_blockgc_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -550,7 +550,7 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_blockgc_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -1842,8 +1842,7 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); - INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); + INIT_DELAYED_WORK(&mp->m_blockgc_work, xfs_blockgc_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index fac9de7ee6d0..145e06c47744 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -194,21 +194,12 @@ static struct ctl_table xfs_table[] = { }, { .procname = "speculative_prealloc_lifetime", - .data = &xfs_params.eofb_timer.val, + .data = &xfs_params.blockgc_timer.val, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.eofb_timer.min, - .extra2 = &xfs_params.eofb_timer.max, - }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.cowb_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.cowb_timer.min, - .extra2 = &xfs_params.cowb_timer.max, + .extra1 = &xfs_params.blockgc_timer.min, + .extra2 = &xfs_params.blockgc_timer.max, }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 8abf4640f1d5..7692e76ead33 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -35,8 +35,7 @@ typedef struct xfs_param { xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ - xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ - xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ + xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */ } xfs_param_t; /* From 419567534e16eb553e7c19eecaa4d03cbc6693be Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:43 -0800 Subject: [PATCH 73/85] xfs: only walk the incore inode tree once per blockgc scan Perform background block preallocation gc scans more efficiently by walking the incore inode tree once. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c6ef4d14fb8d..0c0d5779fea0 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1571,21 +1571,19 @@ xfs_start_block_reaping( xfs_blockgc_queue(mp); } -/* Scan all incore inodes for block preallocations that we can remove. */ -static inline int -xfs_blockgc_scan( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) +/* Scan one incore inode for block preallocations that we can remove. */ +static int +xfs_blockgc_scan_inode( + struct xfs_inode *ip, + void *args) { int error; - error = xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, - XFS_ICI_BLOCKGC_TAG); + error = xfs_inode_free_eofblocks(ip, args); if (error) return error; - error = xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, - XFS_ICI_BLOCKGC_TAG); + error = xfs_inode_free_cowblocks(ip, args); if (error) return error; @@ -1603,7 +1601,8 @@ xfs_blockgc_worker( if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_blockgc_scan(mp, NULL); + error = xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, NULL, + XFS_ICI_BLOCKGC_TAG); if (error) xfs_info(mp, "preallocation gc worker failed, err=%d", error); sb_end_write(mp->m_super); @@ -1620,7 +1619,8 @@ xfs_blockgc_free_space( { trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); - return xfs_blockgc_scan(mp, eofb); + return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb, + XFS_ICI_BLOCKGC_TAG); } /* From c9a6526fe7ae64528d924c6f255af15312211432 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:44 -0800 Subject: [PATCH 74/85] xfs: rename block gc start and stop functions Shorten the names of the two functions that start and stop block preallocation garbage collection and move them up to the other blockgc functions. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/scrub/common.c | 4 ++-- fs/xfs/xfs_icache.c | 4 ++-- fs/xfs/xfs_icache.h | 4 ++-- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_super.c | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8ea6d4aa3f55..53456f3de881 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -888,7 +888,7 @@ xchk_stop_reaping( struct xfs_scrub *sc) { sc->flags |= XCHK_REAPING_DISABLED; - xfs_stop_block_reaping(sc->mp); + xfs_blockgc_stop(sc->mp); } /* Restart background reaping of resources. */ @@ -896,6 +896,6 @@ void xchk_start_reaping( struct xfs_scrub *sc) { - xfs_start_block_reaping(sc->mp); + xfs_blockgc_start(sc->mp); sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0c0d5779fea0..35c735514565 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1557,7 +1557,7 @@ xfs_inode_clear_cowblocks_tag( /* Disable post-EOF and CoW block auto-reclamation. */ void -xfs_stop_block_reaping( +xfs_blockgc_stop( struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_blockgc_work); @@ -1565,7 +1565,7 @@ xfs_stop_block_reaping( /* Enable post-EOF and CoW block auto-reclamation. */ void -xfs_start_block_reaping( +xfs_blockgc_start( struct xfs_mount *mp) { xfs_blockgc_queue(mp); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 5c57476287f6..d1fddb152420 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -75,7 +75,7 @@ int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); -void xfs_stop_block_reaping(struct xfs_mount *mp); -void xfs_start_block_reaping(struct xfs_mount *mp); +void xfs_blockgc_stop(struct xfs_mount *mp); +void xfs_blockgc_start(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 53b8ccab7235..be9ce114527f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1054,7 +1054,7 @@ xfs_unmountfs( uint64_t resblks; int error; - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 471592e8dba6..ea942089d074 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -891,7 +891,7 @@ xfs_fs_freeze( * set a GFP_NOFS context here to avoid recursion deadlocks. */ flags = memalloc_nofs_save(); - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); xfs_save_resvblks(mp); ret = xfs_log_quiesce(mp); memalloc_nofs_restore(flags); @@ -906,7 +906,7 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_start_block_reaping(mp); + xfs_blockgc_start(mp); return 0; } @@ -1690,7 +1690,7 @@ xfs_remount_rw( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_start_block_reaping(mp); + xfs_blockgc_start(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1710,7 +1710,7 @@ xfs_remount_ro( * Cancel background eofb scanning so it cannot race with the final * log force+buftarg wait and deadlock the remount. */ - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); /* Get rid of any leftover CoW reservations... */ error = xfs_blockgc_free_space(mp, NULL); From 894ecacf0f27fd1701c34f2946148b7f017bf984 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Fri, 22 Jan 2021 16:48:44 -0800 Subject: [PATCH 75/85] xfs: parallelize block preallocation garbage collection Split the block preallocation garbage collection work into per-AG work items so that we can take advantage of parallelization. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 42 ++++++++++++++++++++++++++++++------------ fs/xfs/xfs_mount.c | 3 +++ fs/xfs/xfs_mount.h | 5 +++-- fs/xfs/xfs_super.c | 4 ++-- 4 files changed, 38 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 35c735514565..460fa7b3a31c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1333,12 +1333,12 @@ xfs_inode_free_eofblocks( */ static inline void xfs_blockgc_queue( - struct xfs_mount *mp) + struct xfs_perag *pag) { rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(mp->m_blockgc_workqueue, - &mp->m_blockgc_work, + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) + queue_delayed_work(pag->pag_mount->m_blockgc_workqueue, + &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); } @@ -1380,7 +1380,7 @@ xfs_blockgc_set_iflag( spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - xfs_blockgc_queue(ip->i_mount); + xfs_blockgc_queue(pag); trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -1555,12 +1555,24 @@ xfs_inode_clear_cowblocks_tag( return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); } +#define for_each_perag_tag(mp, next_agno, pag, tag) \ + for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + (pag) != NULL; \ + (next_agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) + + /* Disable post-EOF and CoW block auto-reclamation. */ void xfs_blockgc_stop( struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_blockgc_work); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + cancel_delayed_work_sync(&pag->pag_blockgc_work); } /* Enable post-EOF and CoW block auto-reclamation. */ @@ -1568,7 +1580,11 @@ void xfs_blockgc_start( struct xfs_mount *mp) { - xfs_blockgc_queue(mp); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + xfs_blockgc_queue(pag); } /* Scan one incore inode for block preallocations that we can remove. */ @@ -1595,18 +1611,20 @@ void xfs_blockgc_worker( struct work_struct *work) { - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_blockgc_work); + struct xfs_perag *pag = container_of(to_delayed_work(work), + struct xfs_perag, pag_blockgc_work); + struct xfs_mount *mp = pag->pag_mount; int error; if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, NULL, + error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, XFS_ICI_BLOCKGC_TAG); if (error) - xfs_info(mp, "preallocation gc worker failed, err=%d", error); + xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", + pag->pag_agno, error); sb_end_write(mp->m_super); - xfs_blockgc_queue(mp); + xfs_blockgc_queue(pag); } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index be9ce114527f..52370d0a3f43 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -126,6 +126,7 @@ __xfs_free_perag( { struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -146,6 +147,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -201,6 +203,7 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); error = xfs_buf_hash_init(pag); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 316e0d79cc40..659ad95fe3e0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -177,8 +177,6 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct delayed_work m_blockgc_work; /* background prealloc blocks - trimming */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; @@ -367,6 +365,9 @@ typedef struct xfs_perag { /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv; + /* background prealloc block trimming */ + struct delayed_work pag_blockgc_work; + /* reference count */ uint8_t pagf_refcount_level; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ea942089d074..2b04818627e9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -35,6 +35,7 @@ #include "xfs_refcount_item.h" #include "xfs_bmap_item.h" #include "xfs_reflink.h" +#include "xfs_pwork.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -519,7 +520,7 @@ xfs_init_mount_workqueues( goto out_destroy_cil; mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); if (!mp->m_blockgc_workqueue) goto out_destroy_reclaim; @@ -1842,7 +1843,6 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_blockgc_work, xfs_blockgc_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log From 47bd6d3457fb96d287278027aed8a78d14f1d32d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Mon, 25 Jan 2021 16:39:01 -0800 Subject: [PATCH 76/85] xfs: expose the blockgc workqueue knobs publicly Expose the workqueue sysfs knobs for the speculative preallocation gc workers on all kernels, and update the sysadmin information. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- Documentation/admin-guide/xfs.rst | 3 +++ fs/xfs/xfs_super.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index b00b1eece9de..d2064a52811b 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -518,6 +518,9 @@ and the short name of the data device. They all can be found in: ================ =========== xfs_iwalk-$pid Inode scans of the entire filesystem. Currently limited to mount time quotacheck. + xfs-blockgc Background garbage collection of disk space that have been + speculatively allocated beyond EOF or for staging copy on + write operations. ================ =========== For example, the knobs for the quotacheck workqueue for /dev/nvme0n1 would be diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2b04818627e9..21b1d034aca3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -520,7 +520,7 @@ xfs_init_mount_workqueues( goto out_destroy_cil; mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", - XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), + WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, 0, mp->m_super->s_id); if (!mp->m_blockgc_workqueue) goto out_destroy_reclaim; From 0fa4a10a2f5f96a06373ea81f8cd5f97c5cc264f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Mon, 25 Jan 2021 21:09:49 -0800 Subject: [PATCH 77/85] xfs: don't bounce the iolock between free_{eof,cow}blocks Since xfs_inode_free_eofblocks and xfs_inode_free_cowblocks are now internal static functions, we can save ourselves a cycling of the iolock by passing the lock state out to xfs_blockgc_scan_inode and letting it do all the unlocking. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_icache.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 460fa7b3a31c..1d7720a0c068 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1283,11 +1283,11 @@ xfs_reclaim_worker( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - void *args) + void *args, + unsigned int *lockflags) { struct xfs_eofblocks *eofb = args; bool wait; - int ret; wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); @@ -1320,11 +1320,9 @@ xfs_inode_free_eofblocks( return -EAGAIN; return 0; } + *lockflags |= XFS_IOLOCK_EXCL; - ret = xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - - return ret; + return xfs_free_eofblocks(ip); } /* @@ -1493,7 +1491,8 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - void *args) + void *args, + unsigned int *lockflags) { struct xfs_eofblocks *eofb = args; bool wait; @@ -1514,16 +1513,20 @@ xfs_inode_free_cowblocks( * If the caller is waiting, return -EAGAIN to keep the background * scanner moving and revisit the inode in a subsequent pass. */ - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (!(*lockflags & XFS_IOLOCK_EXCL) && + !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (wait) return -EAGAIN; return 0; } + *lockflags |= XFS_IOLOCK_EXCL; + if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { if (wait) - ret = -EAGAIN; - goto out_iolock; + return -EAGAIN; + return 0; } + *lockflags |= XFS_MMAPLOCK_EXCL; /* * Check again, nobody else should be able to dirty blocks or change @@ -1531,11 +1534,6 @@ xfs_inode_free_cowblocks( */ if (xfs_prep_free_cowblocks(ip)) ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); - - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); -out_iolock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; } @@ -1593,17 +1591,18 @@ xfs_blockgc_scan_inode( struct xfs_inode *ip, void *args) { + unsigned int lockflags = 0; int error; - error = xfs_inode_free_eofblocks(ip, args); + error = xfs_inode_free_eofblocks(ip, args, &lockflags); if (error) - return error; + goto unlock; - error = xfs_inode_free_cowblocks(ip, args); - if (error) - return error; - - return 0; + error = xfs_inode_free_cowblocks(ip, args, &lockflags); +unlock: + if (lockflags) + xfs_iunlock(ip, lockflags); + return error; } /* Background worker that trims preallocated space. */ From bc41fa5321f93ecbabec177f888451cfc17ad66d Mon Sep 17 00:00:00 2001 From: Zorro Lang <zlang@redhat.com> Date: Tue, 2 Feb 2021 12:11:53 -0800 Subject: [PATCH 78/85] libxfs: expose inobtcount in xfs geometry As xfs supports the feature of inode btree block counters now, expose this feature flag in xfs geometry, for userspace can check if the inobtcnt is enabled or not. Signed-off-by: Zorro Lang <zlang@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/libxfs/xfs_fs.h | 1 + fs/xfs/libxfs/xfs_sb.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2a2e3cfd94f0..6fad140d4c8e 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -250,6 +250,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ +#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index bbda117e5d85..60e6d255e5e2 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1138,6 +1138,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; if (xfs_sb_version_hasbigtime(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; + if (xfs_sb_version_hasinobtcounts(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; if (xfs_sb_version_hassector(sbp)) geo->logsectsize = sbp->sb_logsectsize; else From ce5e1062e2539c7f7d311548494ea2705184c784 Mon Sep 17 00:00:00 2001 From: Gao Xiang <hsiangkao@redhat.com> Date: Tue, 2 Feb 2021 18:24:06 -0800 Subject: [PATCH 79/85] xfs: rename `new' to `delta' in xfs_growfs_data_private() It actually means the delta block count of growfs. Rename it in order to make it clear. Also introduce nb_div to avoid reusing `delta`. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Gao Xiang <hsiangkao@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_fsops.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 959ce91a3755..62600d78bbf1 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -32,8 +32,8 @@ xfs_growfs_data_private( int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; - xfs_rfsblock_t nb, nb_mod; - xfs_rfsblock_t new; + xfs_rfsblock_t nb, nb_div, nb_mod; + xfs_rfsblock_t delta; xfs_agnumber_t oagcount; xfs_trans_t *tp; struct aghdr_init_data id = {}; @@ -50,16 +50,16 @@ xfs_growfs_data_private( return error; xfs_buf_relse(bp); - new = nb; /* use new as a temporary here */ - nb_mod = do_div(new, mp->m_sb.sb_agblocks); - nagcount = new + (nb_mod != 0); + nb_div = nb; + nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); + nagcount = nb_div + (nb_mod != 0); if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; if (nb < mp->m_sb.sb_dblocks) return -EINVAL; } - new = nb - mp->m_sb.sb_dblocks; + delta = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ @@ -89,7 +89,7 @@ xfs_growfs_data_private( INIT_LIST_HEAD(&id.buffer_list); for (id.agno = nagcount - 1; id.agno >= oagcount; - id.agno--, new -= id.agsize) { + id.agno--, delta -= id.agsize) { if (id.agno == nagcount - 1) id.agsize = nb - @@ -110,8 +110,8 @@ xfs_growfs_data_private( xfs_trans_agblocks_delta(tp, id.nfree); /* If there are new blocks in the old last AG, extend it. */ - if (new) { - error = xfs_ag_extend_space(mp, tp, &id, new); + if (delta) { + error = xfs_ag_extend_space(mp, tp, &id, delta); if (error) goto out_trans_cancel; } @@ -143,7 +143,7 @@ xfs_growfs_data_private( * If we expanded the last AG, free the per-AG reservation * so we can reinitialize it with the new size. */ - if (new) { + if (delta) { struct xfs_perag *pag; pag = xfs_perag_get(mp, id.agno); From 07aabd9c4a881276cf9b7b2d3a7f1d14dd832ed0 Mon Sep 17 00:00:00 2001 From: Gao Xiang <hsiangkao@redhat.com> Date: Tue, 2 Feb 2021 18:24:06 -0800 Subject: [PATCH 80/85] xfs: get rid of xfs_growfs_{data,log}_t Such usage isn't encouraged by the kernel coding style. Leave the definitions alone in case of userspace users. Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: Gao Xiang <hsiangkao@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_fsops.c | 12 ++++++------ fs/xfs/xfs_fsops.h | 4 ++-- fs/xfs/xfs_ioctl.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 62600d78bbf1..a2a407039227 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -25,8 +25,8 @@ */ static int xfs_growfs_data_private( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_data_t *in) /* growfs data input struct */ + struct xfs_mount *mp, /* mount point for filesystem */ + struct xfs_growfs_data *in) /* growfs data input struct */ { struct xfs_buf *bp; int error; @@ -35,7 +35,7 @@ xfs_growfs_data_private( xfs_rfsblock_t nb, nb_div, nb_mod; xfs_rfsblock_t delta; xfs_agnumber_t oagcount; - xfs_trans_t *tp; + struct xfs_trans *tp; struct aghdr_init_data id = {}; nb = in->newblocks; @@ -170,8 +170,8 @@ out_trans_cancel: static int xfs_growfs_log_private( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_log_t *in) /* growfs log input struct */ + struct xfs_mount *mp, /* mount point for filesystem */ + struct xfs_growfs_log *in) /* growfs log input struct */ { xfs_extlen_t nb; @@ -268,7 +268,7 @@ out_error: int xfs_growfs_log( xfs_mount_t *mp, - xfs_growfs_log_t *in) + struct xfs_growfs_log *in) { int error; diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 92869f6ec8d3..2cffe51a31e8 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -6,8 +6,8 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); -extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); +extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); +extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 78f965425fa6..248083ea0276 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -2251,7 +2251,7 @@ xfs_file_ioctl( } case XFS_IOC_FSGROWFSDATA: { - xfs_growfs_data_t in; + struct xfs_growfs_data in; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; @@ -2265,7 +2265,7 @@ xfs_file_ioctl( } case XFS_IOC_FSGROWFSLOG: { - xfs_growfs_log_t in; + struct xfs_growfs_log in; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; From 45068063efb7dd0a8d115c106aa05d9ab0946257 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Tue, 2 Feb 2021 11:13:58 -0800 Subject: [PATCH 81/85] xfs: fix incorrect root dquot corruption error when switching group/project quota types While writing up a regression test for broken behavior when a chprojid request fails, I noticed that we were logging corruption notices about the root dquot of the group/project quota file at mount time when testing V4 filesystems. In commit afeda6000b0c, I was trying to improve ondisk dquot validation by making sure that when we load an ondisk dquot into memory on behalf of an incore dquot, the dquot id and type matches. Unfortunately, I forgot that V4 filesystems only have two quota files, and can switch that file between group and project quota types at mount time. When we perform that switch, we'll try to load the default quota limits from the root dquot prior to running quotacheck and log a corruption error when the types don't match. This is inconsequential because quotacheck will reset the second quota file as part of doing the switch, but we shouldn't leave scary messages in the kernel log. Fixes: afeda6000b0c ("xfs: validate ondisk/incore dquot flags") Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com> --- fs/xfs/xfs_dquot.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 175f544f7c45..bd8379b98374 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -506,6 +506,42 @@ xfs_dquot_alloc( return dqp; } +/* Check the ondisk dquot's id and type match what the incore dquot expects. */ +static bool +xfs_dquot_check_type( + struct xfs_dquot *dqp, + struct xfs_disk_dquot *ddqp) +{ + uint8_t ddqp_type; + uint8_t dqp_type; + + ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK; + dqp_type = xfs_dquot_type(dqp); + + if (be32_to_cpu(ddqp->d_id) != dqp->q_id) + return false; + + /* + * V5 filesystems always expect an exact type match. V4 filesystems + * expect an exact match for user dquots and for non-root group and + * project dquots. + */ + if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) || + dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0) + return ddqp_type == dqp_type; + + /* + * V4 filesystems support either group or project quotas, but not both + * at the same time. The non-user quota file can be switched between + * group and project quota uses depending on the mount options, which + * means that we can encounter the other type when we try to load quota + * defaults. Quotacheck will soon reset the the entire quota file + * (including the root dquot) anyway, but don't log scary corruption + * reports to dmesg. + */ + return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ; +} + /* Copy the in-core quota fields in from the on-disk buffer. */ STATIC int xfs_dquot_from_disk( @@ -518,8 +554,7 @@ xfs_dquot_from_disk( * Ensure that we got the type and ID we were looking for. * Everything else was checked by the dquot buffer verifier. */ - if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || - be32_to_cpu(ddqp->d_id) != dqp->q_id) { + if (!xfs_dquot_check_type(dqp, ddqp)) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", __this_address, dqp->q_id); From 8e8794b919884e0a14991651a69c8a5735570d31 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" <djwong@kernel.org> Date: Mon, 8 Feb 2021 09:29:34 -0800 Subject: [PATCH 82/85] xfs: fix rst syntax error in admin guide Tables are supposed to have a matching line of "===" to signal the end of a table. The rst compiler gets grouchy if it encounters EOF instead, so fix this warning. Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Eric Sandeen <sandeen@redhat.com> Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> --- Documentation/admin-guide/xfs.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index d2064a52811b..6178153d3320 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -536,3 +536,4 @@ The interesting knobs for XFS workqueues are as follows: cpumask CPUs upon which the threads are allowed to run. nice Relative priority of scheduling the threads. These are the same nice levels that can be applied to userspace processes. +============ =========== From e4826691cc7e5458bcb659935d0092bcf3f08c20 Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Wed, 10 Feb 2021 17:27:20 -0800 Subject: [PATCH 83/85] xfs: restore shutdown check in mapped write fault path XFS triggers an iomap warning in the write fault path due to a !PageUptodate() page if a write fault happens to occur on a page that recently failed writeback. The iomap writeback error handling code can clear the Uptodate flag if no portion of the page is submitted for I/O. This is reproduced by fstest generic/019, which combines various forms of I/O with simulated disk failures that inevitably lead to filesystem shutdown (which then unconditionally fails page writeback). This is a regression introduced by commit f150b4234397 ("xfs: split the iomap ops for buffered vs direct writes") due to the removal of a shutdown check and explicit error return in the ->iomap_begin() path used by the write fault path. The explicit error return historically translated to a SIGBUS, but now carries on with iomap processing where it complains about the unexpected state. Restore the shutdown check to xfs_buffered_write_iomap_begin() to restore historical behavior. Fixes: f150b4234397 ("xfs: split the iomap ops for buffered vs direct writes") Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Eric Sandeen <sandeen@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_iomap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 70c341658c01..6594f572096e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -860,6 +860,9 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, From 8646b982baf7d389a140ca3974974a4cbbc3f171 Mon Sep 17 00:00:00 2001 From: kernel test robot <lkp@intel.com> Date: Wed, 10 Feb 2021 17:27:31 -0800 Subject: [PATCH 84/85] xfs: fix boolreturn.cocci warnings fs/xfs/xfs_log.c:1062:9-10: WARNING: return of 0/1 in function 'xfs_log_need_covered' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Fixes: 37444fc4cc39 ("xfs: lift writable fs check up into log worker task") CC: Brian Foster <bfoster@redhat.com> Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: kernel test robot <lkp@intel.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/xfs_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 9aa30e7cd314..06041834daa3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1069,7 +1069,7 @@ xfs_log_need_covered( bool needed = false; if (!xlog_cil_empty(log)) - return 0; + return false; spin_lock(&log->l_icloglock); switch (log->l_covered_state) { From 1cd738b13ae9b29e03d6149f0246c61f76e81fcf Mon Sep 17 00:00:00 2001 From: Brian Foster <bfoster@redhat.com> Date: Thu, 11 Feb 2021 08:46:38 -0800 Subject: [PATCH 85/85] xfs: consider shutdown in bmapbt cursor delete assert The assert in xfs_btree_del_cursor() checks that the bmapbt block allocation field has been handled correctly before the cursor is freed. This field is used for accurate calculation of indirect block reservation requirements (for delayed allocations), for example. generic/019 reproduces a scenario where this assert fails because the filesystem has shutdown while in the middle of a bmbt record insertion. This occurs after a bmbt block has been allocated via the cursor but before the higher level bmap function (i.e. xfs_bmap_add_extent_hole_real()) completes and resets the field. Update the assert to accommodate the transient state if the filesystem has shutdown. While here, clean up the indentation and comments in the function. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- fs/xfs/libxfs/xfs_btree.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c4d7a9241dc3..b56ff451adce 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -353,20 +353,17 @@ xfs_btree_free_block( */ void xfs_btree_del_cursor( - xfs_btree_cur_t *cur, /* btree cursor */ - int error) /* del because of error */ + struct xfs_btree_cur *cur, /* btree cursor */ + int error) /* del because of error */ { - int i; /* btree level */ + int i; /* btree level */ /* - * Clear the buffer pointers, and release the buffers. - * If we're doing this in the face of an error, we - * need to make sure to inspect all of the entries - * in the bc_bufs array for buffers to be unlocked. - * This is because some of the btree code works from - * level n down to 0, and if we get an error along - * the way we won't have initialized all the entries - * down to 0. + * Clear the buffer pointers and release the buffers. If we're doing + * this because of an error, inspect all of the entries in the bc_bufs + * array for buffers to be unlocked. This is because some of the btree + * code works from level n down to 0, and if we get an error along the + * way we won't have initialized all the entries down to 0. */ for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_bufs[i]) @@ -374,17 +371,11 @@ xfs_btree_del_cursor( else if (!error) break; } - /* - * Can't free a bmap cursor without having dealt with the - * allocated indirect blocks' accounting. - */ - ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_ino.allocated == 0); - /* - * Free the cursor. - */ + + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || + XFS_FORCED_SHUTDOWN(cur->bc_mp)); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) - kmem_free((void *)cur->bc_ops); + kmem_free(cur->bc_ops); kmem_cache_free(xfs_btree_cur_zone, cur); }