From 5f7fd75086203a8a4dd3e518976e52bcf24e8b22 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 2 Jun 2021 14:54:09 -0700 Subject: [PATCH 1/4] xfs: sort variable alphabetically to avoid repeated declaration Variable 'xfs_agf_buf_ops', 'xfs_agi_buf_ops', 'xfs_dquot_buf_ops' and 'xfs_symlink_buf_ops' are declared twice, so sort these variables alphabetically and remove the repeated declaration. Cc: "Darrick J. Wong" Signed-off-by: Shaokun Zhang Reviewed-by: Carlos Maiolino Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_shared.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 782fdd08f759..25c4cab58851 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -22,30 +22,26 @@ struct xfs_inode; * Buffer verifier operations are widely used, including userspace tools */ extern const struct xfs_buf_ops xfs_agf_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_bnobt_buf_ops; -extern const struct xfs_buf_ops xfs_cntbt_buf_ops; -extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; -extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_da3_node_buf_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_finobt_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbuf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_rtbuf_ops; /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); From 9673261c32dc2f30863b803374b726a72d16b07c Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 2 Jun 2021 14:56:29 -0700 Subject: [PATCH 2/4] xfs: Remove redundant assignment to busy Variable busy is set to false, but this value is never read as it is overwritten or not used later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: fs/xfs/libxfs/xfs_alloc.c:1679:2: warning: Value stored to 'busy' is never read [clang-analyzer-deadcode.DeadStores]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 82b7cbb1f24f..ae46fe64cc4f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1676,7 +1676,6 @@ restart: cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); bno_cur = NULL; - busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. From 5a981e4ea8ff8062e7c7ea8fc4a1565e4820a08b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 14:58:59 -0700 Subject: [PATCH 3/4] xfs: mark xfs_bmap_set_attrforkoff static xfs_bmap_set_attrforkoff is only used inside of xfs_bmap.c, so mark it static. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_bmap.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a3e0e6f672d6..7eb6b28a4c30 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1028,7 +1028,7 @@ xfs_bmap_add_attrfork_local( /* * Set an inode attr fork offset based on the format of the data fork. */ -int +static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, int size, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f9a390ecfb1d..67641f669918 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -187,7 +187,6 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); -int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, From 977ec4ddf0b75b30afa443cf71ae80e20f501b15 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 2 Jun 2021 15:00:38 -0700 Subject: [PATCH 4/4] xfs: don't take a spinlock unconditionally in the DIO fastpath Because this happens at high thread counts on high IOPS devices doing mixed read/write AIO-DIO to a single file at about a million iops: 64.09% 0.21% [kernel] [k] io_submit_one - 63.87% io_submit_one - 44.33% aio_write - 42.70% xfs_file_write_iter - 41.32% xfs_file_dio_write_aligned - 25.51% xfs_file_write_checks - 21.60% _raw_spin_lock - 21.59% do_raw_spin_lock - 19.70% __pv_queued_spin_lock_slowpath This also happens of the IO completion IO path: 22.89% 0.69% [kernel] [k] xfs_dio_write_end_io - 22.49% xfs_dio_write_end_io - 21.79% _raw_spin_lock - 20.97% do_raw_spin_lock - 20.10% __pv_queued_spin_lock_slowpath IOWs, fio is burning ~14 whole CPUs on this spin lock. So, do an unlocked check against inode size first, then if we are at/beyond EOF, take the spinlock and recheck. This makes the spinlock disappear from the overwrite fastpath. I'd like to report that fixing this makes things go faster. It doesn't - it just exposes the the XFS_ILOCK as the next severe contention point doing extent mapping lookups, and that now burns all the 14 CPUs this spinlock was burning. Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 396ef36dcd0a..c068dcd414f4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -384,21 +384,30 @@ restart: } goto restart; } + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which implies - * having to redo all checks before. + * write. If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies having to + * redo all checks before. * - * We need to serialise against EOF updates that occur in IO - * completions here. We want to make sure that nobody is changing the - * size while we do this check until we have placed an IO barrier (i.e. - * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. - * The spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value - * and hence be able to correctly determine if we need to run zeroing. + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while we + * do this check until we have placed an IO barrier (i.e. hold the + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + * + * We can do an unlocked check here safely as IO completion can only + * extend EOF. Truncate is locked out at this point, so the EOF can + * not move backwards, only forwards. Hence we only need to take the + * slow path and spin locks when we are at or beyond the current EOF. */ + if (iocb->ki_pos <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); isize = i_size_read(inode); if (iocb->ki_pos > isize) { @@ -426,7 +435,7 @@ restart: drained_dio = true; goto restart; } - + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, NULL, &xfs_buffered_write_iomap_ops); @@ -435,6 +444,7 @@ restart: } else spin_unlock(&ip->i_flags_lock); +out: return file_modified(file); } @@ -500,7 +510,17 @@ xfs_dio_write_end_io( * other IO completions here to update the EOF. Failing to serialise * here can result in EOF moving backwards and Bad Things Happen when * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. */ + if (offset + size <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size);