From 6fc277c7c935c7e1fdee23e82da988d9d3cb6bef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Apr 2021 13:48:27 -0700 Subject: [PATCH 01/11] xfs: rename xfs_ictimestamp_t Rename xfs_ictimestamp_t to xfs_log_timestamp_t as it is a type used for logging timestamps with no relationship to the in-core inode. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_log_format.h | 10 +++++----- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_inode_item_recover.c | 2 +- fs/xfs/xfs_ondisk.h | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8bd00da6d2a4..5900772d678a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -368,7 +368,7 @@ static inline int xfs_ilog_fdata(int w) * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ -typedef uint64_t xfs_ictimestamp_t; +typedef uint64_t xfs_log_timestamp_t; /* Legacy timestamp encoding format. */ struct xfs_legacy_ictimestamp { @@ -393,9 +393,9 @@ struct xfs_log_dinode { uint16_t di_projid_hi; /* higher part of owner's project id */ uint8_t di_pad[6]; /* unused, zeroed space */ uint16_t di_flushiter; /* incremented on flush */ - xfs_ictimestamp_t di_atime; /* time last accessed */ - xfs_ictimestamp_t di_mtime; /* time last modified */ - xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_log_timestamp_t di_atime; /* time last accessed */ + xfs_log_timestamp_t di_mtime; /* time last modified */ + xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ @@ -420,7 +420,7 @@ struct xfs_log_dinode { uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ - xfs_ictimestamp_t di_crtime; /* time created */ + xfs_log_timestamp_t di_crtime; /* time created */ xfs_ino_t di_ino; /* inode number */ uuid_t di_uuid; /* UUID of the filesystem */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c1b32680f71c..6cc4ca15209c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -299,13 +299,13 @@ xfs_inode_item_format_attr_fork( * Convert an incore timestamp to a log timestamp. Note that the log format * specifies host endian format! */ -static inline xfs_ictimestamp_t +static inline xfs_log_timestamp_t xfs_inode_to_log_dinode_ts( struct xfs_inode *ip, const struct timespec64 tv) { struct xfs_legacy_ictimestamp *lits; - xfs_ictimestamp_t its; + xfs_log_timestamp_t its; if (xfs_inode_has_bigtime(ip)) return xfs_inode_encode_bigtime(tv); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index cb44f7653f03..9b877de2ce5e 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -125,7 +125,7 @@ static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) static inline xfs_timestamp_t xfs_log_dinode_to_disk_ts( struct xfs_log_dinode *from, - const xfs_ictimestamp_t its) + const xfs_log_timestamp_t its) { struct xfs_legacy_timestamp *lts; struct xfs_legacy_ictimestamp *lits; diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0aa87c210104..66b541b7bb64 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -126,7 +126,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); - XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); From 732de7dbdbd30df40a6d260a8da6fc5262039439 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Apr 2021 13:48:27 -0700 Subject: [PATCH 02/11] xfs: rename struct xfs_legacy_ictimestamp Rename struct xfs_legacy_ictimestamp to struct xfs_log_legacy_timestamp as it is a type used for logging timestamps with no relationship to the in-core inode. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_log_format.h | 2 +- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_inode_item_recover.c | 4 ++-- fs/xfs/xfs_ondisk.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 5900772d678a..3e15ea29fb8d 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -371,7 +371,7 @@ static inline int xfs_ilog_fdata(int w) typedef uint64_t xfs_log_timestamp_t; /* Legacy timestamp encoding format. */ -struct xfs_legacy_ictimestamp { +struct xfs_log_legacy_timestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ }; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6cc4ca15209c..6764d12342da 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -304,13 +304,13 @@ xfs_inode_to_log_dinode_ts( struct xfs_inode *ip, const struct timespec64 tv) { - struct xfs_legacy_ictimestamp *lits; + struct xfs_log_legacy_timestamp *lits; xfs_log_timestamp_t its; if (xfs_inode_has_bigtime(ip)) return xfs_inode_encode_bigtime(tv); - lits = (struct xfs_legacy_ictimestamp *)&its; + lits = (struct xfs_log_legacy_timestamp *)&its; lits->t_sec = tv.tv_sec; lits->t_nsec = tv.tv_nsec; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 9b877de2ce5e..7b79518b6c20 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -128,14 +128,14 @@ xfs_log_dinode_to_disk_ts( const xfs_log_timestamp_t its) { struct xfs_legacy_timestamp *lts; - struct xfs_legacy_ictimestamp *lits; + struct xfs_log_legacy_timestamp *lits; xfs_timestamp_t ts; if (xfs_log_dinode_has_bigtime(from)) return cpu_to_be64(its); lts = (struct xfs_legacy_timestamp *)&ts; - lits = (struct xfs_legacy_ictimestamp *)&its; + lits = (struct xfs_log_legacy_timestamp *)&its; lts->t_sec = cpu_to_be32(lits->t_sec); lts->t_nsec = cpu_to_be32(lits->t_nsec); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 66b541b7bb64..25991923c1a8 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -127,7 +127,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t, 8); - XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_log_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); From 1aec7c3d05670b92b7339b19999009a93808efb9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Apr 2021 16:02:00 -0700 Subject: [PATCH 03/11] xfs: remove obsolete AGF counter debugging In commit f8f2835a9cf3 we changed the behavior of XFS to use EFIs to remove blocks from an overfilled AGFL because there were complaints about transaction overruns that stemmed from trying to free multiple blocks in a single transaction. Unfortunately, that commit missed a subtlety in the debug-mode transaction accounting when a realtime volume is attached. If a realtime file undergoes a data fork mapping change such that realtime extents are allocated (or freed) in the same transaction that a data device block is also allocated (or freed), we can trip a debugging assertion. This can happen (for example) if a realtime extent is allocated and it is necessary to reshape the bmbt to hold the new mapping. When we go to allocate a bmbt block from an AG, the first thing the data device block allocator does is ensure that the freelist is the proper length. If the freelist is too long, it will trim the freelist to the proper length. In debug mode, trimming the freelist calls xfs_trans_agflist_delta() to record the decrement in the AG free list count. Prior to f8f28 we would put the free block back in the free space btrees in the same transaction, which calls xfs_trans_agblocks_delta() to record the increment in the AG free block count. Since AGFL blocks are included in the global free block count (fdblocks), there is no corresponding fdblocks update, so the AGFL free satisfies the following condition in xfs_trans_apply_sb_deltas: /* * Check that superblock mods match the mods made to AGF counters. */ ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + tp->t_ag_btree_delta)); The comparison here used to be: (X + 0) == ((X+1) + -1 + 0), where X is the number blocks that were allocated. After commit f8f28 we defer the block freeing to the next chained transaction, which means that the calls to xfs_trans_agflist_delta and xfs_trans_agblocks_delta occur in separate transactions. The (first) transaction that shortens the free list trips on the comparison, which has now become: (X + 0) == ((X) + -1 + 0) because we haven't freed the AGFL block yet; we've only logged an intention to free it. When the second transaction (the deferred free) commits, it will evaluate the expression as: (0 + 0) == (1 + 0 + 0) and trip over that in turn. At this point, the astute reader may note that the two commits tagged by this patch have been in the kernel for a long time but haven't generated any bug reports. How is it that the author became aware of this bug? This originally surfaced as an intermittent failure when I was testing realtime rmap, but a different bug report by Zorro Lang reveals the same assertion occuring on !lazysbcount filesystems. The common factor to both reports (and why this problem wasn't previously reported) becomes apparent if we consider when xfs_trans_apply_sb_deltas is called by __xfs_trans_commit(): if (tp->t_flags & XFS_TRANS_SB_DIRTY) xfs_trans_apply_sb_deltas(tp); With a modern lazysbcount filesystem, transactions update only the percpu counters, so they don't need to set XFS_TRANS_SB_DIRTY, hence xfs_trans_apply_sb_deltas is rarely called. However, updates to the count of free realtime extents are not part of lazysbcount, so XFS_TRANS_SB_DIRTY will be set on transactions adding or removing data fork mappings to realtime files; similarly, XFS_TRANS_SB_DIRTY is always set on !lazysbcount filesystems. Dave mentioned in response to an earlier version of this patch: "IIUC, what you are saying is that this debug code is simply not exercised in normal testing and hasn't been for the past decade? And it still won't be exercised on anything other than realtime device testing? "...it was debugging code from 1994 that was largely turned into dead code when lazysbcounters were introduced in 2007. Hence I'm not sure it holds any value anymore." This debugging code isn't especially helpful - you can modify the flcount on one AG and the freeblks of another AG, and it won't trigger. Add the fact that nobody noticed for a decade, and let's just get rid of it (and start testing realtime :P). This bug was found by running generic/051 on either a V4 filesystem lacking lazysbcount; or a V5 filesystem with a realtime volume. Cc: bfoster@redhat.com, zlang@redhat.com Fixes: f8f2835a9cf3 ("xfs: defer agfl block frees when dfops is available") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_alloc.c | 3 --- fs/xfs/libxfs/xfs_alloc_btree.c | 2 -- fs/xfs/libxfs/xfs_rmap_btree.c | 2 -- fs/xfs/xfs_fsops.c | 2 -- fs/xfs/xfs_trans.c | 7 ------- fs/xfs/xfs_trans.h | 15 --------------- 6 files changed, 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index aaa19101bb2a..f52b9e4a03f9 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -718,7 +718,6 @@ xfs_alloc_update_counters( agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); - xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { xfs_buf_mark_corrupt(agbp); @@ -2739,7 +2738,6 @@ xfs_alloc_get_freelist( pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); - xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; @@ -2846,7 +2844,6 @@ xfs_alloc_put_freelist( pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); - xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 8e01231b308e..dbe302d1cb8d 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -73,7 +73,6 @@ xfs_allocbt_alloc_block( xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); - xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); *stat = 1; @@ -97,7 +96,6 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index beb81c84a937..9f5bcbd834c3 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -103,7 +103,6 @@ xfs_rmapbt_alloc_block( xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); - xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -136,7 +135,6 @@ xfs_rmapbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1); pag = cur->bc_ag.agbp->b_pag; xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b33c894b6cf3..be9cf88d2ad7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -69,8 +69,6 @@ xfs_resizefs_init_new_ags( if (error) return error; - xfs_trans_agblocks_delta(tp, id->nfree); - if (delta) { *lastag_extended = true; error = xfs_ag_extend_space(mp, tp, id, delta); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bcc978011869..2b46b4f713d1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -487,13 +487,6 @@ xfs_trans_apply_sb_deltas( bp = xfs_trans_getsb(tp); sbp = bp->b_addr; - /* - * Check that superblock mods match the mods made to AGF counters. - */ - ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == - (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + - tp->t_ag_btree_delta)); - /* * Only update the superblock counters if we are logging them */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9dd745cf77c9..ee42d98d9011 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -140,11 +140,6 @@ typedef struct xfs_trans { int64_t t_res_fdblocks_delta; /* on-disk only chg */ int64_t t_frextents_delta;/* superblock freextents chg*/ int64_t t_res_frextents_delta; /* on-disk only chg */ -#if defined(DEBUG) || defined(XFS_WARN) - int64_t t_ag_freeblks_delta; /* debugging counter */ - int64_t t_ag_flist_delta; /* debugging counter */ - int64_t t_ag_btree_delta; /* debugging counter */ -#endif int64_t t_dblocks_delta;/* superblock dblocks change */ int64_t t_agcount_delta;/* superblock agcount change */ int64_t t_imaxpct_delta;/* superblock imaxpct change */ @@ -165,16 +160,6 @@ typedef struct xfs_trans { */ #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) -#if defined(DEBUG) || defined(XFS_WARN) -#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) -#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) -#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) -#else -#define xfs_trans_agblocks_delta(tp, d) -#define xfs_trans_agflist_delta(tp, d) -#define xfs_trans_agbtree_delta(tp, d) -#endif - /* * XFS transaction mechanism exported interfaces. */ From e6c01077ec2d28fe8b6e0bc79eddea8d788f6ea3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Apr 2021 16:02:01 -0700 Subject: [PATCH 04/11] xfs: don't check agf_btreeblks on pre-lazysbcount filesystems The AGF free space btree block counter wasn't added until the lazysbcount feature was added to XFS midway through the life of the V4 format, so ignore the field when checking. Online AGF repair requires rmapbt, so it doesn't need the feature check. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader.c | 7 ++++++- fs/xfs/scrub/fscounters.c | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 749faa17f8e2..7a2f9b5f2db5 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -416,6 +416,10 @@ xchk_agf_xref_btreeblks( xfs_agblock_t btreeblks; int error; + /* agf_btreeblks didn't exist before lazysbcount */ + if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + return; + /* Check agf_rmap_blocks; set up for agf_btreeblks check */ if (sc->sa.rmap_cur) { error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks); @@ -581,7 +585,8 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) && + pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); xfs_perag_put(pag); diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 7b4386c78fbf..318b81c0f90d 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -182,7 +182,8 @@ retry: /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - fsc->fdblocks += pag->pagf_btreeblks; + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + fsc->fdblocks += pag->pagf_btreeblks; /* * Per-AG reservations are taken out of the incore counters, From 6543990a168acf366f4b6174d7bd46ba15a8a2a6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 26 Apr 2021 18:28:31 -0700 Subject: [PATCH 05/11] xfs: update superblock counters correctly for !lazysbcount Keep the mount superblock counters up to date for !lazysbcount filesystems so that when we log the superblock they do not need updating in any way because they are already correct. It's found by what Zorro reported: 1. mkfs.xfs -f -l lazy-count=0 -m crc=0 $dev 2. mount $dev $mnt 3. fsstress -d $mnt -p 100 -n 1000 (maybe need more or less io load) 4. umount $mnt 5. xfs_repair -n $dev and I've seen no problem with this patch. Signed-off-by: Dave Chinner Reported-by: Zorro Lang Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_sb.c | 16 +++++++++++++--- fs/xfs/xfs_trans.c | 3 +++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 60e6d255e5e2..dfbbcbd448c1 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -926,9 +926,19 @@ xfs_log_sb( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp); - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + /* + * Lazy sb counters don't update the in-core superblock so do that now. + * If this is at unmount, the counters will be exactly correct, but at + * any other time they will only be ballpark correct because of + * reservations that have been taken out percpu counters. If we have an + * unclean shutdown, this will be corrected by log recovery rebuilding + * the counters from the AGF block counts. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) { + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2b46b4f713d1..586f2992b789 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -622,6 +622,9 @@ xfs_trans_unreserve_and_mod_sb( /* apply remaining deltas */ spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; + mp->m_sb.sb_icount += idelta; + mp->m_sb.sb_ifree += ifreedelta; mp->m_sb.sb_frextents += rtxdelta; mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; From e147a756ab263f9d10eafd08b79b9fac1b08e56c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 26 Apr 2021 19:06:58 -0700 Subject: [PATCH 06/11] xfs: count free space btree blocks when scrubbing pre-lazysbcount fses Since agf_btreeblks didn't exist before the lazysbcount feature, the fs summary count scrubber needs to walk the free space btrees to determine the amount of space being used by those btrees. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Gao Xiang --- fs/xfs/scrub/fscounters.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 318b81c0f90d..f1d1a8c58853 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -13,6 +13,7 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_health.h" +#include "xfs_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -143,6 +144,35 @@ xchk_setup_fscounters( return xchk_trans_alloc(sc, 0); } +/* Count free space btree blocks manually for pre-lazysbcount filesystems. */ +static int +xchk_fscount_btreeblks( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc, + xfs_agnumber_t agno) +{ + xfs_extlen_t blocks; + int error; + + error = xchk_ag_init(sc, agno, &sc->sa); + if (error) + return error; + + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + +out_free: + xchk_ag_free(sc, &sc->sa); + return error; +} + /* * Calculate what the global in-core counters ought to be from the incore * per-AG structure. Callers can compare this to the actual in-core counters @@ -182,8 +212,15 @@ retry: /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) { fsc->fdblocks += pag->pagf_btreeblks; + } else { + error = xchk_fscount_btreeblks(sc, fsc, agno); + if (error) { + xfs_perag_put(pag); + break; + } + } /* * Per-AG reservations are taken out of the incore counters, From 2675ad3890db93e58f2264d07c2d1f615ec5adf7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 28 Apr 2021 15:05:41 -0700 Subject: [PATCH 07/11] xfs: unconditionally read all AGFs on mounts with perag reservation perag reservation is enabled at mount time on a per AG basis. The upcoming change to set aside allocbt blocks from block reservation requires a populated allocbt counter as soon as possible after mount to be fully effective against large perag reservations. Therefore as a preparation step, initialize the pagf on all mounts where at least one reservation is active. Note that this already occurs to some degree on most default format filesystems as reservation requirement calculations already depend on the AGF or AGI, depending on the reservation type. Signed-off-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag_resv.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 6c5f8d10589c..e32a1833d523 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -253,7 +253,8 @@ xfs_ag_resv_init( xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; - int error = 0; + int error = 0, error2; + bool has_resv = false; /* Create the metadata reservation. */ if (pag->pag_meta_resv.ar_asked == 0) { @@ -291,6 +292,8 @@ xfs_ag_resv_init( if (error) goto out; } + if (ask) + has_resv = true; } /* Create the RMAPBT metadata reservation */ @@ -304,19 +307,28 @@ xfs_ag_resv_init( error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); if (error) goto out; + if (ask) + has_resv = true; } -#ifdef DEBUG - /* need to read in the AGF for the ASSERT below to work */ - error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0); - if (error) - return error; - - ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= - pag->pagf_freeblks + pag->pagf_flcount); -#endif out: + /* + * Initialize the pagf if we have at least one active reservation on the + * AG. This may have occurred already via reservation calculation, but + * fall back to an explicit init to ensure the in-core allocbt usage + * counters are initialized as soon as possible. This is important + * because filesystems with large perag reservations are susceptible to + * free space reservation problems that the allocbt counter is used to + * address. + */ + if (has_resv) { + error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); + if (error2) + return error2; + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= + pag->pagf_freeblks + pag->pagf_flcount); + } return error; } From 16eaab839a9273ed156ebfccbd40c15d1e72f3d8 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 28 Apr 2021 15:05:50 -0700 Subject: [PATCH 08/11] xfs: introduce in-core global counter of allocbt blocks Introduce an in-core counter to track the sum of all allocbt blocks used by the filesystem. This value is currently tracked per-ag via the ->agf_btreeblks field in the AGF, which also happens to include rmapbt blocks. A global, in-core count of allocbt blocks is required to identify the subset of global ->m_fdblocks that consists of unavailable blocks currently used for allocation btrees. To support this calculation at block reservation time, construct a similar global counter for allocbt blocks, populate it on first read of each AGF and update it as allocbt blocks are used and released. Signed-off-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 14 ++++++++++++++ fs/xfs/libxfs/xfs_alloc_btree.c | 2 ++ fs/xfs/xfs_mount.h | 6 ++++++ 3 files changed, 22 insertions(+) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f52b9e4a03f9..82b7cbb1f24f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3033,6 +3033,7 @@ xfs_alloc_read_agf( struct xfs_agf *agf; /* ag freelist header */ struct xfs_perag *pag; /* per allocation group data */ int error; + int allocbt_blks; trace_xfs_alloc_read_agf(mp, agno); @@ -3063,6 +3064,19 @@ xfs_alloc_read_agf( pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); pag->pagf_init = 1; pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + + /* + * Update the in-core allocbt counter. Filter out the rmapbt + * subset of the btreeblks counter because the rmapbt is managed + * by perag reservation. Subtract one for the rmapbt root block + * because the rmap counter includes it while the btreeblks + * counter only tracks non-root blocks. + */ + allocbt_blks = pag->pagf_btreeblks; + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; + if (allocbt_blks > 0) + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); } #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index dbe302d1cb8d..a43e4c50e69b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -71,6 +71,7 @@ xfs_allocbt_alloc_block( return 0; } + atomic64_inc(&cur->bc_mp->m_allocbt_blks); xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); new->s = cpu_to_be32(bno); @@ -94,6 +95,7 @@ xfs_allocbt_free_block( if (error) return error; + atomic64_dec(&cur->bc_mp->m_allocbt_blks); xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 81829d19596e..bb67274ee23f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -170,6 +170,12 @@ typedef struct xfs_mount { * extents or anything related to the rt device. */ struct percpu_counter m_delalloc_blks; + /* + * Global count of allocation btree blocks in use across all AGs. Only + * used when perag reservation is enabled. Helps prevent block + * reservation from attempting to reserve allocation btree blocks. + */ + atomic64_t m_allocbt_blks; struct radix_tree_root m_perag_tree; /* per-ag accounting info */ spinlock_t m_perag_lock; /* lock for m_perag_tree */ From fd43cf600cf61c66ae0a1021aca2f636115c7fcb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 28 Apr 2021 15:06:05 -0700 Subject: [PATCH 09/11] xfs: set aside allocation btree blocks from block reservation The blocks used for allocation btrees (bnobt and countbt) are technically considered free space. This is because as free space is used, allocbt blocks are removed and naturally become available for traditional allocation. However, this means that a significant portion of free space may consist of in-use btree blocks if free space is severely fragmented. On large filesystems with large perag reservations, this can lead to a rare but nasty condition where a significant amount of physical free space is available, but the majority of actual usable blocks consist of in-use allocbt blocks. We have a record of a (~12TB, 32 AG) filesystem with multiple AGs in a state with ~2.5GB or so free blocks tracked across ~300 total allocbt blocks, but effectively at 100% full because the the free space is entirely consumed by refcountbt perag reservation. Such a large perag reservation is by design on large filesystems. The problem is that because the free space is so fragmented, this AG contributes the 300 or so allocbt blocks to the global counters as free space. If this pattern repeats across enough AGs, the filesystem lands in a state where global block reservation can outrun physical block availability. For example, a streaming buffered write on the affected filesystem continues to allow delayed allocation beyond the point where writeback starts to fail due to physical block allocation failures. The expected behavior is for the delalloc block reservation to fail gracefully with -ENOSPC before physical block allocation failure is a possibility. To address this problem, set aside in-use allocbt blocks at reservation time and thus ensure they cannot be reserved until truly available for physical allocation. This allows alloc btree metadata to continue to reside in free space, but dynamically adjusts reservation availability based on internal state. Note that the logic requires that the allocbt counter is fully populated at reservation time before it is fully effective. We currently rely on the mount time AGF scan in the perag reservation initialization code for this dependency on filesystems where it's most important (i.e. with active perag reservations). Signed-off-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_mount.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index cb1e2c4702c3..bdfee1943796 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1188,6 +1188,7 @@ xfs_mod_fdblocks( int64_t lcounter; long long res_used; s32 batch; + uint64_t set_aside; if (delta > 0) { /* @@ -1227,8 +1228,20 @@ xfs_mod_fdblocks( else batch = XFS_FDBLOCKS_BATCH; + /* + * Set aside allocbt blocks because these blocks are tracked as free + * space but not available for allocation. Technically this means that a + * single reservation cannot consume all remaining free space, but the + * ratio of allocbt blocks to usable free blocks should be rather small. + * The tradeoff without this is that filesystems that maintain high + * perag block reservations can over reserve physical block availability + * and fail physical allocation, which leads to much more serious + * problems (i.e. transaction abort, pagecache discards, etc.) than + * slightly premature -ENOSPC. + */ + set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, + if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; From d4f74e162d238ce00a640af5f0611c3f51dad70e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 28 Apr 2021 22:41:39 -0700 Subject: [PATCH 10/11] xfs: fix xfs_reflink_unshare usage of filemap_write_and_wait_range The final parameter of filemap_write_and_wait_range is the end of the range to flush, not the length of the range to flush. Fixes: 46afb0628b86 ("xfs: only flush the unshared range in xfs_reflink_unshare") Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Brian Foster --- fs/xfs/xfs_reflink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4dd4af6ac2ef..060695d6d56a 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1522,7 +1522,8 @@ xfs_reflink_unshare( if (error) goto out; - error = filemap_write_and_wait_range(inode->i_mapping, offset, len); + error = filemap_write_and_wait_range(inode->i_mapping, offset, + offset + len - 1); if (error) goto out; From 8e9800f9f2b89e1efe2a5993361fae4d618a6c26 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 29 Apr 2021 14:39:33 -0700 Subject: [PATCH 11/11] xfs: don't allow log writes if the data device is readonly While running generic/050 with an external log, I observed this warning in dmesg: Trying to write to read-only block-device sda4 (partno 4) WARNING: CPU: 2 PID: 215677 at block/blk-core.c:704 submit_bio_checks+0x256/0x510 Call Trace: submit_bio_noacct+0x2c/0x430 _xfs_buf_ioapply+0x283/0x3c0 [xfs] __xfs_buf_submit+0x6a/0x210 [xfs] xfs_buf_delwri_submit_buffers+0xf8/0x270 [xfs] xfsaild+0x2db/0xc50 [xfs] kthread+0x14b/0x170 I think this happened because we tried to cover the log after a readonly mount, and the AIL tried to write the primary superblock to the data device. The test marks the data device readonly, but it doesn't do the same to the external log device. Therefore, XFS thinks that the log is writable, even though AIL writes whine to dmesg because the data device is read only. Fix this by amending xfs_log_writable to prevent writes when the AIL can't possible write anything into the filesystem. Note: As for the external log or the rt devices being readonly-- xfs_blkdev_get will complain about that if we aren't doing a norecovery mount. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_log.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 06041834daa3..c19a82adea1e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -355,13 +355,15 @@ xfs_log_writable( struct xfs_mount *mp) { /* - * Never write to the log on norecovery mounts, if the block device is - * read-only, or if the filesystem is shutdown. Read-only mounts still - * allow internal writes for log recovery and unmount purposes, so don't - * restrict that case here. + * Do not write to the log on norecovery mounts, if the data or log + * devices are read-only, or if the filesystem is shutdown. Read-only + * mounts allow internal writes for log recovery and unmount purposes, + * so don't restrict that case. */ if (mp->m_flags & XFS_MOUNT_NORECOVERY) return false; + if (xfs_readonly_buftarg(mp->m_ddev_targp)) + return false; if (xfs_readonly_buftarg(mp->m_log->l_targ)) return false; if (XFS_FORCED_SHUTDOWN(mp))