forked from Minki/linux
1aec7c3d05
In commitf8f2835a9c
we changed the behavior of XFS to use EFIs to remove blocks from an overfilled AGFL because there were complaints about transaction overruns that stemmed from trying to free multiple blocks in a single transaction. Unfortunately, that commit missed a subtlety in the debug-mode transaction accounting when a realtime volume is attached. If a realtime file undergoes a data fork mapping change such that realtime extents are allocated (or freed) in the same transaction that a data device block is also allocated (or freed), we can trip a debugging assertion. This can happen (for example) if a realtime extent is allocated and it is necessary to reshape the bmbt to hold the new mapping. When we go to allocate a bmbt block from an AG, the first thing the data device block allocator does is ensure that the freelist is the proper length. If the freelist is too long, it will trim the freelist to the proper length. In debug mode, trimming the freelist calls xfs_trans_agflist_delta() to record the decrement in the AG free list count. Prior to f8f28 we would put the free block back in the free space btrees in the same transaction, which calls xfs_trans_agblocks_delta() to record the increment in the AG free block count. Since AGFL blocks are included in the global free block count (fdblocks), there is no corresponding fdblocks update, so the AGFL free satisfies the following condition in xfs_trans_apply_sb_deltas: /* * Check that superblock mods match the mods made to AGF counters. */ ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + tp->t_ag_btree_delta)); The comparison here used to be: (X + 0) == ((X+1) + -1 + 0), where X is the number blocks that were allocated. After commit f8f28 we defer the block freeing to the next chained transaction, which means that the calls to xfs_trans_agflist_delta and xfs_trans_agblocks_delta occur in separate transactions. The (first) transaction that shortens the free list trips on the comparison, which has now become: (X + 0) == ((X) + -1 + 0) because we haven't freed the AGFL block yet; we've only logged an intention to free it. When the second transaction (the deferred free) commits, it will evaluate the expression as: (0 + 0) == (1 + 0 + 0) and trip over that in turn. At this point, the astute reader may note that the two commits tagged by this patch have been in the kernel for a long time but haven't generated any bug reports. How is it that the author became aware of this bug? This originally surfaced as an intermittent failure when I was testing realtime rmap, but a different bug report by Zorro Lang reveals the same assertion occuring on !lazysbcount filesystems. The common factor to both reports (and why this problem wasn't previously reported) becomes apparent if we consider when xfs_trans_apply_sb_deltas is called by __xfs_trans_commit(): if (tp->t_flags & XFS_TRANS_SB_DIRTY) xfs_trans_apply_sb_deltas(tp); With a modern lazysbcount filesystem, transactions update only the percpu counters, so they don't need to set XFS_TRANS_SB_DIRTY, hence xfs_trans_apply_sb_deltas is rarely called. However, updates to the count of free realtime extents are not part of lazysbcount, so XFS_TRANS_SB_DIRTY will be set on transactions adding or removing data fork mappings to realtime files; similarly, XFS_TRANS_SB_DIRTY is always set on !lazysbcount filesystems. Dave mentioned in response to an earlier version of this patch: "IIUC, what you are saying is that this debug code is simply not exercised in normal testing and hasn't been for the past decade? And it still won't be exercised on anything other than realtime device testing? "...it was debugging code from 1994 that was largely turned into dead code when lazysbcounters were introduced in 2007. Hence I'm not sure it holds any value anymore." This debugging code isn't especially helpful - you can modify the flcount on one AG and the freeblks of another AG, and it won't trigger. Add the fact that nobody noticed for a decade, and let's just get rid of it (and start testing realtime :P). This bug was found by running generic/051 on either a V4 filesystem lacking lazysbcount; or a V5 filesystem with a realtime volume. Cc: bfoster@redhat.com, zlang@redhat.com Fixes:f8f2835a9c
("xfs: defer agfl block frees when dfops is available") Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Brian Foster <bfoster@redhat.com>
300 lines
9.7 KiB
C
300 lines
9.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
|
|
* All Rights Reserved.
|
|
*/
|
|
#ifndef __XFS_TRANS_H__
|
|
#define __XFS_TRANS_H__
|
|
|
|
/* kernel only transaction subsystem defines */
|
|
|
|
struct xfs_buf;
|
|
struct xfs_buftarg;
|
|
struct xfs_efd_log_item;
|
|
struct xfs_efi_log_item;
|
|
struct xfs_inode;
|
|
struct xfs_item_ops;
|
|
struct xfs_log_iovec;
|
|
struct xfs_mount;
|
|
struct xfs_trans;
|
|
struct xfs_trans_res;
|
|
struct xfs_dquot_acct;
|
|
struct xfs_rud_log_item;
|
|
struct xfs_rui_log_item;
|
|
struct xfs_btree_cur;
|
|
struct xfs_cui_log_item;
|
|
struct xfs_cud_log_item;
|
|
struct xfs_bui_log_item;
|
|
struct xfs_bud_log_item;
|
|
|
|
struct xfs_log_item {
|
|
struct list_head li_ail; /* AIL pointers */
|
|
struct list_head li_trans; /* transaction list */
|
|
xfs_lsn_t li_lsn; /* last on-disk lsn */
|
|
struct xfs_mount *li_mountp; /* ptr to fs mount */
|
|
struct xfs_ail *li_ailp; /* ptr to AIL */
|
|
uint li_type; /* item type */
|
|
unsigned long li_flags; /* misc flags */
|
|
struct xfs_buf *li_buf; /* real buffer pointer */
|
|
struct list_head li_bio_list; /* buffer item list */
|
|
const struct xfs_item_ops *li_ops; /* function list */
|
|
|
|
/* delayed logging */
|
|
struct list_head li_cil; /* CIL pointers */
|
|
struct xfs_log_vec *li_lv; /* active log vector */
|
|
struct xfs_log_vec *li_lv_shadow; /* standby vector */
|
|
xfs_lsn_t li_seq; /* CIL commit seq */
|
|
};
|
|
|
|
/*
|
|
* li_flags use the (set/test/clear)_bit atomic interfaces because updates can
|
|
* race with each other and we don't want to have to use the AIL lock to
|
|
* serialise all updates.
|
|
*/
|
|
#define XFS_LI_IN_AIL 0
|
|
#define XFS_LI_ABORTED 1
|
|
#define XFS_LI_FAILED 2
|
|
#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
|
|
|
|
#define XFS_LI_FLAGS \
|
|
{ (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
|
|
{ (1 << XFS_LI_ABORTED), "ABORTED" }, \
|
|
{ (1 << XFS_LI_FAILED), "FAILED" }, \
|
|
{ (1 << XFS_LI_DIRTY), "DIRTY" }
|
|
|
|
struct xfs_item_ops {
|
|
unsigned flags;
|
|
void (*iop_size)(struct xfs_log_item *, int *, int *);
|
|
void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
|
|
void (*iop_pin)(struct xfs_log_item *);
|
|
void (*iop_unpin)(struct xfs_log_item *, int remove);
|
|
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
|
|
void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
|
|
void (*iop_release)(struct xfs_log_item *);
|
|
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
|
|
int (*iop_recover)(struct xfs_log_item *lip,
|
|
struct list_head *capture_list);
|
|
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
|
|
struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
|
|
struct xfs_trans *tp);
|
|
};
|
|
|
|
/* Is this log item a deferred action intent? */
|
|
static inline bool
|
|
xlog_item_is_intent(struct xfs_log_item *lip)
|
|
{
|
|
return lip->li_ops->iop_recover != NULL &&
|
|
lip->li_ops->iop_match != NULL;
|
|
}
|
|
|
|
/* Is this a log intent-done item? */
|
|
static inline bool
|
|
xlog_item_is_intent_done(struct xfs_log_item *lip)
|
|
{
|
|
return lip->li_ops->iop_unpin == NULL &&
|
|
lip->li_ops->iop_push == NULL;
|
|
}
|
|
|
|
/*
|
|
* Release the log item as soon as committed. This is for items just logging
|
|
* intents that never need to be written back in place.
|
|
*/
|
|
#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
|
|
|
|
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
|
|
int type, const struct xfs_item_ops *ops);
|
|
|
|
/*
|
|
* Return values for the iop_push() routines.
|
|
*/
|
|
#define XFS_ITEM_SUCCESS 0
|
|
#define XFS_ITEM_PINNED 1
|
|
#define XFS_ITEM_LOCKED 2
|
|
#define XFS_ITEM_FLUSHING 3
|
|
|
|
/*
|
|
* Deferred operation item relogging limits.
|
|
*/
|
|
#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
|
|
#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
|
|
|
|
/*
|
|
* This is the structure maintained for every active transaction.
|
|
*/
|
|
typedef struct xfs_trans {
|
|
unsigned int t_magic; /* magic number */
|
|
unsigned int t_log_res; /* amt of log space resvd */
|
|
unsigned int t_log_count; /* count for perm log res */
|
|
unsigned int t_blk_res; /* # of blocks resvd */
|
|
unsigned int t_blk_res_used; /* # of resvd blocks used */
|
|
unsigned int t_rtx_res; /* # of rt extents resvd */
|
|
unsigned int t_rtx_res_used; /* # of resvd rt extents used */
|
|
unsigned int t_flags; /* misc flags */
|
|
xfs_fsblock_t t_firstblock; /* first block allocated */
|
|
struct xlog_ticket *t_ticket; /* log mgr ticket */
|
|
struct xfs_mount *t_mountp; /* ptr to fs mount struct */
|
|
struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
|
|
int64_t t_icount_delta; /* superblock icount change */
|
|
int64_t t_ifree_delta; /* superblock ifree change */
|
|
int64_t t_fdblocks_delta; /* superblock fdblocks chg */
|
|
int64_t t_res_fdblocks_delta; /* on-disk only chg */
|
|
int64_t t_frextents_delta;/* superblock freextents chg*/
|
|
int64_t t_res_frextents_delta; /* on-disk only chg */
|
|
int64_t t_dblocks_delta;/* superblock dblocks change */
|
|
int64_t t_agcount_delta;/* superblock agcount change */
|
|
int64_t t_imaxpct_delta;/* superblock imaxpct change */
|
|
int64_t t_rextsize_delta;/* superblock rextsize chg */
|
|
int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */
|
|
int64_t t_rblocks_delta;/* superblock rblocks change */
|
|
int64_t t_rextents_delta;/* superblocks rextents chg */
|
|
int64_t t_rextslog_delta;/* superblocks rextslog chg */
|
|
struct list_head t_items; /* log item descriptors */
|
|
struct list_head t_busy; /* list of busy extents */
|
|
struct list_head t_dfops; /* deferred operations */
|
|
unsigned long t_pflags; /* saved process flags state */
|
|
} xfs_trans_t;
|
|
|
|
/*
|
|
* XFS transaction mechanism exported interfaces that are
|
|
* actually macros.
|
|
*/
|
|
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
|
|
|
|
/*
|
|
* XFS transaction mechanism exported interfaces.
|
|
*/
|
|
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
|
|
uint blocks, uint rtextents, uint flags,
|
|
struct xfs_trans **tpp);
|
|
int xfs_trans_alloc_empty(struct xfs_mount *mp,
|
|
struct xfs_trans **tpp);
|
|
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
|
|
|
|
int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
|
|
struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags,
|
|
struct xfs_buf **bpp);
|
|
|
|
static inline int
|
|
xfs_trans_get_buf(
|
|
struct xfs_trans *tp,
|
|
struct xfs_buftarg *target,
|
|
xfs_daddr_t blkno,
|
|
int numblks,
|
|
uint flags,
|
|
struct xfs_buf **bpp)
|
|
{
|
|
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
|
|
return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp);
|
|
}
|
|
|
|
int xfs_trans_read_buf_map(struct xfs_mount *mp,
|
|
struct xfs_trans *tp,
|
|
struct xfs_buftarg *target,
|
|
struct xfs_buf_map *map, int nmaps,
|
|
xfs_buf_flags_t flags,
|
|
struct xfs_buf **bpp,
|
|
const struct xfs_buf_ops *ops);
|
|
|
|
static inline int
|
|
xfs_trans_read_buf(
|
|
struct xfs_mount *mp,
|
|
struct xfs_trans *tp,
|
|
struct xfs_buftarg *target,
|
|
xfs_daddr_t blkno,
|
|
int numblks,
|
|
xfs_buf_flags_t flags,
|
|
struct xfs_buf **bpp,
|
|
const struct xfs_buf_ops *ops)
|
|
{
|
|
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
|
|
return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
|
|
flags, bpp, ops);
|
|
}
|
|
|
|
struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
|
|
|
|
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
|
|
bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
|
|
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
|
|
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
|
|
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
|
|
uint);
|
|
void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
|
|
bool xfs_trans_buf_is_dirty(struct xfs_buf *bp);
|
|
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
|
|
|
|
int xfs_trans_commit(struct xfs_trans *);
|
|
int xfs_trans_roll(struct xfs_trans **);
|
|
int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
|
|
void xfs_trans_cancel(xfs_trans_t *);
|
|
int xfs_trans_ail_init(struct xfs_mount *);
|
|
void xfs_trans_ail_destroy(struct xfs_mount *);
|
|
|
|
void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
|
|
enum xfs_blft);
|
|
void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
|
|
struct xfs_buf *src_bp);
|
|
|
|
extern kmem_zone_t *xfs_trans_zone;
|
|
|
|
static inline struct xfs_log_item *
|
|
xfs_trans_item_relog(
|
|
struct xfs_log_item *lip,
|
|
struct xfs_trans *tp)
|
|
{
|
|
return lip->li_ops->iop_relog(lip, tp);
|
|
}
|
|
|
|
struct xfs_dquot;
|
|
|
|
int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
|
|
unsigned int dblocks, unsigned int rblocks, bool force,
|
|
struct xfs_trans **tpp);
|
|
int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
|
|
struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
|
|
struct xfs_dquot *pdqp, unsigned int dblocks,
|
|
struct xfs_trans **tpp);
|
|
int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
|
|
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
|
|
struct xfs_trans **tpp);
|
|
|
|
static inline void
|
|
xfs_trans_set_context(
|
|
struct xfs_trans *tp)
|
|
{
|
|
ASSERT(current->journal_info == NULL);
|
|
tp->t_pflags = memalloc_nofs_save();
|
|
current->journal_info = tp;
|
|
}
|
|
|
|
static inline void
|
|
xfs_trans_clear_context(
|
|
struct xfs_trans *tp)
|
|
{
|
|
if (current->journal_info == tp) {
|
|
memalloc_nofs_restore(tp->t_pflags);
|
|
current->journal_info = NULL;
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
xfs_trans_switch_context(
|
|
struct xfs_trans *old_tp,
|
|
struct xfs_trans *new_tp)
|
|
{
|
|
ASSERT(current->journal_info == old_tp);
|
|
new_tp->t_pflags = old_tp->t_pflags;
|
|
old_tp->t_pflags = 0;
|
|
current->journal_info = new_tp;
|
|
}
|
|
|
|
#endif /* __XFS_TRANS_H__ */
|