linux/fs/xfs/libxfs/xfs_defer.h

132 lines
4.1 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*/
#ifndef __XFS_DEFER_H__
#define __XFS_DEFER_H__
struct xfs_btree_cur;
struct xfs_defer_op_type;
xfs: proper replay of deferred ops queued during log recovery When we replay unfinished intent items that have been recovered from the log, it's possible that the replay will cause the creation of more deferred work items. As outlined in commit 509955823cc9c ("xfs: log recovery should replay deferred ops in order"), later work items have an implicit ordering dependency on earlier work items. Therefore, recovery must replay the items (both recovered and created) in the same order that they would have been during normal operation. For log recovery, we enforce this ordering by using an empty transaction to collect deferred ops that get created in the process of recovering a log intent item to prevent them from being committed before the rest of the recovered intent items. After we finish committing all the recovered log items, we allocate a transaction with an enormous block reservation, splice our huge list of created deferred ops into that transaction, and commit it, thereby finishing all those ops. This is /really/ hokey -- it's the one place in XFS where we allow nested transactions; the splicing of the defer ops list is is inelegant and has to be done twice per recovery function; and the broken way we handle inode pointers and block reservations cause subtle use-after-free and allocator problems that will be fixed by this patch and the two patches after it. Therefore, replace the hokey empty transaction with a structure designed to capture each chain of deferred ops that are created as part of recovering a single unfinished log intent. Finally, refactor the loop that replays those chains to do so using one transaction per chain. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
2020-09-26 00:39:37 +00:00
struct xfs_defer_capture;
/*
* Header for deferred operation list.
*/
enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_BMAP,
XFS_DEFER_OPS_TYPE_REFCOUNT,
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
xfs: defer agfl block frees when dfops is available The AGFL fixup code executes before every block allocation/free and rectifies the AGFL based on the current, dynamic allocation requirements of the fs. The AGFL must hold a minimum number of blocks to satisfy a worst case split of the free space btrees caused by the impending allocation operation. The AGFL is also updated to maintain the implicit requirement for a minimum number of free slots to satisfy a worst case join of the free space btrees. Since the AGFL caches individual blocks, AGFL reduction typically involves multiple, single block frees. We've had reports of transaction overrun problems during certain workloads that boil down to AGFL reduction freeing multiple blocks and consuming more space in the log than was reserved for the transaction. Since the objective of freeing AGFL blocks is to ensure free AGFL free slots are available for the upcoming allocation, one way to address this problem is to release surplus blocks from the AGFL immediately but defer the free of those blocks (similar to how file-mapped blocks are unmapped from the file in one transaction and freed via a deferred operation) until the transaction is rolled. This turns AGFL reduction into an operation with predictable log reservation consumption. Add the capability to defer AGFL block frees when a deferred ops list is available to the AGFL fixup code. Add a dfops pointer to the transaction to carry dfops through various contexts to the allocator context. Deferring AGFL frees is conditional behavior based on whether the transaction pointer is populated. The long term objective is to reuse the transaction pointer to clean up all unrelated callchains that pass dfops on the stack along with a transaction and in doing so, consistently defer AGFL blocks from the allocator. A bit of customization is required to handle deferred completion processing because AGFL blocks are accounted against a per-ag reservation pool and AGFL blocks are not inserted into the extent busy list when freed (they are inserted when used and released back to the AGFL). Reuse the majority of the existing deferred extent free infrastructure and customize it appropriately to handle AGFL blocks. Note that this patch only adds infrastructure. It does not change behavior because no callers have been updated to pass ->t_agfl_dfops into the allocation code. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-05-08 00:38:47 +00:00
XFS_DEFER_OPS_TYPE_AGFL_FREE,
XFS_DEFER_OPS_TYPE_ATTR,
XFS_DEFER_OPS_TYPE_MAX,
};
/*
* Save a log intent item and a list of extents, so that we can replay
* whatever action had to happen to the extent list and file the log done
* item.
*/
struct xfs_defer_pending {
struct list_head dfp_list; /* pending items */
struct list_head dfp_work; /* work items */
struct xfs_log_item *dfp_intent; /* log intent item */
struct xfs_log_item *dfp_done; /* log done item */
unsigned int dfp_count; /* # extent items */
enum xfs_defer_ops_type dfp_type;
};
void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
struct list_head *h);
int xfs_defer_finish_noroll(struct xfs_trans **tp);
int xfs_defer_finish(struct xfs_trans **tp);
void xfs_defer_cancel(struct xfs_trans *);
void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
struct list_head *items, unsigned int count, bool sort);
void (*abort_intent)(struct xfs_log_item *intent);
struct xfs_log_item *(*create_done)(struct xfs_trans *tp,
struct xfs_log_item *intent, unsigned int count);
int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done,
struct list_head *item, struct xfs_btree_cur **state);
void (*finish_cleanup)(struct xfs_trans *tp,
struct xfs_btree_cur *state, int error);
void (*cancel_item)(struct list_head *item);
unsigned int max_items;
};
extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
xfs: Set up infrastructure for log attribute replay Currently attributes are modified directly across one or more transactions. But they are not logged or replayed in the event of an error. The goal of log attr replay is to enable logging and replaying of attribute operations using the existing delayed operations infrastructure. This will later enable the attributes to become part of larger multi part operations that also must first be recorded to the log. This is mostly of interest in the scheme of parent pointers which would need to maintain an attribute containing parent inode information any time an inode is moved, created, or removed. Parent pointers would then be of interest to any feature that would need to quickly derive an inode path from the mount point. Online scrub, nfs lookups and fs grow or shrink operations are all features that could take advantage of this. This patch adds two new log item types for setting or removing attributes as deferred operations. The xfs_attri_log_item will log an intent to set or remove an attribute. The corresponding xfs_attrd_log_item holds a reference to the xfs_attri_log_item and is freed once the transaction is done. Both log items use a generic xfs_attr_log_format structure that contains the attribute name, value, flags, inode, and an op_flag that indicates if the operations is a set or remove. [dchinner: added extra little bits needed for intent whiteouts] Signed-off-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-04 02:41:02 +00:00
extern const struct xfs_defer_op_type xfs_attr_defer_type;
/*
* Deferred operation item relogging limits.
*/
#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
/* Resources that must be held across a transaction roll. */
struct xfs_defer_resources {
/* held buffers */
struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS];
/* inodes with no unlock flags */
struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES];
/* number of held buffers */
unsigned short dr_bufs;
/* bitmap of ordered buffers */
unsigned short dr_ordered;
/* number of held inodes */
unsigned short dr_inos;
};
xfs: proper replay of deferred ops queued during log recovery When we replay unfinished intent items that have been recovered from the log, it's possible that the replay will cause the creation of more deferred work items. As outlined in commit 509955823cc9c ("xfs: log recovery should replay deferred ops in order"), later work items have an implicit ordering dependency on earlier work items. Therefore, recovery must replay the items (both recovered and created) in the same order that they would have been during normal operation. For log recovery, we enforce this ordering by using an empty transaction to collect deferred ops that get created in the process of recovering a log intent item to prevent them from being committed before the rest of the recovered intent items. After we finish committing all the recovered log items, we allocate a transaction with an enormous block reservation, splice our huge list of created deferred ops into that transaction, and commit it, thereby finishing all those ops. This is /really/ hokey -- it's the one place in XFS where we allow nested transactions; the splicing of the defer ops list is is inelegant and has to be done twice per recovery function; and the broken way we handle inode pointers and block reservations cause subtle use-after-free and allocator problems that will be fixed by this patch and the two patches after it. Therefore, replace the hokey empty transaction with a structure designed to capture each chain of deferred ops that are created as part of recovering a single unfinished log intent. Finally, refactor the loop that replays those chains to do so using one transaction per chain. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
2020-09-26 00:39:37 +00:00
/*
* This structure enables a dfops user to detach the chain of deferred
* operations from a transaction so that they can be continued later.
*/
struct xfs_defer_capture {
/* List of other capture structures. */
struct list_head dfc_list;
/* Deferred ops state saved from the transaction. */
struct list_head dfc_dfops;
unsigned int dfc_tpflags;
/* Block reservations for the data and rt devices. */
unsigned int dfc_blkres;
unsigned int dfc_rtxres;
/* Log reservation saved from the transaction. */
unsigned int dfc_logres;
struct xfs_defer_resources dfc_held;
xfs: proper replay of deferred ops queued during log recovery When we replay unfinished intent items that have been recovered from the log, it's possible that the replay will cause the creation of more deferred work items. As outlined in commit 509955823cc9c ("xfs: log recovery should replay deferred ops in order"), later work items have an implicit ordering dependency on earlier work items. Therefore, recovery must replay the items (both recovered and created) in the same order that they would have been during normal operation. For log recovery, we enforce this ordering by using an empty transaction to collect deferred ops that get created in the process of recovering a log intent item to prevent them from being committed before the rest of the recovered intent items. After we finish committing all the recovered log items, we allocate a transaction with an enormous block reservation, splice our huge list of created deferred ops into that transaction, and commit it, thereby finishing all those ops. This is /really/ hokey -- it's the one place in XFS where we allow nested transactions; the splicing of the defer ops list is is inelegant and has to be done twice per recovery function; and the broken way we handle inode pointers and block reservations cause subtle use-after-free and allocator problems that will be fixed by this patch and the two patches after it. Therefore, replace the hokey empty transaction with a structure designed to capture each chain of deferred ops that are created as part of recovering a single unfinished log intent. Finally, refactor the loop that replays those chains to do so using one transaction per chain. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
2020-09-26 00:39:37 +00:00
};
/*
* Functions to capture a chain of deferred operations and continue them later.
* This doesn't normally happen except log recovery.
*/
xfs: proper replay of deferred ops queued during log recovery When we replay unfinished intent items that have been recovered from the log, it's possible that the replay will cause the creation of more deferred work items. As outlined in commit 509955823cc9c ("xfs: log recovery should replay deferred ops in order"), later work items have an implicit ordering dependency on earlier work items. Therefore, recovery must replay the items (both recovered and created) in the same order that they would have been during normal operation. For log recovery, we enforce this ordering by using an empty transaction to collect deferred ops that get created in the process of recovering a log intent item to prevent them from being committed before the rest of the recovered intent items. After we finish committing all the recovered log items, we allocate a transaction with an enormous block reservation, splice our huge list of created deferred ops into that transaction, and commit it, thereby finishing all those ops. This is /really/ hokey -- it's the one place in XFS where we allow nested transactions; the splicing of the defer ops list is is inelegant and has to be done twice per recovery function; and the broken way we handle inode pointers and block reservations cause subtle use-after-free and allocator problems that will be fixed by this patch and the two patches after it. Therefore, replace the hokey empty transaction with a structure designed to capture each chain of deferred ops that are created as part of recovering a single unfinished log intent. Finally, refactor the loop that replays those chains to do so using one transaction per chain. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
2020-09-26 00:39:37 +00:00
int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
struct list_head *capture_list);
void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
struct xfs_defer_resources *dres);
void xfs_defer_ops_capture_free(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
int __init xfs_defer_init_item_caches(void);
void xfs_defer_destroy_item_caches(void);
#endif /* __XFS_DEFER_H__ */