mirror of
https://github.com/torvalds/linux.git
synced 2024-11-26 06:02:05 +00:00
b27ce0da60
Back when I wrote commita03297a0ca
, I had thought that we'd be doing users a favor by only marking inodes dontcache at the end of a scrub operation, and only if there's only one reference to that inode. This was more or less true back when I_DONTCACHE was an XFS iflag and the only thing it did was change the outcome of xfs_fs_drop_inode to 1. Note: If there are dentries pointing to the inode when scrub finishes, the inode will have positive i_count and stay around in cache until dentry reclaim. But now we have d_mark_dontcache, which cause the inode *and* the dentries attached to it all to be marked I_DONTCACHE, which means that we drop the dentries ASAP, which drops the inode ASAP. This is bad if scrub found problems with the inode, because now they can be scheduled for inactivation, which can cause inodegc to trip on it and shut down the filesystem. Even if the inode isn't bad, this is still suboptimal because phases 3-7 each initiate inode scans. Dropping the inode immediately during phase 3 is silly because phase 5 will reload it and drop it immediately, etc. It's fine to mark the inodes dontcache, but if there have been accesses to the file that set up dentries, we should keep them. I validated this by setting up ftrace to capture xfs_iget_recycle* tracepoints and ran xfs/285 for 30 seconds. With current djwong-wtf I saw ~30,000 recycle events. I then dropped the d_mark_dontcache calls and set XFS_IGET_DONTCACHE, and the recycle events dropped to ~5,000 per 30 seconds. Therefore, grab the inode with XFS_IGET_DONTCACHE, which only has the effect of setting I_DONTCACHE for cache misses. Remove the d_mark_dontcache call that can happen in xchk_irele. Fixes:a03297a0ca
("xfs: manage inode DONTCACHE status at irele time") Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de>
320 lines
9.7 KiB
C
320 lines
9.7 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Copyright (C) 2017-2023 Oracle. All Rights Reserved.
|
|
* Author: Darrick J. Wong <djwong@kernel.org>
|
|
*/
|
|
#ifndef __XFS_SCRUB_SCRUB_H__
|
|
#define __XFS_SCRUB_SCRUB_H__
|
|
|
|
struct xfs_scrub;
|
|
|
|
struct xchk_relax {
|
|
unsigned long next_resched;
|
|
unsigned int resched_nr;
|
|
bool interruptible;
|
|
};
|
|
|
|
/* Yield to the scheduler at most 10x per second. */
|
|
#define XCHK_RELAX_NEXT (jiffies + (HZ / 10))
|
|
|
|
#define INIT_XCHK_RELAX \
|
|
(struct xchk_relax){ \
|
|
.next_resched = XCHK_RELAX_NEXT, \
|
|
.resched_nr = 0, \
|
|
.interruptible = true, \
|
|
}
|
|
|
|
/*
|
|
* Relax during a scrub operation and exit if there's a fatal signal pending.
|
|
*
|
|
* If preemption is disabled, we need to yield to the scheduler every now and
|
|
* then so that we don't run afoul of the soft lockup watchdog or RCU stall
|
|
* detector. cond_resched calls are somewhat expensive (~5ns) so we want to
|
|
* ratelimit this to 10x per second. Amortize the cost of the other checks by
|
|
* only doing it once every 100 calls.
|
|
*/
|
|
static inline int xchk_maybe_relax(struct xchk_relax *widget)
|
|
{
|
|
/* Amortize the cost of scheduling and checking signals. */
|
|
if (likely(++widget->resched_nr < 100))
|
|
return 0;
|
|
widget->resched_nr = 0;
|
|
|
|
if (unlikely(widget->next_resched <= jiffies)) {
|
|
cond_resched();
|
|
widget->next_resched = XCHK_RELAX_NEXT;
|
|
}
|
|
|
|
if (widget->interruptible && fatal_signal_pending(current))
|
|
return -EINTR;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Standard flags for allocating memory within scrub. NOFS context is
|
|
* configured by the process allocation scope. Scrub and repair must be able
|
|
* to back out gracefully if there isn't enough memory. Force-cast to avoid
|
|
* complaints from static checkers.
|
|
*/
|
|
#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
|
|
__GFP_RETRY_MAYFAIL))
|
|
|
|
/*
|
|
* For opening files by handle for fsck operations, we don't trust the inumber
|
|
* or the allocation state; therefore, perform an untrusted lookup. We don't
|
|
* want these inodes to pollute the cache, so mark them for immediate removal.
|
|
*/
|
|
#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE)
|
|
|
|
/* Type info and names for the scrub types. */
|
|
enum xchk_type {
|
|
ST_NONE = 1, /* disabled */
|
|
ST_PERAG, /* per-AG metadata */
|
|
ST_FS, /* per-FS metadata */
|
|
ST_INODE, /* per-inode metadata */
|
|
};
|
|
|
|
struct xchk_meta_ops {
|
|
/* Acquire whatever resources are needed for the operation. */
|
|
int (*setup)(struct xfs_scrub *sc);
|
|
|
|
/* Examine metadata for errors. */
|
|
int (*scrub)(struct xfs_scrub *);
|
|
|
|
/* Repair or optimize the metadata. */
|
|
int (*repair)(struct xfs_scrub *);
|
|
|
|
/*
|
|
* Re-scrub the metadata we repaired, in case there's extra work that
|
|
* we need to do to check our repair work. If this is NULL, we'll use
|
|
* the ->scrub function pointer, assuming that the regular scrub is
|
|
* sufficient.
|
|
*/
|
|
int (*repair_eval)(struct xfs_scrub *sc);
|
|
|
|
/* Decide if we even have this piece of metadata. */
|
|
bool (*has)(struct xfs_mount *);
|
|
|
|
/* type describing required/allowed inputs */
|
|
enum xchk_type type;
|
|
};
|
|
|
|
/* Buffer pointers and btree cursors for an entire AG. */
|
|
struct xchk_ag {
|
|
struct xfs_perag *pag;
|
|
|
|
/* AG btree roots */
|
|
struct xfs_buf *agf_bp;
|
|
struct xfs_buf *agi_bp;
|
|
|
|
/* AG btrees */
|
|
struct xfs_btree_cur *bno_cur;
|
|
struct xfs_btree_cur *cnt_cur;
|
|
struct xfs_btree_cur *ino_cur;
|
|
struct xfs_btree_cur *fino_cur;
|
|
struct xfs_btree_cur *rmap_cur;
|
|
struct xfs_btree_cur *refc_cur;
|
|
};
|
|
|
|
struct xfs_scrub {
|
|
/* General scrub state. */
|
|
struct xfs_mount *mp;
|
|
struct xfs_scrub_metadata *sm;
|
|
const struct xchk_meta_ops *ops;
|
|
struct xfs_trans *tp;
|
|
|
|
/* File that scrub was called with. */
|
|
struct file *file;
|
|
|
|
/*
|
|
* File that is undergoing the scrub operation. This can differ from
|
|
* the file that scrub was called with if we're checking file-based fs
|
|
* metadata (e.g. rt bitmaps) or if we're doing a scrub-by-handle for
|
|
* something that can't be opened directly (e.g. symlinks).
|
|
*/
|
|
struct xfs_inode *ip;
|
|
|
|
/* Kernel memory buffer used by scrubbers; freed at teardown. */
|
|
void *buf;
|
|
|
|
/*
|
|
* Clean up resources owned by whatever is in the buffer. Cleanup can
|
|
* be deferred with this hook as a means for scrub functions to pass
|
|
* data to repair functions. This function must not free the buffer
|
|
* itself.
|
|
*/
|
|
void (*buf_cleanup)(void *buf);
|
|
|
|
/* xfile used by the scrubbers; freed at teardown. */
|
|
struct xfile *xfile;
|
|
|
|
/* buffer target for in-memory btrees; also freed at teardown. */
|
|
struct xfs_buftarg *xmbtp;
|
|
|
|
/* Lock flags for @ip. */
|
|
uint ilock_flags;
|
|
|
|
/* The orphanage, for stashing files that have lost their parent. */
|
|
uint orphanage_ilock_flags;
|
|
struct xfs_inode *orphanage;
|
|
|
|
/* A temporary file on this filesystem, for staging new metadata. */
|
|
struct xfs_inode *tempip;
|
|
uint temp_ilock_flags;
|
|
|
|
/* See the XCHK/XREP state flags below. */
|
|
unsigned int flags;
|
|
|
|
/*
|
|
* The XFS_SICK_* flags that correspond to the metadata being scrubbed
|
|
* or repaired. We will use this mask to update the in-core fs health
|
|
* status with whatever we find.
|
|
*/
|
|
unsigned int sick_mask;
|
|
|
|
/* next time we want to cond_resched() */
|
|
struct xchk_relax relax;
|
|
|
|
/* State tracking for single-AG operations. */
|
|
struct xchk_ag sa;
|
|
};
|
|
|
|
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
|
|
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
|
|
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
|
|
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
|
|
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
|
|
#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */
|
|
#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */
|
|
#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */
|
|
#define XREP_FSGATES_EXCHANGE_RANGE (1U << 29) /* uses file content exchange */
|
|
#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
|
|
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
|
|
|
|
/*
|
|
* The XCHK_FSGATES* flags reflect functionality in the main filesystem that
|
|
* are only enabled for this particular online fsck. When not in use, the
|
|
* features are gated off via dynamic code patching, which is why the state
|
|
* must be enabled during scrub setup and can only be torn down afterwards.
|
|
*/
|
|
#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \
|
|
XCHK_FSGATES_QUOTA | \
|
|
XCHK_FSGATES_DIRENTS | \
|
|
XCHK_FSGATES_RMAP)
|
|
|
|
/*
|
|
* The sole XREP_FSGATES* flag reflects a log intent item that is protected
|
|
* by a log-incompat feature flag. No code patching in use here.
|
|
*/
|
|
#define XREP_FSGATES_ALL (XREP_FSGATES_EXCHANGE_RANGE)
|
|
|
|
struct xfs_scrub_subord {
|
|
struct xfs_scrub sc;
|
|
struct xfs_scrub *parent_sc;
|
|
unsigned int old_smtype;
|
|
unsigned int old_smflags;
|
|
};
|
|
|
|
struct xfs_scrub_subord *xchk_scrub_create_subord(struct xfs_scrub *sc,
|
|
unsigned int subtype);
|
|
void xchk_scrub_free_subord(struct xfs_scrub_subord *sub);
|
|
|
|
/*
|
|
* We /could/ terminate a scrub/repair operation early. If we're not
|
|
* in a good place to continue (fatal signal, etc.) then bail out.
|
|
* Note that we're careful not to make any judgements about *error.
|
|
*/
|
|
static inline bool
|
|
xchk_should_terminate(
|
|
struct xfs_scrub *sc,
|
|
int *error)
|
|
{
|
|
if (xchk_maybe_relax(&sc->relax)) {
|
|
if (*error == 0)
|
|
*error = -EINTR;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/* Metadata scrubbers */
|
|
int xchk_tester(struct xfs_scrub *sc);
|
|
int xchk_superblock(struct xfs_scrub *sc);
|
|
int xchk_agf(struct xfs_scrub *sc);
|
|
int xchk_agfl(struct xfs_scrub *sc);
|
|
int xchk_agi(struct xfs_scrub *sc);
|
|
int xchk_allocbt(struct xfs_scrub *sc);
|
|
int xchk_iallocbt(struct xfs_scrub *sc);
|
|
int xchk_rmapbt(struct xfs_scrub *sc);
|
|
int xchk_refcountbt(struct xfs_scrub *sc);
|
|
int xchk_inode(struct xfs_scrub *sc);
|
|
int xchk_bmap_data(struct xfs_scrub *sc);
|
|
int xchk_bmap_attr(struct xfs_scrub *sc);
|
|
int xchk_bmap_cow(struct xfs_scrub *sc);
|
|
int xchk_directory(struct xfs_scrub *sc);
|
|
int xchk_xattr(struct xfs_scrub *sc);
|
|
int xchk_symlink(struct xfs_scrub *sc);
|
|
int xchk_parent(struct xfs_scrub *sc);
|
|
int xchk_dirtree(struct xfs_scrub *sc);
|
|
#ifdef CONFIG_XFS_RT
|
|
int xchk_rtbitmap(struct xfs_scrub *sc);
|
|
int xchk_rtsummary(struct xfs_scrub *sc);
|
|
#else
|
|
static inline int
|
|
xchk_rtbitmap(struct xfs_scrub *sc)
|
|
{
|
|
return -ENOENT;
|
|
}
|
|
static inline int
|
|
xchk_rtsummary(struct xfs_scrub *sc)
|
|
{
|
|
return -ENOENT;
|
|
}
|
|
#endif
|
|
#ifdef CONFIG_XFS_QUOTA
|
|
int xchk_quota(struct xfs_scrub *sc);
|
|
int xchk_quotacheck(struct xfs_scrub *sc);
|
|
#else
|
|
static inline int
|
|
xchk_quota(struct xfs_scrub *sc)
|
|
{
|
|
return -ENOENT;
|
|
}
|
|
static inline int
|
|
xchk_quotacheck(struct xfs_scrub *sc)
|
|
{
|
|
return -ENOENT;
|
|
}
|
|
#endif
|
|
int xchk_fscounters(struct xfs_scrub *sc);
|
|
int xchk_nlinks(struct xfs_scrub *sc);
|
|
|
|
/* cross-referencing helpers */
|
|
void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
|
|
xfs_extlen_t len);
|
|
void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
|
|
xfs_extlen_t len);
|
|
void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
|
|
xfs_extlen_t len);
|
|
void xchk_xref_is_only_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
|
|
xfs_extlen_t len, const struct xfs_owner_info *oinfo);
|
|
void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
|
|
xfs_extlen_t len, const struct xfs_owner_info *oinfo);
|
|
void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno,
|
|
xfs_extlen_t len);
|
|
void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
|
|
xfs_extlen_t len);
|
|
void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno,
|
|
xfs_extlen_t len);
|
|
void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
|
|
xfs_extlen_t len);
|
|
#ifdef CONFIG_XFS_RT
|
|
void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
|
|
xfs_extlen_t len);
|
|
#else
|
|
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
|
|
#endif
|
|
|
|
#endif /* __XFS_SCRUB_SCRUB_H__ */
|