linux/fs/xfs/xfs_mount.c

1253 lines
33 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
#include "xfs_icache.h"
#include "xfs_sysfs.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_reflink.h"
#include "xfs_extent_busy.h"
#include "xfs_health.h"
2019-12-11 21:19:06 +00:00
#include "xfs_trace.h"
#include "xfs_ag.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
void
xfs_uuid_table_free(void)
{
if (xfs_uuid_table_size == 0)
return;
kmem_free(xfs_uuid_table);
xfs_uuid_table = NULL;
xfs_uuid_table_size = 0;
}
/*
* See if the UUID is unique among mounted XFS filesystems.
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
*/
STATIC int
xfs_uuid_mount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
int hole, i;
/* Publish UUID in struct super_block */
uuid_copy(&mp->m_super->s_uuid, uuid);
if (mp->m_flags & XFS_MOUNT_NOUUID)
return 0;
if (uuid_is_null(uuid)) {
xfs_warn(mp, "Filesystem has null UUID - can't mount");
return -EINVAL;
}
mutex_lock(&xfs_uuid_table_mutex);
for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
if (uuid_is_null(&xfs_uuid_table[i])) {
hole = i;
continue;
}
if (uuid_equal(uuid, &xfs_uuid_table[i]))
goto out_duplicate;
}
if (hole < 0) {
xfs_uuid_table = krealloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
GFP_KERNEL | __GFP_NOFAIL);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
mutex_unlock(&xfs_uuid_table_mutex);
return 0;
out_duplicate:
mutex_unlock(&xfs_uuid_table_mutex);
xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
return -EINVAL;
}
STATIC void
xfs_uuid_unmount(
struct xfs_mount *mp)
{
uuid_t *uuid = &mp->m_sb.sb_uuid;
int i;
if (mp->m_flags & XFS_MOUNT_NOUUID)
return;
mutex_lock(&xfs_uuid_table_mutex);
for (i = 0; i < xfs_uuid_table_size; i++) {
if (uuid_is_null(&xfs_uuid_table[i]))
continue;
if (!uuid_equal(uuid, &xfs_uuid_table[i]))
continue;
memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
break;
}
ASSERT(i < xfs_uuid_table_size);
mutex_unlock(&xfs_uuid_table_mutex);
}
/*
* Check size of device based on the (data/realtime) block count.
* Note: this check is used by the growfs code as well as mount.
*/
int
xfs_sb_validate_fsb_count(
xfs_sb_t *sbp,
uint64_t nblocks)
{
ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
ASSERT(sbp->sb_blocklog >= BBSHIFT);
/* Limited by ULONG_MAX of page cache index */
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
return -EFBIG;
return 0;
}
/*
* xfs_readsb
*
* Does the initial read of the superblock.
*/
int
xfs_readsb(
struct xfs_mount *mp,
int flags)
{
unsigned int sector_size;
struct xfs_buf *bp;
struct xfs_sb *sbp = &mp->m_sb;
int error;
int loud = !(flags & XFS_MFSI_QUIET);
const struct xfs_buf_ops *buf_ops;
ASSERT(mp->m_sb_bp == NULL);
ASSERT(mp->m_ddev_targp != NULL);
/*
* For the initial read, we must guess at the sector
* size based on the block device. It's enough to
* get the sb_sectsize out of the superblock and
* then reread with the proper length.
* We don't verify it yet, because it may not be complete.
*/
sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
buf_ops = NULL;
/*
* Allocate a (locked) buffer to hold the superblock. This will be kept
* around at all times to optimize access to the superblock. Therefore,
* set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
* elevated.
*/
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
BTOBB(sector_size), XBF_NO_IOACCT, &bp,
buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
/* bad CRC means corrupted metadata */
if (error == -EFSBADCRC)
error = -EFSCORRUPTED;
return error;
}
/*
* Initialize the mount structure from the superblock.
*/
xfs_sb_from_disk(sbp, bp->b_addr);
/*
* If we haven't validated the superblock, do so now before we try
* to check the sector size and reread the superblock appropriately.
*/
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
if (loud)
xfs_warn(mp, "Invalid superblock magic number");
error = -EINVAL;
goto release_buf;
}
/*
* We must be able to do sector-sized and sector-aligned IO.
*/
if (sector_size > sbp->sb_sectsize) {
if (loud)
xfs_warn(mp, "device supports %u byte sectors (not %u)",
sector_size, sbp->sb_sectsize);
error = -ENOSYS;
goto release_buf;
}
if (buf_ops == NULL) {
/*
* Re-read the superblock so the buffer is correctly sized,
* and properly verified.
*/
xfs_buf_relse(bp);
sector_size = sbp->sb_sectsize;
buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
goto reread;
}
xfs_reinit_percpu_counters(mp);
/* no need to be quiet anymore, so reset the buf ops */
bp->b_ops = &xfs_sb_buf_ops;
mp->m_sb_bp = bp;
xfs_buf_unlock(bp);
return 0;
release_buf:
xfs_buf_relse(bp);
return error;
}
2019-12-11 21:19:06 +00:00
/*
* If the sunit/swidth change would move the precomputed root inode value, we
* must reject the ondisk change because repair will stumble over that.
* However, we allow the mount to proceed because we never rejected this
* combination before. Returns true to update the sb, false otherwise.
*/
static inline int
xfs_check_new_dalign(
struct xfs_mount *mp,
int new_dalign,
bool *update_sb)
{
struct xfs_sb *sbp = &mp->m_sb;
xfs_ino_t calc_ino;
calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
if (sbp->sb_rootino == calc_ino) {
*update_sb = true;
return 0;
}
xfs_warn(mp,
"Cannot change stripe alignment; would require moving root inode.");
/*
* XXX: Next time we add a new incompat feature, this should start
* returning -EINVAL to fail the mount. Until then, spit out a warning
* that we're ignoring the administrator's instructions.
*/
xfs_warn(mp, "Skipping superblock stripe alignment update.");
*update_sb = false;
return 0;
}
/*
* If we were provided with new sunit/swidth values as mount options, make sure
* that they pass basic alignment and superblock feature checks, and convert
* them into the same units (FSB) that everything else expects. This step
* /must/ be done before computing the inode geometry.
*/
STATIC int
xfs_validate_new_dalign(
struct xfs_mount *mp)
{
if (mp->m_dalign == 0)
return 0;
/*
* If stripe unit and stripe width are not multiples
* of the fs blocksize turn off alignment.
*/
if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
xfs_warn(mp,
"alignment check failed: sunit/swidth vs. blocksize(%d)",
mp->m_sb.sb_blocksize);
return -EINVAL;
} else {
/*
* Convert the stripe unit and width to FSBs.
*/
mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
xfs_warn(mp,
"alignment check failed: sunit/swidth vs. agsize(%d)",
mp->m_sb.sb_agblocks);
return -EINVAL;
} else if (mp->m_dalign) {
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
xfs: Don't keep silent if sunit/swidth can not be changed via mount As per the mount man page, sunit and swidth can be changed via mount options. For XFS, on the face of it, those options seems works if the specified alignments is properly, e.g. # mount -o sunit=4096,swidth=8192 /dev/sdb1 /mnt # mount | grep sdb1 /dev/sdb1 on /mnt type xfs (rw,sunit=4096,swidth=8192) However, neither sunit nor swidth is shown from the xfs_info output. # xfs_info /mnt meta-data=/dev/sdb1 isize=256 agcount=4, agsize=262144 blks = sectsz=512 attr=2 data = bsize=4096 blocks=1048576, imaxpct=25 = sunit=0 swidth=0 blks ^^^^^^^^^^^^^^^^^^^^^^^^^^ naming =version 2 bsize=4096 ascii-ci=0 log =internal bsize=4096 blocks=2560, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 The reason is that the alignment can only be changed if the relevant super block is already configured with alignments, otherwise, the given value is silently ignored. With this fix, the attempt to mount a storage without strip alignment setup on a super block will get an error with a warning in syslog to indicate the true cause, e.g. # mount -o sunit=4096,swidth=8192 /dev/sdb1 /mnt mount: wrong fs type, bad option, bad superblock on /dev/sdb1, missing codepage or helper program, or other error In some cases useful info is found in syslog - try dmesg | tail or so ....... XFS (sdb1): cannot change alignment: superblock does not support data alignment Signed-off-by: Jie Liu <jeff.liu@oracle.com> Cc: Mark Tinguely <tinguely@sgi.com> Cc: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-05-02 11:27:53 +00:00
} else {
xfs_warn(mp,
"alignment check failed: sunit(%d) less than bsize(%d)",
mp->m_dalign, mp->m_sb.sb_blocksize);
return -EINVAL;
}
}
if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
xfs_warn(mp,
"cannot change alignment: superblock does not support data alignment");
return -EINVAL;
}
return 0;
}
/* Update alignment values based on mount options and sb values. */
STATIC int
xfs_update_alignment(
struct xfs_mount *mp)
{
struct xfs_sb *sbp = &mp->m_sb;
if (mp->m_dalign) {
2019-12-11 21:19:06 +00:00
bool update_sb;
int error;
if (sbp->sb_unit == mp->m_dalign &&
sbp->sb_width == mp->m_swidth)
return 0;
2019-12-11 21:19:06 +00:00
error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
if (error || !update_sb)
return error;
sbp->sb_unit = mp->m_dalign;
sbp->sb_width = mp->m_swidth;
mp->m_update_sb = true;
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
mp->m_dalign = sbp->sb_unit;
mp->m_swidth = sbp->sb_width;
}
return 0;
}
xfs: dynamic speculative EOF preallocation Currently the size of the speculative preallocation during delayed allocation is fixed by either the allocsize mount option of a default size. We are seeing a lot of cases where we need to recommend using the allocsize mount option to prevent fragmentation when buffered writes land in the same AG. Rather than using a fixed preallocation size by default (up to 64k), make it dynamic by basing it on the current inode size. That way the EOF preallocation will increase as the file size increases. Hence for streaming writes we are much more likely to get large preallocations exactly when we need it to reduce fragementation. For default settings, the size of the initial extents is determined by the number of parallel writers and the amount of memory in the machine. For 4GB RAM and 4 concurrent 32GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088 and for 16 concurrent 16GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208 Because it is hard to take back specualtive preallocation, cases where there are large slow growing log files on a nearly full filesystem may cause premature ENOSPC. Hence as the filesystem nears full, the maximum dynamic prealloc size іs reduced according to this table (based on 4k block size): freespace max prealloc size >5% full extent (8GB) 4-5% 2GB (8GB >> 2) 3-4% 1GB (8GB >> 3) 2-3% 512MB (8GB >> 4) 1-2% 256MB (8GB >> 5) <1% 128MB (8GB >> 6) This should reduce the amount of space held in speculative preallocation for such cases. The allocsize mount option turns off the dynamic behaviour and fixes the prealloc size to whatever the mount option specifies. i.e. the behaviour is unchanged. Signed-off-by: Dave Chinner <dchinner@redhat.com>
2011-01-04 00:35:03 +00:00
/*
* precalculate the low space thresholds for dynamic speculative preallocation.
*/
void
xfs_set_low_space_thresholds(
struct xfs_mount *mp)
{
int i;
for (i = 0; i < XFS_LOWSP_MAX; i++) {
uint64_t space = mp->m_sb.sb_dblocks;
xfs: dynamic speculative EOF preallocation Currently the size of the speculative preallocation during delayed allocation is fixed by either the allocsize mount option of a default size. We are seeing a lot of cases where we need to recommend using the allocsize mount option to prevent fragmentation when buffered writes land in the same AG. Rather than using a fixed preallocation size by default (up to 64k), make it dynamic by basing it on the current inode size. That way the EOF preallocation will increase as the file size increases. Hence for streaming writes we are much more likely to get large preallocations exactly when we need it to reduce fragementation. For default settings, the size of the initial extents is determined by the number of parallel writers and the amount of memory in the machine. For 4GB RAM and 4 concurrent 32GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088 and for 16 concurrent 16GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208 Because it is hard to take back specualtive preallocation, cases where there are large slow growing log files on a nearly full filesystem may cause premature ENOSPC. Hence as the filesystem nears full, the maximum dynamic prealloc size іs reduced according to this table (based on 4k block size): freespace max prealloc size >5% full extent (8GB) 4-5% 2GB (8GB >> 2) 3-4% 1GB (8GB >> 3) 2-3% 512MB (8GB >> 4) 1-2% 256MB (8GB >> 5) <1% 128MB (8GB >> 6) This should reduce the amount of space held in speculative preallocation for such cases. The allocsize mount option turns off the dynamic behaviour and fixes the prealloc size to whatever the mount option specifies. i.e. the behaviour is unchanged. Signed-off-by: Dave Chinner <dchinner@redhat.com>
2011-01-04 00:35:03 +00:00
do_div(space, 100);
mp->m_low_space[i] = space * (i + 1);
}
}
/*
* Check that the data (and log if separate) is an ok size.
*/
STATIC int
xfs_check_sizes(
struct xfs_mount *mp)
{
struct xfs_buf *bp;
xfs_daddr_t d;
int error;
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
xfs_warn(mp, "filesystem size mismatch detected");
return -EFBIG;
}
error = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
if (error) {
xfs_warn(mp, "last sector read failed");
return error;
}
xfs_buf_relse(bp);
if (mp->m_logdev_targp == mp->m_ddev_targp)
return 0;
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
xfs_warn(mp, "log size mismatch detected");
return -EFBIG;
}
error = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
if (error) {
xfs_warn(mp, "log device read failed");
return error;
}
xfs_buf_relse(bp);
return 0;
}
/*
* Clear the quotaflags in memory and in the superblock.
*/
int
xfs_mount_reset_sbqflags(
struct xfs_mount *mp)
{
mp->m_qflags = 0;
/* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
if (mp->m_sb.sb_qflags == 0)
return 0;
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_qflags = 0;
spin_unlock(&mp->m_sb_lock);
if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
return 0;
return xfs_sync_sb(mp, false);
}
uint64_t
xfs_default_resblks(xfs_mount_t *mp)
{
uint64_t resblks;
/*
* We default to 5% or 8192 fsbs of space reserved, whichever is
* smaller. This is intended to cover concurrent allocation
* transactions when we initially hit enospc. These each require a 4
* block reservation. Hence by default we cover roughly 2000 concurrent
* allocation reservations.
*/
resblks = mp->m_sb.sb_dblocks;
do_div(resblks, 20);
resblks = min_t(uint64_t, resblks, 8192);
return resblks;
}
/* Ensure the summary counts are correct. */
STATIC int
xfs_check_summary_counts(
struct xfs_mount *mp)
{
/*
* The AG0 superblock verifier rejects in-progress filesystems,
* so we should never see the flag set this far into mounting.
*/
if (mp->m_sb.sb_inprogress) {
xfs_err(mp, "sb_inprogress set after log recovery??");
WARN_ON(1);
return -EFSCORRUPTED;
}
/*
* Now the log is mounted, we know if it was an unclean shutdown or
* not. If it was, with the first phase of recovery has completed, we
* have consistent AG blocks on disk. We have not recovered EFIs yet,
* but they are recovered transactionally in the second recovery phase
* later.
*
* If the log was clean when we mounted, we can check the summary
* counters. If any of them are obviously incorrect, we can recompute
* them from the AGF headers in the next step.
*/
if (XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
(mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
!xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
/*
* We can safely re-initialise incore superblock counters from the
* per-ag data. These may not be correct if the filesystem was not
* cleanly unmounted, so we waited for recovery to finish before doing
* this.
*
* If the filesystem was cleanly unmounted or the previous check did
* not flag anything weird, then we can trust the values in the
* superblock to be correct and we don't need to do anything here.
* Otherwise, recalculate the summary counters.
*/
if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
!xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
return 0;
return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
}
/*
* Flush and reclaim dirty inodes in preparation for unmount. Inodes and
* internal inode structures can be sitting in the CIL and AIL at this point,
* so we need to unpin them, write them back and/or reclaim them before unmount
xfs: per-cpu deferred inode inactivation queues Move inode inactivation to background work contexts so that it no longer runs in the context that releases the final reference to an inode. This will allow process work that ends up blocking on inactivation to continue doing work while the filesytem processes the inactivation in the background. A typical demonstration of this is unlinking an inode with lots of extents. The extents are removed during inactivation, so this blocks the process that unlinked the inode from the directory structure. By moving the inactivation to the background process, the userspace applicaiton can keep working (e.g. unlinking the next inode in the directory) while the inactivation work on the previous inode is done by a different CPU. The implementation of the queue is relatively simple. We use a per-cpu lockless linked list (llist) to queue inodes for inactivation without requiring serialisation mechanisms, and a work item to allow the queue to be processed by a CPU bound worker thread. We also keep a count of the queue depth so that we can trigger work after a number of deferred inactivations have been queued. The use of a bound workqueue with a single work depth allows the workqueue to run one work item per CPU. We queue the work item on the CPU we are currently running on, and so this essentially gives us affine per-cpu worker threads for the per-cpu queues. THis maintains the effective CPU affinity that occurs within XFS at the AG level due to all objects in a directory being local to an AG. Hence inactivation work tends to run on the same CPU that last accessed all the objects that inactivation accesses and this maintains hot CPU caches for unlink workloads. A depth of 32 inodes was chosen to match the number of inodes in an inode cluster buffer. This hopefully allows sequential allocation/unlink behaviours to defering inactivation of all the inodes in a single cluster buffer at a time, further helping maintain hot CPU and buffer cache accesses while running inactivations. A hard per-cpu queue throttle of 256 inode has been set to avoid runaway queuing when inodes that take a long to time inactivate are being processed. For example, when unlinking inodes with large numbers of extents that can take a lot of processing to free. Signed-off-by: Dave Chinner <dchinner@redhat.com> [djwong: tweak comments and tracepoints, convert opflags to state bits] Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-06 18:05:39 +00:00
* can proceed. In other words, callers are required to have inactivated all
* inodes.
*
* An inode cluster that has been freed can have its buffer still pinned in
* memory because the transaction is still sitting in a iclog. The stale inodes
* on that buffer will be pinned to the buffer until the transaction hits the
* disk and the callbacks run. Pushing the AIL will skip the stale inodes and
* may never see the pinned buffer, so nothing will push out the iclog and
* unpin the buffer.
*
* Hence we need to force the log to unpin everything first. However, log
* forces don't wait for the discards they issue to complete, so we have to
* explicitly wait for them to complete here as well.
*
* Then we can tell the world we are unmounting so that error handling knows
* that the filesystem is going away and we should error out anything that we
* have been retrying in the background. This will prevent never-ending
* retries in AIL pushing from hanging the unmount.
*
* Finally, we can push the AIL to clean all the remaining dirty objects, then
* reclaim the remaining inodes that are still in memory at this point in time.
*/
static void
xfs_unmount_flush_inodes(
struct xfs_mount *mp)
{
xfs_log_force(mp, XFS_LOG_SYNC);
xfs_extent_busy_wait_all(mp);
flush_workqueue(xfs_discard_wq);
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_ail_push_all_sync(mp->m_ail);
xfs: per-cpu deferred inode inactivation queues Move inode inactivation to background work contexts so that it no longer runs in the context that releases the final reference to an inode. This will allow process work that ends up blocking on inactivation to continue doing work while the filesytem processes the inactivation in the background. A typical demonstration of this is unlinking an inode with lots of extents. The extents are removed during inactivation, so this blocks the process that unlinked the inode from the directory structure. By moving the inactivation to the background process, the userspace applicaiton can keep working (e.g. unlinking the next inode in the directory) while the inactivation work on the previous inode is done by a different CPU. The implementation of the queue is relatively simple. We use a per-cpu lockless linked list (llist) to queue inodes for inactivation without requiring serialisation mechanisms, and a work item to allow the queue to be processed by a CPU bound worker thread. We also keep a count of the queue depth so that we can trigger work after a number of deferred inactivations have been queued. The use of a bound workqueue with a single work depth allows the workqueue to run one work item per CPU. We queue the work item on the CPU we are currently running on, and so this essentially gives us affine per-cpu worker threads for the per-cpu queues. THis maintains the effective CPU affinity that occurs within XFS at the AG level due to all objects in a directory being local to an AG. Hence inactivation work tends to run on the same CPU that last accessed all the objects that inactivation accesses and this maintains hot CPU caches for unlink workloads. A depth of 32 inodes was chosen to match the number of inodes in an inode cluster buffer. This hopefully allows sequential allocation/unlink behaviours to defering inactivation of all the inodes in a single cluster buffer at a time, further helping maintain hot CPU and buffer cache accesses while running inactivations. A hard per-cpu queue throttle of 256 inode has been set to avoid runaway queuing when inodes that take a long to time inactivate are being processed. For example, when unlinking inodes with large numbers of extents that can take a lot of processing to free. Signed-off-by: Dave Chinner <dchinner@redhat.com> [djwong: tweak comments and tracepoints, convert opflags to state bits] Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-06 18:05:39 +00:00
xfs_inodegc_stop(mp);
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp);
xfs_health_unmount(mp);
}
static void
xfs_mount_setup_inode_geom(
struct xfs_mount *mp)
{
struct xfs_ino_geometry *igeo = M_IGEO(mp);
igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));
xfs_ialloc_setup_geometry(mp);
}
/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
* - if we're a 32-bit kernel, do a size check on the superblock
* so we don't mount terabyte filesystems
* - init mount struct realtime fields
* - allocate inode hash table for fs
* - init directory manager
* - perform recovery and init the log manager
*/
int
xfs_mountfs(
struct xfs_mount *mp)
{
struct xfs_sb *sbp = &(mp->m_sb);
struct xfs_inode *rip;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
uint64_t resblks;
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
xfs_sb_mount_common(mp, sbp);
/*
* Check for a mismatched features2 values. Older kernels read & wrote
* into the wrong sb offset for sb_features2 on some platforms due to
* xfs_sb_t not being 64bit size aligned when sb_features2 was added,
* which made older superblock reading/writing routines swap it as a
* 64-bit value.
*
* For backwards compatibility, we make both slots equal.
*
* If we detect a mismatched field, we OR the set bits into the existing
* features2 field in case it has already been modified; we don't want
* to lose any features. We then update the bad location with the ORed
* value so that older kernels will see any features2 flags. The
* superblock writeback code ensures the new sb_features2 is copied to
* sb_bad_features2 before it is logged or written to disk.
*/
if (xfs_sb_has_mismatched_features2(sbp)) {
xfs_warn(mp, "correcting sb_features alignment problem");
sbp->sb_features2 |= sbp->sb_bad_features2;
mp->m_update_sb = true;
/*
* Re-check for ATTR2 in case it was found in bad_features2
* slot.
*/
if (xfs_sb_version_hasattr2(&mp->m_sb) &&
!(mp->m_flags & XFS_MOUNT_NOATTR2))
mp->m_flags |= XFS_MOUNT_ATTR2;
}
if (xfs_sb_version_hasattr2(&mp->m_sb) &&
(mp->m_flags & XFS_MOUNT_NOATTR2)) {
xfs_sb_version_removeattr2(&mp->m_sb);
mp->m_update_sb = true;
/* update sb_versionnum for the clearing of the morebits */
if (!sbp->sb_features2)
mp->m_update_sb = true;
}
/* always use v2 inodes by default now */
if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
mp->m_update_sb = true;
}
/*
* If we were given new sunit/swidth options, do some basic validation
* checks and convert the incore dalign and swidth values to the
* same units (FSB) that everything else uses. This /must/ happen
* before computing the inode geometry.
*/
error = xfs_validate_new_dalign(mp);
if (error)
goto out;
xfs_alloc_compute_maxlevels(mp);
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
xfs_mount_setup_inode_geom(mp);
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
* is NOT aligned turn off m_dalign since allocator alignment is within
* an ag, therefore ag has to be aligned at stripe boundary. Note that
* we must compute the free space and rmap btree geometry before doing
* this.
*/
error = xfs_update_alignment(mp);
if (error)
goto out;
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
goto out;
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
&mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
error = xfs_error_sysfs_init(mp);
if (error)
goto out_del_stats;
error = xfs_errortag_init(mp);
if (error)
goto out_remove_error_sysfs;
error = xfs_uuid_mount(mp);
if (error)
goto out_remove_errortag;
/*
* Update the preferred write size based on the information from the
* on-disk superblock.
*/
mp->m_allocsize_log =
max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
xfs: dynamic speculative EOF preallocation Currently the size of the speculative preallocation during delayed allocation is fixed by either the allocsize mount option of a default size. We are seeing a lot of cases where we need to recommend using the allocsize mount option to prevent fragmentation when buffered writes land in the same AG. Rather than using a fixed preallocation size by default (up to 64k), make it dynamic by basing it on the current inode size. That way the EOF preallocation will increase as the file size increases. Hence for streaming writes we are much more likely to get large preallocations exactly when we need it to reduce fragementation. For default settings, the size of the initial extents is determined by the number of parallel writers and the amount of memory in the machine. For 4GB RAM and 4 concurrent 32GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088 and for 16 concurrent 16GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208 Because it is hard to take back specualtive preallocation, cases where there are large slow growing log files on a nearly full filesystem may cause premature ENOSPC. Hence as the filesystem nears full, the maximum dynamic prealloc size іs reduced according to this table (based on 4k block size): freespace max prealloc size >5% full extent (8GB) 4-5% 2GB (8GB >> 2) 3-4% 1GB (8GB >> 3) 2-3% 512MB (8GB >> 4) 1-2% 256MB (8GB >> 5) <1% 128MB (8GB >> 6) This should reduce the amount of space held in speculative preallocation for such cases. The allocsize mount option turns off the dynamic behaviour and fixes the prealloc size to whatever the mount option specifies. i.e. the behaviour is unchanged. Signed-off-by: Dave Chinner <dchinner@redhat.com>
2011-01-04 00:35:03 +00:00
/* set the low space thresholds for dynamic preallocation */
xfs_set_low_space_thresholds(mp);
/*
* If enabled, sparse inode chunk alignment is expected to match the
* cluster size. Full inode chunk alignment must match the chunk size,
* but that is checked on sb read verification...
*/
if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
mp->m_sb.sb_spino_align !=
XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
xfs_warn(mp,
"Sparse inode block alignment (%u) must match cluster size (%llu).",
mp->m_sb.sb_spino_align,
XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
error = -EINVAL;
goto out_remove_uuid;
}
/*
* Check that the data (and log if separate) is an ok size.
*/
error = xfs_check_sizes(mp);
if (error)
goto out_remove_uuid;
/*
* Initialize realtime fields in the mount structure
*/
error = xfs_rtmount_init(mp);
if (error) {
xfs_warn(mp, "RT mount failed");
goto out_remove_uuid;
}
/*
* Copies the low order bits of the timestamp and the randomly
* set "sequence" number out of a UUID.
*/
mp->m_fixedfsid[0] =
(get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) |
get_unaligned_be16(&sbp->sb_uuid.b[4]);
mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);
error = xfs_da_mount(mp);
if (error) {
xfs_warn(mp, "Failed dir/attr init: %d", error);
goto out_remove_uuid;
}
/*
* Initialize the precomputed transaction reservations values.
*/
xfs_trans_init(mp);
/*
* Allocate and initialize the per-ag data.
*/
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 11:47:44 +00:00
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
goto out_free_dir;
xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2010-01-11 11:47:44 +00:00
}
if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
xfs_warn(mp, "no log defined");
error = -EFSCORRUPTED;
goto out_free_perag;
}
/*
* Log's mount-time initialization. The first part of recovery can place
* some items on the AIL, to be handled when recovery is finished or
* cancelled.
*/
error = xfs_log_mount(mp, mp->m_logdev_targp,
XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
if (error) {
xfs_warn(mp, "log mount failed");
goto out_fail_wait;
}
/* Make sure the summary counts are ok. */
error = xfs_check_summary_counts(mp);
if (error)
goto out_log_dealloc;
xfs: per-cpu deferred inode inactivation queues Move inode inactivation to background work contexts so that it no longer runs in the context that releases the final reference to an inode. This will allow process work that ends up blocking on inactivation to continue doing work while the filesytem processes the inactivation in the background. A typical demonstration of this is unlinking an inode with lots of extents. The extents are removed during inactivation, so this blocks the process that unlinked the inode from the directory structure. By moving the inactivation to the background process, the userspace applicaiton can keep working (e.g. unlinking the next inode in the directory) while the inactivation work on the previous inode is done by a different CPU. The implementation of the queue is relatively simple. We use a per-cpu lockless linked list (llist) to queue inodes for inactivation without requiring serialisation mechanisms, and a work item to allow the queue to be processed by a CPU bound worker thread. We also keep a count of the queue depth so that we can trigger work after a number of deferred inactivations have been queued. The use of a bound workqueue with a single work depth allows the workqueue to run one work item per CPU. We queue the work item on the CPU we are currently running on, and so this essentially gives us affine per-cpu worker threads for the per-cpu queues. THis maintains the effective CPU affinity that occurs within XFS at the AG level due to all objects in a directory being local to an AG. Hence inactivation work tends to run on the same CPU that last accessed all the objects that inactivation accesses and this maintains hot CPU caches for unlink workloads. A depth of 32 inodes was chosen to match the number of inodes in an inode cluster buffer. This hopefully allows sequential allocation/unlink behaviours to defering inactivation of all the inodes in a single cluster buffer at a time, further helping maintain hot CPU and buffer cache accesses while running inactivations. A hard per-cpu queue throttle of 256 inode has been set to avoid runaway queuing when inodes that take a long to time inactivate are being processed. For example, when unlinking inodes with large numbers of extents that can take a lot of processing to free. Signed-off-by: Dave Chinner <dchinner@redhat.com> [djwong: tweak comments and tracepoints, convert opflags to state bits] Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-06 18:05:39 +00:00
/* Enable background inode inactivation workers. */
xfs_inodegc_start(mp);
/*
* Get and sanity-check the root inode.
* Save the pointer to it in the mount structure.
*/
error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED,
XFS_ILOCK_EXCL, &rip);
if (error) {
xfs_warn(mp,
"Failed to read root inode 0x%llx, error %d",
sbp->sb_rootino, -error);
goto out_log_dealloc;
}
ASSERT(rip != NULL);
if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
xfs_warn(mp, "corrupted root inode %llu: not a directory",
(unsigned long long)rip->i_ino);
xfs_iunlock(rip, XFS_ILOCK_EXCL);
error = -EFSCORRUPTED;
goto out_rele_rip;
}
mp->m_rootip = rip; /* save it */
xfs_iunlock(rip, XFS_ILOCK_EXCL);
/*
* Initialize realtime inode pointers in the mount structure
*/
error = xfs_rtmount_inodes(mp);
if (error) {
/*
* Free up the root inode.
*/
xfs_warn(mp, "failed to read RT inodes");
goto out_rele_rip;
}
/*
* If this is a read-only mount defer the superblock updates until
* the next remount into writeable mode. Otherwise we would never
* perform the update e.g. for the root filesystem.
*/
if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
error = xfs_sync_sb(mp, false);
if (error) {
xfs_warn(mp, "failed to write sb changes");
goto out_rtunmount;
}
}
/*
* Initialise the XFS quota management subsystem for this mount
*/
if (XFS_IS_QUOTA_ON(mp)) {
error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
if (error)
goto out_rtunmount;
} else {
/*
* If a file system had quotas running earlier, but decided to
* mount without -o uquota/pquota/gquota options, revoke the
* quotachecked license.
*/
if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
xfs_notice(mp, "resetting quota flags");
error = xfs_mount_reset_sbqflags(mp);
if (error)
goto out_rtunmount;
}
}
/*
* Finish recovering the file system. This part needed to be delayed
* until after the root and real-time bitmap inodes were consistently
* read in. Temporarily create per-AG space reservations for metadata
* btree shape changes because space freeing transactions (for inode
* inactivation) require the per-AG reservation in lieu of reserving
* blocks.
*/
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error == -ENOSPC)
xfs_warn(mp,
"ENOSPC reserving per-AG metadata pool, log recovery may fail.");
error = xfs_log_mount_finish(mp);
xfs_fs_unreserve_ag_blocks(mp);
if (error) {
xfs_warn(mp, "log mount finish failed");
goto out_rtunmount;
}
xfs: quiesce the filesystem after recovery on readonly mount Recently we've had a number of reports where log recovery on a v5 filesystem has reported corruptions that looked to be caused by recovery being re-run over the top of an already-recovered metadata. This has uncovered a bug in recovery (fixed elsewhere) but the vector that caused this was largely unknown. A kdump test started tripping over this problem - the system would be crashed, the kdump kernel and environment would boot and dump the kernel core image, and then the system would reboot. After reboot, the root filesystem was triggering log recovery and corruptions were being detected. The metadumps indicated the above log recovery issue. What is happening is that the kdump kernel and environment is mounting the root device read-only to find the binaries needed to do it's work. The result of this is that it is running log recovery. However, because there were unlinked files and EFIs to be processed by recovery, the completion of phase 1 of log recovery could not mark the log clean. And because it's a read-only mount, the unmount process does not write records to the log to mark it clean, either. Hence on the next mount of the filesystem, log recovery was run again across all the metadata that had already been recovered and this is what triggered corruption warnings. To avoid this problem, we need to ensure that a read-only mount always updates the log when it completes the second phase of recovery. We already handle this sort of issue with rw->ro remount transitions, so the solution is as simple as quiescing the filesystem at the appropriate time during the mount process. This results in the log being marked clean so the mount behaviour recorded in the logs on repeated RO mounts will change (i.e. log recovery will no longer be run on every mount until a RW mount is done). This is a user visible change in behaviour, but it is harmless. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-25 22:21:44 +00:00
/*
* Now the log is fully replayed, we can transition to full read-only
* mode for read-only mounts. This will sync all the metadata and clean
* the log so that the recovery we just performed does not have to be
* replayed again on the next mount.
*
* We use the same quiesce mechanism as the rw->ro remount, as they are
* semantically identical operations.
*/
if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
XFS_MOUNT_RDONLY) {
xfs_log_clean(mp);
xfs: quiesce the filesystem after recovery on readonly mount Recently we've had a number of reports where log recovery on a v5 filesystem has reported corruptions that looked to be caused by recovery being re-run over the top of an already-recovered metadata. This has uncovered a bug in recovery (fixed elsewhere) but the vector that caused this was largely unknown. A kdump test started tripping over this problem - the system would be crashed, the kdump kernel and environment would boot and dump the kernel core image, and then the system would reboot. After reboot, the root filesystem was triggering log recovery and corruptions were being detected. The metadumps indicated the above log recovery issue. What is happening is that the kdump kernel and environment is mounting the root device read-only to find the binaries needed to do it's work. The result of this is that it is running log recovery. However, because there were unlinked files and EFIs to be processed by recovery, the completion of phase 1 of log recovery could not mark the log clean. And because it's a read-only mount, the unmount process does not write records to the log to mark it clean, either. Hence on the next mount of the filesystem, log recovery was run again across all the metadata that had already been recovered and this is what triggered corruption warnings. To avoid this problem, we need to ensure that a read-only mount always updates the log when it completes the second phase of recovery. We already handle this sort of issue with rw->ro remount transitions, so the solution is as simple as quiescing the filesystem at the appropriate time during the mount process. This results in the log being marked clean so the mount behaviour recorded in the logs on repeated RO mounts will change (i.e. log recovery will no longer be run on every mount until a RW mount is done). This is a user visible change in behaviour, but it is harmless. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-25 22:21:44 +00:00
}
/*
* Complete the quota initialisation, post-log-replay component.
*/
if (quotamount) {
ASSERT(mp->m_qflags == 0);
mp->m_qflags = quotaflags;
xfs_qm_mount_quotas(mp);
}
/*
* Now we are mounted, reserve a small amount of unused space for
* privileged transactions. This is needed so that transaction
* space required for critical operations can dip into this pool
* when at ENOSPC. This is needed for operations like create with
* attr, unwritten extent conversion at ENOSPC, etc. Data allocations
* are not allowed to use this reserved space.
*
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
resblks = xfs_default_resblks(mp);
error = xfs_reserve_blocks(mp, &resblks, NULL);
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
/* Recover any CoW blocks that never got remapped. */
error = xfs_reflink_recover_cow(mp);
if (error) {
xfs_err(mp,
"Error %d recovering leftover CoW allocations.", error);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
goto out_quota;
}
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out_agresv;
}
return 0;
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
out_quota:
xfs_qm_unmount_quotas(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
out_rele_rip:
xfs_irele(rip);
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
xfs: per-cpu deferred inode inactivation queues Move inode inactivation to background work contexts so that it no longer runs in the context that releases the final reference to an inode. This will allow process work that ends up blocking on inactivation to continue doing work while the filesytem processes the inactivation in the background. A typical demonstration of this is unlinking an inode with lots of extents. The extents are removed during inactivation, so this blocks the process that unlinked the inode from the directory structure. By moving the inactivation to the background process, the userspace applicaiton can keep working (e.g. unlinking the next inode in the directory) while the inactivation work on the previous inode is done by a different CPU. The implementation of the queue is relatively simple. We use a per-cpu lockless linked list (llist) to queue inodes for inactivation without requiring serialisation mechanisms, and a work item to allow the queue to be processed by a CPU bound worker thread. We also keep a count of the queue depth so that we can trigger work after a number of deferred inactivations have been queued. The use of a bound workqueue with a single work depth allows the workqueue to run one work item per CPU. We queue the work item on the CPU we are currently running on, and so this essentially gives us affine per-cpu worker threads for the per-cpu queues. THis maintains the effective CPU affinity that occurs within XFS at the AG level due to all objects in a directory being local to an AG. Hence inactivation work tends to run on the same CPU that last accessed all the objects that inactivation accesses and this maintains hot CPU caches for unlink workloads. A depth of 32 inodes was chosen to match the number of inodes in an inode cluster buffer. This hopefully allows sequential allocation/unlink behaviours to defering inactivation of all the inodes in a single cluster buffer at a time, further helping maintain hot CPU and buffer cache accesses while running inactivations. A hard per-cpu queue throttle of 256 inode has been set to avoid runaway queuing when inodes that take a long to time inactivate are being processed. For example, when unlinking inodes with large numbers of extents that can take a lot of processing to free. Signed-off-by: Dave Chinner <dchinner@redhat.com> [djwong: tweak comments and tracepoints, convert opflags to state bits] Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-06 18:05:39 +00:00
/*
* Inactivate all inodes that might still be in memory after a log
* intent recovery failure so that reclaim can free them. Metadata
* inodes and the root directory shouldn't need inactivation, but the
* mount failed for some reason, so pull down all the state and flee.
*/
xfs_inodegc_flush(mp);
/*
* Flush all inode reclamation work and flush the log.
* We have to do this /after/ rtunmount and qm_unmount because those
* two will have scheduled delayed reclaim for the rt/quota inodes.
*
* This is slightly different from the unmountfs call sequence
* because we could be tearing down a partially set up mount. In
* particular, if log_mount_finish fails we bail out without calling
* qm_unmount_quotas and therefore rely on qm_unmount to release the
* quota inodes.
*/
xfs_unmount_flush_inodes(mp);
out_log_dealloc:
xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_buftarg_drain(mp->m_logdev_targp);
xfs_buftarg_drain(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
out_free_dir:
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
out_remove_errortag:
xfs_errortag_del(mp);
out_remove_error_sysfs:
xfs_error_sysfs_del(mp);
out_del_stats:
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
return error;
}
/*
* This flushes out the inodes,dquots and the superblock, unmounts the
* log and makes sure that incore structures are freed.
*/
void
xfs_unmountfs(
struct xfs_mount *mp)
{
uint64_t resblks;
int error;
xfs: per-cpu deferred inode inactivation queues Move inode inactivation to background work contexts so that it no longer runs in the context that releases the final reference to an inode. This will allow process work that ends up blocking on inactivation to continue doing work while the filesytem processes the inactivation in the background. A typical demonstration of this is unlinking an inode with lots of extents. The extents are removed during inactivation, so this blocks the process that unlinked the inode from the directory structure. By moving the inactivation to the background process, the userspace applicaiton can keep working (e.g. unlinking the next inode in the directory) while the inactivation work on the previous inode is done by a different CPU. The implementation of the queue is relatively simple. We use a per-cpu lockless linked list (llist) to queue inodes for inactivation without requiring serialisation mechanisms, and a work item to allow the queue to be processed by a CPU bound worker thread. We also keep a count of the queue depth so that we can trigger work after a number of deferred inactivations have been queued. The use of a bound workqueue with a single work depth allows the workqueue to run one work item per CPU. We queue the work item on the CPU we are currently running on, and so this essentially gives us affine per-cpu worker threads for the per-cpu queues. THis maintains the effective CPU affinity that occurs within XFS at the AG level due to all objects in a directory being local to an AG. Hence inactivation work tends to run on the same CPU that last accessed all the objects that inactivation accesses and this maintains hot CPU caches for unlink workloads. A depth of 32 inodes was chosen to match the number of inodes in an inode cluster buffer. This hopefully allows sequential allocation/unlink behaviours to defering inactivation of all the inodes in a single cluster buffer at a time, further helping maintain hot CPU and buffer cache accesses while running inactivations. A hard per-cpu queue throttle of 256 inode has been set to avoid runaway queuing when inodes that take a long to time inactivate are being processed. For example, when unlinking inodes with large numbers of extents that can take a lot of processing to free. Signed-off-by: Dave Chinner <dchinner@redhat.com> [djwong: tweak comments and tracepoints, convert opflags to state bits] Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-06 18:05:39 +00:00
/*
* Perform all on-disk metadata updates required to inactivate inodes
* that the VFS evicted earlier in the unmount process. Freeing inodes
* and discarding CoW fork preallocations can cause shape changes to
* the free inode and refcount btrees, respectively, so we must finish
* this before we discard the metadata space reservations. Metadata
* inodes and the root directory do not require inactivation.
*/
xfs_inodegc_flush(mp);
xfs_blockgc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
xfs_unmount_flush_inodes(mp);
xfs_qm_unmount(mp);
/*
* Unreserve any blocks we have so that when we unmount we don't account
* the reserved free space as used. This is really only necessary for
* lazy superblock counting because it trusts the incore superblock
* counters to be absolutely correct on clean unmount.
*
* We don't bother correcting this elsewhere for lazy superblock
* counting because on mount of an unclean filesystem we reconstruct the
* correct counter value and this is irrelevant.
*
* For non-lazy counter filesystems, this doesn't matter at all because
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
resblks = 0;
error = xfs_reserve_blocks(mp, &resblks, NULL);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
#if defined(DEBUG)
xfs_errortag_clearall(mp);
#endif
xfs_free_perag(mp);
xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
/*
* Determine whether modifications can proceed. The caller specifies the minimum
* freeze level for which modifications should not be allowed. This allows
* certain operations to proceed while the freeze sequence is in progress, if
* necessary.
*/
bool
xfs_fs_writable(
struct xfs_mount *mp,
int level)
[XFS] Lazy Superblock Counters When we have a couple of hundred transactions on the fly at once, they all typically modify the on disk superblock in some way. create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify free block counts. When these counts are modified in a transaction, they must eventually lock the superblock buffer and apply the mods. The buffer then remains locked until the transaction is committed into the incore log buffer. The result of this is that with enough transactions on the fly the incore superblock buffer becomes a bottleneck. The result of contention on the incore superblock buffer is that transaction rates fall - the more pressure that is put on the superblock buffer, the slower things go. The key to removing the contention is to not require the superblock fields in question to be locked. We do that by not marking the superblock dirty in the transaction. IOWs, we modify the incore superblock but do not modify the cached superblock buffer. In short, we do not log superblock modifications to critical fields in the superblock on every transaction. In fact we only do it just before we write the superblock to disk every sync period or just before unmount. This creates an interesting problem - if we don't log or write out the fields in every transaction, then how do the values get recovered after a crash? the answer is simple - we keep enough duplicate, logged information in other structures that we can reconstruct the correct count after log recovery has been performed. It is the AGF and AGI structures that contain the duplicate information; after recovery, we walk every AGI and AGF and sum their individual counters to get the correct value, and we do a transaction into the log to correct them. An optimisation of this is that if we have a clean unmount record, we know the value in the superblock is correct, so we can avoid the summation walk under normal conditions and so mount/recovery times do not change under normal operation. One wrinkle that was discovered during development was that the blocks used in the freespace btrees are never accounted for in the AGF counters. This was once a valid optimisation to make; when the filesystem is full, the free space btrees are empty and consume no space. Hence when it matters, the "accounting" is correct. But that means the when we do the AGF summations, we would not have a correct count and xfs_check would complain. Hence a new counter was added to track the number of blocks used by the free space btrees. This is an *on-disk format change*. As a result of this, lazy superblock counters are a mkfs option and at the moment on linux there is no way to convert an old filesystem. This is possible - xfs_db can be used to twiddle the right bits and then xfs_repair will do the format conversion for you. Similarly, you can convert backwards as well. At some point we'll add functionality to xfs_admin to do the bit twiddling easily.... SGI-PV: 964999 SGI-Modid: xfs-linux-melb:xfs-kern:28652a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 05:26:31 +00:00
{
ASSERT(level > SB_UNFROZEN);
if ((mp->m_super->s_writers.frozen >= level) ||
XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY))
return false;
return true;
[XFS] Lazy Superblock Counters When we have a couple of hundred transactions on the fly at once, they all typically modify the on disk superblock in some way. create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify free block counts. When these counts are modified in a transaction, they must eventually lock the superblock buffer and apply the mods. The buffer then remains locked until the transaction is committed into the incore log buffer. The result of this is that with enough transactions on the fly the incore superblock buffer becomes a bottleneck. The result of contention on the incore superblock buffer is that transaction rates fall - the more pressure that is put on the superblock buffer, the slower things go. The key to removing the contention is to not require the superblock fields in question to be locked. We do that by not marking the superblock dirty in the transaction. IOWs, we modify the incore superblock but do not modify the cached superblock buffer. In short, we do not log superblock modifications to critical fields in the superblock on every transaction. In fact we only do it just before we write the superblock to disk every sync period or just before unmount. This creates an interesting problem - if we don't log or write out the fields in every transaction, then how do the values get recovered after a crash? the answer is simple - we keep enough duplicate, logged information in other structures that we can reconstruct the correct count after log recovery has been performed. It is the AGF and AGI structures that contain the duplicate information; after recovery, we walk every AGI and AGF and sum their individual counters to get the correct value, and we do a transaction into the log to correct them. An optimisation of this is that if we have a clean unmount record, we know the value in the superblock is correct, so we can avoid the summation walk under normal conditions and so mount/recovery times do not change under normal operation. One wrinkle that was discovered during development was that the blocks used in the freespace btrees are never accounted for in the AGF counters. This was once a valid optimisation to make; when the filesystem is full, the free space btrees are empty and consume no space. Hence when it matters, the "accounting" is correct. But that means the when we do the AGF summations, we would not have a correct count and xfs_check would complain. Hence a new counter was added to track the number of blocks used by the free space btrees. This is an *on-disk format change*. As a result of this, lazy superblock counters are a mkfs option and at the moment on linux there is no way to convert an old filesystem. This is possible - xfs_db can be used to twiddle the right bits and then xfs_repair will do the format conversion for you. Similarly, you can convert backwards as well. At some point we'll add functionality to xfs_admin to do the bit twiddling easily.... SGI-PV: 964999 SGI-Modid: xfs-linux-melb:xfs-kern:28652a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 05:26:31 +00:00
}
int
xfs_mod_fdblocks(
struct xfs_mount *mp,
int64_t delta,
bool rsvd)
{
int64_t lcounter;
long long res_used;
s32 batch;
xfs: set aside allocation btree blocks from block reservation The blocks used for allocation btrees (bnobt and countbt) are technically considered free space. This is because as free space is used, allocbt blocks are removed and naturally become available for traditional allocation. However, this means that a significant portion of free space may consist of in-use btree blocks if free space is severely fragmented. On large filesystems with large perag reservations, this can lead to a rare but nasty condition where a significant amount of physical free space is available, but the majority of actual usable blocks consist of in-use allocbt blocks. We have a record of a (~12TB, 32 AG) filesystem with multiple AGs in a state with ~2.5GB or so free blocks tracked across ~300 total allocbt blocks, but effectively at 100% full because the the free space is entirely consumed by refcountbt perag reservation. Such a large perag reservation is by design on large filesystems. The problem is that because the free space is so fragmented, this AG contributes the 300 or so allocbt blocks to the global counters as free space. If this pattern repeats across enough AGs, the filesystem lands in a state where global block reservation can outrun physical block availability. For example, a streaming buffered write on the affected filesystem continues to allow delayed allocation beyond the point where writeback starts to fail due to physical block allocation failures. The expected behavior is for the delalloc block reservation to fail gracefully with -ENOSPC before physical block allocation failure is a possibility. To address this problem, set aside in-use allocbt blocks at reservation time and thus ensure they cannot be reserved until truly available for physical allocation. This allows alloc btree metadata to continue to reside in free space, but dynamically adjusts reservation availability based on internal state. Note that the logic requires that the allocbt counter is fully populated at reservation time before it is fully effective. We currently rely on the mount time AGF scan in the perag reservation initialization code for this dependency on filesystems where it's most important (i.e. with active perag reservations). Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-04-28 22:06:05 +00:00
uint64_t set_aside;
if (delta > 0) {
/*
* If the reserve pool is depleted, put blocks back into it
* first. Most of the time the pool is full.
*/
if (likely(mp->m_resblks == mp->m_resblks_avail)) {
percpu_counter_add(&mp->m_fdblocks, delta);
return 0;
}
spin_lock(&mp->m_sb_lock);
res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
if (res_used > delta) {
mp->m_resblks_avail += delta;
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
percpu_counter_add(&mp->m_fdblocks, delta);
}
spin_unlock(&mp->m_sb_lock);
return 0;
}
/*
* Taking blocks away, need to be more accurate the closer we
* are to zero.
*
* If the counter has a value of less than 2 * max batch size,
* then make everything serialise as we are real close to
* ENOSPC.
*/
if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
batch = XFS_FDBLOCKS_BATCH;
xfs: set aside allocation btree blocks from block reservation The blocks used for allocation btrees (bnobt and countbt) are technically considered free space. This is because as free space is used, allocbt blocks are removed and naturally become available for traditional allocation. However, this means that a significant portion of free space may consist of in-use btree blocks if free space is severely fragmented. On large filesystems with large perag reservations, this can lead to a rare but nasty condition where a significant amount of physical free space is available, but the majority of actual usable blocks consist of in-use allocbt blocks. We have a record of a (~12TB, 32 AG) filesystem with multiple AGs in a state with ~2.5GB or so free blocks tracked across ~300 total allocbt blocks, but effectively at 100% full because the the free space is entirely consumed by refcountbt perag reservation. Such a large perag reservation is by design on large filesystems. The problem is that because the free space is so fragmented, this AG contributes the 300 or so allocbt blocks to the global counters as free space. If this pattern repeats across enough AGs, the filesystem lands in a state where global block reservation can outrun physical block availability. For example, a streaming buffered write on the affected filesystem continues to allow delayed allocation beyond the point where writeback starts to fail due to physical block allocation failures. The expected behavior is for the delalloc block reservation to fail gracefully with -ENOSPC before physical block allocation failure is a possibility. To address this problem, set aside in-use allocbt blocks at reservation time and thus ensure they cannot be reserved until truly available for physical allocation. This allows alloc btree metadata to continue to reside in free space, but dynamically adjusts reservation availability based on internal state. Note that the logic requires that the allocbt counter is fully populated at reservation time before it is fully effective. We currently rely on the mount time AGF scan in the perag reservation initialization code for this dependency on filesystems where it's most important (i.e. with active perag reservations). Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-04-28 22:06:05 +00:00
/*
* Set aside allocbt blocks because these blocks are tracked as free
* space but not available for allocation. Technically this means that a
* single reservation cannot consume all remaining free space, but the
* ratio of allocbt blocks to usable free blocks should be rather small.
* The tradeoff without this is that filesystems that maintain high
* perag block reservations can over reserve physical block availability
* and fail physical allocation, which leads to much more serious
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
xfs: set aside allocation btree blocks from block reservation The blocks used for allocation btrees (bnobt and countbt) are technically considered free space. This is because as free space is used, allocbt blocks are removed and naturally become available for traditional allocation. However, this means that a significant portion of free space may consist of in-use btree blocks if free space is severely fragmented. On large filesystems with large perag reservations, this can lead to a rare but nasty condition where a significant amount of physical free space is available, but the majority of actual usable blocks consist of in-use allocbt blocks. We have a record of a (~12TB, 32 AG) filesystem with multiple AGs in a state with ~2.5GB or so free blocks tracked across ~300 total allocbt blocks, but effectively at 100% full because the the free space is entirely consumed by refcountbt perag reservation. Such a large perag reservation is by design on large filesystems. The problem is that because the free space is so fragmented, this AG contributes the 300 or so allocbt blocks to the global counters as free space. If this pattern repeats across enough AGs, the filesystem lands in a state where global block reservation can outrun physical block availability. For example, a streaming buffered write on the affected filesystem continues to allow delayed allocation beyond the point where writeback starts to fail due to physical block allocation failures. The expected behavior is for the delalloc block reservation to fail gracefully with -ENOSPC before physical block allocation failure is a possibility. To address this problem, set aside in-use allocbt blocks at reservation time and thus ensure they cannot be reserved until truly available for physical allocation. This allows alloc btree metadata to continue to reside in free space, but dynamically adjusts reservation availability based on internal state. Note that the logic requires that the allocbt counter is fully populated at reservation time before it is fully effective. We currently rely on the mount time AGF scan in the perag reservation initialization code for this dependency on filesystems where it's most important (i.e. with active perag reservations). Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-04-28 22:06:05 +00:00
if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
}
/*
* lock up the sb for dipping into reserves before releasing the space
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(&mp->m_fdblocks, -delta);
if (!rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
if (lcounter >= 0) {
mp->m_resblks_avail = lcounter;
spin_unlock(&mp->m_sb_lock);
return 0;
}
xfs_warn_once(mp,
"Reserve blocks depleted! Consider increasing reserve pool size.");
fdblocks_enospc:
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
}
int
xfs_mod_frextents(
struct xfs_mount *mp,
int64_t delta)
{
int64_t lcounter;
int ret = 0;
spin_lock(&mp->m_sb_lock);
lcounter = mp->m_sb.sb_frextents + delta;
if (lcounter < 0)
ret = -ENOSPC;
else
mp->m_sb.sb_frextents = lcounter;
spin_unlock(&mp->m_sb_lock);
return ret;
}
/*
* Used to free the superblock along various error paths.
*/
void
xfs_freesb(
struct xfs_mount *mp)
{
struct xfs_buf *bp = mp->m_sb_bp;
xfs_buf_lock(bp);
mp->m_sb_bp = NULL;
xfs_buf_relse(bp);
}
/*
* If the underlying (data/log/rt) device is readonly, there are some
* operations that cannot proceed.
*/
int
xfs_dev_is_read_only(
struct xfs_mount *mp,
char *message)
{
if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
xfs_readonly_buftarg(mp->m_logdev_targp) ||
(mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
xfs_notice(mp, "%s required on read-only device.", message);
xfs_notice(mp, "write access unavailable, cannot proceed.");
return -EROFS;
}
return 0;
}
/* Force the summary counters to be recalculated at next mount. */
void
xfs_force_summary_recalc(
struct xfs_mount *mp)
{
if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
return;
xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
}
/*
* Update the in-core delayed block counter.
*
* We prefer to update the counter without having to take a spinlock for every
* counter update (i.e. batching). Each change to delayed allocation
* reservations can change can easily exceed the default percpu counter
* batching, so we use a larger batch factor here.
*
* Note that we don't currently have any callers requiring fast summation
* (e.g. percpu_counter_read) so we can use a big batch value here.
*/
#define XFS_DELALLOC_BATCH (4096)
void
xfs_mod_delalloc(
struct xfs_mount *mp,
int64_t delta)
{
percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
XFS_DELALLOC_BATCH);
}