linux/fs/xfs/xfs_inode_item.c

853 lines
23 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-14 23:14:59 +00:00
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_error.h"
#include <linux/iversion.h>
kmem_zone_t *xfs_ili_zone; /* inode log item zone */
static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
STATIC void
xfs_inode_item_data_fork_size(
struct xfs_inode_log_item *iip,
int *nvecs,
int *nbytes)
{
struct xfs_inode *ip = iip->ili_inode;
switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
/* worst case, doesn't subtract delalloc extents */
*nbytes += XFS_IFORK_DSIZE(ip);
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
*nbytes += ip->i_df.if_broot_bytes;
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
*nbytes += roundup(ip->i_df.if_bytes, 4);
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_DEV:
break;
default:
ASSERT(0);
break;
}
}
STATIC void
xfs_inode_item_attr_fork_size(
struct xfs_inode_log_item *iip,
int *nvecs,
int *nbytes)
{
struct xfs_inode *ip = iip->ili_inode;
switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
ip->i_afp->if_nextents > 0 &&
ip->i_afp->if_bytes > 0) {
/* worst case, doesn't subtract unused space */
*nbytes += XFS_IFORK_ASIZE(ip);
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
ip->i_afp->if_broot_bytes > 0) {
*nbytes += ip->i_afp->if_broot_bytes;
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
*nbytes += roundup(ip->i_afp->if_bytes, 4);
*nvecs += 1;
}
break;
default:
ASSERT(0);
break;
}
}
/*
* This returns the number of iovecs needed to log the given inode item.
*
* We need one iovec for the inode log format structure, one for the
* inode core, and possibly one for the inode data/extents/b-tree root
* and one for the inode attribute data/extents/b-tree root.
*/
STATIC void
xfs_inode_item_size(
struct xfs_log_item *lip,
int *nvecs,
int *nbytes)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
*nvecs += 2;
*nbytes += sizeof(struct xfs_inode_log_format) +
xfs_log_dinode_size(ip->i_mount);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
if (XFS_IFORK_Q(ip))
xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
}
STATIC void
xfs_inode_item_format_data_fork(
struct xfs_inode_log_item *iip,
struct xfs_inode_log_format *ilf,
struct xfs_log_vec *lv,
struct xfs_log_iovec **vecp)
{
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
struct xfs_bmbt_rec *p;
ASSERT(xfs_iext_count(&ip->i_df) > 0);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
xlog_finish_iovec(lv, *vecp, data_bytes);
ASSERT(data_bytes <= ip->i_df.if_bytes);
ilf->ilf_dsize = data_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DEXT;
}
break;
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
ASSERT(ip->i_df.if_broot != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
ip->i_df.if_broot,
ip->i_df.if_broot_bytes);
ilf->ilf_dsize = ip->i_df.if_broot_bytes;
ilf->ilf_size++;
} else {
ASSERT(!(iip->ili_fields &
XFS_ILOG_DBROOT));
iip->ili_fields &= ~XFS_ILOG_DBROOT;
}
break;
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_d.di_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
ip->i_df.if_u1.if_data, data_bytes);
ilf->ilf_dsize = (unsigned)data_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
}
break;
case XFS_DINODE_FMT_DEV:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT);
if (iip->ili_fields & XFS_ILOG_DEV)
ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev);
break;
default:
ASSERT(0);
break;
}
}
STATIC void
xfs_inode_item_format_attr_fork(
struct xfs_inode_log_item *iip,
struct xfs_inode_log_format *ilf,
struct xfs_log_vec *lv,
struct xfs_log_iovec **vecp)
{
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
ip->i_afp->if_nextents > 0 &&
ip->i_afp->if_bytes > 0) {
struct xfs_bmbt_rec *p;
ASSERT(xfs_iext_count(ip->i_afp) ==
ip->i_afp->if_nextents);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
xlog_finish_iovec(lv, *vecp, data_bytes);
ilf->ilf_asize = data_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_AEXT;
}
break;
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
ip->i_afp->if_broot_bytes > 0) {
ASSERT(ip->i_afp->if_broot != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
ip->i_afp->if_broot,
ip->i_afp->if_broot_bytes);
ilf->ilf_asize = ip->i_afp->if_broot_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ABROOT;
}
break;
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
ASSERT(ip->i_afp->if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
ip->i_afp->if_u1.if_data,
data_bytes);
ilf->ilf_asize = (unsigned)data_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
}
break;
default:
ASSERT(0);
break;
}
}
/*
* Convert an incore timestamp to a log timestamp. Note that the log format
* specifies host endian format!
*/
static inline xfs_ictimestamp_t
xfs_inode_to_log_dinode_ts(
struct xfs_inode *ip,
const struct timespec64 tv)
{
struct xfs_legacy_ictimestamp *lits;
xfs_ictimestamp_t its;
if (xfs_inode_has_bigtime(ip))
return xfs_inode_encode_bigtime(tv);
lits = (struct xfs_legacy_ictimestamp *)&its;
lits->t_sec = tv.tv_sec;
lits->t_nsec = tv.tv_nsec;
return its;
}
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
struct xfs_log_dinode *to,
xfs_lsn_t lsn)
{
struct xfs_icdinode *from = &ip->i_d;
struct inode *inode = VFS_I(ip);
to->di_magic = XFS_DINODE_MAGIC;
to->di_format = xfs_ifork_format(&ip->i_df);
to->di_uid = i_uid_read(inode);
to->di_gid = i_gid_read(inode);
to->di_projid_lo = from->di_projid & 0xffff;
to->di_projid_hi = from->di_projid >> 16;
memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
to->di_size = from->di_size;
to->di_nblocks = from->di_nblocks;
to->di_extsize = from->di_extsize;
to->di_nextents = xfs_ifork_nextents(&ip->i_df);
to->di_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_forkoff = from->di_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_dmevmask = from->di_dmevmask;
to->di_dmstate = from->di_dmstate;
to->di_flags = from->di_flags;
/* log a dummy value to ensure log structure is fully initialised */
to->di_next_unlinked = NULLAGINO;
if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime);
to->di_flags2 = from->di_flags2;
to->di_cowextsize = from->di_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
} else {
to->di_version = 2;
to->di_flushiter = from->di_flushiter;
}
}
/*
* Format the inode core. Current timestamp data is only in the VFS inode
* fields, so we need to grab them from there. Hence rather than just copying
* the XFS inode core structure, format the fields directly into the iovec.
*/
static void
xfs_inode_item_format_core(
struct xfs_inode *ip,
struct xfs_log_vec *lv,
struct xfs_log_iovec **vecp)
{
struct xfs_log_dinode *dic;
dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount));
}
/*
* This is called to fill in the vector of log iovecs for the given inode
* log item. It fills the first item with an inode log format structure,
* the second with the on-disk inode structure, and a possible third and/or
* fourth with the inode data/extents/b-tree root and inode attributes
* data/extents/b-tree root.
*
* Note: Always use the 64 bit inode log format structure so we don't
* leave an uninitialised hole in the format item on 64 bit systems. Log
* recovery on 32 bit systems handles this just fine, so there's no reason
* for not using an initialising the properly padded structure all the time.
*/
STATIC void
xfs_inode_item_format(
struct xfs_log_item *lip,
struct xfs_log_vec *lv)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
struct xfs_log_iovec *vecp = NULL;
struct xfs_inode_log_format *ilf;
ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
ilf->ilf_type = XFS_LI_INODE;
ilf->ilf_ino = ip->i_ino;
ilf->ilf_blkno = ip->i_imap.im_blkno;
ilf->ilf_len = ip->i_imap.im_len;
ilf->ilf_boffset = ip->i_imap.im_boffset;
ilf->ilf_fields = XFS_ILOG_CORE;
ilf->ilf_size = 2; /* format + core */
/*
* make sure we don't leak uninitialised data into the log in the case
* when we don't log every field in the inode.
*/
ilf->ilf_dsize = 0;
ilf->ilf_asize = 0;
ilf->ilf_pad = 0;
memset(&ilf->ilf_u, 0, sizeof(ilf->ilf_u));
xlog_finish_iovec(lv, vecp, sizeof(*ilf));
xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
if (XFS_IFORK_Q(ip)) {
xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
} else {
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
}
/* update the format with the exact fields we actually logged */
ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
}
/*
* This is called to pin the inode associated with the inode log
* item in memory so it cannot be written out.
*/
STATIC void
xfs_inode_item_pin(
struct xfs_log_item *lip)
{
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
ASSERT(lip->li_buf);
trace_xfs_inode_pin(ip, _RET_IP_);
atomic_inc(&ip->i_pincount);
}
/*
* This is called to unpin the inode associated with the inode log
* item which was previously pinned with a call to xfs_inode_item_pin().
*
* Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
*
* Note that unpin can race with inode cluster buffer freeing marking the buffer
* stale. In that case, flush completions are run from the buffer unpin call,
* which may happen before the inode is unpinned. If we lose the race, there
* will be no buffer attached to the log item, but the inode will be marked
* XFS_ISTALE.
*/
STATIC void
xfs_inode_item_unpin(
struct xfs_log_item *lip,
int remove)
{
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
STATIC uint
xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 05:58:39 +00:00
xfs_inode_item_push(
struct xfs_log_item *lip,
struct list_head *buffer_list)
__releases(&lip->li_ailp->ail_lock)
__acquires(&lip->li_ailp->ail_lock)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
struct xfs_buf *bp = lip->li_buf;
xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 05:58:39 +00:00
uint rval = XFS_ITEM_SUCCESS;
int error;
ASSERT(iip->ili_item.li_buf);
if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) ||
(ip->i_flags & XFS_ISTALE))
return XFS_ITEM_PINNED;
if (xfs_iflags_test(ip, XFS_IFLUSHING))
return XFS_ITEM_FLUSHING;
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
spin_unlock(&lip->li_ailp->ail_lock);
xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 05:58:39 +00:00
/*
* We need to hold a reference for flushing the cluster buffer as it may
* fail the buffer without IO submission. In which case, we better get a
* reference for that completion because otherwise we don't get a
* reference for IO until we queue the buffer for delwri submission.
xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 05:58:39 +00:00
*/
xfs_buf_hold(bp);
error = xfs_iflush_cluster(bp);
xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 05:58:39 +00:00
if (!error) {
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
} else {
/*
* Release the buffer if we were unable to flush anything. On
* any other error, the buffer has already been released.
*/
if (error == -EAGAIN)
xfs_buf_relse(bp);
rval = XFS_ITEM_LOCKED;
}
xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 05:58:39 +00:00
spin_lock(&lip->li_ailp->ail_lock);
xfs: on-stack delayed write buffer lists Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2012-04-23 05:58:39 +00:00
return rval;
}
/*
* Unlock the inode associated with the inode log item.
*/
STATIC void
xfs: split iop_unlock The iop_unlock method is called when comitting or cancelling a transaction. In the latter case, the transaction may or may not be aborted. While there is no known problem with the current code in practice, this implementation is limited in that any log item implementation that might want to differentiate between a commit and a cancellation must rely on the aborted state. The aborted bit is only set when the cancelled transaction is dirty, however. This means that there is no way to distinguish between a commit and a clean transaction cancellation. For example, intent log items currently rely on this distinction. The log item is either transferred to the CIL on commit or released on transaction cancel. There is currently no possibility for a clean intent log item in a transaction, but if that state is ever introduced a cancel of such a transaction will immediately result in memory leaks of the associated log item(s). This is an interface deficiency and landmine. To clean this up, replace the iop_unlock method with an iop_release method that is specific to transaction cancel. The existing iop_committing method occurs at the same time as iop_unlock in the commit path and there is no need for two separate callbacks here. Overload the iop_committing method with the current commit time iop_unlock implementations to eliminate the need for the latter and further simplify the interface. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-06-29 02:27:32 +00:00
xfs_inode_item_release(
struct xfs_log_item *lip)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
unsigned short lock_flags;
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
if (lock_flags)
xfs_iunlock(ip, lock_flags);
}
/*
* This is called to find out where the oldest active copy of the inode log
* item in the on disk log resides now that the last log write of it completed
* at the given lsn. Since we always re-log all dirty data in an inode, the
* latest copy in the on disk log is the only one that matters. Therefore,
* simply return the given lsn.
*
* If the inode has been marked stale because the cluster is being freed, we
* don't want to (re-)insert this inode into the AIL. There is a race condition
* where the cluster buffer may be unpinned before the inode is inserted into
* the AIL during transaction committed processing. If the buffer is unpinned
* before the inode item has been committed and inserted, then it is possible
xfs: unpin stale inodes directly in IOP_COMMITTED When inodes are marked stale in a transaction, they are treated specially when the inode log item is being inserted into the AIL. It tries to avoid moving the log item forward in the AIL due to a race condition with the writing the underlying buffer back to disk. The was "fixed" in commit de25c18 ("xfs: avoid moving stale inodes in the AIL"). To avoid moving the item forward, we return a LSN smaller than the commit_lsn of the completing transaction, thereby trying to trick the commit code into not moving the inode forward at all. I'm not sure this ever worked as intended - it assumes the inode is already in the AIL, but I don't think the returned LSN would have been small enough to prevent moving the inode. It appears that the reason it worked is that the lower LSN of the inodes meant they were inserted into the AIL and flushed before the inode buffer (which was moved to the commit_lsn of the transaction). The big problem is that with delayed logging, the returning of the different LSN means insertion takes the slow, non-bulk path. Worse yet is that insertion is to a position -before- the commit_lsn so it is doing a AIL traversal on every insertion, and has to walk over all the items that have already been inserted into the AIL. It's expensive. To compound the matter further, with delayed logging inodes are likely to go from clean to stale in a single checkpoint, which means they aren't even in the AIL at all when we come across them at AIL insertion time. Hence these were all getting inserted into the AIL when they simply do not need to be as inodes marked XFS_ISTALE are never written back. Transactional/recovery integrity is maintained in this case by the other items in the unlink transaction that were modified (e.g. the AGI btree blocks) and committed in the same checkpoint. So to fix this, simply unpin the stale inodes directly in xfs_inode_item_committed() and return -1 to indicate that the AIL insertion code does not need to do any further processing of these inodes. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2011-07-04 05:27:36 +00:00
* for the buffer to be written and IO completes before the inode is inserted
* into the AIL. In that case, we'd be inserting a clean, stale inode into the
* AIL which will never get removed. It will, however, get reclaimed which
* triggers an assert in xfs_inode_free() complaining about freein an inode
* still in the AIL.
*
xfs: unpin stale inodes directly in IOP_COMMITTED When inodes are marked stale in a transaction, they are treated specially when the inode log item is being inserted into the AIL. It tries to avoid moving the log item forward in the AIL due to a race condition with the writing the underlying buffer back to disk. The was "fixed" in commit de25c18 ("xfs: avoid moving stale inodes in the AIL"). To avoid moving the item forward, we return a LSN smaller than the commit_lsn of the completing transaction, thereby trying to trick the commit code into not moving the inode forward at all. I'm not sure this ever worked as intended - it assumes the inode is already in the AIL, but I don't think the returned LSN would have been small enough to prevent moving the inode. It appears that the reason it worked is that the lower LSN of the inodes meant they were inserted into the AIL and flushed before the inode buffer (which was moved to the commit_lsn of the transaction). The big problem is that with delayed logging, the returning of the different LSN means insertion takes the slow, non-bulk path. Worse yet is that insertion is to a position -before- the commit_lsn so it is doing a AIL traversal on every insertion, and has to walk over all the items that have already been inserted into the AIL. It's expensive. To compound the matter further, with delayed logging inodes are likely to go from clean to stale in a single checkpoint, which means they aren't even in the AIL at all when we come across them at AIL insertion time. Hence these were all getting inserted into the AIL when they simply do not need to be as inodes marked XFS_ISTALE are never written back. Transactional/recovery integrity is maintained in this case by the other items in the unlink transaction that were modified (e.g. the AGI btree blocks) and committed in the same checkpoint. So to fix this, simply unpin the stale inodes directly in xfs_inode_item_committed() and return -1 to indicate that the AIL insertion code does not need to do any further processing of these inodes. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2011-07-04 05:27:36 +00:00
* To avoid this, just unpin the inode directly and return a LSN of -1 so the
* transaction committed code knows that it does not need to do any further
* processing on the item.
*/
STATIC xfs_lsn_t
xfs_inode_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
xfs: unpin stale inodes directly in IOP_COMMITTED When inodes are marked stale in a transaction, they are treated specially when the inode log item is being inserted into the AIL. It tries to avoid moving the log item forward in the AIL due to a race condition with the writing the underlying buffer back to disk. The was "fixed" in commit de25c18 ("xfs: avoid moving stale inodes in the AIL"). To avoid moving the item forward, we return a LSN smaller than the commit_lsn of the completing transaction, thereby trying to trick the commit code into not moving the inode forward at all. I'm not sure this ever worked as intended - it assumes the inode is already in the AIL, but I don't think the returned LSN would have been small enough to prevent moving the inode. It appears that the reason it worked is that the lower LSN of the inodes meant they were inserted into the AIL and flushed before the inode buffer (which was moved to the commit_lsn of the transaction). The big problem is that with delayed logging, the returning of the different LSN means insertion takes the slow, non-bulk path. Worse yet is that insertion is to a position -before- the commit_lsn so it is doing a AIL traversal on every insertion, and has to walk over all the items that have already been inserted into the AIL. It's expensive. To compound the matter further, with delayed logging inodes are likely to go from clean to stale in a single checkpoint, which means they aren't even in the AIL at all when we come across them at AIL insertion time. Hence these were all getting inserted into the AIL when they simply do not need to be as inodes marked XFS_ISTALE are never written back. Transactional/recovery integrity is maintained in this case by the other items in the unlink transaction that were modified (e.g. the AGI btree blocks) and committed in the same checkpoint. So to fix this, simply unpin the stale inodes directly in xfs_inode_item_committed() and return -1 to indicate that the AIL insertion code does not need to do any further processing of these inodes. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2011-07-04 05:27:36 +00:00
if (xfs_iflags_test(ip, XFS_ISTALE)) {
xfs_inode_item_unpin(lip, 0);
return -1;
}
return lsn;
}
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
xfs: split iop_unlock The iop_unlock method is called when comitting or cancelling a transaction. In the latter case, the transaction may or may not be aborted. While there is no known problem with the current code in practice, this implementation is limited in that any log item implementation that might want to differentiate between a commit and a cancellation must rely on the aborted state. The aborted bit is only set when the cancelled transaction is dirty, however. This means that there is no way to distinguish between a commit and a clean transaction cancellation. For example, intent log items currently rely on this distinction. The log item is either transferred to the CIL on commit or released on transaction cancel. There is currently no possibility for a clean intent log item in a transaction, but if that state is ever introduced a cancel of such a transaction will immediately result in memory leaks of the associated log item(s). This is an interface deficiency and landmine. To clean this up, replace the iop_unlock method with an iop_release method that is specific to transaction cancel. The existing iop_committing method occurs at the same time as iop_unlock in the commit path and there is no need for two separate callbacks here. Overload the iop_committing method with the current commit time iop_unlock implementations to eliminate the need for the latter and further simplify the interface. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-06-29 02:27:32 +00:00
xfs_lsn_t commit_lsn)
{
xfs: split iop_unlock The iop_unlock method is called when comitting or cancelling a transaction. In the latter case, the transaction may or may not be aborted. While there is no known problem with the current code in practice, this implementation is limited in that any log item implementation that might want to differentiate between a commit and a cancellation must rely on the aborted state. The aborted bit is only set when the cancelled transaction is dirty, however. This means that there is no way to distinguish between a commit and a clean transaction cancellation. For example, intent log items currently rely on this distinction. The log item is either transferred to the CIL on commit or released on transaction cancel. There is currently no possibility for a clean intent log item in a transaction, but if that state is ever introduced a cancel of such a transaction will immediately result in memory leaks of the associated log item(s). This is an interface deficiency and landmine. To clean this up, replace the iop_unlock method with an iop_release method that is specific to transaction cancel. The existing iop_committing method occurs at the same time as iop_unlock in the commit path and there is no need for two separate callbacks here. Overload the iop_committing method with the current commit time iop_unlock implementations to eliminate the need for the latter and further simplify the interface. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-06-29 02:27:32 +00:00
INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
return xfs_inode_item_release(lip);
}
static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_size = xfs_inode_item_size,
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
.iop_unpin = xfs_inode_item_unpin,
xfs: split iop_unlock The iop_unlock method is called when comitting or cancelling a transaction. In the latter case, the transaction may or may not be aborted. While there is no known problem with the current code in practice, this implementation is limited in that any log item implementation that might want to differentiate between a commit and a cancellation must rely on the aborted state. The aborted bit is only set when the cancelled transaction is dirty, however. This means that there is no way to distinguish between a commit and a clean transaction cancellation. For example, intent log items currently rely on this distinction. The log item is either transferred to the CIL on commit or released on transaction cancel. There is currently no possibility for a clean intent log item in a transaction, but if that state is ever introduced a cancel of such a transaction will immediately result in memory leaks of the associated log item(s). This is an interface deficiency and landmine. To clean this up, replace the iop_unlock method with an iop_release method that is specific to transaction cancel. The existing iop_committing method occurs at the same time as iop_unlock in the commit path and there is no need for two separate callbacks here. Overload the iop_committing method with the current commit time iop_unlock implementations to eliminate the need for the latter and further simplify the interface. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-06-29 02:27:32 +00:00
.iop_release = xfs_inode_item_release,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
xfs: split iop_unlock The iop_unlock method is called when comitting or cancelling a transaction. In the latter case, the transaction may or may not be aborted. While there is no known problem with the current code in practice, this implementation is limited in that any log item implementation that might want to differentiate between a commit and a cancellation must rely on the aborted state. The aborted bit is only set when the cancelled transaction is dirty, however. This means that there is no way to distinguish between a commit and a clean transaction cancellation. For example, intent log items currently rely on this distinction. The log item is either transferred to the CIL on commit or released on transaction cancel. There is currently no possibility for a clean intent log item in a transaction, but if that state is ever introduced a cancel of such a transaction will immediately result in memory leaks of the associated log item(s). This is an interface deficiency and landmine. To clean this up, replace the iop_unlock method with an iop_release method that is specific to transaction cancel. The existing iop_committing method occurs at the same time as iop_unlock in the commit path and there is no need for two separate callbacks here. Overload the iop_committing method with the current commit time iop_unlock implementations to eliminate the need for the latter and further simplify the interface. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-06-29 02:27:32 +00:00
.iop_committing = xfs_inode_item_committing,
};
/*
* Initialize the inode log item for a newly allocated (in-core) inode.
*/
void
xfs_inode_item_init(
struct xfs_inode *ip,
struct xfs_mount *mp)
{
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone,
GFP_KERNEL | __GFP_NOFAIL);
iip->ili_inode = ip;
xfs: add an inode item lock The inode log item is kind of special in that it can be aggregating new changes in memory at the same time time existing changes are being written back to disk. This means there are fields in the log item that are accessed concurrently from contexts that don't share any locking at all. e.g. updating ili_last_fields occurs at flush time under the ILOCK_EXCL and flush lock at flush time, under the flush lock at IO completion time, and is read under the ILOCK_EXCL when the inode is logged. Hence there is no actual serialisation between reading the field during logging of the inode in transactions vs clearing the field in IO completion. We currently get away with this by the fact that we are only clearing fields in IO completion, and nothing bad happens if we accidentally log more of the inode than we actually modify. Worst case is we consume a tiny bit more memory and log bandwidth. However, if we want to do more complex state manipulations on the log item that requires updates at all three of these potential locations, we need to have some mechanism of serialising those operations. To do this, introduce a spinlock into the log item to serialise internal state. This could be done via the xfs_inode i_flags_lock, but this then leads to potential lock inversion issues where inode flag updates need to occur inside locks that best nest inside the inode log item locks (e.g. marking inodes stale during inode cluster freeing). Using a separate spinlock avoids these sorts of problems and simplifies future code. This does not touch the use of ili_fields in the item formatting code - that is entirely protected by the ILOCK_EXCL at this point in time, so it remains untouched. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:48:46 +00:00
spin_lock_init(&iip->ili_lock);
xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
&xfs_inode_item_ops);
}
/*
* Free the inode log item and any memory hanging off of it.
*/
void
xfs_inode_item_destroy(
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
struct xfs_inode *ip)
{
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
struct xfs_inode_log_item *iip = ip->i_itemp;
ASSERT(iip->ili_item.li_buf == NULL);
ip->i_itemp = NULL;
kmem_free(iip->ili_item.li_lv_shadow);
kmem_cache_free(xfs_ili_zone, iip);
}
/*
* We only want to pull the item from the AIL if it is actually there
* and its location in the log has not changed since we started the
* flush. Thus, we only bother if the inode's lsn has not changed.
*/
static void
xfs_iflush_ail_updates(
struct xfs_ail *ailp,
struct list_head *list)
{
struct xfs_log_item *lip;
xfs_lsn_t tail_lsn = 0;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->ail_lock);
list_for_each_entry(lip, list, li_bio_list) {
xfs_lsn_t lsn;
clear_bit(XFS_LI_FAILED, &lip->li_flags);
if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
continue;
lsn = xfs_ail_delete_one(ailp, lip);
if (!tail_lsn && lsn)
tail_lsn = lsn;
}
xfs_ail_update_finish(ailp, tail_lsn);
}
/*
* Walk the list of inodes that have completed their IOs. If they are clean
* remove them from the list and dissociate them from the buffer. Buffers that
* are still dirty remain linked to the buffer and on the list. Caller must
* handle them appropriately.
*/
static void
xfs_iflush_finish(
struct xfs_buf *bp,
struct list_head *list)
{
struct xfs_log_item *lip, *n;
list_for_each_entry_safe(lip, n, list, li_bio_list) {
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
bool drop_buffer = false;
xfs: add an inode item lock The inode log item is kind of special in that it can be aggregating new changes in memory at the same time time existing changes are being written back to disk. This means there are fields in the log item that are accessed concurrently from contexts that don't share any locking at all. e.g. updating ili_last_fields occurs at flush time under the ILOCK_EXCL and flush lock at flush time, under the flush lock at IO completion time, and is read under the ILOCK_EXCL when the inode is logged. Hence there is no actual serialisation between reading the field during logging of the inode in transactions vs clearing the field in IO completion. We currently get away with this by the fact that we are only clearing fields in IO completion, and nothing bad happens if we accidentally log more of the inode than we actually modify. Worst case is we consume a tiny bit more memory and log bandwidth. However, if we want to do more complex state manipulations on the log item that requires updates at all three of these potential locations, we need to have some mechanism of serialising those operations. To do this, introduce a spinlock into the log item to serialise internal state. This could be done via the xfs_inode i_flags_lock, but this then leads to potential lock inversion issues where inode flag updates need to occur inside locks that best nest inside the inode log item locks (e.g. marking inodes stale during inode cluster freeing). Using a separate spinlock avoids these sorts of problems and simplifies future code. This does not touch the use of ili_fields in the item formatting code - that is entirely protected by the ILOCK_EXCL at this point in time, so it remains untouched. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:48:46 +00:00
spin_lock(&iip->ili_lock);
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
/*
* Remove the reference to the cluster buffer if the inode is
* clean in memory and drop the buffer reference once we've
* dropped the locks we hold.
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
*/
ASSERT(iip->ili_item.li_buf == bp);
if (!iip->ili_fields) {
iip->ili_item.li_buf = NULL;
list_del_init(&lip->li_bio_list);
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
drop_buffer = true;
}
iip->ili_last_fields = 0;
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
iip->ili_flush_lsn = 0;
xfs: add an inode item lock The inode log item is kind of special in that it can be aggregating new changes in memory at the same time time existing changes are being written back to disk. This means there are fields in the log item that are accessed concurrently from contexts that don't share any locking at all. e.g. updating ili_last_fields occurs at flush time under the ILOCK_EXCL and flush lock at flush time, under the flush lock at IO completion time, and is read under the ILOCK_EXCL when the inode is logged. Hence there is no actual serialisation between reading the field during logging of the inode in transactions vs clearing the field in IO completion. We currently get away with this by the fact that we are only clearing fields in IO completion, and nothing bad happens if we accidentally log more of the inode than we actually modify. Worst case is we consume a tiny bit more memory and log bandwidth. However, if we want to do more complex state manipulations on the log item that requires updates at all three of these potential locations, we need to have some mechanism of serialising those operations. To do this, introduce a spinlock into the log item to serialise internal state. This could be done via the xfs_inode i_flags_lock, but this then leads to potential lock inversion issues where inode flag updates need to occur inside locks that best nest inside the inode log item locks (e.g. marking inodes stale during inode cluster freeing). Using a separate spinlock avoids these sorts of problems and simplifies future code. This does not touch the use of ili_fields in the item formatting code - that is entirely protected by the ILOCK_EXCL at this point in time, so it remains untouched. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:48:46 +00:00
spin_unlock(&iip->ili_lock);
xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
if (drop_buffer)
xfs_buf_rele(bp);
}
}
/*
* Inode buffer IO completion routine. It is responsible for removing inodes
* attached to the buffer from the AIL if they have not been re-logged and
* completing the inode flush.
*/
void
xfs_buf_inode_iodone(
struct xfs_buf *bp)
{
struct xfs_log_item *lip, *n;
LIST_HEAD(flushed_inodes);
LIST_HEAD(ail_updates);
/*
* Pull the attached inodes from the buffer one at a time and take the
* appropriate action on them.
*/
list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) {
xfs_iflush_abort(iip->ili_inode);
continue;
}
if (!iip->ili_last_fields)
continue;
/* Do an unlocked check for needing the AIL lock. */
if (iip->ili_flush_lsn == lip->li_lsn ||
test_bit(XFS_LI_FAILED, &lip->li_flags))
list_move_tail(&lip->li_bio_list, &ail_updates);
else
list_move_tail(&lip->li_bio_list, &flushed_inodes);
}
if (!list_empty(&ail_updates)) {
xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates);
list_splice_tail(&ail_updates, &flushed_inodes);
}
xfs_iflush_finish(bp, &flushed_inodes);
if (!list_empty(&flushed_inodes))
list_splice_tail(&flushed_inodes, &bp->b_li_list);
}
void
xfs_buf_inode_io_fail(
struct xfs_buf *bp)
{
struct xfs_log_item *lip;
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
set_bit(XFS_LI_FAILED, &lip->li_flags);
}
/*
* This is the inode flushing abort routine. It is called when
* the filesystem is shutting down to clean up the inode state. It is
* responsible for removing the inode item from the AIL if it has not been
* re-logged and clearing the inode's flush state.
*/
void
xfs_iflush_abort(
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
struct xfs_inode *ip)
{
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
struct xfs_inode_log_item *iip = ip->i_itemp;
struct xfs_buf *bp = NULL;
if (iip) {
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
/*
* Clear the failed bit before removing the item from the AIL so
* xfs_trans_ail_delete() doesn't try to clear and release the
* buffer attached to the log item before we are done with it.
*/
clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
xfs_trans_ail_delete(&iip->ili_item, 0);
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
/*
* Clear the inode logging fields so no more flushes are
* attempted.
*/
xfs: add an inode item lock The inode log item is kind of special in that it can be aggregating new changes in memory at the same time time existing changes are being written back to disk. This means there are fields in the log item that are accessed concurrently from contexts that don't share any locking at all. e.g. updating ili_last_fields occurs at flush time under the ILOCK_EXCL and flush lock at flush time, under the flush lock at IO completion time, and is read under the ILOCK_EXCL when the inode is logged. Hence there is no actual serialisation between reading the field during logging of the inode in transactions vs clearing the field in IO completion. We currently get away with this by the fact that we are only clearing fields in IO completion, and nothing bad happens if we accidentally log more of the inode than we actually modify. Worst case is we consume a tiny bit more memory and log bandwidth. However, if we want to do more complex state manipulations on the log item that requires updates at all three of these potential locations, we need to have some mechanism of serialising those operations. To do this, introduce a spinlock into the log item to serialise internal state. This could be done via the xfs_inode i_flags_lock, but this then leads to potential lock inversion issues where inode flag updates need to occur inside locks that best nest inside the inode log item locks (e.g. marking inodes stale during inode cluster freeing). Using a separate spinlock avoids these sorts of problems and simplifies future code. This does not touch the use of ili_fields in the item formatting code - that is entirely protected by the ILOCK_EXCL at this point in time, so it remains untouched. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:48:46 +00:00
spin_lock(&iip->ili_lock);
iip->ili_last_fields = 0;
iip->ili_fields = 0;
xfs: optimise away log forces on timestamp updates for fdatasync xfs: timestamp updates cause excessive fdatasync log traffic Sage Weil reported that a ceph test workload was writing to the log on every fdatasync during an overwrite workload. Event tracing showed that the only metadata modification being made was the timestamp updates during the write(2) syscall, but fdatasync(2) is supposed to ignore them. The key observation was that the transactions in the log all looked like this: INODE: #regs: 4 ino: 0x8b flags: 0x45 dsize: 32 And contained a flags field of 0x45 or 0x85, and had data and attribute forks following the inode core. This means that the timestamp updates were triggering dirty relogging of previously logged parts of the inode that hadn't yet been flushed back to disk. There are two parts to this problem. The first is that XFS relogs dirty regions in subsequent transactions, so it carries around the fields that have been dirtied since the last time the inode was written back to disk, not since the last time the inode was forced into the log. The second part is that on v5 filesystems, the inode change count update during inode dirtying also sets the XFS_ILOG_CORE flag, so on v5 filesystems this makes a timestamp update dirty the entire inode. As a result when fdatasync is run, it looks at the dirty fields in the inode, and sees more than just the timestamp flag, even though the only metadata change since the last fdatasync was just the timestamps. Hence we force the log on every subsequent fdatasync even though it is not needed. To fix this, add a new field to the inode log item that tracks changes since the last time fsync/fdatasync forced the log to flush the changes to the journal. This flag is updated when we dirty the inode, but we do it before updating the change count so it does not carry the "core dirty" flag from timestamp updates. The fields are zeroed when the inode is marked clean (due to writeback/freeing) or when an fsync/datasync forces the log. Hence if we only dirty the timestamps on the inode between fsync/fdatasync calls, the fdatasync will not trigger another log force. Over 100 runs of the test program: Ext4 baseline: runtime: 1.63s +/- 0.24s avg lat: 1.59ms +/- 0.24ms iops: ~2000 XFS, vanilla kernel: runtime: 2.45s +/- 0.18s avg lat: 2.39ms +/- 0.18ms log forces: ~400/s iops: ~1000 XFS, patched kernel: runtime: 1.49s +/- 0.26s avg lat: 1.46ms +/- 0.25ms log forces: ~30/s iops: ~1500 Reported-by: Sage Weil <sage@redhat.com> Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-11-03 02:14:59 +00:00
iip->ili_fsync_fields = 0;
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
iip->ili_flush_lsn = 0;
bp = iip->ili_item.li_buf;
iip->ili_item.li_buf = NULL;
list_del_init(&iip->ili_item.li_bio_list);
xfs: add an inode item lock The inode log item is kind of special in that it can be aggregating new changes in memory at the same time time existing changes are being written back to disk. This means there are fields in the log item that are accessed concurrently from contexts that don't share any locking at all. e.g. updating ili_last_fields occurs at flush time under the ILOCK_EXCL and flush lock at flush time, under the flush lock at IO completion time, and is read under the ILOCK_EXCL when the inode is logged. Hence there is no actual serialisation between reading the field during logging of the inode in transactions vs clearing the field in IO completion. We currently get away with this by the fact that we are only clearing fields in IO completion, and nothing bad happens if we accidentally log more of the inode than we actually modify. Worst case is we consume a tiny bit more memory and log bandwidth. However, if we want to do more complex state manipulations on the log item that requires updates at all three of these potential locations, we need to have some mechanism of serialising those operations. To do this, introduce a spinlock into the log item to serialise internal state. This could be done via the xfs_inode i_flags_lock, but this then leads to potential lock inversion issues where inode flag updates need to occur inside locks that best nest inside the inode log item locks (e.g. marking inodes stale during inode cluster freeing). Using a separate spinlock avoids these sorts of problems and simplifies future code. This does not touch the use of ili_fields in the item formatting code - that is entirely protected by the ILOCK_EXCL at this point in time, so it remains untouched. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:48:46 +00:00
spin_unlock(&iip->ili_lock);
}
xfs_iflags_clear(ip, XFS_IFLUSHING);
xfs: pin inode backing buffer to the inode log item When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:49:15 +00:00
if (bp)
xfs_buf_rele(bp);
}
/*
* convert an xfs_inode_log_format struct from the old 32 bit version
* (which can have different field alignments) to the native 64 bit version
*/
int
xfs_inode_item_format_convert(
struct xfs_log_iovec *buf,
struct xfs_inode_log_format *in_f)
{
struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
if (buf->i_len != sizeof(*in_f32)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
in_f->ilf_type = in_f32->ilf_type;
in_f->ilf_size = in_f32->ilf_size;
in_f->ilf_fields = in_f32->ilf_fields;
in_f->ilf_asize = in_f32->ilf_asize;
in_f->ilf_dsize = in_f32->ilf_dsize;
in_f->ilf_ino = in_f32->ilf_ino;
memcpy(&in_f->ilf_u, &in_f32->ilf_u, sizeof(in_f->ilf_u));
in_f->ilf_blkno = in_f32->ilf_blkno;
in_f->ilf_len = in_f32->ilf_len;
in_f->ilf_boffset = in_f32->ilf_boffset;
return 0;
}