xfs: retain ILOCK during directory updates [v13.2 16/16]

This series changes the directory update code to retain the ILOCK on all
 files involved in a rename until the end of the operation.  The upcoming
 parent pointers patchset applies parent pointers in a separate chained
 update from the actual directory update, which is why it is now
 necessary to keep the ILOCK instead of dropping it after the first
 transaction in the chain.
 
 As a side effect, we no longer need to hold the IOLOCK during an rmapbt
 scan of inodes to serialize the scan with ongoing directory updates.
 
 This has been running on the djcloud for months with no problems.  Enjoy!
 
 Signed-off-by: Darrick J. Wong <djwong@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQQ2qTKExjcn+O1o2YRKO3ySh0YRpgUCZh23VAAKCRBKO3ySh0YR
 phqUAP9ACvKuSe7BN1PYvnSTWJ27Kfzy1u9AnMivsKjaWRW2AQEAiYsDs2La+B2m
 Z7pdfAX6U6id5D4F9zGm1nIu08ChCQs=
 =/hm0
 -----END PGP SIGNATURE-----

Merge tag 'retain-ilock-during-dir-ops-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.10-mergeA

xfs: retain ILOCK during directory updates

This series changes the directory update code to retain the ILOCK on all
files involved in a rename until the end of the operation.  The upcoming
parent pointers patchset applies parent pointers in a separate chained
update from the actual directory update, which is why it is now
necessary to keep the ILOCK instead of dropping it after the first
transaction in the chain.

As a side effect, we no longer need to hold the IOLOCK during an rmapbt
scan of inodes to serialize the scan with ongoing directory updates.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>

* tag 'retain-ilock-during-dir-ops-6.10_2024-04-15' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: unlock new repair tempfiles after creation
  xfs: don't pick up IOLOCK during rmapbt repair scan
  xfs: Hold inode locks in xfs_rename
  xfs: Hold inode locks in xfs_trans_alloc_dir
  xfs: Hold inode locks in xfs_ialloc
  xfs: Increase XFS_QM_TRANS_MAXDQS to 5
  xfs: Increase XFS_DEFER_OPS_NR_INODES to 5
This commit is contained in:
Chandan Babu R 2024-04-16 12:53:08 +05:30
commit 9cb5f15d88
13 changed files with 156 additions and 54 deletions

View File

@ -1092,7 +1092,11 @@ xfs_defer_ops_continue(
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
/* Lock the captured resources to the new transaction. */
if (dfc->dfc_held.dr_inos == 2)
if (dfc->dfc_held.dr_inos > 2) {
xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos);
xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos,
XFS_ILOCK_EXCL);
} else if (dfc->dfc_held.dr_inos == 2)
xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
else if (dfc->dfc_held.dr_inos == 1)

View File

@ -77,7 +77,13 @@ extern const struct xfs_defer_op_type xfs_exchmaps_defer_type;
/*
* Deferred operation item relogging limits.
*/
#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */
/*
* Rename w/ parent pointers can require up to 5 inodes with deferred ops to
* be joined to the transaction: src_dp, target_dp, src_ip, target_ip, and wip.
* These inodes are locked in sorted order by their inode numbers
*/
#define XFS_DEFER_OPS_NR_INODES 5
#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */
/* Resources that must be held across a transaction roll. */

View File

@ -578,23 +578,9 @@ xrep_rmap_scan_inode(
struct xrep_rmap *rr,
struct xfs_inode *ip)
{
unsigned int lock_mode = 0;
unsigned int lock_mode = xrep_rmap_scan_ilock(ip);
int error;
/*
* Directory updates (create/link/unlink/rename) drop the directory's
* ILOCK before finishing any rmapbt updates associated with directory
* shape changes. For this scan to coordinate correctly with the live
* update hook, we must take the only lock (i_rwsem) that is held all
* the way to dir op completion. This will get fixed by the parent
* pointer patchset.
*/
if (S_ISDIR(VFS_I(ip)->i_mode)) {
lock_mode = XFS_IOLOCK_SHARED;
xfs_ilock(ip, lock_mode);
}
lock_mode |= xrep_rmap_scan_ilock(ip);
/* Check the data fork. */
error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
if (error)

View File

@ -153,6 +153,7 @@ xrep_tempfile_create(
xfs_qm_dqrele(pdqp);
/* Finish setting up the incore / vfs context. */
xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
xfs_setup_iops(sc->tempip);
xfs_finish_inode_setup(sc->tempip);
@ -168,6 +169,7 @@ out_release_inode:
* transactions and deadlocks from xfs_inactive.
*/
if (sc->tempip) {
xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(sc->tempip);
xchk_irele(sc, sc->tempip);
}

View File

@ -1371,6 +1371,47 @@ xfs_dqlock2(
}
}
static int
xfs_dqtrx_cmp(
const void *a,
const void *b)
{
const struct xfs_dqtrx *qa = a;
const struct xfs_dqtrx *qb = b;
if (qa->qt_dquot->q_id > qb->qt_dquot->q_id)
return 1;
if (qa->qt_dquot->q_id < qb->qt_dquot->q_id)
return -1;
return 0;
}
void
xfs_dqlockn(
struct xfs_dqtrx *q)
{
unsigned int i;
BUILD_BUG_ON(XFS_QM_TRANS_MAXDQS > MAX_LOCKDEP_SUBCLASSES);
/* Sort in order of dquot id, do not allow duplicates */
for (i = 0; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) {
unsigned int j;
for (j = 0; j < i; j++)
ASSERT(q[i].qt_dquot != q[j].qt_dquot);
}
if (i == 0)
return;
sort(q, i, sizeof(struct xfs_dqtrx), xfs_dqtrx_cmp, NULL);
mutex_lock(&q[0].qt_dquot->q_qlock);
for (i = 1; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++)
mutex_lock_nested(&q[i].qt_dquot->q_qlock,
XFS_QLOCK_NESTED + i - 1);
}
int __init
xfs_qm_init(void)
{

View File

@ -223,6 +223,7 @@ int xfs_qm_dqget_uncached(struct xfs_mount *mp,
void xfs_qm_dqput(struct xfs_dquot *dqp);
void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
void xfs_dqlockn(struct xfs_dqtrx *q);
void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);

View File

@ -418,7 +418,7 @@ xfs_lock_inumorder(
* lock more than one at a time, lockdep will report false positives saying we
* have violated locking orders.
*/
static void
void
xfs_lock_inodes(
struct xfs_inode **ips,
int inodes,
@ -747,6 +747,8 @@ xfs_inode_inherit_flags2(
/*
* Initialise a newly allocated inode and return the in-core inode to the
* caller locked exclusively.
*
* Caller is responsible for unlocking the inode manually upon return
*/
int
xfs_init_new_inode(
@ -873,7 +875,7 @@ xfs_init_new_inode(
/*
* Log the new values stuffed into the inode.
*/
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, flags);
/* now that we have an i_mode we can setup the inode structure */
@ -1101,8 +1103,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
xfs_trans_ijoin(tp, dp, 0);
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
resblks - XFS_IALLOC_SPACE_RES(mp));
@ -1151,6 +1152,8 @@ xfs_create(
xfs_qm_dqrele(pdqp);
*ipp = ip;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return 0;
out_trans_cancel:
@ -1162,6 +1165,7 @@ xfs_create(
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
@ -1247,6 +1251,7 @@ xfs_create_tmpfile(
xfs_qm_dqrele(pdqp);
*ipp = ip;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
out_trans_cancel:
@ -1258,6 +1263,7 @@ xfs_create_tmpfile(
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
@ -1362,10 +1368,15 @@ xfs_link(
if (xfs_has_wsync(mp) || xfs_has_dirsync(mp))
xfs_trans_set_sync(tp);
return xfs_trans_commit(tp);
error = xfs_trans_commit(tp);
xfs_iunlock(tdp, XFS_ILOCK_EXCL);
xfs_iunlock(sip, XFS_ILOCK_EXCL);
return error;
error_return:
xfs_trans_cancel(tp);
xfs_iunlock(tdp, XFS_ILOCK_EXCL);
xfs_iunlock(sip, XFS_ILOCK_EXCL);
std_return:
if (error == -ENOSPC && nospace_error)
error = nospace_error;
@ -2775,19 +2786,39 @@ xfs_remove(
error = xfs_trans_commit(tp);
if (error)
goto std_return;
goto out_unlock;
if (is_dir && xfs_inode_is_filestream(ip))
xfs_filestream_deassociate(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return 0;
out_trans_cancel:
xfs_trans_cancel(tp);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
std_return:
return error;
}
static inline void
xfs_iunlock_rename(
struct xfs_inode **i_tab,
int num_inodes)
{
int i;
for (i = num_inodes - 1; i >= 0; i--) {
/* Skip duplicate inodes if src and target dps are the same */
if (!i_tab[i] || (i > 0 && i_tab[i] == i_tab[i - 1]))
continue;
xfs_iunlock(i_tab[i], XFS_ILOCK_EXCL);
}
}
/*
* Enter all inodes for a rename transaction into a sorted array.
*/
@ -2802,7 +2833,7 @@ xfs_sort_for_rename(
struct xfs_inode **i_tab,/* out: sorted array of inodes */
int *num_inodes) /* in/out: inodes in array */
{
int i, j;
int i;
ASSERT(*num_inodes == __XFS_SORT_INODES);
memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
@ -2824,17 +2855,26 @@ xfs_sort_for_rename(
i_tab[i++] = wip;
*num_inodes = i;
xfs_sort_inodes(i_tab, *num_inodes);
}
void
xfs_sort_inodes(
struct xfs_inode **i_tab,
unsigned int num_inodes)
{
int i, j;
ASSERT(num_inodes <= __XFS_SORT_INODES);
/*
* Sort the elements via bubble sort. (Remember, there are at
* most 5 elements to sort, so this is adequate.)
*/
for (i = 0; i < *num_inodes; i++) {
for (j = 1; j < *num_inodes; j++) {
if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
struct xfs_inode *temp = i_tab[j];
i_tab[j] = i_tab[j-1];
i_tab[j-1] = temp;
}
for (i = 0; i < num_inodes; i++) {
for (j = 1; j < num_inodes; j++) {
if (i_tab[j]->i_ino < i_tab[j-1]->i_ino)
swap(i_tab[j], i_tab[j - 1]);
}
}
}
@ -3088,8 +3128,10 @@ retry:
* Attach the dquots to the inodes
*/
error = xfs_qm_vop_rename_dqattach(inodes);
if (error)
goto out_trans_cancel;
if (error) {
xfs_trans_cancel(tp);
goto out_release_wip;
}
/*
* Lock all the participating inodes. Depending upon whether
@ -3100,18 +3142,16 @@ retry:
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
* Join all the inodes to the transaction. From this point on,
* we can rely on either trans_commit or trans_cancel to unlock
* them.
* Join all the inodes to the transaction.
*/
xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, src_dp, 0);
if (new_parent)
xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, target_dp, 0);
xfs_trans_ijoin(tp, src_ip, 0);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, target_ip, 0);
if (wip)
xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, wip, 0);
/*
* If we are using project inheritance, we only allow renames
@ -3125,10 +3165,13 @@ retry:
}
/* RENAME_EXCHANGE is unique from here on. */
if (flags & RENAME_EXCHANGE)
return xfs_cross_rename(tp, src_dp, src_name, src_ip,
if (flags & RENAME_EXCHANGE) {
error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
target_dp, target_name, target_ip,
spaceres);
xfs_iunlock_rename(inodes, num_inodes);
return error;
}
/*
* Try to reserve quota to handle an expansion of the target directory.
@ -3142,6 +3185,7 @@ retry:
if (error == -EDQUOT || error == -ENOSPC) {
if (!retried) {
xfs_trans_cancel(tp);
xfs_iunlock_rename(inodes, num_inodes);
xfs_blockgc_free_quota(target_dp, 0);
retried = true;
goto retry;
@ -3368,12 +3412,14 @@ retry:
xfs_dir_update_hook(src_dp, wip, 1, src_name);
error = xfs_finish_rename(tp);
xfs_iunlock_rename(inodes, num_inodes);
if (wip)
xfs_irele(wip);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
xfs_iunlock_rename(inodes, num_inodes);
out_release_wip:
if (wip)
xfs_irele(wip);

View File

@ -627,6 +627,8 @@ int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode);
void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes);
static inline bool
xfs_inode_unlinked_incomplete(

View File

@ -836,8 +836,10 @@ xfs_qm_qino_alloc(
ASSERT(xfs_is_shutdown(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
}
if (need_alloc)
if (need_alloc) {
xfs_iunlock(*ipp, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(*ipp);
}
return error;
}

View File

@ -136,7 +136,7 @@ enum {
XFS_QM_TRANS_PRJ,
XFS_QM_TRANS_DQTYPES
};
#define XFS_QM_TRANS_MAXDQS 2
#define XFS_QM_TRANS_MAXDQS 5
struct xfs_dquot_acct {
struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
};

View File

@ -172,8 +172,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
xfs_trans_ijoin(tp, dp, 0);
/*
* Also attach the dquot(s) to it, if applicable.
@ -215,6 +214,8 @@ xfs_symlink(
xfs_qm_dqrele(pdqp);
*ipp = ip;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return 0;
out_trans_cancel:
@ -226,6 +227,7 @@ out_release_inode:
* transactions and deadlocks from xfs_inactive.
*/
if (ip) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}

View File

@ -1430,6 +1430,8 @@ out_cancel:
* The caller must ensure that the on-disk dquots attached to this inode have
* already been allocated and initialized. The ILOCKs will be dropped when the
* transaction is committed or cancelled.
*
* Caller is responsible for unlocking the inodes manually upon return
*/
int
xfs_trans_alloc_dir(
@ -1460,8 +1462,8 @@ retry:
xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, dp, 0);
xfs_trans_ijoin(tp, ip, 0);
error = xfs_qm_dqattach_locked(dp, false);
if (error) {
@ -1484,6 +1486,9 @@ retry:
if (error == -EDQUOT || error == -ENOSPC) {
if (!retried) {
xfs_trans_cancel(tp);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
if (dp != ip)
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_blockgc_free_quota(dp, 0);
retried = true;
goto retry;

View File

@ -379,24 +379,29 @@ xfs_trans_mod_dquot(
/*
* Given an array of dqtrx structures, lock all the dquots associated and join
* them to the transaction, provided they have been modified. We know that the
* highest number of dquots of one type - usr, grp and prj - involved in a
* transaction is 3 so we don't need to make this very generic.
* them to the transaction, provided they have been modified.
*/
STATIC void
xfs_trans_dqlockedjoin(
struct xfs_trans *tp,
struct xfs_dqtrx *q)
{
unsigned int i;
ASSERT(q[0].qt_dquot != NULL);
if (q[1].qt_dquot == NULL) {
xfs_dqlock(q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
} else {
ASSERT(XFS_QM_TRANS_MAXDQS == 2);
} else if (q[2].qt_dquot == NULL) {
xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
xfs_trans_dqjoin(tp, q[1].qt_dquot);
} else {
xfs_dqlockn(q);
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (q[i].qt_dquot == NULL)
break;
xfs_trans_dqjoin(tp, q[i].qt_dquot);
}
}
}