From 435dcf0787fde20d99ede873cf56dc27c22f2c30 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:49 -0800 Subject: [PATCH 01/70] xfs: never try to scrub more than 64 inodes per inobt record Make sure we never check more than XFS_INODES_PER_CHUNK inodes for any given inobt record since there can be more than one inobt record mapped to an inode cluster. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 882dc56c5c21..fd431682db0b 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -203,7 +203,8 @@ xchk_iallocbt_check_freemask( int error = 0; /* Make sure the freemask matches the inode records. */ - nr_inodes = mp->m_inodes_per_cluster; + nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_inodes_per_cluster); for (agino = irec->ir_startino; agino < irec->ir_startino + XFS_INODES_PER_CHUNK; From c050fdfeb575a57ba955427142469b2d2e5f7ac2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:50 -0800 Subject: [PATCH 02/70] xfs: check the ir_startino alignment directly In xchk_iallocbt_rec, check the alignment of ir_startino by converting the inode cluster block alignment into units of inodes instead of the other way around (converting ir_startino to blocks). This prevents us from tripping over off-by-one errors in ir_startino which are obscured by the inode -> block conversion. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 56 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index fd431682db0b..1c6fef9b3799 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -265,6 +265,53 @@ xchk_iallocbt_check_freemask( return error; } +/* + * Make sure this inode btree record is aligned properly. Because a fs block + * contains multiple inodes, we check that the inobt record is aligned to the + * correct inode, not just the correct block on disk. This results in a finer + * grained corruption check. + */ +STATIC void +xchk_iallocbt_rec_alignment( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->sc->mp; + + /* + * finobt records have different positioning requirements than inobt + * records: each finobt record must have a corresponding inobt record. + * That is checked in the xref function, so for now we only catch the + * obvious case where the record isn't at all aligned properly. + * + * Note that if a fs block contains more than a single chunk of inodes, + * we will have finobt records only for those chunks containing free + * inodes, and therefore expect chunk alignment of finobt records. + * Otherwise, we expect that the finobt record is aligned to the + * cluster alignment as told by the superblock. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + unsigned int imask; + + imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_cluster_align_inodes) - 1; + if (irec->ir_startino & imask) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + /* inobt records must be aligned to cluster and inoalignmnt size. */ + if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } +} + /* Scrub an inobt/finobt record. */ STATIC int xchk_iallocbt_rec( @@ -277,7 +324,6 @@ xchk_iallocbt_rec( uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agino_t agino; - xfs_agblock_t agbno; xfs_extlen_t len; int holecount; int i; @@ -304,11 +350,9 @@ xchk_iallocbt_rec( goto out; } - /* Make sure this record is aligned to cluster and inoalignmnt size. */ - agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); - if ((agbno & (mp->m_cluster_align - 1)) || - (agbno & (mp->m_blocks_per_cluster - 1))) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_iallocbt_rec_alignment(bs, &irec); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; iabt->inodes += irec.ir_count; From 22234c62f98bf06ae21cb71fb5b2aef2ce13326e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:50 -0800 Subject: [PATCH 03/70] xfs: check inobt record alignment on big block filesystems On a big block filesystem, there may be multiple inobt records covering a single inode cluster. These records obviously won't be aligned to cluster alignment rules, and they must cover the entire cluster. Teach scrub to check for these things. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 1c6fef9b3799..b8bfa93fd1a4 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt( struct xchk_iallocbt { /* Number of inodes we see while scanning inobt. */ unsigned long long inodes; + + /* Expected next startino, for big block filesystems. */ + xfs_agino_t next_startino; + + /* Expected end of the current inode cluster. */ + xfs_agino_t next_cluster_ino; }; /* @@ -277,6 +283,7 @@ xchk_iallocbt_rec_alignment( struct xfs_inobt_rec_incore *irec) { struct xfs_mount *mp = bs->sc->mp; + struct xchk_iallocbt *iabt = bs->private; /* * finobt records have different positioning requirements than inobt @@ -300,6 +307,27 @@ xchk_iallocbt_rec_alignment( return; } + if (iabt->next_startino != NULLAGINO) { + /* + * We're midway through a cluster of inodes that is mapped by + * multiple inobt records. Did we get the record for the next + * irec in the sequence? + */ + if (irec->ir_startino != iabt->next_startino) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + iabt->next_startino += XFS_INODES_PER_CHUNK; + + /* Are we done with the cluster? */ + if (iabt->next_startino >= iabt->next_cluster_ino) { + iabt->next_startino = NULLAGINO; + iabt->next_cluster_ino = NULLAGINO; + } + return; + } + /* inobt records must be aligned to cluster and inoalignmnt size. */ if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); @@ -310,6 +338,17 @@ xchk_iallocbt_rec_alignment( xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return; } + + if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK) + return; + + /* + * If this is the start of an inode cluster that can be mapped by + * multiple inobt records, the next inobt record must follow exactly + * after this one. + */ + iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK; + iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster; } /* Scrub an inobt/finobt record. */ @@ -474,6 +513,8 @@ xchk_iallocbt( struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { .inodes = 0, + .next_startino = NULLAGINO, + .next_cluster_ino = NULLAGINO, }; int error; From a1954242facb01350265183675f1a544e1e1e646 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:51 -0800 Subject: [PATCH 04/70] xfs: hoist inode cluster checks out of loop Hoist the inode cluster checks out of the inobt record check loop into a separate function in preparation for refactoring of that loop. No functional changes here; that's in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 123 +++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 56 deletions(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index b8bfa93fd1a4..0ce793d92995 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -188,19 +188,19 @@ out: return 0; } -/* Make sure the free mask is consistent with what the inodes think. */ +/* Check an inode cluster. */ STATIC int -xchk_iallocbt_check_freemask( +xchk_iallocbt_check_cluster( struct xchk_btree *bs, - struct xfs_inobt_rec_incore *irec) + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino) { struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; struct xfs_buf *bp; xfs_ino_t fsino; - xfs_agino_t nr_inodes; - xfs_agino_t agino; + unsigned int nr_inodes; xfs_agino_t chunkino; xfs_agino_t clusterino; xfs_agblock_t agbno; @@ -212,60 +212,71 @@ xchk_iallocbt_check_freemask( nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, mp->m_inodes_per_cluster); + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + chunkino = agino - irec->ir_startino; + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + + /* Compute the holemask mask for this cluster. */ + for (clusterino = 0, holemask = 0; clusterino < nr_inodes; + clusterino += XFS_INODES_PER_HOLEMASK_BIT) + holemask |= XFS_INOBT_MASK((chunkino + clusterino) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* The whole cluster must be a hole or not a hole. */ + ir_holemask = (irec->ir_holemask & holemask); + if (ir_holemask != holemask && ir_holemask != 0) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) { + xchk_xref_is_not_owned_by(bs->sc, agbno, + mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + return 0; + } + + xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + + /* Grab the inode cluster buffer. */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + imap.im_boffset = 0; + + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &bp, 0, 0); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) + return 0; + + /* Which inodes are free? */ + for (clusterino = 0; clusterino < nr_inodes; clusterino++) { + error = xchk_iallocbt_check_cluster_freemask(bs, fsino, + chunkino, clusterino, irec, bp); + if (error) + break; + } + + xfs_trans_brelse(bs->cur->bc_tp, bp); + return error; +} + +/* Make sure the free mask is consistent with what the inodes think. */ +STATIC int +xchk_iallocbt_check_freemask( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->cur->bc_mp; + xfs_agino_t agino; + int error = 0; + for (agino = irec->ir_startino; agino < irec->ir_startino + XFS_INODES_PER_CHUNK; agino += mp->m_inodes_per_cluster) { - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - chunkino = agino - irec->ir_startino; - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - - /* Compute the holemask mask for this cluster. */ - for (clusterino = 0, holemask = 0; clusterino < nr_inodes; - clusterino += XFS_INODES_PER_HOLEMASK_BIT) - holemask |= XFS_INOBT_MASK((chunkino + clusterino) / - XFS_INODES_PER_HOLEMASK_BIT); - - /* The whole cluster must be a hole or not a hole. */ - ir_holemask = (irec->ir_holemask & holemask); - if (ir_holemask != holemask && ir_holemask != 0) { - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - continue; - } - - /* If any part of this is a hole, skip it. */ - if (ir_holemask) { - xchk_xref_is_not_owned_by(bs->sc, agbno, - mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); - continue; - } - - xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); - - /* Grab the inode cluster buffer. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, - agbno); - imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - imap.im_boffset = 0; - - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, - &dip, &bp, 0, 0); - if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, - &error)) - continue; - - /* Which inodes are free? */ - for (clusterino = 0; clusterino < nr_inodes; clusterino++) { - error = xchk_iallocbt_check_cluster_freemask(bs, - fsino, chunkino, clusterino, irec, bp); - if (error) { - xfs_trans_brelse(bs->cur->bc_tp, bp); - return error; - } - } - - xfs_trans_brelse(bs->cur->bc_tp, bp); + error = xchk_iallocbt_check_cluster(bs, irec, agino); + if (error) + break; } return error; From b9454fe056bda3b208225e4ac76bcc7c912f350a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:51 -0800 Subject: [PATCH 05/70] xfs: clean up the inode cluster checking in the inobt scrub The code to check inobt records against inode clusters is a mess of poorly named variables and unnecessary parameters. Clean the unnecessary inode number parameters out of _check_cluster_freemask in favor of computing them inside the function instead of making the caller do it. In xchk_iallocbt_check_cluster, rename the variables to make it more obvious just what chunk_ino and cluster_ino represent. Add a tracepoint to make it easier to track each inode cluster as we scrub it. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 165 +++++++++++++++++++++++++++--------------- fs/xfs/scrub/trace.h | 45 ++++++++++++ 2 files changed, 152 insertions(+), 58 deletions(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 0ce793d92995..708f6607db71 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -134,41 +134,69 @@ xchk_iallocbt_freecount( return hweight64(freemask); } -/* Check a particular inode with ir_free. */ +/* + * Check that an inode's allocation status matches ir_free in the inobt + * record. First we try querying the in-core inode state, and if the inode + * isn't loaded we examine the on-disk inode directly. + * + * Since there can be 1:M and M:1 mappings between inobt records and inode + * clusters, we pass in the inode location information as an inobt record; + * the index of an inode cluster within the inobt record (as well as the + * cluster buffer itself); and the index of the inode within the cluster. + * + * @irec is the inobt record. + * @cluster_base is the inode offset of the cluster within the @irec. + * @cluster_bp is the cluster buffer. + * @cluster_index is the inode offset within the inode cluster. + */ STATIC int -xchk_iallocbt_check_cluster_freemask( +xchk_iallocbt_check_cluster_ifree( struct xchk_btree *bs, - xfs_ino_t fsino, - xfs_agino_t chunkino, - xfs_agino_t clusterino, struct xfs_inobt_rec_incore *irec, - struct xfs_buf *bp) + unsigned int cluster_base, + struct xfs_buf *cluster_bp, + unsigned int cluster_index) { - struct xfs_dinode *dip; struct xfs_mount *mp = bs->cur->bc_mp; - bool inode_is_free = false; + struct xfs_dinode *dip; + xfs_ino_t fsino; + xfs_agino_t agino; + unsigned int offset; + bool irec_free; + bool ino_inuse; bool freemask_ok; - bool inuse; - int error = 0; + int error; if (xchk_should_terminate(bs->sc, &error)) return error; - dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + /* + * Given an inobt record, an offset of a cluster within the record, and + * an offset of an inode within a cluster, compute which fs inode we're + * talking about and the offset of that inode within the buffer. + */ + agino = irec->ir_startino + cluster_base + cluster_index; + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + offset = cluster_index * mp->m_sb.sb_inodesize; + if (offset >= BBTOB(cluster_bp->b_length)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + goto out; + } + dip = xfs_buf_offset(cluster_bp, offset); + irec_free = (irec->ir_free & XFS_INOBT_MASK(cluster_base + + cluster_index)); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - (dip->di_version >= 3 && - be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } - if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) - inode_is_free = true; - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, - fsino + clusterino, &inuse); + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, + &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ - freemask_ok = inode_is_free ^ !!(dip->di_mode); + freemask_ok = irec_free ^ !!(dip->di_mode); if (!bs->sc->try_harder && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { @@ -180,7 +208,7 @@ xchk_iallocbt_check_cluster_freemask( goto out; } else { /* Inode is all there. */ - freemask_ok = inode_is_free ^ inuse; + freemask_ok = irec_free ^ ino_inuse; } if (!freemask_ok) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); @@ -188,43 +216,57 @@ out: return 0; } -/* Check an inode cluster. */ +/* + * Check that the holemask and freemask of a hypothetical inode cluster match + * what's actually on disk. If sparse inodes are enabled, the cluster does + * not actually have to map to inodes if the corresponding holemask bit is set. + * + * @cluster_base is the first inode in the cluster within the @irec. + */ STATIC int xchk_iallocbt_check_cluster( struct xchk_btree *bs, struct xfs_inobt_rec_incore *irec, - xfs_agino_t agino) + unsigned int cluster_base) { struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; - struct xfs_buf *bp; - xfs_ino_t fsino; + struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agino_t chunkino; - xfs_agino_t clusterino; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t agbno; - uint16_t holemask; + unsigned int cluster_index; + uint16_t cluster_mask = 0; uint16_t ir_holemask; int error = 0; - /* Make sure the freemask matches the inode records. */ nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, mp->m_inodes_per_cluster); - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - chunkino = agino - irec->ir_startino; - agbno = XFS_AGINO_TO_AGBNO(mp, agino); + /* Map this inode cluster */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base); - /* Compute the holemask mask for this cluster. */ - for (clusterino = 0, holemask = 0; clusterino < nr_inodes; - clusterino += XFS_INODES_PER_HOLEMASK_BIT) - holemask |= XFS_INOBT_MASK((chunkino + clusterino) / + /* Compute a bitmask for this cluster that can be used for holemask. */ + for (cluster_index = 0; + cluster_index < nr_inodes; + cluster_index += XFS_INODES_PER_HOLEMASK_BIT) + cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / XFS_INODES_PER_HOLEMASK_BIT); + ir_holemask = (irec->ir_holemask & cluster_mask); + imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + imap.im_boffset = 0; + + trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, + imap.im_blkno, imap.im_len, cluster_base, nr_inodes, + cluster_mask, ir_holemask, + XFS_INO_TO_OFFSET(mp, irec->ir_startino + + cluster_base)); + /* The whole cluster must be a hole or not a hole. */ - ir_holemask = (irec->ir_holemask & holemask); - if (ir_holemask != holemask && ir_holemask != 0) { + if (ir_holemask != cluster_mask && ir_holemask != 0) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -241,40 +283,47 @@ xchk_iallocbt_check_cluster( &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, agbno); - imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - imap.im_boffset = 0; - - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &bp, 0, 0); + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, + 0, 0); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) - return 0; + return error; - /* Which inodes are free? */ - for (clusterino = 0; clusterino < nr_inodes; clusterino++) { - error = xchk_iallocbt_check_cluster_freemask(bs, fsino, - chunkino, clusterino, irec, bp); + /* Check free status of each inode within this cluster. */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + error = xchk_iallocbt_check_cluster_ifree(bs, irec, + cluster_base, cluster_bp, cluster_index); if (error) break; } - xfs_trans_brelse(bs->cur->bc_tp, bp); + xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); return error; } -/* Make sure the free mask is consistent with what the inodes think. */ +/* + * For all the inode clusters that could map to this inobt record, make sure + * that the holemask makes sense and that the allocation status of each inode + * matches the freemask. + */ STATIC int -xchk_iallocbt_check_freemask( +xchk_iallocbt_check_clusters( struct xchk_btree *bs, struct xfs_inobt_rec_incore *irec) { - struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agino_t agino; + unsigned int cluster_base; int error = 0; - for (agino = irec->ir_startino; - agino < irec->ir_startino + XFS_INODES_PER_CHUNK; - agino += mp->m_inodes_per_cluster) { - error = xchk_iallocbt_check_cluster(bs, irec, agino); + /* + * For the common case where this inobt record maps to multiple inode + * clusters this will call _check_cluster for each cluster. + * + * For the case that multiple inobt records map to a single cluster, + * this will call _check_cluster once. + */ + for (cluster_base = 0; + cluster_base < XFS_INODES_PER_CHUNK; + cluster_base += bs->sc->mp->m_inodes_per_cluster) { + error = xchk_iallocbt_check_cluster(bs, irec, cluster_base); if (error) break; } @@ -415,7 +464,7 @@ xchk_iallocbt_rec( if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) goto out; - goto check_freemask; + goto check_clusters; } /* Check each chunk of a sparse inode cluster. */ @@ -441,8 +490,8 @@ xchk_iallocbt_rec( holecount + irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); -check_freemask: - error = xchk_iallocbt_check_freemask(bs, &irec); +check_clusters: + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 8344b14031ef..3c83e8b3b39c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error, __entry->ret_ip) ); +TRACE_EVENT(xchk_iallocbt_check_cluster, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, xfs_daddr_t map_daddr, + unsigned short map_len, unsigned int chunk_ino, + unsigned int nr_inodes, uint16_t cluster_mask, + uint16_t holemask, unsigned int cluster_ino), + TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + cluster_mask, holemask, cluster_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(xfs_daddr_t, map_daddr) + __field(unsigned short, map_len) + __field(unsigned int, chunk_ino) + __field(unsigned int, nr_inodes) + __field(unsigned int, cluster_ino) + __field(uint16_t, cluster_mask) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->map_daddr = map_daddr; + __entry->map_len = map_len; + __entry->chunk_ino = chunk_ino; + __entry->nr_inodes = nr_inodes; + __entry->cluster_mask = cluster_mask; + __entry->holemask = holemask; + __entry->cluster_ino = cluster_ino; + ), + TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->map_daddr, + __entry->map_len, + __entry->chunk_ino, + __entry->nr_inodes, + __entry->cluster_mask, + __entry->holemask, + __entry->cluster_ino) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) From 4539b8a7807848ad5c9e4d1d4199f34eb88669c5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:52 -0800 Subject: [PATCH 06/70] xfs: scrub big block inode btrees correctly Teach scrub how to handle the case that there are one or more inobt records covering a given inode cluster. This fixes the operation on big block filesystems (e.g. 64k blocks, 512 byte inodes). Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 708f6607db71..1929d79ea6b3 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -162,6 +162,7 @@ xchk_iallocbt_check_cluster_ifree( xfs_ino_t fsino; xfs_agino_t agino; unsigned int offset; + unsigned int cluster_buf_base; bool irec_free; bool ino_inuse; bool freemask_ok; @@ -174,10 +175,17 @@ xchk_iallocbt_check_cluster_ifree( * Given an inobt record, an offset of a cluster within the record, and * an offset of an inode within a cluster, compute which fs inode we're * talking about and the offset of that inode within the buffer. + * + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make cluster_buf_base nonzero. */ agino = irec->ir_startino + cluster_base + cluster_index; fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - offset = cluster_index * mp->m_sb.sb_inodesize; + cluster_buf_base = XFS_INO_TO_OFFSET(mp, irec->ir_startino); + ASSERT(cluster_buf_base == 0 || cluster_base == 0); + offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; if (offset >= BBTOB(cluster_bp->b_length)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; From f9e63342b858332d932414a7b27cad1d5557c84c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:52 -0800 Subject: [PATCH 07/70] xfs: consolidate scrub dinode mapping code into a single function Move all the confusing dinode mapping code that's split between xchk_iallocbt_check_cluster and xchk_iallocbt_check_cluster_ifree into the first function so that it's clearer how we find the dinode for a given inode. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 62 ++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 1929d79ea6b3..2c9dad2b61b1 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -145,24 +145,19 @@ xchk_iallocbt_freecount( * cluster buffer itself); and the index of the inode within the cluster. * * @irec is the inobt record. - * @cluster_base is the inode offset of the cluster within the @irec. - * @cluster_bp is the cluster buffer. - * @cluster_index is the inode offset within the inode cluster. + * @irec_ino is the inode offset from the start of the record. + * @dip is the on-disk inode. */ STATIC int xchk_iallocbt_check_cluster_ifree( struct xchk_btree *bs, struct xfs_inobt_rec_incore *irec, - unsigned int cluster_base, - struct xfs_buf *cluster_bp, - unsigned int cluster_index) + unsigned int irec_ino, + struct xfs_dinode *dip) { struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_dinode *dip; xfs_ino_t fsino; xfs_agino_t agino; - unsigned int offset; - unsigned int cluster_buf_base; bool irec_free; bool ino_inuse; bool freemask_ok; @@ -172,27 +167,12 @@ xchk_iallocbt_check_cluster_ifree( return error; /* - * Given an inobt record, an offset of a cluster within the record, and - * an offset of an inode within a cluster, compute which fs inode we're - * talking about and the offset of that inode within the buffer. - * - * Be careful about inobt records that don't align with the start of - * the inode buffer when block sizes are large enough to hold multiple - * inode chunks. When this happens, cluster_base will be zero but - * ir_startino can be large enough to make cluster_buf_base nonzero. + * Given an inobt record and the offset of an inode from the start of + * the record, compute which fs inode we're talking about. */ - agino = irec->ir_startino + cluster_base + cluster_index; + agino = irec->ir_startino + irec_ino; fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - cluster_buf_base = XFS_INO_TO_OFFSET(mp, irec->ir_startino); - ASSERT(cluster_buf_base == 0 || cluster_base == 0); - offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; - if (offset >= BBTOB(cluster_bp->b_length)) { - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - goto out; - } - dip = xfs_buf_offset(cluster_bp, offset); - irec_free = (irec->ir_free & XFS_INOBT_MASK(cluster_base + - cluster_index)); + irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { @@ -262,10 +242,23 @@ xchk_iallocbt_check_cluster( cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / XFS_INODES_PER_HOLEMASK_BIT); + /* + * Map the first inode of this cluster to a buffer and offset. + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make im_boffset nonzero. + */ ir_holemask = (irec->ir_holemask & cluster_mask); imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - imap.im_boffset = 0; + imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino); + + if (imap.im_boffset != 0 && cluster_base != 0) { + ASSERT(imap.im_boffset == 0 || cluster_base == 0); + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, imap.im_blkno, imap.im_len, cluster_base, nr_inodes, @@ -298,10 +291,19 @@ xchk_iallocbt_check_cluster( /* Check free status of each inode within this cluster. */ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + struct xfs_dinode *dip; + + if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + break; + } + + dip = xfs_buf_offset(cluster_bp, imap.im_boffset); error = xchk_iallocbt_check_cluster_ifree(bs, irec, - cluster_base, cluster_bp, cluster_index); + cluster_base + cluster_index, dip); if (error) break; + imap.im_boffset += mp->m_sb.sb_inodesize; } xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); From 3258cb208caba74258ffdd8bd59972bbda9bfee1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:52 -0800 Subject: [PATCH 08/70] xfs: abort xattr scrub if fatal signals are pending The extended attribute scrubber should abort the "read all attrs" loop if there's a fatal signal pending on the process. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/attr.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 81d5e90547a1..9960bc5b5d76 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -82,6 +82,11 @@ xchk_xattr_listent( sx = container_of(context, struct xchk_xattr, context); + if (xchk_should_terminate(sx->sc, &error)) { + context->seen_enough = 1; + return; + } + if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ xchk_ino_set_preen(sx->sc, context->dp->i_ino); From f8c1d7023e252df853efbb3566c6d47b148609fe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:53 -0800 Subject: [PATCH 09/70] xfs: scrub should flag dir/attr offsets that aren't mappable with xfs_dablk_t Teach scrub to flag extent maps that exceed the range that can be mapped with a xfs_dablk_t. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_types.c | 11 +++++++++++ fs/xfs/libxfs/xfs_types.h | 1 + fs/xfs/scrub/bmap.c | 27 +++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 3306fc42cfad..451608b5a37b 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -204,3 +204,14 @@ xfs_verify_icount( xfs_icount_range(mp, &min, &max); return icount >= min && icount <= max; } + +/* Sanity-checking of dir/attr block offsets. */ +bool +xfs_verify_dablk( + struct xfs_mount *mp, + xfs_fileoff_t dabno) +{ + xfs_dablk_t max_dablk = -1U; + + return dabno <= max_dablk; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8f02855a019a..704b4f308780 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -188,5 +188,6 @@ bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); +bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index e1d11f3223e3..a703cd58a90e 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -281,6 +281,31 @@ xchk_bmap_extent_xref( xchk_ag_free(info->sc, &info->sc->sa); } +/* + * Directories and attr forks should never have blocks that can't be addressed + * by a xfs_dablk_t. + */ +STATIC void +xchk_bmap_dirattr_extent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + + if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK) + return; + + if (!xfs_verify_dablk(mp, irec->br_startoff)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + off = irec->br_startoff + irec->br_blockcount - 1; + if (!xfs_verify_dablk(mp, off)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, off); +} + /* Scrub a single extent record. */ STATIC int xchk_bmap_extent( @@ -305,6 +330,8 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + xchk_bmap_dirattr_extent(ip, info, irec); + /* There should never be a "hole" extent in either extent list. */ if (irec->br_startblock == HOLESTARTBLOCK) xchk_fblock_set_corrupt(info->sc, info->whichfork, From 87c9607df2ff73290dcfe08d22f34687ce0142ce Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:53 -0800 Subject: [PATCH 10/70] xfs: fix off-by-one error in rtbitmap cross-reference Fix an off-by-one error in the realtime bitmap "is used" cross-reference helper function if the realtime extent size is a single block. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/rtbitmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 665d4bbb17cc..dbe115b075f7 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space( startext = fsbno; endext = fsbno + len - 1; do_div(startext, sc->mp->m_sb.sb_rextsize); - if (do_div(endext, sc->mp->m_sb.sb_rextsize)) - endext++; - extcount = endext - startext; + do_div(endext, sc->mp->m_sb.sb_rextsize); + extcount = endext - startext + 1; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); From e5d7d51b340aac0f4cc56677eb8d29d4e164c58c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:54 -0800 Subject: [PATCH 11/70] xfs: check directory name validity Check directory entry names for invalid characters. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_dir2.c | 17 +++++++++++++++++ fs/xfs/libxfs/xfs_dir2.h | 1 + fs/xfs/scrub/dir.c | 6 ++++++ 3 files changed, 24 insertions(+) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 229152cd1a24..156ce95c9c45 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -703,3 +703,20 @@ xfs_dir2_shrink_inode( xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } + +/* Returns true if the directory entry name is valid. */ +bool +xfs_dir2_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any slashes or nulls here */ + return !memchr(name, '/', length) && !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c3e3f6b813d8..f54244779492 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); +bool xfs_dir2_namecheck(const void *name, size_t length); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index cd3e4d768a18..a38a22785a1a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -129,6 +129,12 @@ xchk_dir_actor( goto out; } + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) From 654805367d982cffdb9979453673aab9c3c96d07 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Feb 2019 09:08:54 -0800 Subject: [PATCH 12/70] xfs: check attribute name validity Check extended attribute entry names for invalid characters. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_attr.c | 17 +++++++++++++++++ fs/xfs/libxfs/xfs_attr.h | 2 +- fs/xfs/scrub/attr.c | 6 ++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 844ed87b1900..2dd9ee2a2e08 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return retval; } + +/* Returns true if the attribute entry name is valid. */ +bool +xfs_attr_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any nulls here */ + return !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index bdf52a333f3f..2297d8467666 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); - +bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 9960bc5b5d76..dce74ec57038 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -93,6 +93,12 @@ xchk_xattr_listent( return; } + /* Does this name make sense? */ + if (!xfs_attr_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + return; + } + args.flags = ATTR_KERNOTIME; if (flags & XFS_ATTR_ROOT) args.flags |= ATTR_ROOT; From e88db81645d36f945d11bd73ffff31dad148eaf8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 1 Feb 2019 09:09:50 -0800 Subject: [PATCH 13/70] xfs: remove duplicated xfs_defer.h Remove duplicated include. Signed-off-by: YueHaibing Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans_bmap.c | 1 - fs/xfs/xfs_trans_extfree.c | 1 - fs/xfs/xfs_trans_refcount.c | 1 - fs/xfs/xfs_trans_rmap.c | 1 - 4 files changed, 4 deletions(-) diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 11cff449d055..e1c7d55b32c3 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -17,7 +17,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_inode.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "bmap update done" diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 0710434eb240..8ee7a3f8bb20 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -18,7 +18,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" -#include "xfs_defer.h" /* * This routine is called to allocate an "extent free done" diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 6c947ff4faf6..8d734728dd1b 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -16,7 +16,6 @@ #include "xfs_refcount_item.h" #include "xfs_alloc.h" #include "xfs_refcount.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "refcount update done" diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index a42890931ecd..5c7936b1be13 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -16,7 +16,6 @@ #include "xfs_rmap_item.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_defer.h" /* Set the map extent flags for this reverse mapping. */ static void From d519da41e2b789f2cbd705cd8cec0bb92be4838d Mon Sep 17 00:00:00 2001 From: Marco Benatto Date: Fri, 1 Feb 2019 09:12:20 -0800 Subject: [PATCH 14/70] xfs: Introduce XFS_PTAG_VERIFIER_ERROR panic mask Currently we have a few PTAGs in place allowing us to transform a filesystem error in a BUG() call. However, we don't have a panic tag for corrupt metadata, so introduce XFS_PTAG_VERIFIER_ERROR so that the administrator can use the fs.xfs.panic_mask sysctl knob to convert any error detected by buffer verifiers into a kernel panic. Signed-off-by: Marco Benatto Reviewed-by: Eric Sandeen [darrick: light editing of commit message] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- Documentation/filesystems/xfs.txt | 3 ++- fs/xfs/xfs_error.c | 3 ++- fs/xfs/xfs_error.h | 1 + fs/xfs/xfs_globals.c | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 9ccfd1bc6201..a5cbb5e0e3db 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -272,7 +272,7 @@ The following sysctls are available for the XFS filesystem: XFS_ERRLEVEL_LOW: 1 XFS_ERRLEVEL_HIGH: 5 - fs.xfs.panic_mask (Min: 0 Default: 0 Max: 255) + fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) Causes certain error conditions to call BUG(). Value is a bitmask; OR together the tags which represent errors which should cause panics: @@ -285,6 +285,7 @@ The following sysctls are available for the XFS filesystem: XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 XFS_PTAG_FSBLOCK_ZERO 0x00000080 + XFS_PTAG_VERIFIER_ERROR 0x00000100 This option is intended for debugging only. diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 9866f542e77b..57a85410a8c6 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -357,7 +357,8 @@ xfs_buf_verifier_error( fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", fa, bp->b_ops->name, bp->b_bn, name); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 246d3e989c6c..602aa7d62b66 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 +#define XFS_PTAG_VERIFIER_ERROR 0x00000100 #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 5169e84ae382..d0d377384120 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -16,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 255 }, + .panic_mask = { 0, 0, 256 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, From 9f9bc034b84958523689347ee2bdd9c660008e5e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Feb 2019 09:14:22 -0800 Subject: [PATCH 15/70] xfs: update fork seq counter on data fork changes The sequence counter in the xfs_ifork structure is only updated on COW forks. This is because the counter is currently only used to optimize out repetitive COW fork checks at writeback time. Tweak the extent code to update the seq counter regardless of the fork type in preparation for using this counter on data forks as well. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_iext_tree.c | 13 ++++++------- fs/xfs/libxfs/xfs_inode_fork.h | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 771dd072015d..bc690f2409fa 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -614,16 +614,15 @@ xfs_iext_realloc_root( } /* - * Increment the sequence counter if we are on a COW fork. This allows - * the writeback code to skip looking for a COW extent if the COW fork - * hasn't changed. We use WRITE_ONCE here to ensure the update to the - * sequence counter is seen before the modifications to the extent - * tree itself take effect. + * Increment the sequence counter on extent tree changes. If we are on a COW + * fork, this allows the writeback code to skip looking for a COW extent if the + * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the + * sequence counter is seen before the modifications to the extent tree itself + * take effect. */ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) { - if (state & BMAP_COWFORK) - WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); + WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } void diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 60361d2d74a1..00c62ce170d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -14,7 +14,7 @@ struct xfs_dinode; */ struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* cow fork mod counter */ + unsigned int if_seq; /* fork mod counter */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ From d9252d526ba66a8f95ad2830ae1b62825ef3dbd5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Feb 2019 09:14:23 -0800 Subject: [PATCH 16/70] xfs: validate writeback mapping using data fork seq counter The writeback code caches the current extent mapping across multiple xfs_do_writepage() calls to avoid repeated lookups for sequential pages backed by the same extent. This is known to be slightly racy with extent fork changes in certain difficult to reproduce scenarios. The cached extent is trimmed to within EOF to help avoid the most common vector for this problem via speculative preallocation management, but this is a band-aid that does not address the fundamental problem. Now that we have an xfs_ifork sequence counter mechanism used to facilitate COW writeback, we can use the same mechanism to validate consistency between the data fork and cached writeback mappings. On its face, this is somewhat of a big hammer approach because any change to the data fork invalidates any mapping currently cached by a writeback in progress regardless of whether the data fork change overlaps with the range under writeback. In practice, however, the impact of this approach is minimal in most cases. First, data fork changes (delayed allocations) caused by sustained sequential buffered writes are amortized across speculative preallocations. This means that a cached mapping won't be invalidated by each buffered write of a common file copy workload, but rather only on less frequent allocation events. Second, the extent tree is always entirely in-core so an additional lookup of a usable extent mostly costs a shared ilock cycle and in-memory tree lookup. This means that a cached mapping reval is relatively cheap compared to the I/O itself. Third, spurious invalidations don't impact ioend construction. This means that even if the same extent is revalidated multiple times across multiple writepage instances, we still construct and submit the same size ioend (and bio) if the blocks are physically contiguous. Update struct xfs_writepage_ctx with a new field to hold the sequence number of the data fork associated with the currently cached mapping. Check the wpc seqno against the data fork when the mapping is validated and reestablish the mapping whenever the fork has changed since the mapping was cached. This ensures that writeback always uses a valid extent mapping and thus prevents lost writebacks and stale delalloc block problems. Signed-off-by: Brian Foster Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 60 ++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_iomap.c | 5 ++-- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index d9048bcea49c..5b0256a8a420 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -29,6 +29,7 @@ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; unsigned int io_type; + unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -301,6 +302,42 @@ xfs_end_bio( xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } +/* + * Fast revalidation of the cached writeback mapping. Return true if the current + * mapping is valid, false otherwise. + */ +static bool +xfs_imap_valid( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + if (offset_fsb < wpc->imap.br_startoff || + offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + return false; + /* + * If this is a COW mapping, it is sufficient to check that the mapping + * covers the offset. Be careful to check this first because the caller + * can revalidate a COW mapping without updating the data seqno. + */ + if (wpc->io_type == XFS_IO_COW) + return true; + + /* + * This is not a COW mapping. Check the sequence number of the data fork + * because concurrent changes could have invalidated the extent. Check + * the COW fork because concurrent changes since the last time we + * checked (and found nothing at this offset) could have added + * overlapping blocks. + */ + if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + return false; + if (xfs_inode_has_cow_data(ip) && + wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + return false; + return true; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -315,9 +352,11 @@ xfs_map_blocks( struct xfs_bmbt_irec imap; int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; - bool imap_valid; int error = 0; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + /* * We have to make sure the cached mapping is within EOF to protect * against eofblocks trimming on file release leaving us with a stale @@ -346,17 +385,9 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - imap_valid = offset_fsb >= wpc->imap.br_startoff && - offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; - if (imap_valid && - (!xfs_inode_has_cow_data(ip) || - wpc->io_type == XFS_IO_COW || - wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) + if (xfs_imap_valid(wpc, ip, offset_fsb)) return 0; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * If we don't have a valid map, now it's time to get a new one for this * offset. This will convert delayed allocations (including COW ones) @@ -403,9 +434,10 @@ xfs_map_blocks( } /* - * Map valid and no COW extent in the way? We're done. + * No COW extent overlap. Revalidate now that we may have updated + * ->cow_seq. If the data mapping is still valid, we're done. */ - if (imap_valid) { + if (xfs_imap_valid(wpc, ip, offset_fsb)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -417,6 +449,7 @@ xfs_map_blocks( */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ + wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (imap.br_startoff > offset_fsb) { @@ -454,7 +487,8 @@ xfs_map_blocks( return 0; allocate_blocks: error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - &wpc->cow_seq); + whichfork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); if (error) return error; ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 27c93b5f029d..ab69caa685b4 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -681,7 +681,7 @@ xfs_iomap_write_allocate( int whichfork, xfs_off_t offset, xfs_bmbt_irec_t *imap, - unsigned int *cow_seq) + unsigned int *seq) { xfs_mount_t *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); @@ -797,8 +797,7 @@ xfs_iomap_write_allocate( if (error) goto error0; - if (whichfork == XFS_COW_FORK) - *cow_seq = READ_ONCE(ifp->if_seq); + *seq = READ_ONCE(ifp->if_seq); xfs_iunlock(ip, XFS_ILOCK_EXCL); } From 3b35089807304f208419b5ad9cc3c5f731225cd9 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Feb 2019 09:14:23 -0800 Subject: [PATCH 17/70] xfs: remove superfluous writeback mapping eof trimming Now that the cached writeback mapping is explicitly invalidated on data fork changes, the EOF trimming band-aid is no longer necessary. Remove xfs_trim_extent_eof() as well since it has no other users. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 11 ----------- fs/xfs/libxfs/xfs_bmap.h | 1 - fs/xfs/xfs_aops.c | 15 --------------- 3 files changed, 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 332eefa2700b..4c73927819c2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3685,17 +3685,6 @@ xfs_trim_extent( } } -/* trim extent to within eof */ -void -xfs_trim_extent_eof( - struct xfs_bmbt_irec *irec, - struct xfs_inode *ip) - -{ - xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, - i_size_read(VFS_I(ip)))); -} - /* * Trim the returned map to the required bounds */ diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 09d3ea97cc15..b4ff710d7250 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -181,7 +181,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); -void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 5b0256a8a420..515532f45beb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -357,19 +357,6 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - /* - * We have to make sure the cached mapping is within EOF to protect - * against eofblocks trimming on file release leaving us with a stale - * mapping. Otherwise, a page for a subsequent file extending buffered - * write could get picked up by this writeback cycle and written to the - * wrong blocks. - * - * Note that what we really want here is a generic mapping invalidation - * mechanism to protect us from arbitrary extent modifying contexts, not - * just eofblocks. - */ - xfs_trim_extent_eof(&wpc->imap, ip); - /* * COW fork blocks can overlap data fork blocks even if the blocks * aren't shared. COW I/O always takes precedent, so we must always @@ -482,7 +469,6 @@ xfs_map_blocks( } wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); return 0; allocate_blocks: @@ -494,7 +480,6 @@ allocate_blocks: ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || imap.br_startoff + imap.br_blockcount <= cow_fsb); wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); return 0; } From 627209fbcc2f0d658a5417645859a1d3053ddb59 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Feb 2019 09:14:23 -0800 Subject: [PATCH 18/70] xfs: create delalloc bmapi wrapper for full extent allocation The writeback delalloc conversion code is racy with respect to changes in the currently cached file mapping. This stems from the fact that the bmapi allocation code requires a file range to allocate and the writeback conversion code assumes the range of the currently cached mapping is still valid with respect to the fork. It may not be valid, however, because the ilock is cycled (potentially multiple times) between the time the cached mapping was populated and the delalloc conversion occurs. To facilitate a solution to this problem, create a new xfs_bmapi_delalloc() wrapper to xfs_bmapi_write() that takes a file (FSB) offset and attempts to allocate whatever delalloc extent backs the offset. Use a new bmapi flag to cause xfs_bmapi_write() to set the range based on the extent backing the bno parameter unless bno lands in a hole. If bno does land in a hole, fall back to the current behavior (which may result in an error or quietly skipping holes in the specified range depending on other parameters). This patch does not change behavior. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 59 +++++++++++++++++++++++++++++++++++++--- fs/xfs/libxfs/xfs_bmap.h | 4 +++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4c73927819c2..c629004d9a4c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4286,10 +4286,6 @@ xfs_bmapi_write( goto error0; } - n = 0; - end = bno + len; - obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) @@ -4299,6 +4295,26 @@ xfs_bmapi_write( bma.total = total; bma.datatype = 0; + /* + * The reval flag means the caller wants to allocate the entire delalloc + * extent backing bno where bno may not necessarily match the startoff. + * Now that we've looked up the extent, reset the range to map based on + * the extent in the file. If we're in a hole, this may be an error so + * don't adjust anything. + */ + if ((flags & XFS_BMAPI_REVALRANGE) && + !eof && bno >= bma.got.br_startoff) { + ASSERT(flags & XFS_BMAPI_DELALLOC); + bno = bma.got.br_startoff; + len = bma.got.br_blockcount; +#ifdef DEBUG + orig_bno = bno; + orig_len = len; +#endif + } + n = 0; + end = bno + len; + obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; @@ -4455,6 +4471,41 @@ error0: return error; } +/* + * Convert an existing delalloc extent to real blocks based on file offset. This + * attempts to allocate the entire delalloc extent and may require multiple + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + int whichfork, + struct xfs_bmbt_irec *imap) +{ + int flags = XFS_BMAPI_DELALLOC; + int nimaps = 1; + int error; + int total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, + XFS_DATA_FORK); + + if (whichfork == XFS_COW_FORK) + flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + + /* + * The reval flag means to allocate the entire extent; pass a dummy + * length of 1. + */ + flags |= XFS_BMAPI_REVALRANGE; + error = xfs_bmapi_write(tp, ip, offset_fsb, 1, flags, total, imap, + &nimaps); + if (!error && !nimaps) + error = -EFSCORRUPTED; + return error; +} + int xfs_bmapi_remap( struct xfs_trans *tp, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b4ff710d7250..75586d56f7a5 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -107,6 +107,8 @@ struct xfs_extent_free_item /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP 0x2000 +#define XFS_BMAPI_REVALRANGE 0x4000 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -227,6 +229,8 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +int xfs_bmapi_convert_delalloc(struct xfs_trans *, struct xfs_inode *, + xfs_fileoff_t, int, struct xfs_bmbt_irec *); static inline void xfs_bmap_add_free( From c2b3164320b51a535d7c7a6acdcee255edbb22cf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Feb 2019 09:14:24 -0800 Subject: [PATCH 19/70] xfs: use the latest extent at writeback delalloc conversion time The writeback delalloc conversion code is racy with respect to changes in the currently cached file mapping outside of the current page. This is because the ilock is cycled between the time the caller originally looked up the mapping and across each real allocation of the provided file range. This code has collected various hacks over the years to help combat the symptoms of these races (i.e., truncate race detection, allocation into hole detection, etc.), but none address the fundamental problem that the imap may not be valid at allocation time. Rather than continue to use race detection hacks, update writeback delalloc conversion to a model that explicitly converts the delalloc extent backing the current file offset being processed. The current file offset is the only block we can trust to remain once the ilock is dropped because any operation that can remove the block (truncate, hole punch, etc.) must flush and discard pagecache pages first. Modify xfs_iomap_write_allocate() to use the xfs_bmapi_delalloc() mechanism to request allocation of the entire delalloc extent backing the current offset instead of assuming the extent passed by the caller is unchanged. Record the range specified by the caller and apply it to the resulting allocated extent so previous checks by the caller for COW fork overlap are not lost. Finally, overload the bmapi delalloc flag with the range reval flag behavior since this is the only use case for both. This ensures that writeback always picks up the correct and current extent associated with the page, regardless of races with other extent modifying operations. If operating on a data fork and the COW overlap state has changed since the ilock was cycled, the caller revalidates against the COW fork sequence number before using the imap for the next block. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 16 ++-- fs/xfs/libxfs/xfs_bmap.h | 2 - fs/xfs/xfs_iomap.c | 176 +++++++++++++-------------------------- 3 files changed, 67 insertions(+), 127 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c629004d9a4c..f4a65330a2a9 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4296,15 +4296,14 @@ xfs_bmapi_write( bma.datatype = 0; /* - * The reval flag means the caller wants to allocate the entire delalloc - * extent backing bno where bno may not necessarily match the startoff. - * Now that we've looked up the extent, reset the range to map based on - * the extent in the file. If we're in a hole, this may be an error so - * don't adjust anything. + * The delalloc flag means the caller wants to allocate the entire + * delalloc extent backing bno where bno may not necessarily match the + * startoff. Now that we've looked up the extent, reset the range to + * map based on the extent in the file. If we're in a hole, this may be + * an error so don't adjust anything. */ - if ((flags & XFS_BMAPI_REVALRANGE) && + if ((flags & XFS_BMAPI_DELALLOC) && !eof && bno >= bma.got.br_startoff) { - ASSERT(flags & XFS_BMAPI_DELALLOC); bno = bma.got.br_startoff; len = bma.got.br_blockcount; #ifdef DEBUG @@ -4495,10 +4494,9 @@ xfs_bmapi_convert_delalloc( flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; /* - * The reval flag means to allocate the entire extent; pass a dummy + * The delalloc flag means to allocate the entire extent; pass a dummy * length of 1. */ - flags |= XFS_BMAPI_REVALRANGE; error = xfs_bmapi_write(tp, ip, offset_fsb, 1, flags, total, imap, &nimaps); if (!error && !nimaps) diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 75586d56f7a5..4dc7d1a02b35 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -107,8 +107,6 @@ struct xfs_extent_free_item /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP 0x2000 -#define XFS_BMAPI_REVALRANGE 0x4000 - #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ab69caa685b4..6af1d3ec0a9c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -677,25 +677,19 @@ out_unlock: */ int xfs_iomap_write_allocate( - xfs_inode_t *ip, - int whichfork, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - unsigned int *seq) + struct xfs_inode *ip, + int whichfork, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + unsigned int *seq) { - xfs_mount_t *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_filblks_t count_fsb; - xfs_trans_t *tp; - int nimaps; - int error = 0; - int flags = XFS_BMAPI_DELALLOC; - int nres; - - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_fileoff_t offset_fsb; + xfs_fileoff_t map_start_fsb; + xfs_extlen_t map_count_fsb; + struct xfs_trans *tp; + int error = 0; /* * Make sure that the dquots are there. @@ -704,106 +698,60 @@ xfs_iomap_write_allocate( if (error) return error; + /* + * Store the file range the caller is interested in because it encodes + * state such as potential overlap with COW fork blocks. We must trim + * the allocated extent down to this range to maintain consistency with + * what the caller expects. Revalidation of the range itself is the + * responsibility of the caller. + */ offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = imap->br_blockcount; map_start_fsb = imap->br_startoff; + map_count_fsb = imap->br_blockcount; - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + XFS_STATS_ADD(mp, xs_xstrat_bytes, + XFS_FSB_TO_B(mp, imap->br_blockcount)); - while (count_fsb != 0) { + while (true) { /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. + * Allocate in a loop because it may take several attempts to + * allocate real blocks for a contiguous delalloc extent if free + * space is sufficiently fragmented. Note that space for the + * extent and indirect blocks was reserved when the delalloc + * extent was created so there's no need to do so here. */ - nimaps = 0; - while (nimaps == 0) { - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - /* - * We have already reserved space for the extent and any - * indirect blocks when creating the delalloc extent, - * there is no need to reserve space in this transaction - * again. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * it is possible that the extents have changed since - * we did the read call as we dropped the ilock for a - * while. We have to be careful about truncates or hole - * punchs here - we are not allowed to allocate - * non-delalloc blocks here. - * - * The only protection against truncation is the pages - * for the range we are being asked to convert are - * locked and hence a truncate will block on them - * first. - * - * As a result, if we go beyond the range we really - * need and hit an delalloc extent boundary followed by - * a hole while we have excess blocks in the map, we - * will fill the hole incorrectly and overrun the - * transaction reservation. - * - * Using a single map prevents this as we are forced to - * check each map we look for overlap with the desired - * range and abort as soon as we find it. Also, given - * that we only return a single map, having one beyond - * what we can return is probably a bit silly. - * - * We also need to check that we don't go beyond EOF; - * this is a truncate optimisation as a truncate sets - * the new file size before block on the pages we - * currently have locked under writeback. Because they - * are about to be tossed, we don't need to write them - * back.... - */ - nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(ip, &last_block, - XFS_DATA_FORK); - if (error) - goto trans_cancel; - - last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = -EAGAIN; - goto trans_cancel; - } - } - - /* - * From this point onwards we overwrite the imap - * pointer that the caller gave to us. - */ - error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, flags, nres, imap, - &nimaps); - if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp); - if (error) - goto error0; - - *seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); /* - * See if we were able to allocate an extent that - * covers at least part of the callers request + * ilock was dropped since imap was populated which means it + * might no longer be valid. The current page is held locked so + * nothing could have removed the block backing offset_fsb. + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result in the imap pointer from the + * caller. We'll trim it down to the caller's most recently + * validated range before we return. + */ + error = xfs_bmapi_convert_delalloc(tp, ip, offset_fsb, + whichfork, imap); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto error0; + + *seq = READ_ONCE(ifp->if_seq); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * See if we were able to allocate an extent that covers at + * least part of the callers request. */ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, imap); @@ -812,15 +760,11 @@ xfs_iomap_write_allocate( (offset_fsb < (imap->br_startoff + imap->br_blockcount))) { XFS_STATS_INC(mp, xs_xstrat_quick); + xfs_trim_extent(imap, map_start_fsb, map_count_fsb); + ASSERT(offset_fsb >= imap->br_startoff && + offset_fsb < imap->br_startoff + imap->br_blockcount); return 0; } - - /* - * So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - count_fsb -= imap->br_blockcount; - map_start_fsb = imap->br_startoff + imap->br_blockcount; } trans_cancel: From 5837f62592ef7c07859a1b1a6e82030d85869f0f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:13 -0800 Subject: [PATCH 20/70] xfs: clean up iunlink functions Fix some indentation issues with the iunlink functions and reorganize the tops of the functions to be identical. No functional changes. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 79 +++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 47 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae667ba74a1c..8a39d80f4863 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1918,26 +1918,24 @@ xfs_inactive( */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_dinode *dip; + struct xfs_buf *agibp; + struct xfs_buf *ibp; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int offset; + int error; ASSERT(VFS_I(ip)->i_mode != 0); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; agi = XFS_BUF_TO_AGI(agibp); @@ -1946,9 +1944,6 @@ xfs_iunlink( * Get the index into the agi hash table for the * list this inode will go on. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; ASSERT(agi->agi_unlinked[bucket_index]); ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); @@ -1995,45 +1990,35 @@ xfs_iunlink( */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_dinode *dip; + struct xfs_buf *agibp; + struct xfs_buf *ibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + xfs_ino_t next_ino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int offset; + int last_offset = 0; + int error; - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); - - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); /* * Get the index into the agi hash table for the * list this inode will go on. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; if (!xfs_verify_agino(mp, agno, be32_to_cpu(agi->agi_unlinked[bucket_index]))) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, From 7d36c19538d38f9ff6b93d2a3d23ee879b076dc6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:13 -0800 Subject: [PATCH 21/70] xfs: add xfs_verify_agino_or_null helper Add a new helper to check that a per-AG inode pointer is either null or points somewhere valid within that AG. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_buf.c | 3 +-- fs/xfs/libxfs/xfs_types.c | 13 +++++++++++++ fs/xfs/libxfs/xfs_types.h | 2 ++ fs/xfs/scrub/agheader.c | 8 +++----- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09d9c8cfa4a0..82c4374acb4c 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -99,8 +99,7 @@ xfs_inode_buf_verify( unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && xfs_dinode_good_version(mp, dip->di_version) && - (unlinked_ino == NULLAGINO || - xfs_verify_agino(mp, agno, unlinked_ino)); + xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 451608b5a37b..de310712dd6d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -115,6 +115,19 @@ xfs_verify_agino( return agino >= first && agino <= last; } +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +bool +xfs_verify_agino_or_null( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); +} + /* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 704b4f308780..c5a25403b4db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -183,6 +183,8 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino); +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 90955ab1e895..9d4e8293d37e 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -864,19 +864,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (agino == NULLAGINO) - continue; - if (!xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } From 9a4a5118644e41ac9da7fa7d87ff3b09e61304de Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:14 -0800 Subject: [PATCH 22/70] xfs: refactor AGI unlinked bucket updates Split the AGI unlinked bucket updates into a separate function. No functional changes. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 65 ++++++++++++++++++++++++++++++++-------------- fs/xfs/xfs_trace.h | 26 +++++++++++++++++++ 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8a39d80f4863..f1de80e8b9a6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1906,6 +1906,43 @@ xfs_inactive( xfs_qm_dqdetach(ip); } +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) + return -EFSCORRUPTED; + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + /* * This is called when the inode's link count goes to 0 or we are creating a * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be @@ -1973,16 +2010,8 @@ xfs_iunlink( xfs_inobp_check(mp, ibp); } - /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - return 0; + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); } /* @@ -2058,16 +2087,12 @@ xfs_iunlink_remove( } else { xfs_trans_brelse(tp, ibp); } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); + + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); + if (error) + return error; } else { /* * We need to search the list for the inode being freed. diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6fcc893dfc91..c10478e7e49a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3371,6 +3371,32 @@ DEFINE_TRANS_EVENT(xfs_trans_roll); DEFINE_TRANS_EVENT(xfs_trans_add_item); DEFINE_TRANS_EVENT(xfs_trans_free_items); +TRACE_EVENT(xfs_iunlink_update_bucket, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bucket = bucket; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_ptr, + __entry->new_ptr) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH From 86bfd3750fb3070932ae2da4cb5b9ae743c13f99 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:14 -0800 Subject: [PATCH 23/70] xfs: strengthen AGI unlinked inode bucket pointer checks Strengthen our checking of the AGI unlinked pointers when we start to use them for updating the metadata. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster --- fs/xfs/xfs_inode.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f1de80e8b9a6..ba19cfd52b5e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1963,6 +1963,7 @@ xfs_iunlink( struct xfs_dinode *dip; struct xfs_buf *agibp; struct xfs_buf *ibp; + xfs_agino_t next_agino; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; @@ -1978,13 +1979,16 @@ xfs_iunlink( agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. */ - ASSERT(agi->agi_unlinked[bucket_index]); - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(mp, agno, next_agino)) + return -EFSCORRUPTED; - if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { + if (next_agino != NULLAGINO) { /* * There is already another inode in the bucket we need * to add ourselves to. Add us at the front of the list. @@ -2045,17 +2049,17 @@ xfs_iunlink_remove( agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. */ - if (!xfs_verify_agino(mp, agno, - be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, next_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { + if (next_agino == agino) { /* * We're at the head of the list. Get the inode's on-disk * buffer to see if there is anyone after us on the list. @@ -2097,7 +2101,6 @@ xfs_iunlink_remove( /* * We need to search the list for the inode being freed. */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); last_ibp = NULL; while (next_agino != agino) { struct xfs_imap imap; From f2fc16a3d7c12224d4c19055fef40ca6379b2045 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:15 -0800 Subject: [PATCH 24/70] xfs: refactor inode unlinked pointer update functions Hoist the functions that update an inode's unlinked pointer updates into a helper. No functional changes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster --- fs/xfs/xfs_inode.c | 191 +++++++++++++++++++++++---------------------- fs/xfs/xfs_trace.h | 26 ++++++ 2 files changed, 125 insertions(+), 92 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ba19cfd52b5e..eb51fa33f91a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1943,6 +1943,85 @@ xfs_iunlink_update_bucket( return 0; } +/* Set an on-disk inode's next_unlinked pointer. */ +STATIC void +xfs_iunlink_update_dinode( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_buf *ibp, + struct xfs_dinode *dip, + struct xfs_imap *imap, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + int offset; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + trace_xfs_iunlink_update_dinode(mp, agno, agino, + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = imap->im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + xfs_inobp_check(mp, ibp); +} + +/* Set an in-core inode's unlinked pointer and return the old value. */ +STATIC int +xfs_iunlink_update_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_agnumber_t agno, + xfs_agino_t next_agino, + xfs_agino_t *old_next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + xfs_agino_t old_value; + int error; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + return error; + + /* Make sure the old pointer isn't garbage. */ + old_value = be32_to_cpu(dip->di_next_unlinked); + if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + error = -EFSCORRUPTED; + goto out; + } + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + *old_next_agino = old_value; + if (old_value == next_agino) { + if (next_agino != NULLAGINO) + error = -EFSCORRUPTED; + goto out; + } + + /* Ok, update the new pointer. */ + xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + ibp, dip, &ip->i_imap, next_agino); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + /* * This is called when the inode's link count goes to 0 or we are creating a * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be @@ -1960,14 +2039,11 @@ xfs_iunlink( { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi; - struct xfs_dinode *dip; struct xfs_buf *agibp; - struct xfs_buf *ibp; xfs_agino_t next_agino; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - int offset; int error; ASSERT(VFS_I(ip)->i_mode != 0); @@ -1989,29 +2065,17 @@ xfs_iunlink( return -EFSCORRUPTED; if (next_agino != NULLAGINO) { + xfs_agino_t old_agino; + /* - * There is already another inode in the bucket we need - * to add ourselves to. Add us at the front of the list. - * Here we put the head pointer into our next pointer, - * and then we fall through to point the head at us. + * There is already another inode in the bucket, so point this + * inode to the current head of the list. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); + error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + &old_agino); if (error) return error; - - ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); - dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); + ASSERT(old_agino == NULLAGINO); } /* Point the head of the list to point to this inode. */ @@ -2028,9 +2092,7 @@ xfs_iunlink_remove( { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi; - struct xfs_dinode *dip; struct xfs_buf *agibp; - struct xfs_buf *ibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; xfs_ino_t next_ino; @@ -2038,8 +2100,6 @@ xfs_iunlink_remove( xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - int offset; - int last_offset = 0; int error; /* Get the agi buffer first. It ensures lock ordering on the list. */ @@ -2063,34 +2123,11 @@ xfs_iunlink_remove( /* * We're at the head of the list. Get the inode's on-disk * buffer to see if there is anyone after us on the list. - * Only modify our next pointer if it is not already NULLAGINO. - * This saves us the overhead of dealing with the buffer when - * there is no need to change it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, + &next_agino); + if (error) return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } /* Point the head of the list to the next unlinked inode. */ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, @@ -2098,13 +2135,14 @@ xfs_iunlink_remove( if (error) return error; } else { + struct xfs_imap imap; + xfs_agino_t prev_agino; + /* * We need to search the list for the inode being freed. */ last_ibp = NULL; while (next_agino != agino) { - struct xfs_imap imap; - if (last_ibp) xfs_trans_brelse(tp, last_ibp); @@ -2128,7 +2166,7 @@ xfs_iunlink_remove( return error; } - last_offset = imap.im_boffset; + prev_agino = next_agino; next_agino = be32_to_cpu(last_dip->di_next_unlinked); if (!xfs_verify_agino(mp, agno, next_agino)) { XFS_CORRUPTION_ERROR(__func__, @@ -2142,45 +2180,14 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on the * unlinked list. Pull us from the list. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", - __func__, error); + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, + &next_agino); + if (error) return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the previous inode on the list to the next inode. - */ - last_dip->di_next_unlinked = cpu_to_be32(next_agino); - ASSERT(next_agino != 0); - offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, last_dip); - - xfs_trans_inode_buf(tp, last_ibp); - xfs_trans_log_buf(tp, last_ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, last_ibp); + /* Point the previous inode on the list to the next inode. */ + xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + last_dip, &imap, next_agino); } return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c10478e7e49a..fbec8f0e1a9a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3397,6 +3397,32 @@ TRACE_EVENT(xfs_iunlink_update_bucket, __entry->new_ptr) ); +TRACE_EVENT(xfs_iunlink_update_dinode, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->old_ptr, + __entry->new_ptr) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH From 23ffa52cc792813bda35fd7bbaa87df8540d4bcb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:15 -0800 Subject: [PATCH 25/70] xfs: refactor unlinked list search and mapping to a separate function There's a loop that searches an unlinked bucket list to find the inode that points to a given inode. Hoist this into a separate function; later we'll use our iunlink backref cache to bypass the slow list operation. No functional changes. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_inode.c | 134 ++++++++++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 38 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index eb51fa33f91a..d4610eee7172 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2082,6 +2082,97 @@ xfs_iunlink( return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); } +/* Return the imap, dinode pointer, and buffer for an inode. */ +STATIC int +xfs_iunlink_map_ino( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + int error; + + imap->im_blkno = 0; + error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + return 0; +} + +/* + * Walk the unlinked chain from @head_agino until we find the inode that + * points to @target_agino. Return the inode number, map, dinode pointer, + * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. + * + * @tp, @pag, @head_agino, and @target_agino are input parameters. + * @agino, @imap, @dipp, and @bpp are all output parameters. + * + * Do not call this function if @target_agino is the head of the list. + */ +STATIC int +xfs_iunlink_map_prev( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t head_agino, + xfs_agino_t target_agino, + xfs_agino_t *agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_agino_t next_agino; + int error; + + ASSERT(head_agino != target_agino); + *bpp = NULL; + + next_agino = head_agino; + while (next_agino != target_agino) { + xfs_agino_t unlinked_agino; + + if (*bpp) + xfs_trans_brelse(tp, *bpp); + + *agino = next_agino; + error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, + bpp); + if (error) + return error; + + unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); + /* + * Make sure this pointer is valid and isn't an obvious + * infinite loop. + */ + if (!xfs_verify_agino(mp, agno, unlinked_agino) || + next_agino == unlinked_agino) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + *dipp, sizeof(**dipp)); + error = -EFSCORRUPTED; + return error; + } + next_agino = unlinked_agino; + } + + return 0; +} + /* * Pull the on-disk inode from the AGI unlinked list. */ @@ -2095,7 +2186,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - xfs_ino_t next_ino; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2138,43 +2228,11 @@ xfs_iunlink_remove( struct xfs_imap imap; xfs_agino_t prev_agino; - /* - * We need to search the list for the inode being freed. - */ - last_ibp = NULL; - while (next_agino != agino) { - if (last_ibp) - xfs_trans_brelse(tp, last_ibp); - - imap.im_blkno = 0; - next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); - - error = xfs_imap(mp, tp, next_ino, &imap, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } - - error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, - &last_ibp, 0, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - - prev_agino = next_agino; - next_agino = be32_to_cpu(last_dip->di_next_unlinked); - if (!xfs_verify_agino(mp, agno, next_agino)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - last_dip, sizeof(*last_dip)); - return -EFSCORRUPTED; - } - } + /* We need to search the list for the inode being freed. */ + error = xfs_iunlink_map_prev(tp, agno, next_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp); + if (error) + return error; /* * Now last_ibp points to the buffer previous to us on the From b1d2a068ea631891ae7a0b19c37ff8a5bf0f6af4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:15 -0800 Subject: [PATCH 26/70] xfs: refactor inode update in iunlink_remove In xfs_iunlink_remove we have two identical calls to xfs_iunlink_update_inode, so move it out of the if statement to simplify the code some more. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster --- fs/xfs/xfs_inode.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d4610eee7172..282316bcce39 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2189,6 +2189,7 @@ xfs_iunlink_remove( xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; + xfs_agino_t head_agino; short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; @@ -2202,23 +2203,23 @@ xfs_iunlink_remove( * Get the index into the agi hash table for the list this inode will * go on. Make sure the head pointer isn't garbage. */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (!xfs_verify_agino(mp, agno, next_agino)) { + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (next_agino == agino) { - /* - * We're at the head of the list. Get the inode's on-disk - * buffer to see if there is anyone after us on the list. - */ - error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, - &next_agino); - if (error) - return error; + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + if (error) + return error; + if (head_agino == agino) { /* Point the head of the list to the next unlinked inode. */ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, next_agino); @@ -2229,20 +2230,11 @@ xfs_iunlink_remove( xfs_agino_t prev_agino; /* We need to search the list for the inode being freed. */ - error = xfs_iunlink_map_prev(tp, agno, next_agino, agino, + error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp); if (error) return error; - /* - * Now last_ibp points to the buffer previous to us on the - * unlinked list. Pull us from the list. - */ - error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, - &next_agino); - if (error) - return error; - /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, last_dip, &imap, next_agino); From 4664c66c91a14019551b9ba8b2998fa3b5f69499 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:16 -0800 Subject: [PATCH 27/70] xfs: add tracepoints for high level iunlink operations Add tracepoints so we can associate high level operations with low level updates. No functional changes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster --- fs/xfs/xfs_inode.c | 3 +++ fs/xfs/xfs_trace.h | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 282316bcce39..206bb004c4f2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2047,6 +2047,7 @@ xfs_iunlink( int error; ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); @@ -2193,6 +2194,8 @@ xfs_iunlink_remove( short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; + trace_xfs_iunlink_remove(ip); + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fbec8f0e1a9a..a6e384a642b1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3423,6 +3423,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __entry->new_ptr) ); +DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno %u agino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agino) +) + +#define DEFINE_AGINODE_EVENT(name) \ +DEFINE_EVENT(xfs_ag_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_AGINODE_EVENT(xfs_iunlink); +DEFINE_AGINODE_EVENT(xfs_iunlink_remove); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH From 9b2471797942a5947664818cfe2c6de93b43f37a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Feb 2019 10:37:16 -0800 Subject: [PATCH 28/70] xfs: cache unlinked pointers in an rhashtable Use a rhashtable to cache the unlinked list incore. This should speed up unlinked processing considerably when there are a lot of inodes on the unlinked list because iunlink_remove no longer has to traverse an entire bucket list to find which inode points to the one being removed. The incore list structure records "X.next_unlinked = Y" relations, with the rhashtable using Y to index the records. This makes finding the inode X that points to a inode Y very quick. If our cache fails to find anything we can always fall back on the old method. FWIW this drastically reduces the amount of time it takes to remove inodes from the unlinked list. I wrote a program to open a lot of O_TMPFILE files and then close them in the same order, which takes a very long time if we have to traverse the unlinked lists. With the ptach, I see: + /d/t/tmpfile/tmpfile Opened 193531 files in 6.33s. Closed 193531 files in 5.86s real 0m12.192s user 0m0.064s sys 0m11.619s + cd / + umount /mnt real 0m0.050s user 0m0.004s sys 0m0.030s And without the patch: + /d/t/tmpfile/tmpfile Opened 193588 files in 6.35s. Closed 193588 files in 751.61s real 12m38.853s user 0m0.084s sys 12m34.470s + cd / + umount /mnt real 0m0.086s user 0m0.000s sys 0m0.060s Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_errortag.h | 4 +- fs/xfs/xfs_error.c | 3 + fs/xfs/xfs_inode.c | 290 ++++++++++++++++++++++++++++++++++- fs/xfs/xfs_inode.h | 3 + fs/xfs/xfs_mount.c | 5 + fs/xfs/xfs_mount.h | 7 + fs/xfs/xfs_trace.h | 1 + 7 files changed, 306 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 66077a105cbb..79e6c4fb1d8a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -54,7 +54,8 @@ #define XFS_ERRTAG_BUF_LRU_REF 31 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 -#define XFS_ERRTAG_MAX 34 +#define XFS_ERRTAG_IUNLINK_FALLBACK 34 +#define XFS_ERRTAG_MAX 35 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -93,5 +94,6 @@ #define XFS_RANDOM_BUF_LRU_REF 2 #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 +#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 57a85410a8c6..a1e177f66404 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BUF_LRU_REF, XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, + XFS_RANDOM_IUNLINK_FALLBACK, }; struct xfs_errortag_attr { @@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); +XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), + XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), NULL, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 206bb004c4f2..9aaa3143a277 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1906,6 +1906,214 @@ xfs_inactive( xfs_qm_dqdetach(ip); } +/* + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * What if we modelled the unlinked list as a collection of records capturing + * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd + * have a fast way to look up unlinked list predecessors, which avoids the + * slow list walk. That's exactly what we do here (in-core) with a per-AG + * rhashtable. + * + * Because this is a backref cache, we ignore operational failures since the + * iunlink code can fall back to the slow bucket walk. The only errors that + * should bubble out are for obviously incorrect situations. + * + * All users of the backref cache MUST hold the AGI buffer lock to serialize + * access or have otherwise provided for concurrency control. + */ + +/* Capture a "X.next_unlinked = Y" relationship. */ +struct xfs_iunlink { + struct rhash_head iu_rhash_head; + xfs_agino_t iu_agino; /* X */ + xfs_agino_t iu_next_unlinked; /* Y */ +}; + +/* Unlinked list predecessor lookup hashtable construction */ +static int +xfs_iunlink_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const xfs_agino_t *key = arg->key; + const struct xfs_iunlink *iu = obj; + + if (iu->iu_next_unlinked != *key) + return 1; + return 0; +} + +static const struct rhashtable_params xfs_iunlink_hash_params = { + .min_size = XFS_AGI_UNLINKED_BUCKETS, + .key_len = sizeof(xfs_agino_t), + .key_offset = offsetof(struct xfs_iunlink, + iu_next_unlinked), + .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = xfs_iunlink_obj_cmpfn, +}; + +/* + * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such + * relation is found. + */ +static xfs_agino_t +xfs_iunlink_lookup_backref( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_iunlink *iu; + + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + return iu ? iu->iu_agino : NULLAGINO; +} + +/* + * Take ownership of an iunlink cache entry and insert it into the hash table. + * If successful, the entry will be owned by the cache; if not, it is freed. + * Either way, the caller does not own @iu after this call. + */ +static int +xfs_iunlink_insert_backref( + struct xfs_perag *pag, + struct xfs_iunlink *iu) +{ + int error; + + error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + /* + * Fail loudly if there already was an entry because that's a sign of + * corruption of in-memory data. Also fail loudly if we see an error + * code we didn't anticipate from the rhashtable code. Currently we + * only anticipate ENOMEM. + */ + if (error) { + WARN(error != -ENOMEM, "iunlink cache insert error %d", error); + kmem_free(iu); + } + /* + * Absorb any runtime errors that aren't a result of corruption because + * this is a cache and we can always fall back to bucket list scanning. + */ + if (error != 0 && error != -EEXIST) + error = 0; + return error; +} + +/* Remember that @prev_agino.next_unlinked = @this_agino. */ +static int +xfs_iunlink_add_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t this_agino) +{ + struct xfs_iunlink *iu; + + if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) + return 0; + + iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu->iu_agino = prev_agino; + iu->iu_next_unlinked = this_agino; + + return xfs_iunlink_insert_backref(pag, iu); +} + +/* + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. + * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there + * wasn't any such entry then we don't bother. + */ +static int +xfs_iunlink_change_backref( + struct xfs_perag *pag, + xfs_agino_t agino, + xfs_agino_t next_unlinked) +{ + struct xfs_iunlink *iu; + int error; + + /* Look up the old entry; if there wasn't one then exit. */ + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + if (!iu) + return 0; + + /* + * Remove the entry. This shouldn't ever return an error, but if we + * couldn't remove the old entry we don't want to add it again to the + * hash table, and if the entry disappeared on us then someone's + * violated the locking rules and we need to fail loudly. Either way + * we cannot remove the inode because internal state is or would have + * been corrupt. + */ + error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + if (error) + return error; + + /* If there is no new next entry just free our item and return. */ + if (next_unlinked == NULLAGINO) { + kmem_free(iu); + return 0; + } + + /* Update the entry and re-add it to the hash table. */ + iu->iu_next_unlinked = next_unlinked; + return xfs_iunlink_insert_backref(pag, iu); +} + +/* Set up the in-core predecessor structures. */ +int +xfs_iunlink_init( + struct xfs_perag *pag) +{ + return rhashtable_init(&pag->pagi_unlinked_hash, + &xfs_iunlink_hash_params); +} + +/* Free the in-core predecessor structures. */ +static void +xfs_iunlink_free_item( + void *ptr, + void *arg) +{ + struct xfs_iunlink *iu = ptr; + bool *freed_anything = arg; + + *freed_anything = true; + kmem_free(iu); +} + +void +xfs_iunlink_destroy( + struct xfs_perag *pag) +{ + bool freed_anything = false; + + rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, + xfs_iunlink_free_item, &freed_anything); + + ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); +} + /* * Point the AGI unlinked bucket at an inode and log the results. The caller * is responsible for validating the old value. @@ -2066,7 +2274,8 @@ xfs_iunlink( return -EFSCORRUPTED; if (next_agino != NULLAGINO) { - xfs_agino_t old_agino; + struct xfs_perag *pag; + xfs_agino_t old_agino; /* * There is already another inode in the bucket, so point this @@ -2077,6 +2286,16 @@ xfs_iunlink( if (error) return error; ASSERT(old_agino == NULLAGINO); + + /* + * agino has been unlinked, add a backref from the next inode + * back to agino. + */ + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_add_backref(pag, agino, next_agino); + xfs_perag_put(pag); + if (error) + return error; } /* Point the head of the list to point to this inode. */ @@ -2133,7 +2352,8 @@ xfs_iunlink_map_prev( xfs_agino_t *agino, struct xfs_imap *imap, struct xfs_dinode **dipp, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + struct xfs_perag *pag) { struct xfs_mount *mp = tp->t_mountp; xfs_agino_t next_agino; @@ -2142,6 +2362,28 @@ xfs_iunlink_map_prev( ASSERT(head_agino != target_agino); *bpp = NULL; + /* See if our backref cache can find it faster. */ + *agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (*agino != NULLAGINO) { + error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + if (error) + return error; + + if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + return 0; + + /* + * If we get here the cache contents were corrupt, so drop the + * buffer and fall back to walking the bucket list. + */ + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + WARN_ON_ONCE(1); + } + + trace_xfs_iunlink_map_prev_fallback(mp, agno); + + /* Otherwise, walk the entire bucket until we find it. */ next_agino = head_agino; while (next_agino != target_agino) { xfs_agino_t unlinked_agino; @@ -2187,6 +2429,7 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; + struct xfs_perag *pag = NULL; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2222,27 +2465,62 @@ xfs_iunlink_remove( if (error) return error; + /* + * If there was a backref pointing from the next inode back to this + * one, remove it because we've removed this inode from the list. + * + * Later, if this inode was in the middle of the list we'll update + * this inode's backref to point from the next inode. + */ + if (next_agino != NULLAGINO) { + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_change_backref(pag, next_agino, + NULLAGINO); + if (error) + goto out; + } + if (head_agino == agino) { /* Point the head of the list to the next unlinked inode. */ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, next_agino); if (error) - return error; + goto out; } else { struct xfs_imap imap; xfs_agino_t prev_agino; + if (!pag) + pag = xfs_perag_get(mp, agno); + /* We need to search the list for the inode being freed. */ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, - &prev_agino, &imap, &last_dip, &last_ibp); + &prev_agino, &imap, &last_dip, &last_ibp, + pag); if (error) - return error; + goto out; /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, last_dip, &imap, next_agino); + + /* + * Now we deal with the backref for this inode. If this inode + * pointed at a real inode, change the backref that pointed to + * us to point to our old next. If this inode was the end of + * the list, delete the backref that pointed to us. Note that + * change_backref takes care of deleting the backref if + * next_agino is NULLAGINO. + */ + error = xfs_iunlink_change_backref(pag, agino, next_agino); + if (error) + goto out; } - return 0; + +out: + if (pag) + xfs_perag_put(pag); + return error; } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index be2014520155..e62074a5257c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone; bool xfs_inode_verify_forks(struct xfs_inode *ip); +int xfs_iunlink_init(struct xfs_perag *pag); +void xfs_iunlink_destroy(struct xfs_perag *pag); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b4d8c318be3c..fd63b0b1307c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -149,6 +149,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -227,6 +228,9 @@ xfs_initialize_perag( /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; } index = xfs_set_inode_alloc(mp, agcount); @@ -249,6 +253,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7daafe064af8..a33f45077867 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -396,6 +396,13 @@ typedef struct xfs_perag { /* reference count */ uint8_t pagf_refcount_level; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a6e384a642b1..c83ce022a355 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3447,6 +3447,7 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \ TP_ARGS(ip)) DEFINE_AGINODE_EVENT(xfs_iunlink); DEFINE_AGINODE_EVENT(xfs_iunlink_remove); +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); #endif /* _TRACE_XFS_H */ From 75d0230314997b18946c96dc3d93c8d61cfdb9a5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 6 Feb 2019 09:25:29 -0800 Subject: [PATCH 29/70] xfs: clarify documentation for the function to reverify buffers Improve the documentation around xfs_buf_ensure_ops, which is the function that is responsible for cleaning up the b_ops state of buffers that go through xrep_findroot_block but don't match anything. Rename the function to xfs_buf_reverify. [darrick: this started off as bfoster mods of a previous patch of mine, but the renaming part is now this separate patch.] Signed-off-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Signed-off-by: Brian Foster --- fs/xfs/xfs_buf.c | 35 +++++++++++++++-------------------- fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_trans_buf.c | 2 +- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4f5f2ff3f70f..d29246ac3721 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -776,29 +776,24 @@ _xfs_buf_read( } /* - * Set buffer ops on an unchecked buffer and validate it, if possible. + * Reverify a buffer found in cache without an attached ->b_ops. * - * If the caller passed in an ops structure and the buffer doesn't have ops - * assigned, set the ops and use them to verify the contents. If the contents - * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no - * recorded errors and is already in XBF_DONE state. + * If the caller passed an ops structure and the buffer doesn't have ops + * assigned, set the ops and use it to verify the contents. If verification + * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is + * already in XBF_DONE state on entry. * - * Under normal operations, every in-core buffer must have buffer ops assigned - * to them when the buffer is read in from disk so that we can validate the - * metadata. - * - * However, there are two scenarios where one can encounter in-core buffers - * that don't have buffer ops. The first is during log recovery of buffers on - * a V4 filesystem, though these buffers are purged at the end of recovery. - * - * The other is online repair, which tries to match arbitrary metadata blocks - * with btree types in order to find the root. If online repair doesn't match - * the buffer with /any/ btree type, the buffer remains in memory in DONE state - * with no ops, and a subsequent read_buf call from elsewhere will not set the - * ops. This function helps us fix this situation. + * Under normal operations, every in-core buffer is verified on read I/O + * completion. There are two scenarios that can lead to in-core buffers without + * an assigned ->b_ops. The first is during log recovery of buffers on a V4 + * filesystem, though these buffers are purged at the end of recovery. The + * other is online repair, which intentionally reads with a NULL buffer ops to + * run several verifiers across an in-core buffer in order to establish buffer + * type. If repair can't establish that, the buffer will be left in memory + * with NULL buffer ops. */ int -xfs_buf_ensure_ops( +xfs_buf_reverify( struct xfs_buf *bp, const struct xfs_buf_ops *ops) { @@ -840,7 +835,7 @@ xfs_buf_read_map( return bp; } - xfs_buf_ensure_ops(bp, ops); + xfs_buf_reverify(bp, ops); if (flags & XBF_ASYNC) { /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b9f5511ea998..1c436a85b71d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -385,6 +385,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 629f1479c9d2..7d65ebf1e847 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -277,7 +277,7 @@ xfs_trans_read_buf_map( * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); - error = xfs_buf_ensure_ops(bp, ops); + error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __func__); From e34d3e74eb8f6eb020312cec747ff55ee1d1ca18 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:45 -0800 Subject: [PATCH 30/70] xfs: always check magic values in on-disk byte order Most verifiers that check on-disk magic values convert the CPU endian magic value constant to disk endian to facilitate compile time optimization of the byte swap and reduce the need for runtime byte swaps in buffer verifiers. Several buffer verifiers do not follow this pattern. Update those verifiers for consistency. Also fix up a random typo in the inode readahead verifier name. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_attr_leaf.c | 4 ++-- fs/xfs/libxfs/xfs_da_btree.c | 4 ++-- fs/xfs/libxfs/xfs_inode_buf.c | 2 +- fs/xfs/libxfs/xfs_sb.c | 3 ++- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b715668886a4..48aab07e7138 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -570,7 +570,7 @@ xfs_agfl_verify( if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + if (agfl->agfl_magicnum != cpu_to_be32(XFS_AGFL_MAGIC)) return __this_address; /* * during growfs operations, the perag is not fully initialised, diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2652d00842d6..e60eba7f129c 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -251,7 +251,7 @@ xfs_attr3_leaf_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + if (hdr3->info.hdr.magic != cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) return __this_address; if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) @@ -261,7 +261,7 @@ xfs_attr3_leaf_verify( if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) return __this_address; } else { - if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + if (leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) return __this_address; } /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 376bee94b5dd..355322688c9f 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -132,7 +132,7 @@ xfs_da3_node_verify( if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - if (ichdr.magic != XFS_DA3_NODE_MAGIC) + if (hdr3->info.hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC)) return __this_address; if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) @@ -142,7 +142,7 @@ xfs_da3_node_verify( if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) return __this_address; } else { - if (ichdr.magic != XFS_DA_NODE_MAGIC) + if (hdr->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC)) return __this_address; } if (ichdr.level == 0) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 82c4374acb4c..01ebc9557f69 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -151,7 +151,7 @@ const struct xfs_buf_ops xfs_inode_buf_ops = { }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { - .name = "xxfs_inode_ra", + .name = "xfs_inode_ra", .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b5a82acd7dfe..a2f52a958091 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -225,10 +225,11 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); uint32_t agcount = 0; uint32_t rem; - if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (dsb->sb_magicnum != cpu_to_be32(XFS_SB_MAGIC)) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; } From 01e68f40bf7846b58d2734aa11b0cbcaadbeaa3e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:46 -0800 Subject: [PATCH 31/70] xfs: create a separate finobt verifier The inobt verifier is reused for the inobt and finobt, which prevents the ability to distinguish between magic values on a per-tree basis. Create a separate finobt structure in preparation for changes to enforce the appropriate magic value for the associated tree. This patch has no functional change. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 9 ++++++++- fs/xfs/libxfs/xfs_shared.h | 1 + fs/xfs/scrub/agheader_repair.c | 2 +- fs/xfs/xfs_log_recover.c | 6 ++++-- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 999ad8d00d43..bde67ef3ff43 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -361,7 +361,7 @@ xfs_ag_init_headers( { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_inobt_buf_ops, + .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b25e7a0df47..798269eb4767 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -333,6 +333,13 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .verify_struct = xfs_inobt_verify, }; +const struct xfs_buf_ops xfs_finobt_buf_ops = { + .name = "xfs_finobt", + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, +}; + STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, @@ -389,7 +396,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .buf_ops = &xfs_inobt_buf_ops, + .buf_ops = &xfs_finobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 1c5debe748f0..9855f4d2f98f 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -36,6 +36,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_finobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 03d1e15cceba..673be3cf7b8d 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -879,7 +879,7 @@ xrep_agi( }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, + .buf_ops = &xfs_finobt_buf_ops, .magic = XFS_FIBT_CRC_MAGIC, }, [XREP_AGI_END] = { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9fe88d125f0a..228c754bb137 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2445,11 +2445,13 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_allocbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: - case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: - case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; From 8473fee340e37711b9ac6a5cc591305ccaaa4778 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:46 -0800 Subject: [PATCH 32/70] xfs: distinguish between inobt and finobt magic values The inode btree verifier code is shared between the inode btree and free inode btree because the underlying metadata formats are essentially equivalent. A side effect of this is that the verifier cannot determine whether a particular btree block should have an inobt or finobt magic value. This logic allows an unfortunate xfs_repair bug to escape detection where certain level > 0 nodes of the finobt are stamped with inobt magic by xfs_repair finobt reconstruction. This is fortunately not a severe problem since the inode btree magic values do not contribute to any changes in kernel behavior, but we do need a means to detect and prevent this problem in the future. Add a field to xfs_buf_ops to store the v4 and v5 superblock magic values expected by a particular verifier. Add a helper to check an on-disk magic value against the value expected by the verifier. Call the helper from the shared [f]inobt verifier code for magic value verification. This ensures that the inode btree blocks each have the appropriate magic value based on specific tree type and superblock version. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc_btree.c | 16 +++++++--------- fs/xfs/xfs_buf.c | 19 +++++++++++++++++++ fs/xfs/xfs_buf.h | 2 ++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 798269eb4767..c2df1f89eec8 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -260,6 +260,9 @@ xfs_inobt_verify( xfs_failaddr_t fa; unsigned int level; + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + /* * During growfs operations, we can't verify the exact owner as the * perag is not fully initialised and hence not attached to the buffer. @@ -270,18 +273,10 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - switch (block->bb_magic) { - case cpu_to_be32(XFS_IBT_CRC_MAGIC): - case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (xfs_sb_version_hascrc(&mp->m_sb)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_IBT_MAGIC): - case cpu_to_be32(XFS_FIBT_MAGIC): - break; - default: - return __this_address; } /* level verification */ @@ -328,6 +323,7 @@ xfs_inobt_write_verify( const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", + .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, @@ -335,6 +331,8 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { const struct xfs_buf_ops xfs_finobt_buf_ops = { .name = "xfs_finobt", + .magic = { cpu_to_be32(XFS_FIBT_MAGIC), + cpu_to_be32(XFS_FIBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d29246ac3721..52a382b8cbce 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2204,3 +2204,22 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) atomic_set(&bp->b_lru_ref, lru_ref); } + +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic( + struct xfs_buf *bp, + uint32_t dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + return false; + return dmagic == bp->b_ops->magic[idx]; +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1c436a85b71d..44f9423a1861 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -125,6 +125,7 @@ struct xfs_buf_map { struct xfs_buf_ops { char *name; + uint32_t magic[2]; /* v4 and v5 on disk magic values */ void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); @@ -386,5 +387,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +bool xfs_verify_magic(struct xfs_buf *bp, uint32_t dmagic); #endif /* __XFS_BUF_H__ */ From 27df4f5045fc68766980c4dfba5ffc9ad1f71ebb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:47 -0800 Subject: [PATCH 33/70] xfs: split up allocation btree verifier Similar to the inode btree verifier, the same allocation btree verifier structure is shared between the by-bno (bnobt) and by-size (cntbt) btrees. This prevents the ability to distinguish magic values between them. Separate the verifier into two, one for each tree, and assign them appropriately. No functional changes. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 4 ++-- fs/xfs/libxfs/xfs_alloc_btree.c | 14 ++++++++++---- fs/xfs/libxfs/xfs_shared.h | 3 ++- fs/xfs/scrub/agheader_repair.c | 4 ++-- fs/xfs/xfs_log_recover.c | 6 ++++-- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index bde67ef3ff43..1ef8acf35e7d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -339,14 +339,14 @@ xfs_ag_init_headers( { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 4e59cc8a2802..480d0f52da64 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -377,13 +377,19 @@ xfs_allocbt_write_verify( } -const struct xfs_buf_ops xfs_allocbt_buf_ops = { - .name = "xfs_allocbt", +const struct xfs_buf_ops xfs_bnobt_buf_ops = { + .name = "xfs_bnobt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, }; +const struct xfs_buf_ops xfs_cntbt_buf_ops = { + .name = "xfs_cntbt", + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, +}; STATIC int xfs_bnobt_keys_inorder( @@ -448,7 +454,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_bnobt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_bnobt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, @@ -470,7 +476,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_cntbt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_cntbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 9855f4d2f98f..4e909791aeac 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 673be3cf7b8d..2799d1531639 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -341,12 +341,12 @@ xrep_agf( struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_bnobt_buf_ops, .magic = XFS_ABTB_CRC_MAGIC, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_cntbt_buf_ops, .magic = XFS_ABTC_CRC_MAGIC, }, [XREP_AGF_RMAPBT] = { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 228c754bb137..5ad42d598333 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2439,10 +2439,12 @@ xlog_recover_validate_buf_type( case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_allocbt_buf_ops; + bp->b_ops = &xfs_cntbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: From b8f89801664f8413a88cf2c7539d1aeae07dd3c5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:47 -0800 Subject: [PATCH 34/70] xfs: distinguish between bnobt and cntbt magic values The allocation btree verifiers share code that is unable to detect cross-tree magic value corruptions such as a bnobt block with a cntbt magic value. Populate the b_ops->magic field of the associated verifier structures such that the structure verifier can check the magic value against the expected value based on tree type. The btree level check requires knowledge of the tree type to determine the appropriate maximum value. This was previously part of the hardcoded magic value checks. With that code removed, peek at the first magic value in the verifier to determine the expected tree type of the current block. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 60 ++++++++++++++------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 480d0f52da64..9fe949f6055e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -297,48 +297,34 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; + xfs_btnum_t btnum = XFS_BTNUM_BNOi; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; + } /* - * magic number and level verification + * The perag may not be attached during grow operations or fully + * initialized from the AGF during log recovery. Therefore we can only + * check against maximum tree depth from those contexts. * - * During growfs operations, we can't verify the exact level or owner as - * the perag is not fully initialised and hence not attached to the - * buffer. In this case, check against the maximum tree depth. - * - * Similarly, during log recovery we will have a perag structure - * attached, but the agf information will not yet have been initialised - * from the on disk AGF. Again, we can only check against maximum limits - * in this case. + * Otherwise check against the per-tree limit. Peek at one of the + * verifier magic values to determine the type of tree we're verifying + * against. */ level = be16_to_cpu(block->bb_level); - switch (block->bb_magic) { - case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) + btnum = XFS_BTNUM_CNTi; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[btnum]) return __this_address; - break; - case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) - return __this_address; - break; - default: + } else if (level >= mp->m_ag_maxlevels) return __this_address; - } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } @@ -379,6 +365,8 @@ xfs_allocbt_write_verify( const struct xfs_buf_ops xfs_bnobt_buf_ops = { .name = "xfs_bnobt", + .magic = { cpu_to_be32(XFS_ABTB_MAGIC), + cpu_to_be32(XFS_ABTB_CRC_MAGIC) }, .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, @@ -386,6 +374,8 @@ const struct xfs_buf_ops xfs_bnobt_buf_ops = { const struct xfs_buf_ops xfs_cntbt_buf_ops = { .name = "xfs_cntbt", + .magic = { cpu_to_be32(XFS_ABTC_MAGIC), + cpu_to_be32(XFS_ABTC_CRC_MAGIC) }, .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, From 09f420197d7ced360b4809606efd7a65f842c2c0 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:47 -0800 Subject: [PATCH 35/70] xfs: use verifier magic field in dir2 leaf verifiers The dir2 leaf verifiers share the same underlying structure verification code, but implement six accessor functions to multiplex the code across the two verifiers. Further, the magic value isn't sufficiently abstracted such that the common helper has to manually fix up the magic from the caller on v5 filesystems. Use the magic field in the verifier structure to eliminate the duplicate code and clean this all up. No functional change. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_dir2_leaf.c | 87 ++++++++--------------------------- 1 file changed, 19 insertions(+), 68 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 1728a3e6f5cf..dee0fd333d9d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -142,41 +142,31 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp, - uint16_t magic) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; - ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + if (!xfs_verify_magic(bp, leaf->hdr.info.magic)) + return __this_address; if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - uint16_t magic3; - magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC - : XFS_DIR3_LEAFN_MAGIC; - - if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return __this_address; if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) return __this_address; - } else { - if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return __this_address; } return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } static void -__read_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_read_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; xfs_failaddr_t fa; @@ -185,23 +175,22 @@ __read_verify( !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void -__write_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_write_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -216,60 +205,22 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } -static xfs_failaddr_t -xfs_dir3_leaf1_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static xfs_failaddr_t -xfs_dir3_leafn_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", - .verify_read = xfs_dir3_leaf1_read_verify, - .verify_write = xfs_dir3_leaf1_write_verify, - .verify_struct = xfs_dir3_leaf1_verify, + .magic = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", - .verify_read = xfs_dir3_leafn_read_verify, - .verify_write = xfs_dir3_leafn_write_verify, - .verify_struct = xfs_dir3_leafn_verify, + .magic = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; int From 39708c20ab51337c3eb282a824eb0aaff7ebe2e1 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:48 -0800 Subject: [PATCH 36/70] xfs: miscellaneous verifier magic value fixups Most buffer verifiers have hardcoded magic value checks conditionalized on the version of the filesystem. The magic value field of the verifier structure facilitates abstraction of some of this code. Populate the ->magic field of various verifiers to take advantage of this abstraction. No functional changes. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 12 ++++++++---- fs/xfs/libxfs/xfs_attr_leaf.c | 11 +++++------ fs/xfs/libxfs/xfs_attr_remote.c | 8 +++++--- fs/xfs/libxfs/xfs_bmap_btree.c | 13 ++++++------- fs/xfs/libxfs/xfs_da_btree.c | 11 +++++------ fs/xfs/libxfs/xfs_dir2_block.c | 10 +++++----- fs/xfs/libxfs/xfs_dir2_data.c | 12 +++++++----- fs/xfs/libxfs/xfs_dir2_node.c | 10 +++++----- fs/xfs/libxfs/xfs_ialloc.c | 3 ++- fs/xfs/libxfs/xfs_refcount_btree.c | 3 ++- fs/xfs/libxfs/xfs_rmap_btree.c | 3 ++- fs/xfs/libxfs/xfs_sb.c | 4 +++- fs/xfs/libxfs/xfs_symlink_remote.c | 3 ++- 13 files changed, 57 insertions(+), 46 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 48aab07e7138..bc3367b8b7bb 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -568,9 +568,9 @@ xfs_agfl_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return NULL; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) + if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) return __this_address; - if (agfl->agfl_magicnum != cpu_to_be32(XFS_AGFL_MAGIC)) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -643,6 +643,7 @@ xfs_agfl_write_verify( const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", + .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, .verify_struct = xfs_agfl_verify, @@ -2587,8 +2588,10 @@ xfs_agf_verify( return __this_address; } - if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + if (!xfs_verify_magic(bp, agf->agf_magicnum)) + return __this_address; + + if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && @@ -2670,6 +2673,7 @@ xfs_agf_write_verify( const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", + .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, .verify_struct = xfs_agf_verify, diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e60eba7f129c..f164f296f1b8 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -248,21 +248,18 @@ xfs_attr3_leaf_verify( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + if (!xfs_verify_magic(bp, leaf->hdr.info.magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - if (hdr3->info.hdr.magic != cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) - return __this_address; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) return __this_address; - } else { - if (leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) - return __this_address; } /* * In recovery there is a transient state where count == 0 is valid @@ -369,6 +366,8 @@ xfs_attr3_leaf_read_verify( const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", + .magic = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d89363c6b523..65ff600a8067 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok( static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, + struct xfs_buf *bp, void *ptr, int fsbsize, xfs_daddr_t bno) @@ -87,7 +88,7 @@ xfs_attr3_rmt_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify( const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", + .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) }, .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, .verify_struct = xfs_attr3_rmt_verify_struct, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cdb74d2e2a43..aff82ed112c9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -416,8 +416,10 @@ xfs_bmbt_verify( xfs_failaddr_t fa; unsigned int level; - switch (block->bb_magic) { - case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -425,11 +427,6 @@ xfs_bmbt_verify( fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_BMAP_MAGIC): - break; - default: - return __this_address; } /* @@ -481,6 +478,8 @@ xfs_bmbt_write_verify( const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", + .magic = { cpu_to_be32(XFS_BMAP_MAGIC), + cpu_to_be32(XFS_BMAP_CRC_MAGIC) }, .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, .verify_struct = xfs_bmbt_verify, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 355322688c9f..e02d2f407e12 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -129,21 +129,18 @@ xfs_da3_node_verify( ops->node_hdr_from_disk(&ichdr, hdr); + if (!xfs_verify_magic(bp, hdr->hdr.info.magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - if (hdr3->info.hdr.magic != cpu_to_be16(XFS_DA3_NODE_MAGIC)) - return __this_address; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) return __this_address; - } else { - if (hdr->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC)) - return __this_address; } if (ichdr.level == 0) return __this_address; @@ -257,6 +254,8 @@ xfs_da3_node_verify_struct( const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", + .magic = { cpu_to_be16(XFS_DA_NODE_MAGIC), + cpu_to_be16(XFS_DA3_NODE_MAGIC) }, .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, .verify_struct = xfs_da3_node_verify_struct, diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 30ed5919da72..b7d6d78f4ce2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,18 +53,16 @@ xfs_dir3_block_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -112,6 +110,8 @@ xfs_dir3_block_write_verify( const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", + .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), + cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 01162c62ec8f..b7b9ce002cb9 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -252,18 +252,16 @@ xfs_dir3_data_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -339,6 +337,8 @@ xfs_dir3_data_write_verify( const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, .verify_struct = xfs_dir3_data_verify, @@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .name = "xfs_dir3_data_reada", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index f1bb3434f51c..3b03703c5c3d 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -87,20 +87,18 @@ xfs_dir3_free_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; + if (!xfs_verify_magic(bp, hdr->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ @@ -151,6 +149,8 @@ xfs_dir3_free_write_verify( const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", + .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC), + cpu_to_be32(XFS_DIR3_FREE_MAGIC) }, .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, .verify_struct = xfs_dir3_free_verify, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d32152fc8a6c..fe9898875097 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2508,7 +2508,7 @@ xfs_agi_verify( /* * Validate the magic number of the agi block. */ - if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; @@ -2582,6 +2582,7 @@ xfs_agi_write_verify( const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", + .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d9eab657b63e..6f47ab876d90 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -209,7 +209,7 @@ xfs_refcountbt_verify( xfs_failaddr_t fa; unsigned int level; - if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) @@ -264,6 +264,7 @@ xfs_refcountbt_write_verify( const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", + .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) }, .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, .verify_struct = xfs_refcountbt_verify, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f79cf040d745..5738e11055e6 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -310,7 +310,7 @@ xfs_rmapbt_verify( * from the on disk AGF. Again, we can only check against maximum limits * in this case. */ - if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) @@ -365,6 +365,7 @@ xfs_rmapbt_write_verify( const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, .verify_struct = xfs_rmapbt_verify, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a2f52a958091..4e5029c37966 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -229,7 +229,7 @@ xfs_validate_sb_common( uint32_t agcount = 0; uint32_t rem; - if (dsb->sb_magicnum != cpu_to_be32(XFS_SB_MAGIC)) { + if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; } @@ -782,12 +782,14 @@ out_error: const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 77d80106f989..a0ccc253c43d 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -95,7 +95,7 @@ xfs_symlink_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -159,6 +159,7 @@ xfs_symlink_write_verify( const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", + .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) }, .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, .verify_struct = xfs_symlink_verify, From 8764f98351fa561296f70c3435a5cb1eb6272c39 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 7 Feb 2019 10:45:48 -0800 Subject: [PATCH 37/70] xfs: factor xfs_da3_blkinfo verification into common helper With the verifier magic value helper in place, we've left a bit more duplicate code across the verifiers that involve struct xfs_da3_blkinfo. This includes the da node, xattr leaf and dir leaf verifiers, all of which perform similar checks for v4 and v5 filesystems. Create a common helper to verify an xfs_da3_blkinfo structure, taking care to only access v5 fields where appropriate, and refactor the aforementioned verifiers to use the helper. No functional changes. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr_leaf.c | 16 ++++--------- fs/xfs/libxfs/xfs_da_btree.c | 44 +++++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_da_format.h | 3 +++ fs/xfs/libxfs/xfs_dir2_leaf.c | 17 ++++---------- 4 files changed, 43 insertions(+), 37 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index f164f296f1b8..0c92987f57fc 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -245,22 +245,14 @@ xfs_attr3_leaf_verify( struct xfs_attr_leaf_entry *entries; uint32_t end; /* must be 32bit - see below */ int i; + xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); - if (!xfs_verify_magic(bp, leaf->hdr.info.magic)) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } /* * In recovery there is a transient state where count == 0 is valid * because we may have transitioned an empty shortform attr to a leaf diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e02d2f407e12..eb2cee428f26 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } +/* + * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only + * accessible on v5 filesystems. This header format is common across da node, + * attr leaf and dir leaf blocks. + */ +xfs_failaddr_t +xfs_da3_blkinfo_verify( + struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *hdr = &hdr3->hdr; + + if (!xfs_verify_magic(bp, hdr->magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return __this_address; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return __this_address; + } + + return 0; +} + static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) @@ -124,24 +152,16 @@ xfs_da3_node_verify( struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; const struct xfs_dir_ops *ops; + xfs_failaddr_t fa; ops = xfs_dir_get_ops(mp, NULL); ops->node_hdr_from_disk(&ichdr, hdr); - if (!xfs_verify_magic(bp, hdr->hdr.info.magic)) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } if (ichdr.level == 0) return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5d5bf3bffc78..ae654e06b2fb 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); } +xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3); + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index dee0fd333d9d..094028b7b162 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -146,20 +146,11 @@ xfs_dir3_leaf_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_failaddr_t fa; - if (!xfs_verify_magic(bp, leaf->hdr.info.magic)) - return __this_address; - - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return __this_address; - } + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } From 2bfe7069f71e56a301976d08eae3027b1eebc30d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Feb 2019 10:01:50 -0800 Subject: [PATCH 38/70] xfs: add inode magic to inode verifier Use xfs_verify_magic to check the magic numbers of inodes. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_inode_buf.c | 6 +++++- fs/xfs/xfs_log_recover.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 01ebc9557f69..f92f14e93ad3 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -97,7 +97,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + di_ok = xfs_verify_magic(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, @@ -146,12 +146,16 @@ xfs_inode_buf_write_verify( const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", + .magic = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { .name = "xfs_inode_ra", + .magic = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5ad42d598333..f5948d16015b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3049,7 +3049,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + if (unlikely(!xfs_verify_magic(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); From 4260baac629e15723574f42c5c9ba13cb037db8e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Feb 2019 10:05:04 -0800 Subject: [PATCH 39/70] xfs: add magic numbers to dquot buffer ops Add dquot magic numbers to the buffer ops type, in case we ever want to use them. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_dquot_buf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index d293f371dd54..b956638a3532 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify( const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", + .magic = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, .verify_struct = xfs_dquot_buf_verify_struct, @@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .name = "xfs_dquot_ra", + .magic = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; From 9228d751ebf9c7876902db6458a33c26674f7ccc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Feb 2019 10:20:54 -0800 Subject: [PATCH 40/70] xfs: use buf ops magic to detect btree block type Now that we encode block magic numbers in all the buffer ops, use that for block type detection in the ag header repair code instead of encoding magics directly in the repair code. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/agheader_repair.c | 6 ------ fs/xfs/scrub/repair.c | 3 ++- fs/xfs/scrub/repair.h | 3 --- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 2799d1531639..64e31f87d490 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -342,22 +342,18 @@ xrep_agf( [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_bnobt_buf_ops, - .magic = XFS_ABTB_CRC_MAGIC, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_cntbt_buf_ops, - .magic = XFS_ABTC_CRC_MAGIC, }, [XREP_AGF_RMAPBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_rmapbt_buf_ops, - .magic = XFS_RMAP_CRC_MAGIC, }, [XREP_AGF_REFCOUNTBT] = { .rmap_owner = XFS_RMAP_OWN_REFC, .buf_ops = &xfs_refcountbt_buf_ops, - .magic = XFS_REFC_CRC_MAGIC, }, [XREP_AGF_END] = { .buf_ops = NULL, @@ -875,12 +871,10 @@ xrep_agi( [XREP_AGI_INOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_IBT_CRC_MAGIC, }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_finobt_buf_ops, - .magic = XFS_FIBT_CRC_MAGIC, }, [XREP_AGI_END] = { .buf_ops = NULL diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 6acf1bfa0bfe..f28f4bad317b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -743,7 +743,8 @@ xrep_findroot_block( /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); - if (be32_to_cpu(btblock->bb_magic) != fab->magic) + ASSERT(fab->buf_ops->magic[1] != 0); + if (btblock->bb_magic != fab->buf_ops->magic[1]) goto out; /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f2fc18bb7605..d990314eb08b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -42,9 +42,6 @@ struct xrep_find_ag_btree { /* in: buffer ops */ const struct xfs_buf_ops *buf_ops; - /* in: magic number of the btree */ - uint32_t magic; - /* out: the highest btree block found and the tree height */ xfs_agblock_t root; unsigned int height; From 670105de15cd98f0ad6e34464c720cf9c96f4e6e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 8 Feb 2019 09:02:29 -0800 Subject: [PATCH 41/70] xfs: compile time offset checks for common v4/v5 metadata The v5 superblock format added various metadata fields (such as crc, metadata lsn, owner uuid, etc.) to v4 metadata headers or created new v5 headers for blocks where no such headers existed on v4. Where v4 headers did exist, the v5 structures are careful to place v4 metadata at the original location. For example, the magic value is expected to be at the same location in certain blocks to facilitate version detection. While failure of this invariant is likely to cause severe and obvious problems at runtime, we can detect this condition at compile time via the more recently added on-disk format check infrastructure. Since there is no runtime cost, add some offset checks that start with v5 structure definitions, traverse down to the first bit of common metadata with v4 and ensure that common metadata is at the expected offset. Note that we don't care about blocks which had no v4 header because there is no common metadata in those cases. No functional changes. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ondisk.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index d3e04d20d8d4..c8ba98fae30a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + + /* + * The v5 superblock format extended several v4 header structures with + * additional data. While new fields are only accessible on v5 + * superblocks, it's important that the v5 structures place original v4 + * fields/headers in the correct location on-disk. For example, we must + * be able to find magic values at the same location in certain blocks + * regardless of superblock version. + * + * The following checks ensure that various v5 data structures place the + * subset of v4 metadata associated with the same type of block at the + * start of the on-disk block. If there is no data structure definition + * for certain types of v4 blocks, traverse down to the first field of + * common metadata (e.g., magic value) and make sure it is at offset + * zero. + */ + XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0); } #endif /* __XFS_ONDISK_H */ From 3b50086f0c0d78c144d9483fa292c1509c931b70 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 13 Feb 2019 11:15:17 -0800 Subject: [PATCH 42/70] xfs: don't overflow xattr listent buffer For VFS listxattr calls, xfs_xattr_put_listent calls __xfs_xattr_put_listent twice if it sees an attribute "trusted.SGI_ACL_FILE": once for that name, and again for "system.posix_acl_access". Unfortunately, if we happen to run out of buffer space while emitting the first name, we set count to -1 (so that we can feed ERANGE to the caller). The second invocation doesn't check that the context parameters make sense and overwrites the byte before the buffer, triggering a KASAN report: ================================================================== BUG: KASAN: slab-out-of-bounds in strncpy+0xb3/0xd0 Write of size 1 at addr ffff88807fbd317f by task syz/1113 CPU: 3 PID: 1113 Comm: syz Not tainted 5.0.0-rc6-xfsx #rc6 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0xcc/0x180 print_address_description+0x6c/0x23c kasan_report.cold.3+0x1c/0x35 strncpy+0xb3/0xd0 __xfs_xattr_put_listent+0x1a9/0x2c0 [xfs] xfs_attr_list_int_ilocked+0x11af/0x1800 [xfs] xfs_attr_list_int+0x20c/0x2e0 [xfs] xfs_vn_listxattr+0x225/0x320 [xfs] listxattr+0x11f/0x1b0 path_listxattr+0xbd/0x130 do_syscall_64+0x139/0x560 While we're at it we add an assert to the other put_listent to avoid this sort of thing ever happening to the attrlist_by_handle code. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_list.c | 1 + fs/xfs/xfs_xattr.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a58034049995..3d213a7394c5 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -555,6 +555,7 @@ xfs_attr_put_listent( attrlist_ent_t *aep; int arraytop; + ASSERT(!context->seen_enough); ASSERT(!(context->flags & ATTR_KERNOVAL)); ASSERT(context->count >= 0); ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 63ee1d5bf1d7..9a63016009a1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -129,6 +129,9 @@ __xfs_xattr_put_listent( char *offset; int arraytop; + if (context->count < 0 || context->seen_enough) + return; + if (!context->alist) goto compute_size; From e1f6ca11381588e3ef138c10de60eeb34cb8466a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Feb 2019 09:33:15 -0800 Subject: [PATCH 43/70] xfs: rename m_inotbt_nores to m_finobt_nores Rename this flag variable to imply more strongly that it's related to the free inode btree (finobt) operation. No functional changes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_ag_resv.c | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 4 ++-- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_mount.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e701ebc36c06..e2ba2a3b63b2 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -281,7 +281,7 @@ xfs_ag_resv_init( */ ask = used = 0; - mp->m_inotbt_nores = true; + mp->m_finobt_nores = true; error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index c2df1f89eec8..1080381ff243 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -124,7 +124,7 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); @@ -154,7 +154,7 @@ xfs_finobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_free_block(cur, bp); return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9aaa3143a277..d1411c168700 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1754,7 +1754,7 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - if (unlikely(mp->m_inotbt_nores)) { + if (unlikely(mp->m_finobt_nores)) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a33f45077867..864ecf27aa75 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -138,7 +138,7 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint64_t m_flags; /* global mount flags */ - bool m_inotbt_nores; /* no per-AG finobt resv. */ + bool m_finobt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode From 15a268d9f263ed3a0601a1296568241a5a3da7aa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 13 Feb 2019 11:46:16 -0800 Subject: [PATCH 44/70] xfs: reserve blocks for ifree transaction during log recovery Log recovery frees all the inodes stored in the unlinked list, which can cause expansion of the free inode btree. The ifree code skips block reservations if it thinks there's a per-AG space reservation, but we don't set up the reservation until after log recovery, which means that a finobt expansion blows up in xfs_trans_mod_sb when we exceed the transaction's block reservation. To fix this, we set the "no finobt reservation" flag to true when we create the xfs_mount and only set it to false if we confirm that every AG had enough free space to put aside for the finobt. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 1 + fs/xfs/xfs_super.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f3ef70c542e1..584648582ba7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks( int error = 0; int err2; + mp->m_finobt_nores = false; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { pag = xfs_perag_get(mp, agno); err2 = xfs_ag_resv_init(pag, NULL); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c9097cb0b955..08033ac040d6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1594,6 +1594,13 @@ xfs_mount_alloc( INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; return mp; } From c4a6bf7f6cc7eb4cce120fb7eb1e1fb8b2d65e09 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 13 Feb 2019 11:15:17 -0800 Subject: [PATCH 45/70] xfs: don't ever put nlink > 0 inodes on the unlinked list When XFS creates an O_TMPFILE file, the inode is created with nlink = 1, put on the unlinked list, and then the VFS sets nlink = 0 in d_tmpfile. If we crash before anything logs the inode (it's dirty incore but the vfs doesn't tell us it's dirty so we never log that change), the iunlink processing part of recovery will then explode with a pile of: XFS: Assertion failed: VFS_I(ip)->i_nlink == 0, file: fs/xfs/xfs_log_recover.c, line: 5072 Worse yet, since nlink is nonzero, the inodes also don't get cleaned up and they just leak until the next xfs_repair run. Therefore, change xfs_iunlink to require that inodes being put on the unlinked list have nlink == 0, change the tmpfile callers to instantiate nodes that way, and set the nlink to 1 just prior to calling d_tmpfile. Fix the comment for xfs_iunlink while we're at it. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 16 ++++++---------- fs/xfs/xfs_iops.c | 13 +++++++++++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d1411c168700..f643a9295179 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1332,7 +1332,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); + error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -2231,11 +2231,8 @@ out: } /* - * This is called when the inode's link count goes to 0 or we are creating a - * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be - * set to true as the link count is dropped to zero by the VFS after we've - * created the file successfully, so we have to add it to the unlinked list - * while the link count is non-zero. + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. @@ -2254,6 +2251,7 @@ xfs_iunlink( short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; + ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); trace_xfs_iunlink(ip); @@ -3184,11 +3182,9 @@ xfs_rename_alloc_whiteout( /* * Prepare the tmpfile inode as if it were created through the VFS. - * Otherwise, the link increment paths will complain about nlink 0->1. - * Drop the link count as done by d_tmpfile(), complete the inode setup - * and flag it as linkable. + * Complete the inode setup and flag it as linkable. nlink is already + * zero, so we can skip the drop_nlink. */ - drop_nlink(VFS_I(tmpfile)); xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7a8d3e..1efef69a7f1c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -191,9 +191,18 @@ xfs_generic_create( xfs_setup_iops(ip); - if (tmpfile) + if (tmpfile) { + /* + * The VFS requires that any inode fed to d_tmpfile must have + * nlink == 1 so that it can decrement the nlink in d_tmpfile. + * However, we created the temp file with nlink == 0 because + * we're not allowed to put an inode with nlink > 0 on the + * unlinked list. Therefore we have to set nlink to 1 so that + * d_tmpfile can immediately set it back to zero. + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); - else + } else d_instantiate(dentry, inode); xfs_finish_inode_setup(ip); From be225fec72ed10a7e2c81a1ddd0d081a80238bff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:46 -0800 Subject: [PATCH 46/70] xfs: remove the io_type field from the writeback context and ioend The io_type field contains what is basically a summary of information from the inode fork and the imap. But we can just as easily use that information directly, simplifying a few bits here and there and improving the trace points. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 93 ++++++++++++++++++++------------------------ fs/xfs/xfs_aops.h | 24 +----------- fs/xfs/xfs_iomap.c | 8 ++-- fs/xfs/xfs_reflink.c | 2 +- fs/xfs/xfs_trace.h | 34 +++++++--------- 5 files changed, 63 insertions(+), 98 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 515532f45beb..a3fa60d1d2df 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -28,7 +28,7 @@ */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; - unsigned int io_type; + int fork; unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; @@ -256,30 +256,20 @@ xfs_end_io( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) xfs_reflink_cancel_cow_range(ip, offset, size, true); - break; - } - goto done; } /* - * Success: commit the COW or unwritten blocks if needed. + * Success: commit the COW or unwritten blocks if needed. */ - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) error = xfs_reflink_end_cow(ip, offset, size); - break; - case XFS_IO_UNWRITTEN: - /* writeback should never update isize */ + else if (ioend->io_state == XFS_EXT_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - break; - default: + else ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); - break; - } done: if (ioend->io_append_trans) @@ -294,7 +284,8 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) + if (ioend->io_fork == XFS_COW_FORK || + ioend->io_state == XFS_EXT_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -320,7 +311,7 @@ xfs_imap_valid( * covers the offset. Be careful to check this first because the caller * can revalidate a COW mapping without updating the data seqno. */ - if (wpc->io_type == XFS_IO_COW) + if (wpc->fork == XFS_COW_FORK) return true; /* @@ -350,7 +341,6 @@ xfs_map_blocks( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; - int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; int error = 0; @@ -400,6 +390,9 @@ xfs_map_blocks( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); + + wpc->fork = XFS_COW_FORK; + /* * Truncate can race with writeback since writeback doesn't * take the iolock and truncate decreases the file size before @@ -412,11 +405,13 @@ xfs_map_blocks( * will kill the contents anyway. */ if (offset > i_size_read(inode)) { - wpc->io_type = XFS_IO_HOLE; + wpc->imap.br_blockcount = end_fsb - offset_fsb; + wpc->imap.br_startoff = offset_fsb; + wpc->imap.br_startblock = HOLESTARTBLOCK; + wpc->imap.br_state = XFS_EXT_NORM; return 0; } - whichfork = XFS_COW_FORK; - wpc->io_type = XFS_IO_COW; + goto allocate_blocks; } @@ -439,12 +434,14 @@ xfs_map_blocks( wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); + wpc->fork = XFS_DATA_FORK; + if (imap.br_startoff > offset_fsb) { /* landed in a hole or beyond EOF */ imap.br_blockcount = imap.br_startoff - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; - wpc->io_type = XFS_IO_HOLE; + imap.br_state = XFS_EXT_NORM; } else { /* * Truncate to the next COW extent if there is one. This is the @@ -456,31 +453,24 @@ xfs_map_blocks( cow_fsb < imap.br_startoff + imap.br_blockcount) imap.br_blockcount = cow_fsb - imap.br_startoff; - if (isnullstartblock(imap.br_startblock)) { - /* got a delalloc extent */ - wpc->io_type = XFS_IO_DELALLOC; + /* got a delalloc extent? */ + if (isnullstartblock(imap.br_startblock)) goto allocate_blocks; - } - - if (imap.br_state == XFS_EXT_UNWRITTEN) - wpc->io_type = XFS_IO_UNWRITTEN; - else - wpc->io_type = XFS_IO_OVERWRITE; } wpc->imap = imap; - trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); + trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - whichfork == XFS_COW_FORK ? + error = xfs_iomap_write_allocate(ip, wpc->fork, offset, &imap, + wpc->fork == XFS_COW_FORK ? &wpc->cow_seq : &wpc->data_seq); if (error) return error; - ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || + ASSERT(wpc->fork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || imap.br_startoff + imap.br_blockcount <= cow_fsb); wpc->imap = imap; - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); return 0; } @@ -505,7 +495,7 @@ xfs_submit_ioend( int status) { /* Convert CoW extents to regular */ - if (!status && ioend->io_type == XFS_IO_COW) { + if (!status && ioend->io_fork == XFS_COW_FORK) { /* * Yuk. This can do memory allocation, but is not a * transactional operation so everything is done in GFP_KERNEL @@ -523,7 +513,8 @@ xfs_submit_ioend( /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && + (ioend->io_fork == XFS_COW_FORK || + ioend->io_state != XFS_EXT_UNWRITTEN) && xfs_ioend_is_append(ioend) && !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); @@ -552,7 +543,8 @@ xfs_submit_ioend( static struct xfs_ioend * xfs_alloc_ioend( struct inode *inode, - unsigned int type, + int fork, + xfs_exntst_t state, xfs_off_t offset, struct block_device *bdev, sector_t sector) @@ -566,7 +558,8 @@ xfs_alloc_ioend( ioend = container_of(bio, struct xfs_ioend, io_inline_bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; + ioend->io_fork = fork; + ioend->io_state = state; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; @@ -627,13 +620,15 @@ xfs_add_to_ioend( sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || + if (!wpc->ioend || + wpc->fork != wpc->ioend->io_fork || + wpc->imap.br_state != wpc->ioend->io_state || sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, - bdev, sector); + wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, + wpc->imap.br_state, offset, bdev, sector); } if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { @@ -742,7 +737,7 @@ xfs_writepage_map( error = xfs_map_blocks(wpc, inode, file_offset); if (error) break; - if (wpc->io_type == XFS_IO_HOLE) + if (wpc->imap.br_startblock == HOLESTARTBLOCK) continue; xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, &submit_list); @@ -937,9 +932,7 @@ xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); @@ -953,9 +946,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e5c23948a8ab..6c2615b83c5d 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -8,33 +8,13 @@ extern struct bio_set xfs_ioend_bioset; -/* - * Types of I/O for bmap clustering and I/O completion tracking. - * - * This enum is used in string mapping in xfs_trace.h; please keep the - * TRACE_DEFINE_ENUMs for it up to date. - */ -enum { - XFS_IO_HOLE, /* covers region without any block allocation */ - XFS_IO_DELALLOC, /* covers delalloc region */ - XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ - XFS_IO_OVERWRITE, /* covers already allocated extent */ - XFS_IO_COW, /* covers copy-on-write extent */ -}; - -#define XFS_IO_TYPES \ - { XFS_IO_HOLE, "hole" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" } - /* * Structure for buffered I/O completions. */ struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ - unsigned int io_type; /* delalloc / unwritten */ + int io_fork; /* inode fork written back */ + xfs_exntst_t io_state; /* extent state */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6af1d3ec0a9c..fd3aacd4bf02 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -575,7 +575,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - trace_xfs_iomap_found(ip, offset, count, 0, &got); + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got); goto done; } @@ -647,7 +647,7 @@ retry: * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); + trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got); done: if (isnullstartblock(got.br_startblock)) got.br_startblock = DELAYSTARTBLOCK; @@ -1082,7 +1082,7 @@ xfs_file_iomap_begin( return error; iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields @@ -1098,7 +1098,7 @@ out_finish: out_found: ASSERT(nimaps); xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); goto out_finish; out_unlock: diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c5b4fa004ca4..2babc2cbe103 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1192,7 +1192,7 @@ xfs_reflink_remap_blocks( break; ASSERT(nimaps == 1); - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, &imap); /* Translate imap into the destination file. */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c83ce022a355..f1e18ae8a209 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \ DEFINE_READPAGE_EVENT(xfs_vm_readpage); DEFINE_READPAGE_EVENT(xfs_vm_readpages); -TRACE_DEFINE_ENUM(XFS_IO_HOLE); -TRACE_DEFINE_ENUM(XFS_IO_DELALLOC); -TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN); -TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE); -TRACE_DEFINE_ENUM(XFS_IO_COW); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int type, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, type, irec), + int whichfork, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, whichfork, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, type) + __field(int, whichfork) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->type = type; + __entry->whichfork = whichfork; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->whichfork == XFS_COW_FORK ? "cow" : "data", __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) ) -#define DEFINE_IOMAP_EVENT(name) \ +#define DEFINE_IMAP_EVENT(name) \ DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int type, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, type, irec)) -DEFINE_IOMAP_EVENT(xfs_map_blocks_found); -DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_found); + int whichfork, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, whichfork, irec)) +DEFINE_IMAP_EVENT(xfs_map_blocks_found); +DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); TRACE_EVENT(xfs_reflink_remap_blocks_loop, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, From b4e29032f254fed39ef2251807322f36c3ede1aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:47 -0800 Subject: [PATCH 47/70] xfs: remove the s_maxbytes checks in xfs_map_blocks We already ensure all data fits into s_maxbytes in the write / fault path. The only reason we have them here is that they were copy and pasted from xfs_bmapi_read when we stopped using that function. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a3fa60d1d2df..8bfb62d8776f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -338,7 +338,8 @@ xfs_map_blocks( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; @@ -374,11 +375,6 @@ xfs_map_blocks( xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_super->s_maxbytes); - - if (offset > mp->m_super->s_maxbytes - count) - count = mp->m_super->s_maxbytes - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); /* * Check if this is offset is covered by a COW extents, and if yes use From b101e3342a34404f2cc2daaad569afcae68452b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:47 -0800 Subject: [PATCH 48/70] xfs: simplify the xfs_bmap_btree_to_extents calling conventions Move boilerplate code from the callers into xfs_bmap_btree_to_extents: - exit early without failure if we don't need to convert to the extent format - assert that we have a btree cursor - don't reinitialize the passed in logflags argument Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 78 ++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f4a65330a2a9..3c0dee417d31 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -577,42 +577,44 @@ __xfs_bmap_add_free( */ /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Convert the inode format to extent format if it currently is in btree format, + * but the extent list is small enough that it fits into the extent format. + * + * Since the extents are already in-core, all we have to do is give up the space + * for the btree root and pitch the leaf block. */ STATIC int /* error */ xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - /* REFERENCED */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ xfs_buf_t *cbp; /* child block's buffer */ int error; /* error return value */ - struct xfs_ifork *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ struct xfs_owner_info oinfo; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* check if we actually need the extent format first: */ + if (!xfs_bmap_wants_extents(ip, whichfork)) + return 0; + + ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); - *logflagsp = 0; #ifdef DEBUG XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, xfs_btree_check_lptr(cur, cbno, 1)); @@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -4424,19 +4426,10 @@ xfs_bmapi_write( } *nmap = n; - /* - * Transform from btree to extents, give it cur. - */ - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - ASSERT(bma.cur); - error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, - &tmp_logflags, whichfork); - bma.logflags |= tmp_logflags; - if (error) - goto error0; - } + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto error0; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > @@ -4574,13 +4567,7 @@ xfs_bmapi_remap( if (error) goto error0; - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, whichfork); - logflags |= tmp_logflags; - } + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) @@ -5444,24 +5431,11 @@ nodelete: error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; - if (error) - goto error0; - } - /* - * transform from btree to extents, give it cur - */ - else if (xfs_bmap_wants_extents(ip, whichfork)) { - ASSERT(cur != NULL); - error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + } else { + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; } - /* - * transform from extents to local? - */ - error = 0; + error0: /* * Log everything. Do this after conversion, there's no point in From c8b54673b30a9668d626a9e48d1659c21300f2a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:48 -0800 Subject: [PATCH 49/70] xfs: factor out two helpers from xfs_bmapi_write We want to be able to reuse them for the upcoming dedidcated delalloc convert routine. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 78 ++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 34 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3c0dee417d31..0ba1ade4b822 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4194,6 +4194,44 @@ xfs_bmapi_convert_unwritten( return 0; } +static inline xfs_extlen_t +xfs_bmapi_minleft( + struct xfs_trans *tp, + struct xfs_inode *ip, + int fork) +{ + if (tp && tp->t_firstblock != NULLFSBLOCK) + return 0; + if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + return 1; + return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; +} + +/* + * Log whatever the flags say, even if error. Otherwise we might miss detecting + * a case where the data is changed, there's an error, and it's not logged so we + * don't shutdown when we should. Don't bother logging extents/btree changes if + * we converted to the other format. + */ +static void +xfs_bmapi_finish( + struct xfs_bmalloca *bma, + int whichfork, + int error) +{ + if ((bma->logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma->logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma->logflags &= ~xfs_ilog_fbroot(whichfork); + + if (bma->logflags) + xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags); + if (bma->cur) + xfs_btree_del_cursor(bma->cur, error); +} + /* * Map file blocks to filesystem blocks, and allocate blocks or convert the * extent state if necessary. Details behaviour is controlled by the flags @@ -4273,15 +4311,6 @@ xfs_bmapi_write( XFS_STATS_INC(mp, xs_blk_mapw); - if (!tp || tp->t_firstblock == NULLFSBLOCK) { - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; - else - bma.minleft = 1; - } else { - bma.minleft = 0; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) @@ -4296,6 +4325,7 @@ xfs_bmapi_write( bma.ip = ip; bma.total = total; bma.datatype = 0; + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* * The delalloc flag means the caller wants to allocate the entire @@ -4434,32 +4464,12 @@ xfs_bmapi_write( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork)); - error = 0; + xfs_bmapi_finish(&bma, whichfork, 0); + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return 0; error0: - /* - * Log everything. Do this after conversion, there's no point in - * logging the extent records if we've converted to btree format. - */ - if ((bma.logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - bma.logflags &= ~xfs_ilog_fext(whichfork); - else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - bma.logflags &= ~xfs_ilog_fbroot(whichfork); - /* - * Log whatever the flags say, even if error. Otherwise we might miss - * detecting a case where the data is changed, there's an error, - * and it's not logged so we don't shutdown when we should. - */ - if (bma.logflags) - xfs_trans_log_inode(tp, ip, bma.logflags); - - if (bma.cur) { - xfs_btree_del_cursor(bma.cur, error); - } - if (!error) - xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + xfs_bmapi_finish(&bma, whichfork, error); return error; } From d8ae82e394bd5d836a32864b1ca22757ef8bb8ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:48 -0800 Subject: [PATCH 50/70] xfs: split XFS_BMAPI_DELALLOC handling from xfs_bmapi_write Delalloc conversion has traditionally been part of our function to allocate blocks on disk (first xfs_bmapi, then xfs_bmapi_write), but delalloc conversion is a little special as we really do not want to allocate blocks over holes, for which we don't have reservations. Split the delalloc conversions into a separate helper to keep the code simple and structured. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 106 +++++++++++++++++++++------------------ fs/xfs/libxfs/xfs_bmap.h | 4 -- 2 files changed, 58 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0ba1ade4b822..edab329df83b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4327,22 +4327,6 @@ xfs_bmapi_write( bma.datatype = 0; bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); - /* - * The delalloc flag means the caller wants to allocate the entire - * delalloc extent backing bno where bno may not necessarily match the - * startoff. Now that we've looked up the extent, reset the range to - * map based on the extent in the file. If we're in a hole, this may be - * an error so don't adjust anything. - */ - if ((flags & XFS_BMAPI_DELALLOC) && - !eof && bno >= bma.got.br_startoff) { - bno = bma.got.br_startoff; - len = bma.got.br_blockcount; -#ifdef DEBUG - orig_bno = bno; - orig_len = len; -#endif - } n = 0; end = bno + len; obno = bno; @@ -4359,26 +4343,7 @@ xfs_bmapi_write( ASSERT(!((flags & XFS_BMAPI_CONVERT) && (flags & XFS_BMAPI_COWFORK))); - if (flags & XFS_BMAPI_DELALLOC) { - /* - * For the COW fork we can reasonably get a - * request for converting an extent that races - * with other threads already having converted - * part of it, as there converting COW to - * regular blocks is not protected using the - * IOLOCK. - */ - ASSERT(flags & XFS_BMAPI_COWFORK); - if (!(flags & XFS_BMAPI_COWFORK)) { - error = -EIO; - goto error0; - } - - if (eof || bno >= end) - break; - } else { - need_alloc = true; - } + need_alloc = true; } else if (isnullstartblock(bma.got.br_startblock)) { wasdelay = true; } @@ -4487,23 +4452,68 @@ xfs_bmapi_convert_delalloc( int whichfork, struct xfs_bmbt_irec *imap) { - int flags = XFS_BMAPI_DELALLOC; - int nimaps = 1; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmalloca bma = { NULL }; int error; - int total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, - XFS_DATA_FORK); - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || + bma.got.br_startoff > offset_fsb) { + /* + * No extent found in the range we are trying to convert. This + * should only happen for the COW fork, where another thread + * might have moved the extent to the data fork in the meantime. + */ + WARN_ON_ONCE(whichfork != XFS_COW_FORK); + return -EAGAIN; + } /* - * The delalloc flag means to allocate the entire extent; pass a dummy - * length of 1. + * If we find a real extent here we raced with another thread converting + * the extent. Just return the real extent at this offset. */ - error = xfs_bmapi_write(tp, ip, offset_fsb, 1, flags, total, imap, - &nimaps); - if (!error && !nimaps) - error = -EFSCORRUPTED; + if (!isnullstartblock(bma.got.br_startblock)) { + *imap = bma.got; + return 0; + } + + bma.tp = tp; + bma.ip = ip; + bma.wasdel = true; + bma.offset = bma.got.br_startoff; + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + if (whichfork == XFS_COW_FORK) + bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; + + error = xfs_bmapi_allocate(&bma); + if (error) + goto out_finish; + + error = -ENOSPC; + if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) + goto out_finish; + error = -EFSCORRUPTED; + if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + goto out_finish; + + ASSERT(!isnullstartblock(bma.got.br_startblock)); + *imap = bma.got; + + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); + if (error) + goto out_finish; + } + + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); +out_finish: + xfs_bmapi_finish(&bma, whichfork, error); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 4dc7d1a02b35..b5eca7a26949 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -95,9 +95,6 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 -/* Only convert delalloc space, don't allocate entirely new extents */ -#define XFS_BMAPI_DELALLOC 0x400 - /* Only convert unwritten extents, don't allocate new blocks */ #define XFS_BMAPI_CONVERT_ONLY 0x800 @@ -117,7 +114,6 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } From 491ce61e939f76399e344b0414dc5a4c08c1f0cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:49 -0800 Subject: [PATCH 51/70] xfs: move transaction handling to xfs_bmapi_convert_delalloc No need to deal with the transaction and the inode locking in the caller. Note that we also switch to passing whichfork as the second paramter, matching what most related functions do. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 38 +++++++++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_bmap.h | 5 +++-- fs/xfs/xfs_iomap.c | 32 ++++---------------------------- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index edab329df83b..e245b5dfee9d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4446,16 +4446,30 @@ error0: */ int xfs_bmapi_convert_delalloc( - struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, int whichfork, - struct xfs_bmbt_irec *imap) + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap, + unsigned int *seq) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; struct xfs_bmalloca bma = { NULL }; + struct xfs_trans *tp; int error; + /* + * Space for the extent and indirect blocks was reserved when the + * delalloc extent was created so there's no need to do so here. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* @@ -4464,7 +4478,8 @@ xfs_bmapi_convert_delalloc( * might have moved the extent to the data fork in the meantime. */ WARN_ON_ONCE(whichfork != XFS_COW_FORK); - return -EAGAIN; + error = -EAGAIN; + goto out_trans_cancel; } /* @@ -4473,7 +4488,8 @@ xfs_bmapi_convert_delalloc( */ if (!isnullstartblock(bma.got.br_startblock)) { *imap = bma.got; - return 0; + *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; } bma.tp = tp; @@ -4502,6 +4518,7 @@ xfs_bmapi_convert_delalloc( ASSERT(!isnullstartblock(bma.got.br_startblock)); *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) { error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, @@ -4512,8 +4529,19 @@ xfs_bmapi_convert_delalloc( error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); + if (error) + goto out_finish; + + xfs_bmapi_finish(&bma, whichfork, 0); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + out_finish: xfs_bmapi_finish(&bma, whichfork, error); +out_trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b5eca7a26949..78b190b6e908 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -223,8 +223,9 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); -int xfs_bmapi_convert_delalloc(struct xfs_trans *, struct xfs_inode *, - xfs_fileoff_t, int, struct xfs_bmbt_irec *); +int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, + unsigned int *seq); static inline void xfs_bmap_add_free( diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fd3aacd4bf02..39be741cac5a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -684,11 +684,9 @@ xfs_iomap_write_allocate( unsigned int *seq) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb; xfs_fileoff_t map_start_fsb; xfs_extlen_t map_count_fsb; - struct xfs_trans *tp; int error = 0; /* @@ -716,17 +714,8 @@ xfs_iomap_write_allocate( /* * Allocate in a loop because it may take several attempts to * allocate real blocks for a contiguous delalloc extent if free - * space is sufficiently fragmented. Note that space for the - * extent and indirect blocks was reserved when the delalloc - * extent was created so there's no need to do so here. + * space is sufficiently fragmented. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, - XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); /* * ilock was dropped since imap was populated which means it @@ -737,17 +726,10 @@ xfs_iomap_write_allocate( * caller. We'll trim it down to the caller's most recently * validated range before we return. */ - error = xfs_bmapi_convert_delalloc(tp, ip, offset_fsb, - whichfork, imap); + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset_fsb, + imap, seq); if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp); - if (error) - goto error0; - - *seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; /* * See if we were able to allocate an extent that covers at @@ -766,12 +748,6 @@ xfs_iomap_write_allocate( return 0; } } - -trans_cancel: - xfs_trans_cancel(tp); -error0: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; } int From 125851ac92d62b966df851c6f34147121020af2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:49 -0800 Subject: [PATCH 52/70] xfs: move stat accounting to xfs_bmapi_convert_delalloc This way we can actually count how many bytes got converted and how many calls we need, unlike in the caller which doesn't have the detailed view. Note that this includes a slight change in behavior as the xs_xstrat_quick is now bumped for every allocation instead of just the one covering the requested writeback offset, which makes a lot more sense. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 3 +++ fs/xfs/xfs_iomap.c | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e245b5dfee9d..e4f29ebdf4d2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4516,6 +4516,9 @@ xfs_bmapi_convert_delalloc( if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) goto out_finish; + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); + XFS_STATS_INC(mp, xs_xstrat_quick); + ASSERT(!isnullstartblock(bma.got.br_startblock)); *imap = bma.got; *seq = READ_ONCE(ifp->if_seq); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 39be741cac5a..15da53b5fb53 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -707,9 +707,6 @@ xfs_iomap_write_allocate( map_start_fsb = imap->br_startoff; map_count_fsb = imap->br_blockcount; - XFS_STATS_ADD(mp, xs_xstrat_bytes, - XFS_FSB_TO_B(mp, imap->br_blockcount)); - while (true) { /* * Allocate in a loop because it may take several attempts to @@ -741,7 +738,6 @@ xfs_iomap_write_allocate( if ((offset_fsb >= imap->br_startoff) && (offset_fsb < (imap->br_startoff + imap->br_blockcount))) { - XFS_STATS_INC(mp, xs_xstrat_quick); xfs_trim_extent(imap, map_start_fsb, map_count_fsb); ASSERT(offset_fsb >= imap->br_startoff && offset_fsb < imap->br_startoff + imap->br_blockcount); From 4ad765edb02a5333ce2fade642f116a67a3370ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:49 -0800 Subject: [PATCH 53/70] xfs: move xfs_iomap_write_allocate to xfs_aops.c This function is a small wrapper only used by the writeback code, so move it together with the writeback code and simplify it down to the glorified do { } while loop that is now is. A few bits intentionally got lost here: no need to call xfs_qm_dqattach because quotas are always attached when we create the delalloc reservation, and no need for the imap->br_startblock == 0 check given that xfs_bmapi_convert_delalloc already has a WARN_ON_ONCE for exactly that condition. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 51 +++++++++++++++++++++++++---- fs/xfs/xfs_iomap.c | 81 ---------------------------------------------- fs/xfs/xfs_iomap.h | 2 -- 3 files changed, 45 insertions(+), 89 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8bfb62d8776f..42017ecf78ed 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -329,6 +329,38 @@ xfs_imap_valid( return true; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in wpc->imap. + * + * The current page is held locked so nothing could have removed the block + * backing offset_fsb. + */ +static int +xfs_convert_blocks( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result into wpc->imap. Allocate in a loop + * because it may take several attempts to allocate real blocks for a + * contiguous delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, + &wpc->imap, wpc->fork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); + if (error) + return error; + } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + + return 0; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -458,14 +490,21 @@ xfs_map_blocks( trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, wpc->fork, offset, &imap, - wpc->fork == XFS_COW_FORK ? - &wpc->cow_seq : &wpc->data_seq); + error = xfs_convert_blocks(wpc, ip, offset_fsb); if (error) return error; - ASSERT(wpc->fork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || - imap.br_startoff + imap.br_blockcount <= cow_fsb); - wpc->imap = imap; + + /* + * Due to merging the return real extent might be larger than the + * original delalloc one. Trim the return extent to the next COW + * boundary again to force a re-lookup. + */ + if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && + cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) + wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + + ASSERT(wpc->imap.br_startoff <= offset_fsb); + ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); return 0; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 15da53b5fb53..361dfe7af783 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -665,87 +665,6 @@ out_unlock: return error; } -/* - * Pass in a delayed allocate extent, convert it to real extents; - * return to the caller the extent we create which maps on top of - * the originating callers request. - * - * Called without a lock on the inode. - * - * We no longer bother to look at the incoming map - all we have to - * guarantee is that whatever we allocate fills the required range. - */ -int -xfs_iomap_write_allocate( - struct xfs_inode *ip, - int whichfork, - xfs_off_t offset, - struct xfs_bmbt_irec *imap, - unsigned int *seq) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t map_start_fsb; - xfs_extlen_t map_count_fsb; - int error = 0; - - /* - * Make sure that the dquots are there. - */ - error = xfs_qm_dqattach(ip); - if (error) - return error; - - /* - * Store the file range the caller is interested in because it encodes - * state such as potential overlap with COW fork blocks. We must trim - * the allocated extent down to this range to maintain consistency with - * what the caller expects. Revalidation of the range itself is the - * responsibility of the caller. - */ - offset_fsb = XFS_B_TO_FSBT(mp, offset); - map_start_fsb = imap->br_startoff; - map_count_fsb = imap->br_blockcount; - - while (true) { - /* - * Allocate in a loop because it may take several attempts to - * allocate real blocks for a contiguous delalloc extent if free - * space is sufficiently fragmented. - */ - - /* - * ilock was dropped since imap was populated which means it - * might no longer be valid. The current page is held locked so - * nothing could have removed the block backing offset_fsb. - * Attempt to allocate whatever delalloc extent currently backs - * offset_fsb and put the result in the imap pointer from the - * caller. We'll trim it down to the caller's most recently - * validated range before we return. - */ - error = xfs_bmapi_convert_delalloc(ip, whichfork, offset_fsb, - imap, seq); - if (error) - return error; - - /* - * See if we were able to allocate an extent that covers at - * least part of the callers request. - */ - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, imap); - - if ((offset_fsb >= imap->br_startoff) && - (offset_fsb < (imap->br_startoff + - imap->br_blockcount))) { - xfs_trim_extent(imap, map_start_fsb, map_count_fsb); - ASSERT(offset_fsb >= imap->br_startoff && - offset_fsb < imap->br_startoff + imap->br_blockcount); - return 0; - } - } -} - int xfs_iomap_write_unwritten( xfs_inode_t *ip, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c6170548831b..6b16243db0b7 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,8 +13,6 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, - struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, From 19c8e4e25866fac5ba7138b902cc45b6d3c8e827 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:50 -0800 Subject: [PATCH 54/70] xfs: remove the truncate short cut in xfs_map_blocks Now that we properly handle the race with truncate in the delalloc allocator there is no need to short cut this exceptional case earlier on. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 42017ecf78ed..a6abb7125203 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -420,26 +420,6 @@ xfs_map_blocks( xfs_iunlock(ip, XFS_ILOCK_SHARED); wpc->fork = XFS_COW_FORK; - - /* - * Truncate can race with writeback since writeback doesn't - * take the iolock and truncate decreases the file size before - * it starts truncating the pages between new_size and old_size. - * Therefore, we can end up in the situation where writeback - * gets a CoW fork mapping but the truncate makes the mapping - * invalid and we end up in here trying to get a new mapping. - * bail out here so that we simply never get a valid mapping - * and so we drop the write altogether. The page truncation - * will kill the contents anyway. - */ - if (offset > i_size_read(inode)) { - wpc->imap.br_blockcount = end_fsb - offset_fsb; - wpc->imap.br_startoff = offset_fsb; - wpc->imap.br_startblock = HOLESTARTBLOCK; - wpc->imap.br_state = XFS_EXT_NORM; - return 0; - } - goto allocate_blocks; } From 7588cbeec6df925ef6142a7e48762896c06007a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2019 08:02:50 -0800 Subject: [PATCH 55/70] xfs: retry COW fork delalloc conversion when no extent was found While we can only truncate a block under the page lock for the current page, there is no high-level synchronization for moving extents from the COW to the data fork. This means that for example we can have another thread doing a direct I/O completion that moves extents from the COW to the data fork race with writeback. While this race is very hard to hit the always_cow seems to reproduce it reasonably well, and it also exists without that. Because of that there is a chance that a delalloc conversion for the COW fork might not find any extents to convert. In that case we should retry the whole block lookup and now find the blocks in the data fork. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a6abb7125203..2ed8733eca49 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -334,7 +334,8 @@ xfs_imap_valid( * extent that maps offset_fsb in wpc->imap. * * The current page is held locked so nothing could have removed the block - * backing offset_fsb. + * backing offset_fsb, although it could have moved from the COW to the data + * fork by another thread. */ static int xfs_convert_blocks( @@ -375,6 +376,7 @@ xfs_map_blocks( xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; + int retries = 0; int error = 0; if (XFS_FORCED_SHUTDOWN(mp)) @@ -404,6 +406,7 @@ xfs_map_blocks( * into real extents. If we return without a valid map, it means we * landed in a hole and we skip the block. */ +retry: xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); @@ -471,8 +474,19 @@ xfs_map_blocks( return 0; allocate_blocks: error = xfs_convert_blocks(wpc, ip, offset_fsb); - if (error) + if (error) { + /* + * If we failed to find the extent in the COW fork we might have + * raced with a COW to data fork conversion or truncate. + * Restart the lookup to catch the extent in the data fork for + * the former case, but prevent additional retries to avoid + * looping forever for the latter case. + */ + if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + goto retry; + ASSERT(error != -EAGAIN); return error; + } /* * Due to merging the return real extent might be larger than the From 15baadf72cedc2a09ea792c1fc59451502b55da2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 16 Feb 2019 11:47:28 -0800 Subject: [PATCH 56/70] xfs: fix xfs_buf magic number endian checks Create a separate magic16 check function so that we don't run afoul of static checkers. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_attr_leaf.c | 4 ++-- fs/xfs/libxfs/xfs_da_btree.c | 8 ++++---- fs/xfs/libxfs/xfs_dir2_leaf.c | 8 ++++---- fs/xfs/libxfs/xfs_dquot_buf.c | 8 ++++---- fs/xfs/libxfs/xfs_inode_buf.c | 10 +++++----- fs/xfs/xfs_buf.c | 20 +++++++++++++++++++- fs/xfs/xfs_buf.h | 8 ++++++-- fs/xfs/xfs_log_recover.c | 2 +- 8 files changed, 45 insertions(+), 23 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 0c92987f57fc..1f6e3965ff74 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -358,8 +358,8 @@ xfs_attr3_leaf_read_verify( const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", - .magic = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), - cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index eb2cee428f26..e2737e2ac2ae 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -129,7 +129,7 @@ xfs_da3_blkinfo_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_blkinfo *hdr = &hdr3->hdr; - if (!xfs_verify_magic(bp, hdr->magic)) + if (!xfs_verify_magic16(bp, hdr->magic)) return __this_address; if (xfs_sb_version_hascrc(&mp->m_sb)) { @@ -141,7 +141,7 @@ xfs_da3_blkinfo_verify( return __this_address; } - return 0; + return NULL; } static xfs_failaddr_t @@ -274,8 +274,8 @@ xfs_da3_node_verify_struct( const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", - .magic = { cpu_to_be16(XFS_DA_NODE_MAGIC), - cpu_to_be16(XFS_DA3_NODE_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC), + cpu_to_be16(XFS_DA3_NODE_MAGIC) }, .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, .verify_struct = xfs_da3_node_verify_struct, diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 094028b7b162..9a3767818c50 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -198,8 +198,8 @@ xfs_dir3_leaf_write_verify( const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", - .magic = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), - cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, .verify_read = xfs_dir3_leaf_read_verify, .verify_write = xfs_dir3_leaf_write_verify, .verify_struct = xfs_dir3_leaf_verify, @@ -207,8 +207,8 @@ const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", - .magic = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), - cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, .verify_read = xfs_dir3_leaf_read_verify, .verify_write = xfs_dir3_leaf_write_verify, .verify_struct = xfs_dir3_leaf_verify, diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index b956638a3532..fb5bd9a804f6 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -277,8 +277,8 @@ xfs_dquot_buf_write_verify( const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", - .magic = { cpu_to_be16(XFS_DQUOT_MAGIC), - cpu_to_be16(XFS_DQUOT_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, .verify_struct = xfs_dquot_buf_verify_struct, @@ -286,8 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .name = "xfs_dquot_ra", - .magic = { cpu_to_be16(XFS_DQUOT_MAGIC), - cpu_to_be16(XFS_DQUOT_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f92f14e93ad3..e021d5133ccb 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -97,7 +97,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); - di_ok = xfs_verify_magic(bp, dip->di_magic) && + di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, @@ -146,16 +146,16 @@ xfs_inode_buf_write_verify( const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", - .magic = { cpu_to_be16(XFS_DINODE_MAGIC), - cpu_to_be16(XFS_DINODE_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { .name = "xfs_inode_ra", - .magic = { cpu_to_be16(XFS_DINODE_MAGIC), - cpu_to_be16(XFS_DINODE_MAGIC) }, + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 52a382b8cbce..548344e25128 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2213,7 +2213,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) bool xfs_verify_magic( struct xfs_buf *bp, - uint32_t dmagic) + __be32 dmagic) { struct xfs_mount *mp = bp->b_target->bt_mount; int idx; @@ -2223,3 +2223,21 @@ xfs_verify_magic( return false; return dmagic == bp->b_ops->magic[idx]; } +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic16( + struct xfs_buf *bp, + __be16 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + return false; + return dmagic == bp->b_ops->magic16[idx]; +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 44f9423a1861..d0b96e071cec 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -125,7 +125,10 @@ struct xfs_buf_map { struct xfs_buf_ops { char *name; - uint32_t magic[2]; /* v4 and v5 on disk magic values */ + union { + __be32 magic[2]; /* v4 and v5 on disk magic values */ + __be16 magic16[2]; /* v4 and v5 on disk magic values */ + }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); @@ -387,6 +390,7 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); -bool xfs_verify_magic(struct xfs_buf *bp, uint32_t dmagic); +bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); +bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index f5948d16015b..3371d1ff27c4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3049,7 +3049,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(!xfs_verify_magic(bp, dip->di_magic))) { + if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); From 16be1433737ee46f88da57d47f594c4fc1376538 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:46 -0800 Subject: [PATCH 57/70] xfs: make xfs_bmbt_to_iomap more useful Move checking for invalid zero blocks and setting of various iomap flags into this helper. Also make it deal with "raw" delalloc extents to avoid clutter in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 84 +++++++++++++++++++++------------------------- fs/xfs/xfs_iomap.h | 4 +-- fs/xfs/xfs_pnfs.c | 2 +- 3 files changed, 41 insertions(+), 49 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 361dfe7af783..284c5e68f695 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -35,18 +35,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -void +static int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool shared) { struct xfs_mount *mp = ip->i_mount; + if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { + } else if (imap->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(imap->br_startblock)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { @@ -60,6 +82,13 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + if (xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + if (shared) + iomap->flags |= IOMAP_F_SHARED; + return 0; } static void @@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb( return 0; } -STATIC int -xfs_alert_fsblock_zero( - xfs_inode_t *ip, - xfs_bmbt_irec_t *imap) -{ - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)imap->br_startblock, - (unsigned long long)imap->br_startoff, - (unsigned long long)imap->br_blockcount, - imap->br_state); - return -EFSCORRUPTED; -} - int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -649,17 +661,7 @@ retry: iomap->flags |= IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got); done: - if (isnullstartblock(got.br_startblock)) - got.br_startblock = DELAYSTARTBLOCK; - - if (!got.br_startblock) { - error = xfs_alert_fsblock_zero(ip, &got); - if (error) - goto out_unlock; - } - - xfs_bmbt_to_iomap(ip, iomap, &got); - + error = xfs_bmbt_to_iomap(ip, iomap, &got, false); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -976,15 +978,7 @@ xfs_file_iomap_begin( trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields - & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; - - xfs_bmbt_to_iomap(ip, iomap, &imap); - - if (shared) - iomap->flags |= IOMAP_F_SHARED; - return 0; + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_found: ASSERT(nimaps); @@ -1107,12 +1101,10 @@ xfs_xattr_iomap_begin( out_unlock: xfs_iunlock(ip, lockmode); - if (!error) { - ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); - } - - return error; + if (error) + return error; + ASSERT(nimaps); + return xfs_bmbt_to_iomap(ip, iomap, &imap, false); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 6b16243db0b7..37b584c3069b 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -15,8 +15,8 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); -void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); +int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *, bool shared); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f44c3599527d..bde2c9f56a46 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -185,7 +185,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); *device_generation = mp->m_generation; return error; out_unlock: From 60271ab79d40b99ce6cb28d8dc48aa5e9ffb6df3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:46 -0800 Subject: [PATCH 58/70] xfs: fix SEEK_DATA for speculative COW fork preallocation We speculatively allocate extents in the COW fork to reduce fragmentation. But when we write data into such COW fork blocks that do now shadow an allocation in the data fork SEEK_DATA will not correctly report it, as it only looks at the data fork extents. The only reason why that hasn't been an issue so far is because we even use these speculative COW fork preallocations over holes in the data fork at all for buffered writes, and blocks in the COW fork that are written by direct writes are moved into the data fork immediately at I/O completion time. Add a new set of iomap_ops for SEEK_HOLE/SEEK_DATA which looks into both the COW and data fork, and reports all COW extents as unwritten to the iomap layer. While this isn't strictly true for COW fork extents that were already converted to real extents, the practical semantics that you can't read data from them until they are moved into the data fork are very similar, and this will force the iomap layer into probing the extents for actually present data. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 4 +-- fs/xfs/xfs_iomap.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_iomap.h | 1 + 3 files changed, 89 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e47425071e65..1d07dcfbbff3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1068,10 +1068,10 @@ xfs_file_llseek( default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); break; case SEEK_DATA: - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); break; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 284c5e68f695..df6eda336f17 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1068,6 +1068,92 @@ const struct iomap_ops xfs_iomap_ops = { .iomap_end = xfs_file_iomap_end, }; +static int +xfs_seek_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + int error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { + /* + * If we found a data extent we are done. + */ + if (imap.br_startoff <= offset_fsb) + goto done; + data_fsb = imap.br_startoff; + } else { + /* + * Fake a hole until the end of the file. + */ + data_fsb = min(XFS_B_TO_FSB(mp, offset + length), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + } + + /* + * If a COW fork extent covers the hole, report it - capped to the next + * data fork extent: + */ + if (xfs_inode_has_cow_data(ip) && + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cow_fsb = cmap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + /* + * This is a COW extent, so we must probe the page cache + * because there could be dirty page cache being backed + * by this extent. + */ + iomap->type = IOMAP_UNWRITTEN; + goto out_unlock; + } + + /* + * Else report a hole, capped to the next found data or COW extent. + */ + if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) + imap.br_blockcount = cow_fsb - offset_fsb; + else + imap.br_blockcount = data_fsb - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; +done: + xfs_trim_extent(&imap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +const struct iomap_ops xfs_seek_iomap_ops = { + .iomap_begin = xfs_seek_iomap_begin, +}; + static int xfs_xattr_iomap_begin( struct inode *inode, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 37b584c3069b..5c2f6aa6d78f 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -40,6 +40,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ From 78f0cc9d55cbe75faccc0135371c45912a34e6ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:46 -0800 Subject: [PATCH 59/70] xfs: don't use delalloc extents for COW on files with extsize hints While using delalloc for extsize hints is generally a good idea, the current code that does so only for COW doesn't help us much and creates a lot of special cases. Switch it to use real allocations like we do for direct I/O. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 29 ++++++++++++++++++----------- fs/xfs/xfs_reflink.c | 10 +++++++++- fs/xfs/xfs_reflink.h | 3 ++- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index df6eda336f17..be9d2a4b190a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -918,22 +918,29 @@ xfs_file_iomap_begin( * been done up front, so we don't need to do them here. */ if (xfs_is_reflink_inode(ip)) { + struct xfs_bmbt_irec orig = imap; + /* if zeroing doesn't need COW allocation, then we are done. */ if ((flags & IOMAP_ZERO) && !needs_cow_for_zeroing(&imap, nimaps)) goto out_found; - if (flags & IOMAP_DIRECT) { - /* may drop and re-acquire the ilock */ - error = xfs_reflink_allocate_cow(ip, &imap, &shared, - &lockmode); - if (error) - goto out_unlock; - } else { - error = xfs_reflink_reserve_cow(ip, &imap); - if (error) - goto out_unlock; - } + /* may drop and re-acquire the ilock */ + error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode, + flags); + if (error) + goto out_unlock; + + /* + * For buffered writes we need to report the address of the + * previous block (if there was any) so that the higher level + * write code can perform read-modify-write operations. For + * direct I/O code, which must be block aligned we need to + * report the newly allocated address. + */ + if (!(flags & IOMAP_DIRECT) && + orig.br_startblock != HOLESTARTBLOCK) + imap = orig; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 2babc2cbe103..8a5353daf9ab 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -397,7 +397,8 @@ xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, - uint *lockmode) + uint *lockmode, + unsigned iomap_flags) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; @@ -471,6 +472,13 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!(iomap_flags & IOMAP_DIRECT)) + return 0; return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); out_unreserve: diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 6d73daef1f13..70d68a1a9b49 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -15,7 +15,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, + unsigned iomap_flags); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); From 12df89f28fa92e54bfb2ae01f9ee059e74e1acc0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:47 -0800 Subject: [PATCH 60/70] xfs: also truncate holes covered by COW blocks This only matters if we want to write data through the COW fork that is not actually an overwrite of existing data. Reasons for that are speculative COW fork allocations using the cowextsize, or a mode where we always write through the COW fork. Currently both can't actually happen, but I plan to enable them. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2ed8733eca49..983d11c27d32 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -447,28 +447,29 @@ retry: wpc->fork = XFS_DATA_FORK; + /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { - /* landed in a hole or beyond EOF */ imap.br_blockcount = imap.br_startoff - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; - } else { - /* - * Truncate to the next COW extent if there is one. This is the - * only opportunity to do this because we can skip COW fork - * lookups for the subsequent blocks in the mapping; however, - * the requirement to treat the COW range separately remains. - */ - if (cow_fsb != NULLFILEOFF && - cow_fsb < imap.br_startoff + imap.br_blockcount) - imap.br_blockcount = cow_fsb - imap.br_startoff; - - /* got a delalloc extent? */ - if (isnullstartblock(imap.br_startblock)) - goto allocate_blocks; } + /* + * Truncate to the next COW extent if there is one. This is the only + * opportunity to do this because we can skip COW fork lookups for the + * subsequent blocks in the mapping; however, the requirement to treat + * the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + + /* got a delalloc extent? */ + if (imap.br_startblock != HOLESTARTBLOCK && + isnullstartblock(imap.br_startblock)) + goto allocate_blocks; + wpc->imap = imap; trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; From db46e604adf8c923214a63b46e87ca2411d3d580 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:47 -0800 Subject: [PATCH 61/70] xfs: merge COW handling into xfs_file_iomap_begin_delay Besides simplifying the code a bit this allows to actually implement the behavior of using COW preallocation for non-COW data mentioned in the current comments. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 133 ++++++++++++++++++++++++++++++------------- fs/xfs/xfs_reflink.c | 67 ---------------------- fs/xfs/xfs_reflink.h | 2 - fs/xfs/xfs_trace.h | 3 - 4 files changed, 94 insertions(+), 111 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index be9d2a4b190a..08c4d1d8f90e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -534,15 +534,16 @@ xfs_file_iomap_begin_delay( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); xfs_fileoff_t end_fsb; - int error = 0, eof = 0; - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared; + int whichfork = XFS_DATA_FORK; + int error = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -560,7 +561,7 @@ xfs_file_iomap_begin_delay( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; @@ -568,51 +569,92 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) - got.br_startoff = end_fsb; /* fake hole until the end */ + imap.br_startoff = end_fsb; /* fake hole until the end */ - if (got.br_startoff <= offset_fsb) { + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } + + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_reflink_inode(ip)) { + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + whichfork = XFS_COW_FORK; + goto done; + } + } + + if (imap.br_startoff <= offset_fsb) { /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - got.br_state != XFS_EXT_UNWRITTEN)) { - xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got); - if (error) - goto out_unlock; + if (!xfs_is_reflink_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; } - trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &got); - goto done; - } + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); - goto out_unlock; + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + whichfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); } error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages - * to keep the chunks of work done where somewhat symmetric with the - * work writeback does. This is a completely arbitrary number pulled - * out of thin air as a best guess for initial testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - - if (eof) { + if (eof && whichfork == XFS_DATA_FORK) { prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, &icur); if (prealloc_blocks) { @@ -635,9 +677,11 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &icur, - eof); + error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + whichfork == XFS_DATA_FORK ? &imap : &cmap, + whichfork == XFS_DATA_FORK ? &icur : &ccur, + whichfork == XFS_DATA_FORK ? eof : cow_eof); switch (error) { case 0: break; @@ -659,9 +703,20 @@ retry: * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &got); + trace_xfs_iomap_alloc(ip, offset, count, whichfork, + whichfork == XFS_DATA_FORK ? &imap : &cmap); done: - error = xfs_bmbt_to_iomap(ip, iomap, &got, false); + if (whichfork == XFS_COW_FORK) { + if (imap.br_startoff > offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false); + goto out_unlock; + } + /* ensure we only report blocks we have a reservation for */ + xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); + } + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 8a5353daf9ab..9ef1f79cb3ae 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -234,73 +234,6 @@ xfs_reflink_trim_around_shared( } } -/* - * Trim the passed in imap to the next shared/unshared extent boundary, and - * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. - * - * Note that imap will always contain the block numbers for the existing blocks - * in the data fork, as the upper layers need them for read-modify-write - * operations. - */ -int -xfs_reflink_reserve_cow( - struct xfs_inode *ip, - struct xfs_bmbt_irec *imap) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got; - int error = 0; - bool eof = false; - struct xfs_iext_cursor icur; - bool shared; - - /* - * Search the COW fork extent list first. This serves two purposes: - * first this implement the speculative preallocation using cowextisze, - * so that we also unshared block adjacent to shared blocks instead - * of just the shared blocks themselves. Second the lookup in the - * extent list is generally faster than going out to the shared extent - * tree. - */ - - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) - eof = true; - if (!eof && got.br_startoff <= imap->br_startoff) { - trace_xfs_reflink_cow_found(ip, imap); - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - return 0; - } - - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, &shared); - if (error) - return error; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) - return 0; - - /* - * Fork all the shared blocks from our write offset until the end of - * the extent. - */ - error = xfs_qm_dqattach_locked(ip, false); - if (error) - return error; - - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &icur, eof); - if (error == -ENOSPC || error == -EDQUOT) - trace_xfs_reflink_cow_enospc(ip, imap); - if (error) - return error; - - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - trace_xfs_reflink_cow_alloc(ip, &got); - return 0; -} - /* Convert part of an unwritten CoW extent to a real one. */ STATIC int xfs_reflink_convert_cow_extent( diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 70d68a1a9b49..4a9e3cd4768a 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -12,8 +12,6 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, unsigned iomap_flags); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f1e18ae8a209..47fb07d86efd 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3196,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow); - DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); From 26b91c728b2d15952432371dc2b6ba1dda1fb61f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:48 -0800 Subject: [PATCH 62/70] xfs: make COW fork unwritten extent conversions more robust If we have racing buffered and direct I/O COW fork extents under writeback can have been moved to the data fork by the time we call xfs_reflink_convert_cow from xfs_submit_ioend. This would be mostly harmless as the block numbers don't change by this move, except for the fact that xfs_bmapi_write will crash or trigger asserts when not finding existing extents, even despite trying to paper over this with the XFS_BMAPI_CONVERT_ONLY flag. Instead of special casing non-transaction conversions in the already way too complicated xfs_bmapi_write just add a new helper for the much simpler non-transactional COW fork case, which simplify ignores not found extents. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 9 ++---- fs/xfs/libxfs/xfs_bmap.h | 8 +++--- fs/xfs/xfs_reflink.c | 61 +++++++++++++++++++++++++--------------- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e4f29ebdf4d2..48502cb9990f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2031,7 +2031,7 @@ done: /* * Convert an unwritten allocation to a real allocation or vice versa. */ -STATIC int /* error */ +int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ @@ -4276,9 +4276,7 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(tp != NULL || - (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == - (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); + ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4352,8 +4350,7 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if ((need_alloc || wasdelay) && - !(flags & XFS_BMAPI_CONVERT_ONLY)) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 78b190b6e908..8f597f9abdbe 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -95,9 +95,6 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 -/* Only convert unwritten extents, don't allocate new blocks */ -#define XFS_BMAPI_CONVERT_ONLY 0x800 - /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD 0x1000 @@ -114,7 +111,6 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } @@ -226,6 +222,10 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, unsigned int *seq); +int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, int *logflagsp); static inline void xfs_bmap_add_free( diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 9ef1f79cb3ae..f84b37fa4f17 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -234,26 +234,42 @@ xfs_reflink_trim_around_shared( } } -/* Convert part of an unwritten CoW extent to a real one. */ -STATIC int -xfs_reflink_convert_cow_extent( - struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - xfs_fileoff_t offset_fsb, - xfs_filblks_t count_fsb) +static int +xfs_reflink_convert_cow_locked( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb) { - int nimaps = 1; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_btree_cur *dummy_cur = NULL; + int dummy_logflags; + int error; - if (imap->br_state == XFS_EXT_NORM) + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; - xfs_trim_extent(imap, offset_fsb, count_fsb); - trace_xfs_reflink_convert_cow(ip, imap); - if (imap->br_blockcount == 0) - return 0; - return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, - &nimaps); + do { + if (got.br_startoff >= offset_fsb + count_fsb) + break; + if (got.br_state == XFS_EXT_NORM) + continue; + if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) + return -EIO; + + xfs_trim_extent(&got, offset_fsb, count_fsb); + if (!got.br_blockcount) + continue; + + got.br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent_unwritten_real(NULL, ip, + XFS_COW_FORK, &icur, &dummy_cur, &got, + &dummy_logflags); + if (error) + return error; + } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); + + return error; } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ @@ -267,15 +283,12 @@ xfs_reflink_convert_cow( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_filblks_t count_fsb = end_fsb - offset_fsb; - struct xfs_bmbt_irec imap; - int nimaps = 1, error = 0; + int error; ASSERT(count != 0); xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | - XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -405,14 +418,16 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: + xfs_trim_extent(imap, offset_fsb, count_fsb); /* * COW fork extents are supposed to remain unwritten until we're ready * to initiate a disk write. For direct I/O we are going to write the * data and need the conversion, but for buffered writes we're done. */ - if (!(iomap_flags & IOMAP_DIRECT)) + if (!(iomap_flags & IOMAP_DIRECT) || imap->br_state == XFS_EXT_NORM) return 0; - return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); + trace_xfs_reflink_convert_cow(ip, imap); + return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, From c4feb0b194f37b33514ee30a9e86a68194a1361e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:48 -0800 Subject: [PATCH 63/70] xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay No user of it in the iomap code at the moment, but we should not actively report wrong information if we can trivially get it right. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iomap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 08c4d1d8f90e..0803ed76e296 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -541,7 +541,7 @@ xfs_file_iomap_begin_delay( struct xfs_bmbt_irec imap, cmap; struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; - bool eof = false, cow_eof = false, shared; + bool eof = false, cow_eof = false, shared = false; int whichfork = XFS_DATA_FORK; int error = 0; @@ -710,13 +710,14 @@ done: if (imap.br_startoff > offset_fsb) { xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); goto out_unlock; } /* ensure we only report blocks we have a reservation for */ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); + shared = true; } - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; From 66ae56a53f0e34113da1a70068422b9444fe66f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:38:49 -0800 Subject: [PATCH 64/70] xfs: introduce an always_cow mode Add a mode where XFS never overwrites existing blocks in place. This is to aid debugging our COW code, and also put infatructure in place for things like possible future support for zoned block devices, which can't support overwrites. This mode is enabled globally by doing a: echo 1 > /sys/fs/xfs/debug/always_cow Note that the parameter is global to allow running all tests in xfstests easily in this mode, which would not easily be possible with a per-fs sysfs file. In always_cow mode persistent preallocations are disabled, and fallocate will fail when called with a 0 mode (with our without FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE. There are a few interesting xfstests failures when run in always_cow mode: - generic/392 fails because the bytes used in the file used to test hole punch recovery are less after the log replay. This is because the blocks written and then punched out are only freed with a delay due to the logging mechanism. - xfs/170 will fail as the already fragile file streams mechanism doesn't seem to interact well with the COW allocator - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim the file system is badly fragmented, but there is not much we can do to avoid that when always writing out of place - xfs/205 fails because overwriting a file in always_cow mode will require new space allocation and the assumption in the test thus don't work anymore. - xfs/326 fails to modify the file at all in always_cow mode after injecting the refcount error, leading to an unexpected md5sum after the remount, but that again is expected Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_bmap_util.c | 9 +++------ fs/xfs/xfs_file.c | 27 ++++++++++++++++++++------- fs/xfs/xfs_iomap.c | 28 ++++++++++++++++++---------- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_reflink.c | 28 ++++++++++++++++++++++++---- fs/xfs/xfs_reflink.h | 13 +++++++++++++ fs/xfs/xfs_super.c | 15 +++++++++++---- fs/xfs/xfs_sysctl.h | 1 + fs/xfs/xfs_sysfs.c | 24 ++++++++++++++++++++++++ 10 files changed, 116 insertions(+), 32 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 983d11c27d32..7b8bb6bde981 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1023,7 +1023,7 @@ xfs_vm_bmap( * Since we don't pass back blockdev info, we can't return bmap * information for rt files either. */ - if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) + if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; return iomap_bmap(mapping, block, &xfs_iomap_ops); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1ee8c5539fa4..2db43ff4f8b5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1162,16 +1162,13 @@ xfs_zero_file_space( * by virtue of the hole punch. */ error = xfs_free_file_space(ip, offset, len); - if (error) - goto out; + if (error || xfs_is_always_cow_inode(ip)) + return error; - error = xfs_alloc_file_space(ip, round_down(offset, blksize), + return xfs_alloc_file_space(ip, round_down(offset, blksize), round_up(offset + len, blksize) - round_down(offset, blksize), XFS_BMAPI_PREALLOC); -out: - return error; - } static int diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1d07dcfbbff3..770cc2edf777 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -507,7 +507,7 @@ xfs_file_dio_aio_write( * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } @@ -872,14 +872,27 @@ xfs_file_fallocate( goto out_unlock; } - if (mode & FALLOC_FL_ZERO_RANGE) + if (mode & FALLOC_FL_ZERO_RANGE) { error = xfs_zero_file_space(ip, offset, len); - else { - if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; + } else if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + + if (!xfs_is_always_cow_inode(ip)) { + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); } + } else { + /* + * If always_cow mode we can't use preallocations and + * thus should not create them. + */ + if (xfs_is_always_cow_inode(ip)) { + error = -EOPNOTSUPP; + goto out_unlock; + } + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 0803ed76e296..01210eae8bf3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -395,12 +395,13 @@ xfs_quota_calc_throttle( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_inode *ip, + int whichfork, loff_t offset, loff_t count, struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmbt_irec prev; int shift = 0; @@ -593,7 +594,11 @@ xfs_file_iomap_begin_delay( * themselves. Second the lookup in the extent list is generally faster * than going out to the shared extent tree. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &ccur, &cmap); if (!cow_eof && cmap.br_startoff <= offset_fsb) { @@ -609,7 +614,7 @@ xfs_file_iomap_begin_delay( * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ - if (!xfs_is_reflink_inode(ip) || + if (!xfs_is_cow_inode(ip) || ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &imap); @@ -619,7 +624,7 @@ xfs_file_iomap_begin_delay( xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared); + error = xfs_inode_need_cow(ip, &imap, &shared); if (error) goto out_unlock; @@ -648,15 +653,18 @@ xfs_file_iomap_begin_delay( */ count = min_t(loff_t, count, 1024 * PAGE_SIZE); end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (xfs_is_always_cow_inode(ip)) + whichfork = XFS_COW_FORK; } error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; - if (eof && whichfork == XFS_DATA_FORK) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, - &icur); + if (eof) { + prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, + count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -867,7 +875,7 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && is_write) { + if (xfs_is_cow_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -901,7 +909,7 @@ relock: * check, so if we got ILOCK_SHARED for a write and but we're now a * reflink inode we have to switch to ILOCK_EXCL and relock. */ - if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { xfs_iunlock(ip, mode); mode = XFS_ILOCK_EXCL; goto relock; @@ -973,7 +981,7 @@ xfs_file_iomap_begin( * Break shared extents if necessary. Checks for non-blocking IO have * been done up front, so we don't need to do them here. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { struct xfs_bmbt_irec orig = imap; /* if zeroing doesn't need COW allocation, then we are done. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 864ecf27aa75..110f927cf943 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -194,6 +194,7 @@ typedef struct xfs_mount { */ uint32_t m_generation; + bool m_always_cow; bool m_fail_unmount; #ifdef DEBUG /* diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f84b37fa4f17..e2d9179bd50d 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -234,6 +234,23 @@ xfs_reflink_trim_around_shared( } } +bool +xfs_inode_need_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + bool *shared) +{ + /* We can't update any real extents in always COW mode. */ + if (xfs_is_always_cow_inode(ip) && + !isnullstartblock(imap->br_startblock)) { + *shared = true; + return 0; + } + + /* Trim the mapping to the nearest shared extent boundary. */ + return xfs_reflink_trim_around_shared(ip, imap, shared); +} + static int xfs_reflink_convert_cow_locked( struct xfs_inode *ip, @@ -321,7 +338,7 @@ xfs_find_trim_cow_extent( if (got.br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, got.br_startoff - imap->br_startoff); - return xfs_reflink_trim_around_shared(ip, imap, shared); + return xfs_inode_need_cow(ip, imap, shared); } *shared = true; @@ -356,7 +373,10 @@ xfs_reflink_allocate_cow( xfs_extlen_t resblks = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) @@ -542,7 +562,7 @@ xfs_reflink_cancel_cow_range( int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); - ASSERT(xfs_is_reflink_inode(ip)); + ASSERT(ip->i_cowfp); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 4a9e3cd4768a..2a3052fbe23e 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -6,11 +6,24 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) +{ + return ip->i_mount->m_always_cow && + xfs_sb_version_hasreflink(&ip->i_mount->m_sb); +} + +static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +{ + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); +} + extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + bool *shared); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 08033ac040d6..f093ea244849 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1736,11 +1736,18 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { - xfs_alert(mp, + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, "reflink not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_globals.always_cow) { + xfs_info(mp, "using DEBUG-only always_cow mode."); + mp->m_always_cow = true; + } } if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 168488130a19..ad7f9be13087 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,7 @@ struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ + bool always_cow; /* use COW fork for all overwrites */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index cd6a994a7250..cabda13f3c64 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -183,10 +183,34 @@ mount_delay_show( } XFS_SYSFS_ATTR_RW(mount_delay); +static ssize_t +always_cow_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.always_cow); + if (ret < 0) + return ret; + return count; +} + +static ssize_t +always_cow_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); +} +XFS_SYSFS_ATTR_RW(always_cow); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), + ATTR_LIST(always_cow), NULL, }; From 081a8ae2a54e85ddd77db77d50b1a04b3c4731c9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 21 Feb 2019 07:53:20 -0800 Subject: [PATCH 65/70] xfs: fix uninitialized error variable A previous commit removed the initialization of variable 'error' to zero, and can cause a bogus error return. This occurs when error contains a non-zero garbage value and the call to xchk_should_terminate detects a pending fatal signal and checks for a zero error before setting it to -EAGAIN. Fix the issue by initializing error to zero. Fixes: b9454fe056bd ("xfs: clean up the inode cluster checking in the inobt scrub") Signed-off-by: Colin Ian King Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 2c9dad2b61b1..700114f79a7d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -161,7 +161,7 @@ xchk_iallocbt_check_cluster_ifree( bool irec_free; bool ino_inuse; bool freemask_ok; - int error; + int error = 0; if (xchk_should_terminate(bs->sc, &error)) return error; From affe250a085d8230fa44fee1a0b7929a986e7580 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Feb 2019 16:26:35 -0800 Subject: [PATCH 66/70] xfs: don't pass iomap flags to xfs_reflink_allocate_cow Don't pass raw iomap flags to xfs_reflink_allocate_cow; signal our intention with a boolean argument. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_iomap.c | 13 +++++++------ fs/xfs/xfs_reflink.c | 4 ++-- fs/xfs/xfs_reflink.h | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 01210eae8bf3..fa6621448d5e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -983,6 +983,7 @@ xfs_file_iomap_begin( */ if (xfs_is_cow_inode(ip)) { struct xfs_bmbt_irec orig = imap; + bool directio = (flags & IOMAP_DIRECT); /* if zeroing doesn't need COW allocation, then we are done. */ if ((flags & IOMAP_ZERO) && @@ -991,19 +992,19 @@ xfs_file_iomap_begin( /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode, - flags); + directio); if (error) goto out_unlock; /* * For buffered writes we need to report the address of the * previous block (if there was any) so that the higher level - * write code can perform read-modify-write operations. For - * direct I/O code, which must be block aligned we need to - * report the newly allocated address. + * write code can perform read-modify-write operations; we + * won't need the CoW fork mapping until writeback. For direct + * I/O, which must be block aligned, we need to report the + * newly allocated address. */ - if (!(flags & IOMAP_DIRECT) && - orig.br_startblock != HOLESTARTBLOCK) + if (!directio && orig.br_startblock != HOLESTARTBLOCK) imap = orig; end_fsb = imap.br_startoff + imap.br_blockcount; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e2d9179bd50d..d42e3ef9050e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -361,7 +361,7 @@ xfs_reflink_allocate_cow( struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, - unsigned iomap_flags) + bool convert_now) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; @@ -444,7 +444,7 @@ convert: * to initiate a disk write. For direct I/O we are going to write the * data and need the conversion, but for buffered writes we're done. */ - if (!(iomap_flags & IOMAP_DIRECT) || imap->br_state == XFS_EXT_NORM) + if (!convert_now || imap->br_state == XFS_EXT_NORM) return 0; trace_xfs_reflink_convert_cow(ip, imap); return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 2a3052fbe23e..28a43b7f581d 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -27,7 +27,7 @@ bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, - unsigned iomap_flags); + bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); From 4f29e10d689f5678b947df978bd0c96e4d70686f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Feb 2019 16:36:17 -0800 Subject: [PATCH 67/70] xfs: rework breaking of shared extents in xfs_file_iomap_begin Rework the data flow in xfs_file_iomap_begin where we decide if we have to break shared extents. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_iomap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fa6621448d5e..63d323916bba 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -982,7 +982,7 @@ xfs_file_iomap_begin( * been done up front, so we don't need to do them here. */ if (xfs_is_cow_inode(ip)) { - struct xfs_bmbt_irec orig = imap; + struct xfs_bmbt_irec cmap; bool directio = (flags & IOMAP_DIRECT); /* if zeroing doesn't need COW allocation, then we are done. */ @@ -991,7 +991,8 @@ xfs_file_iomap_begin( goto out_found; /* may drop and re-acquire the ilock */ - error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode, + cmap = imap; + error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, directio); if (error) goto out_unlock; @@ -1002,10 +1003,11 @@ xfs_file_iomap_begin( * write code can perform read-modify-write operations; we * won't need the CoW fork mapping until writeback. For direct * I/O, which must be block aligned, we need to report the - * newly allocated address. + * newly allocated address. If the data fork has a hole, copy + * the COW fork mapping to avoid allocating to the data fork. */ - if (!directio && orig.br_startblock != HOLESTARTBLOCK) - imap = orig; + if (directio || imap.br_startblock == HOLESTARTBLOCK) + imap = cmap; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; From c1a4447f5e6ae8fb1f34a474f3083fb91cc4da90 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Feb 2019 09:35:34 -0800 Subject: [PATCH 68/70] xfs: fix uninitialized error variables smatch complained about some uninitialized error returns, so fix those. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson --- fs/xfs/libxfs/xfs_sb.c | 2 +- fs/xfs/xfs_reflink.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4e5029c37966..77a3a4085de3 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -877,7 +877,7 @@ xfs_initialize_perag_data( uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; - int error; + int error = 0; for (index = 0; index < agcount; index++) { /* diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d42e3ef9050e..680ae7662a78 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -261,7 +261,7 @@ xfs_reflink_convert_cow_locked( struct xfs_bmbt_irec got; struct xfs_btree_cur *dummy_cur = NULL; int dummy_logflags; - int error; + int error = 0; if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; From 3d129e1be3d941d9b1d7509d046307ec350fb535 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 25 Feb 2019 09:36:36 -0800 Subject: [PATCH 69/70] xfs: fix backwards endian conversion in scrub Fix a backwards endian conversion of a constant. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson --- fs/xfs/scrub/agheader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 9d4e8293d37e..ddf06bfaa29d 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -399,7 +399,7 @@ xchk_agf_xref_cntbt( if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) return; if (!have) { - if (agf->agf_freeblks != be32_to_cpu(0)) + if (agf->agf_freeblks != cpu_to_be32(0)) xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); return; } From 1b9598c8fb9965fff901c4caa21fed9644c34df3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 1 Mar 2019 08:14:57 -0800 Subject: [PATCH 70/70] xfs: fix reporting supported extra file attributes for statx() statx(2) notes that any attribute that is not indicated as supported by stx_attributes_mask has no usable value. Commit 5f955f26f3d42d ("xfs: report crtime and attribute flags to statx") added support for informing userspace of extra file attributes but forgot to list these flags as supported making reporting them rather useless for the pedantic userspace author. $ git describe --contains 5f955f26f3d42d04aba65590a32eb70eedb7f37d v4.11-rc6~5^2^2~2 Fixes: 5f955f26f3d42d ("xfs: report crtime and attribute flags to statx") Signed-off-by: Luis R. Rodriguez Reviewed-by: Darrick J. Wong [darrick: add a comment reminding people to keep attributes_mask up to date] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iops.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1efef69a7f1c..74047bd0c1ae 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -531,6 +531,10 @@ xfs_vn_getattr( } } + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) @@ -538,6 +542,10 @@ xfs_vn_getattr( if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND | + STATX_ATTR_NODUMP); + switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: