xfs: define the on-disk format for the metadir feature

Define the on-disk layout and feature flags for the metadata inode
directory feature.  Add a xfs_sb_version_hasmetadir for benefit of
xfs_repair, which needs to know where the new end of the superblock
lies.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
This commit is contained in:
Darrick J. Wong 2024-11-03 20:18:49 -08:00
parent ecc8065dfa
commit 4f3d4dd1b0
15 changed files with 152 additions and 25 deletions

View File

@ -174,6 +174,8 @@ typedef struct xfs_sb {
xfs_lsn_t sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
xfs_ino_t sb_metadirino; /* metadata directory tree root */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@ -259,6 +261,8 @@ struct xfs_dsb {
__be64 sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
__be64 sb_metadirino; /* metadata directory tree root */
/* must be padded to 64 bit alignment */
};
@ -374,6 +378,7 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@ -790,6 +795,27 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
}
enum xfs_metafile_type {
XFS_METAFILE_UNKNOWN, /* unknown */
XFS_METAFILE_DIR, /* metadir directory */
XFS_METAFILE_USRQUOTA, /* user quota */
XFS_METAFILE_GRPQUOTA, /* group quota */
XFS_METAFILE_PRJQUOTA, /* project quota */
XFS_METAFILE_RTBITMAP, /* rt bitmap */
XFS_METAFILE_RTSUMMARY, /* rt summary */
XFS_METAFILE_MAX
} __packed;
#define XFS_METAFILE_TYPE_STR \
{ XFS_METAFILE_UNKNOWN, "unknown" }, \
{ XFS_METAFILE_DIR, "dir" }, \
{ XFS_METAFILE_USRQUOTA, "usrquota" }, \
{ XFS_METAFILE_GRPQUOTA, "grpquota" }, \
{ XFS_METAFILE_PRJQUOTA, "prjquota" }, \
{ XFS_METAFILE_RTBITMAP, "rtbitmap" }, \
{ XFS_METAFILE_RTSUMMARY, "rtsummary" }
/*
* On-disk inode structure.
*
@ -812,7 +838,7 @@ struct xfs_dinode {
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
__u8 di_format; /* format of di_c data */
__be16 di_onlink; /* old number of links to file */
__be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */
__be32 di_uid; /* owner's user id */
__be32 di_gid; /* owner's group id */
__be32 di_nlink; /* number of links to file */
@ -1088,21 +1114,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* Values for di_flags2 These start by being exposed to userspace in the upper
* 16 bits of the XFS_XFLAG_s range.
*/
#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
/* use DAX for this inode */
#define XFS_DIFLAG2_DAX_BIT 0
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
/* file's blocks may be shared */
#define XFS_DIFLAG2_REFLINK_BIT 1
/* copy on write extent size hint */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2
/* big timestamps */
#define XFS_DIFLAG2_BIGTIME_BIT 3
/* large extent counters */
#define XFS_DIFLAG2_NREXT64_BIT 4
/*
* The inode contains filesystem metadata and can be found through the metadata
* directory tree. Metadata inodes must satisfy the following constraints:
*
* - V5 filesystem (and ftype) are enabled;
* - The only valid modes are regular files and directories;
* - The access bits must be zero;
* - DMAPI event and state masks are zero;
* - The user and group IDs must be zero;
* - The project ID can be used as a u32 annotation;
* - The immutable, sync, noatime, nodump, nodefrag flags must be set.
* - The dax flag must not be set.
* - Directories must have nosymlinks set.
*
* These requirements are chosen defensively to minimize the ability of
* userspace to read or modify the contents, should a metadata file ever
* escape to userspace.
*
* There are further constraints on the directory tree itself:
*
* - Metadata inodes must never be resolvable through the root directory;
* - They must never be accessed by userspace;
* - Metadata directory entries must have correct ftype.
*
* Superblock-rooted metadata files must have the METADATA iflag set even
* though they do not have a parent directory.
*/
#define XFS_DIFLAG2_METADATA_BIT 5
#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@ -1117,6 +1182,12 @@ static inline bool xfs_dinode_has_large_extent_counts(
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
}
static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip)
{
return dip->di_version >= 3 &&
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA));
}
/*
* Inode number format:
* low inopblog bits - offset in block

View File

@ -209,12 +209,15 @@ xfs_inode_from_disk(
* They will also be unconditionally written back to disk as v2 inodes.
*/
if (unlikely(from->di_version == 1)) {
set_nlink(inode, be16_to_cpu(from->di_onlink));
/* di_metatype used to be di_onlink */
set_nlink(inode, be16_to_cpu(from->di_metatype));
ip->i_projid = 0;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
be16_to_cpu(from->di_projid_lo);
if (xfs_dinode_is_metadir(from))
ip->i_metatype = be16_to_cpu(from->di_metatype);
}
i_uid_write(inode, be32_to_cpu(from->di_uid));
@ -315,7 +318,10 @@ xfs_inode_to_disk(
struct inode *inode = VFS_I(ip);
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
to->di_onlink = 0;
if (xfs_is_metadir_inode(ip))
to->di_metatype = cpu_to_be16(ip->i_metatype);
else
to->di_metatype = 0;
to->di_format = xfs_ifork_format(&ip->i_df);
to->di_uid = cpu_to_be32(i_uid_read(inode));
@ -523,8 +529,11 @@ xfs_dinode_verify(
* di_nlink==0 on a V1 inode. V2/3 inodes would get written out with
* di_onlink==0, so we can check that.
*/
if (dip->di_version >= 2) {
if (dip->di_onlink)
if (dip->di_version == 2) {
if (dip->di_metatype)
return __this_address;
} else if (dip->di_version >= 3) {
if (!xfs_dinode_is_metadir(dip) && dip->di_metatype)
return __this_address;
}
@ -546,7 +555,8 @@ xfs_dinode_verify(
if (dip->di_nlink)
return __this_address;
} else {
if (dip->di_onlink)
/* di_metatype used to be di_onlink */
if (dip->di_metatype)
return __this_address;
}
}

View File

@ -224,6 +224,8 @@ xfs_inode_inherit_flags2(
}
if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
ip->i_diflags2 |= XFS_DIFLAG2_DAX;
if (xfs_is_metadir_inode(pip))
ip->i_diflags2 |= XFS_DIFLAG2_METADATA;
/* Don't let invalid cowextsize hints propagate. */
failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,

View File

@ -404,7 +404,7 @@ struct xfs_log_dinode {
uint16_t di_mode; /* mode and type of file */
int8_t di_version; /* inode version */
int8_t di_format; /* format of di_c data */
uint8_t di_pad3[2]; /* unused in v2/3 inodes */
uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */
uint32_t di_uid; /* owner's user id */
uint32_t di_gid; /* owner's group id */
uint32_t di_nlink; /* number of links to file */

View File

@ -37,7 +37,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 272);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);

View File

@ -180,6 +180,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_EXCHANGE_RANGE;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
features |= XFS_FEAT_PARENT;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
features |= XFS_FEAT_METADIR;
return features;
}
@ -689,6 +691,11 @@ __xfs_sb_from_disk(
/* Convert on-disk flags to in-memory flags? */
if (convert_xquota)
xfs_sb_quota_from_disk(to);
if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
to->sb_metadirino = be64_to_cpu(from->sb_metadirino);
else
to->sb_metadirino = NULLFSINO;
}
void
@ -836,6 +843,9 @@ xfs_sb_to_disk(
to->sb_lsn = cpu_to_be64(from->sb_lsn);
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
to->sb_metadirino = cpu_to_be64(from->sb_metadirino);
}
/*

View File

@ -421,7 +421,7 @@ xchk_dinode(
break;
case 2:
case 3:
if (dip->di_onlink != 0)
if (!xfs_dinode_is_metadir(dip) && dip->di_metatype)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_mode == 0 && sc->ip)

View File

@ -521,10 +521,13 @@ STATIC void
xrep_dinode_nlinks(
struct xfs_dinode *dip)
{
if (dip->di_version > 1)
dip->di_onlink = 0;
else
if (dip->di_version < 2) {
dip->di_nlink = 0;
return;
}
if (!xfs_dinode_is_metadir(dip))
dip->di_metatype = 0;
}
/* Fix any conflicting flags that the verifiers complain about. */

View File

@ -65,6 +65,7 @@ typedef struct xfs_inode {
uint16_t i_flushiter; /* incremented on flush */
};
uint8_t i_forkoff; /* attr fork offset >> 3 */
enum xfs_metafile_type i_metatype; /* XFS_METAFILE_* */
uint16_t i_diflags; /* XFS_DIFLAG_... */
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
@ -276,10 +277,23 @@ static inline bool xfs_is_reflink_inode(const struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
}
static inline bool xfs_is_metadir_inode(const struct xfs_inode *ip)
{
return ip->i_diflags2 & XFS_DIFLAG2_METADATA;
}
static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
/* Any file in the metadata directory tree is a metadata inode. */
if (xfs_has_metadir(mp))
return xfs_is_metadir_inode(ip);
/*
* Before metadata directories, the only metadata inodes were the
* three quota files, the realtime bitmap, and the realtime summary.
*/
return ip->i_ino == mp->m_sb.sb_rbmino ||
ip->i_ino == mp->m_sb.sb_rsumino ||
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);

View File

@ -556,7 +556,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
@ -590,10 +589,16 @@ xfs_inode_to_log_dinode(
/* dummy value for initialisation */
to->di_crc = 0;
if (xfs_is_metadir_inode(ip))
to->di_metatype = ip->i_metatype;
else
to->di_metatype = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
to->di_metatype = 0;
}
xfs_inode_to_log_dinode_iext_counters(ip, to);

View File

@ -175,7 +175,7 @@ xfs_log_dinode_to_disk(
to->di_mode = cpu_to_be16(from->di_mode);
to->di_version = from->di_version;
to->di_format = from->di_format;
to->di_onlink = 0;
to->di_metatype = cpu_to_be16(from->di_metatype);
to->di_uid = cpu_to_be32(from->di_uid);
to->di_gid = cpu_to_be32(from->di_gid);
to->di_nlink = cpu_to_be32(from->di_nlink);

View File

@ -169,6 +169,10 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_PPTR,
.name = "parent pointer",
},
[XFS_EXPERIMENTAL_METADIR] = {
.opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree",
},
};
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);

View File

@ -98,6 +98,7 @@ enum xfs_experimental_feat {
XFS_EXPERIMENTAL_LBS,
XFS_EXPERIMENTAL_EXCHRANGE,
XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR,
XFS_EXPERIMENTAL_MAX,
};

View File

@ -332,6 +332,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@ -387,6 +388,7 @@ __XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
/*
* Some features are always on for v5 file systems, allow the compiler to
@ -487,6 +489,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_EXCHRANGE 15
/* Kernel has logged a warning about parent pointers being used on this fs. */
#define XFS_OPSTATE_WARNED_PPTR 16
/* Kernel has logged a warning about metadata dirs being used on this fs. */
#define XFS_OPSTATE_WARNED_METADIR 17
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \

View File

@ -1731,6 +1731,9 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
if (xfs_has_metadir(mp))
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
if (xfs_has_reflink(mp)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,