2018-06-06 02:42:14 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2016-08-03 01:36:07 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2014 Red Hat, Inc.
|
|
|
|
* All Rights Reserved.
|
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
|
|
|
#include "xfs_fs.h"
|
|
|
|
#include "xfs_shared.h"
|
|
|
|
#include "xfs_format.h"
|
|
|
|
#include "xfs_log_format.h"
|
|
|
|
#include "xfs_trans_resv.h"
|
|
|
|
#include "xfs_mount.h"
|
|
|
|
#include "xfs_trans.h"
|
|
|
|
#include "xfs_alloc.h"
|
|
|
|
#include "xfs_btree.h"
|
2020-03-11 18:11:42 +00:00
|
|
|
#include "xfs_btree_staging.h"
|
2016-08-03 01:39:05 +00:00
|
|
|
#include "xfs_rmap.h"
|
2016-08-03 01:36:07 +00:00
|
|
|
#include "xfs_rmap_btree.h"
|
2024-02-22 20:39:47 +00:00
|
|
|
#include "xfs_health.h"
|
2016-08-03 01:36:07 +00:00
|
|
|
#include "xfs_trace.h"
|
|
|
|
#include "xfs_error.h"
|
|
|
|
#include "xfs_extent_busy.h"
|
2021-06-02 00:48:24 +00:00
|
|
|
#include "xfs_ag.h"
|
2016-10-03 16:11:44 +00:00
|
|
|
#include "xfs_ag_resv.h"
|
2016-08-03 01:36:07 +00:00
|
|
|
|
2021-09-27 21:26:19 +00:00
|
|
|
static struct kmem_cache *xfs_rmapbt_cur_cache;
|
2021-09-23 19:21:37 +00:00
|
|
|
|
2016-08-03 01:39:05 +00:00
|
|
|
/*
|
|
|
|
* Reverse map btree.
|
|
|
|
*
|
|
|
|
* This is a per-ag tree used to track the owner(s) of a given extent. With
|
|
|
|
* reflink it is possible for there to be multiple owners, which is a departure
|
|
|
|
* from classic XFS. Owner records for data extents are inserted when the
|
|
|
|
* extent is mapped and removed when an extent is unmapped. Owner records for
|
|
|
|
* all other block types (i.e. metadata) are inserted when an extent is
|
|
|
|
* allocated and removed when an extent is freed. There can only be one owner
|
|
|
|
* of a metadata extent, usually an inode or some other metadata structure like
|
|
|
|
* an AG btree.
|
|
|
|
*
|
|
|
|
* The rmap btree is part of the free space management, so blocks for the tree
|
|
|
|
* are sourced from the agfl. Hence we need transaction reservation support for
|
|
|
|
* this tree so that the freelist is always large enough. This also impacts on
|
|
|
|
* the minimum space we need to leave free in the AG.
|
|
|
|
*
|
|
|
|
* The tree is ordered by [ag block, owner, offset]. This is a large key size,
|
|
|
|
* but it is the only way to enforce unique keys when a block can be owned by
|
|
|
|
* multiple files at any offset. There's no need to order/search by extent
|
|
|
|
* size for online updating/management of the tree. It is intended that most
|
|
|
|
* reverse lookups will be to find the owner(s) of a particular block, or to
|
|
|
|
* try to recover tree and file data from corrupt primary metadata.
|
|
|
|
*/
|
|
|
|
|
2016-08-03 01:36:07 +00:00
|
|
|
static struct xfs_btree_cur *
|
|
|
|
xfs_rmapbt_dup_cursor(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
|
|
|
return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
|
2021-06-02 00:48:24 +00:00
|
|
|
cur->bc_ag.agbp, cur->bc_ag.pag);
|
2016-08-03 01:36:07 +00:00
|
|
|
}
|
|
|
|
|
2016-08-03 01:39:05 +00:00
|
|
|
STATIC void
|
|
|
|
xfs_rmapbt_set_root(
|
2021-08-12 16:49:03 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
int inc)
|
2016-08-03 01:39:05 +00:00
|
|
|
{
|
2020-03-11 00:51:15 +00:00
|
|
|
struct xfs_buf *agbp = cur->bc_ag.agbp;
|
2020-03-10 15:57:29 +00:00
|
|
|
struct xfs_agf *agf = agbp->b_addr;
|
2016-08-03 01:39:05 +00:00
|
|
|
|
|
|
|
ASSERT(ptr->s != 0);
|
|
|
|
|
2024-02-22 20:39:46 +00:00
|
|
|
agf->agf_rmap_root = ptr->s;
|
|
|
|
be32_add_cpu(&agf->agf_rmap_level, inc);
|
|
|
|
cur->bc_ag.pag->pagf_rmap_level += inc;
|
2016-08-03 01:39:05 +00:00
|
|
|
|
|
|
|
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC int
|
|
|
|
xfs_rmapbt_alloc_block(
|
2021-08-12 16:53:27 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *start,
|
|
|
|
union xfs_btree_ptr *new,
|
|
|
|
int *stat)
|
2016-08-03 01:39:05 +00:00
|
|
|
{
|
2020-03-11 00:51:15 +00:00
|
|
|
struct xfs_buf *agbp = cur->bc_ag.agbp;
|
2020-03-10 15:57:29 +00:00
|
|
|
struct xfs_agf *agf = agbp->b_addr;
|
2021-06-02 00:48:24 +00:00
|
|
|
struct xfs_perag *pag = cur->bc_ag.pag;
|
2016-08-03 01:39:05 +00:00
|
|
|
int error;
|
|
|
|
xfs_agblock_t bno;
|
|
|
|
|
|
|
|
/* Allocate the new block from the freelist. If we can't, give up. */
|
2022-07-07 09:08:01 +00:00
|
|
|
error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
|
2016-08-03 01:39:05 +00:00
|
|
|
&bno, 1);
|
2018-03-07 01:03:30 +00:00
|
|
|
if (error)
|
2016-08-03 01:39:05 +00:00
|
|
|
return error;
|
|
|
|
if (bno == NULLAGBLOCK) {
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-06-02 00:48:24 +00:00
|
|
|
xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
|
2016-08-03 01:39:05 +00:00
|
|
|
|
|
|
|
new->s = cpu_to_be32(bno);
|
2016-08-16 22:31:49 +00:00
|
|
|
be32_add_cpu(&agf->agf_rmap_blocks, 1);
|
|
|
|
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
|
2016-08-03 01:39:05 +00:00
|
|
|
|
2021-06-02 00:48:24 +00:00
|
|
|
xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
|
2018-03-09 22:02:32 +00:00
|
|
|
|
2016-08-03 01:39:05 +00:00
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC int
|
|
|
|
xfs_rmapbt_free_block(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2020-03-11 00:51:15 +00:00
|
|
|
struct xfs_buf *agbp = cur->bc_ag.agbp;
|
2020-03-10 15:57:29 +00:00
|
|
|
struct xfs_agf *agf = agbp->b_addr;
|
2021-06-02 00:48:24 +00:00
|
|
|
struct xfs_perag *pag = cur->bc_ag.pag;
|
2016-08-03 01:39:05 +00:00
|
|
|
xfs_agblock_t bno;
|
|
|
|
int error;
|
|
|
|
|
2021-08-19 01:46:57 +00:00
|
|
|
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
|
2016-08-16 22:31:49 +00:00
|
|
|
be32_add_cpu(&agf->agf_rmap_blocks, -1);
|
|
|
|
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
|
2022-07-07 09:08:08 +00:00
|
|
|
error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
|
2016-08-03 01:39:05 +00:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
2021-06-02 00:48:24 +00:00
|
|
|
xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
|
2016-08-03 01:39:05 +00:00
|
|
|
XFS_EXTENT_BUSY_SKIP_DISCARD);
|
|
|
|
|
2020-07-13 16:13:00 +00:00
|
|
|
xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
|
2016-08-03 01:39:05 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC int
|
|
|
|
xfs_rmapbt_get_minrecs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level)
|
|
|
|
{
|
|
|
|
return cur->bc_mp->m_rmap_mnr[level != 0];
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC int
|
|
|
|
xfs_rmapbt_get_maxrecs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level)
|
|
|
|
{
|
|
|
|
return cur->bc_mp->m_rmap_mxr[level != 0];
|
|
|
|
}
|
|
|
|
|
xfs: fix rm_offset flag handling in rmap keys
Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:
(physical block, owner, fork, is_btree, offset)
This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.
Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:
(physical block, owner, fork, is_btree, unwritten state, offset)
Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset. This means that
lookup comparisons are only done with:
(physical block, owner, offset)
Queries can (theoretically) return incorrect results because of this
omission. On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field. For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.
Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code. Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit. New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.
A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.
Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.
Found by fuzzing keys[1].attrfork = ones on xfs/371.
Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2023-04-12 02:00:07 +00:00
|
|
|
/*
|
|
|
|
* Convert the ondisk record's offset field into the ondisk key's offset field.
|
|
|
|
* Fork and bmbt are significant parts of the rmap record key, but written
|
|
|
|
* status is merely a record attribute.
|
|
|
|
*/
|
|
|
|
static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
|
|
|
|
{
|
|
|
|
return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:39:05 +00:00
|
|
|
STATIC void
|
|
|
|
xfs_rmapbt_init_key_from_rec(
|
2021-08-11 00:02:16 +00:00
|
|
|
union xfs_btree_key *key,
|
|
|
|
const union xfs_btree_rec *rec)
|
2016-08-03 01:39:05 +00:00
|
|
|
{
|
|
|
|
key->rmap.rm_startblock = rec->rmap.rm_startblock;
|
|
|
|
key->rmap.rm_owner = rec->rmap.rm_owner;
|
xfs: fix rm_offset flag handling in rmap keys
Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:
(physical block, owner, fork, is_btree, offset)
This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.
Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:
(physical block, owner, fork, is_btree, unwritten state, offset)
Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset. This means that
lookup comparisons are only done with:
(physical block, owner, offset)
Queries can (theoretically) return incorrect results because of this
omission. On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field. For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.
Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code. Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit. New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.
A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.
Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.
Found by fuzzing keys[1].attrfork = ones on xfs/371.
Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2023-04-12 02:00:07 +00:00
|
|
|
key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
|
2016-08-03 01:39:05 +00:00
|
|
|
}
|
|
|
|
|
2016-08-03 01:40:56 +00:00
|
|
|
/*
|
|
|
|
* The high key for a reverse mapping record can be computed by shifting
|
|
|
|
* the startblock and offset to the highest value that would still map
|
|
|
|
* to that record. In practice this means that we add blockcount-1 to
|
|
|
|
* the startblock for all records, and if the record is for a data/attr
|
|
|
|
* fork mapping, we add blockcount-1 to the offset too.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_rmapbt_init_high_key_from_rec(
|
2021-08-11 00:02:16 +00:00
|
|
|
union xfs_btree_key *key,
|
|
|
|
const union xfs_btree_rec *rec)
|
2016-08-03 01:40:56 +00:00
|
|
|
{
|
2021-08-11 00:02:16 +00:00
|
|
|
uint64_t off;
|
|
|
|
int adj;
|
2016-08-03 01:40:56 +00:00
|
|
|
|
|
|
|
adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
|
|
|
|
|
|
|
|
key->rmap.rm_startblock = rec->rmap.rm_startblock;
|
|
|
|
be32_add_cpu(&key->rmap.rm_startblock, adj);
|
|
|
|
key->rmap.rm_owner = rec->rmap.rm_owner;
|
xfs: fix rm_offset flag handling in rmap keys
Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:
(physical block, owner, fork, is_btree, offset)
This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.
Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:
(physical block, owner, fork, is_btree, unwritten state, offset)
Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset. This means that
lookup comparisons are only done with:
(physical block, owner, offset)
Queries can (theoretically) return incorrect results because of this
omission. On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field. For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.
Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code. Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit. New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.
A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.
Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.
Found by fuzzing keys[1].attrfork = ones on xfs/371.
Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2023-04-12 02:00:07 +00:00
|
|
|
key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
|
2016-08-03 01:40:56 +00:00
|
|
|
if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
|
|
|
|
XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
|
|
|
|
return;
|
|
|
|
off = be64_to_cpu(key->rmap.rm_offset);
|
|
|
|
off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
|
|
|
|
key->rmap.rm_offset = cpu_to_be64(off);
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:39:05 +00:00
|
|
|
STATIC void
|
|
|
|
xfs_rmapbt_init_rec_from_cur(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_rec *rec)
|
|
|
|
{
|
|
|
|
rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
|
|
|
|
rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
|
|
|
|
rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
|
|
|
|
rec->rmap.rm_offset = cpu_to_be64(
|
|
|
|
xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC void
|
|
|
|
xfs_rmapbt_init_ptr_from_cur(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr)
|
|
|
|
{
|
2020-03-11 00:51:15 +00:00
|
|
|
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
|
2016-08-03 01:39:05 +00:00
|
|
|
|
2021-06-02 00:48:24 +00:00
|
|
|
ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
|
2016-08-03 01:39:05 +00:00
|
|
|
|
2024-02-22 20:39:46 +00:00
|
|
|
ptr->s = agf->agf_rmap_root;
|
2016-08-03 01:39:05 +00:00
|
|
|
}
|
|
|
|
|
xfs: fix rm_offset flag handling in rmap keys
Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:
(physical block, owner, fork, is_btree, offset)
This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.
Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:
(physical block, owner, fork, is_btree, unwritten state, offset)
Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset. This means that
lookup comparisons are only done with:
(physical block, owner, offset)
Queries can (theoretically) return incorrect results because of this
omission. On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field. For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.
Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code. Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit. New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.
A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.
Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.
Found by fuzzing keys[1].attrfork = ones on xfs/371.
Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2023-04-12 02:00:07 +00:00
|
|
|
/*
|
|
|
|
* Mask the appropriate parts of the ondisk key field for a key comparison.
|
|
|
|
* Fork and bmbt are significant parts of the rmap record key, but written
|
|
|
|
* status is merely a record attribute.
|
|
|
|
*/
|
|
|
|
static inline uint64_t offset_keymask(uint64_t offset)
|
|
|
|
{
|
|
|
|
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
|
|
|
|
}
|
|
|
|
|
2017-06-16 18:00:05 +00:00
|
|
|
STATIC int64_t
|
2016-08-03 01:39:05 +00:00
|
|
|
xfs_rmapbt_key_diff(
|
2021-08-11 00:02:15 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key)
|
2016-08-03 01:39:05 +00:00
|
|
|
{
|
2021-08-11 00:02:15 +00:00
|
|
|
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
|
|
|
|
const struct xfs_rmap_key *kp = &key->rmap;
|
|
|
|
__u64 x, y;
|
|
|
|
int64_t d;
|
2016-08-03 01:39:05 +00:00
|
|
|
|
2017-06-16 18:00:05 +00:00
|
|
|
d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
|
2016-08-03 01:39:05 +00:00
|
|
|
if (d)
|
|
|
|
return d;
|
|
|
|
|
|
|
|
x = be64_to_cpu(kp->rm_owner);
|
|
|
|
y = rec->rm_owner;
|
|
|
|
if (x > y)
|
|
|
|
return 1;
|
|
|
|
else if (y > x)
|
|
|
|
return -1;
|
|
|
|
|
xfs: fix rm_offset flag handling in rmap keys
Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:
(physical block, owner, fork, is_btree, offset)
This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.
Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:
(physical block, owner, fork, is_btree, unwritten state, offset)
Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset. This means that
lookup comparisons are only done with:
(physical block, owner, offset)
Queries can (theoretically) return incorrect results because of this
omission. On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field. For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.
Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code. Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit. New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.
A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.
Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.
Found by fuzzing keys[1].attrfork = ones on xfs/371.
Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2023-04-12 02:00:07 +00:00
|
|
|
x = offset_keymask(be64_to_cpu(kp->rm_offset));
|
|
|
|
y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
|
2016-08-03 01:39:05 +00:00
|
|
|
if (x > y)
|
|
|
|
return 1;
|
|
|
|
else if (y > x)
|
|
|
|
return -1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-06-16 18:00:05 +00:00
|
|
|
STATIC int64_t
|
2016-08-03 01:40:56 +00:00
|
|
|
xfs_rmapbt_diff_two_keys(
|
2021-08-11 00:02:15 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *k1,
|
2023-04-12 02:00:11 +00:00
|
|
|
const union xfs_btree_key *k2,
|
|
|
|
const union xfs_btree_key *mask)
|
2016-08-03 01:40:56 +00:00
|
|
|
{
|
2021-08-11 00:02:15 +00:00
|
|
|
const struct xfs_rmap_key *kp1 = &k1->rmap;
|
|
|
|
const struct xfs_rmap_key *kp2 = &k2->rmap;
|
|
|
|
int64_t d;
|
|
|
|
__u64 x, y;
|
2016-08-03 01:40:56 +00:00
|
|
|
|
2023-04-12 02:00:11 +00:00
|
|
|
/* Doesn't make sense to mask off the physical space part */
|
|
|
|
ASSERT(!mask || mask->rmap.rm_startblock);
|
|
|
|
|
2017-06-16 18:00:05 +00:00
|
|
|
d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
|
2023-04-12 02:00:11 +00:00
|
|
|
be32_to_cpu(kp2->rm_startblock);
|
2016-08-03 01:40:56 +00:00
|
|
|
if (d)
|
|
|
|
return d;
|
|
|
|
|
2023-04-12 02:00:11 +00:00
|
|
|
if (!mask || mask->rmap.rm_owner) {
|
|
|
|
x = be64_to_cpu(kp1->rm_owner);
|
|
|
|
y = be64_to_cpu(kp2->rm_owner);
|
|
|
|
if (x > y)
|
|
|
|
return 1;
|
|
|
|
else if (y > x)
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!mask || mask->rmap.rm_offset) {
|
|
|
|
/* Doesn't make sense to allow offset but not owner */
|
|
|
|
ASSERT(!mask || mask->rmap.rm_owner);
|
|
|
|
|
|
|
|
x = offset_keymask(be64_to_cpu(kp1->rm_offset));
|
|
|
|
y = offset_keymask(be64_to_cpu(kp2->rm_offset));
|
|
|
|
if (x > y)
|
|
|
|
return 1;
|
|
|
|
else if (y > x)
|
|
|
|
return -1;
|
|
|
|
}
|
2016-08-03 01:40:56 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-01-08 18:51:03 +00:00
|
|
|
static xfs_failaddr_t
|
2016-08-03 01:36:07 +00:00
|
|
|
xfs_rmapbt_verify(
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2019-06-29 02:27:29 +00:00
|
|
|
struct xfs_mount *mp = bp->b_mount;
|
2016-08-03 01:36:07 +00:00
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
|
|
|
struct xfs_perag *pag = bp->b_pag;
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_failaddr_t fa;
|
2016-08-03 01:36:07 +00:00
|
|
|
unsigned int level;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* magic number and level verification
|
|
|
|
*
|
|
|
|
* During growfs operations, we can't verify the exact level or owner as
|
|
|
|
* the perag is not fully initialised and hence not attached to the
|
|
|
|
* buffer. In this case, check against the maximum tree depth.
|
|
|
|
*
|
|
|
|
* Similarly, during log recovery we will have a perag structure
|
|
|
|
* attached, but the agf information will not yet have been initialised
|
|
|
|
* from the on disk AGF. Again, we can only check against maximum limits
|
|
|
|
* in this case.
|
|
|
|
*/
|
2019-02-07 18:45:48 +00:00
|
|
|
if (!xfs_verify_magic(bp, block->bb_magic))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2016-08-03 01:36:07 +00:00
|
|
|
|
2021-08-19 01:46:37 +00:00
|
|
|
if (!xfs_has_rmapbt(mp))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_agblock_v5hdr_verify(bp);
|
2018-01-08 18:51:03 +00:00
|
|
|
if (fa)
|
|
|
|
return fa;
|
2016-08-03 01:36:07 +00:00
|
|
|
|
|
|
|
level = be16_to_cpu(block->bb_level);
|
2023-02-12 22:14:52 +00:00
|
|
|
if (pag && xfs_perag_initialised_agf(pag)) {
|
2024-02-22 20:43:38 +00:00
|
|
|
unsigned int maxlevel = pag->pagf_rmap_level;
|
|
|
|
|
|
|
|
#ifdef CONFIG_XFS_ONLINE_REPAIR
|
|
|
|
/*
|
|
|
|
* Online repair could be rewriting the free space btrees, so
|
|
|
|
* we'll validate against the larger of either tree while this
|
|
|
|
* is going on.
|
|
|
|
*/
|
|
|
|
maxlevel = max_t(unsigned int, maxlevel,
|
|
|
|
pag->pagf_repair_rmap_level);
|
|
|
|
#endif
|
|
|
|
if (level >= maxlevel)
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2016-08-03 01:36:07 +00:00
|
|
|
} else if (level >= mp->m_rmap_maxlevels)
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2016-08-03 01:36:07 +00:00
|
|
|
|
2024-02-22 20:40:58 +00:00
|
|
|
return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
|
2016-08-03 01:36:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_rmapbt_read_verify(
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_failaddr_t fa;
|
|
|
|
|
2024-02-22 20:40:58 +00:00
|
|
|
if (!xfs_btree_agblock_verify_crc(bp))
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
|
|
|
|
else {
|
|
|
|
fa = xfs_rmapbt_verify(bp);
|
|
|
|
if (fa)
|
|
|
|
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
|
|
|
|
}
|
2016-08-03 01:36:07 +00:00
|
|
|
|
2018-01-08 18:51:02 +00:00
|
|
|
if (bp->b_error)
|
2016-08-03 01:36:07 +00:00
|
|
|
trace_xfs_btree_corrupt(bp, _RET_IP_);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
xfs_rmapbt_write_verify(
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_failaddr_t fa;
|
|
|
|
|
|
|
|
fa = xfs_rmapbt_verify(bp);
|
|
|
|
if (fa) {
|
2016-08-03 01:36:07 +00:00
|
|
|
trace_xfs_btree_corrupt(bp, _RET_IP_);
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
|
2016-08-03 01:36:07 +00:00
|
|
|
return;
|
|
|
|
}
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_agblock_calc_crc(bp);
|
2016-08-03 01:36:07 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
|
|
|
|
.name = "xfs_rmapbt",
|
2019-02-07 18:45:48 +00:00
|
|
|
.magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
|
2016-08-03 01:36:07 +00:00
|
|
|
.verify_read = xfs_rmapbt_read_verify,
|
|
|
|
.verify_write = xfs_rmapbt_write_verify,
|
2018-01-08 18:51:08 +00:00
|
|
|
.verify_struct = xfs_rmapbt_verify,
|
2016-08-03 01:36:07 +00:00
|
|
|
};
|
|
|
|
|
2016-08-03 01:39:05 +00:00
|
|
|
STATIC int
|
|
|
|
xfs_rmapbt_keys_inorder(
|
2021-08-11 00:02:17 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *k1,
|
|
|
|
const union xfs_btree_key *k2)
|
2016-08-03 01:39:05 +00:00
|
|
|
{
|
2017-06-16 18:00:05 +00:00
|
|
|
uint32_t x;
|
|
|
|
uint32_t y;
|
|
|
|
uint64_t a;
|
|
|
|
uint64_t b;
|
2016-08-03 01:39:05 +00:00
|
|
|
|
|
|
|
x = be32_to_cpu(k1->rmap.rm_startblock);
|
|
|
|
y = be32_to_cpu(k2->rmap.rm_startblock);
|
|
|
|
if (x < y)
|
|
|
|
return 1;
|
|
|
|
else if (x > y)
|
|
|
|
return 0;
|
|
|
|
a = be64_to_cpu(k1->rmap.rm_owner);
|
|
|
|
b = be64_to_cpu(k2->rmap.rm_owner);
|
|
|
|
if (a < b)
|
|
|
|
return 1;
|
|
|
|
else if (a > b)
|
|
|
|
return 0;
|
xfs: fix rm_offset flag handling in rmap keys
Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:
(physical block, owner, fork, is_btree, offset)
This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.
Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:
(physical block, owner, fork, is_btree, unwritten state, offset)
Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset. This means that
lookup comparisons are only done with:
(physical block, owner, offset)
Queries can (theoretically) return incorrect results because of this
omission. On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field. For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.
Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code. Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit. New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.
A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.
Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.
Found by fuzzing keys[1].attrfork = ones on xfs/371.
Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2023-04-12 02:00:07 +00:00
|
|
|
a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
|
|
|
|
b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
|
2016-08-03 01:39:05 +00:00
|
|
|
if (a <= b)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC int
|
|
|
|
xfs_rmapbt_recs_inorder(
|
2021-08-11 00:02:17 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_rec *r1,
|
|
|
|
const union xfs_btree_rec *r2)
|
2016-08-03 01:39:05 +00:00
|
|
|
{
|
2017-06-16 18:00:05 +00:00
|
|
|
uint32_t x;
|
|
|
|
uint32_t y;
|
|
|
|
uint64_t a;
|
|
|
|
uint64_t b;
|
2016-08-03 01:39:05 +00:00
|
|
|
|
|
|
|
x = be32_to_cpu(r1->rmap.rm_startblock);
|
|
|
|
y = be32_to_cpu(r2->rmap.rm_startblock);
|
|
|
|
if (x < y)
|
|
|
|
return 1;
|
|
|
|
else if (x > y)
|
|
|
|
return 0;
|
|
|
|
a = be64_to_cpu(r1->rmap.rm_owner);
|
|
|
|
b = be64_to_cpu(r2->rmap.rm_owner);
|
|
|
|
if (a < b)
|
|
|
|
return 1;
|
|
|
|
else if (a > b)
|
|
|
|
return 0;
|
xfs: fix rm_offset flag handling in rmap keys
Keys for extent interval records in the reverse mapping btree are
supposed to be computed as follows:
(physical block, owner, fork, is_btree, offset)
This provides users the ability to look up a reverse mapping from a file
block mapping record -- start with the physical block; then if there are
multiple records for the same block, move on to the owner; then the
inode fork type; and so on to the file offset.
Unfortunately, the code that creates rmap lookup keys from rmap records
forgot to mask off the record attribute flags, leading to ondisk keys
that look like this:
(physical block, owner, fork, is_btree, unwritten state, offset)
Fortunately, this has all worked ok for the past six years because the
key comparison functions incorrectly ignore the fork/bmbt/unwritten
information that's encoded in the on-disk offset. This means that
lookup comparisons are only done with:
(physical block, owner, offset)
Queries can (theoretically) return incorrect results because of this
omission. On consistent filesystems this isn't an issue because xattr
and bmbt blocks cannot be shared and hence the comparisons succeed
purely on the contents of the rm_startblock field. For the one case
where we support sharing (written data fork blocks) all flag bits are
zero, so the omission in the comparison has no ill effects.
Unfortunately, this bug prevents scrub from detecting incorrect fork and
bmbt flag bits in the rmap btree, so we really do need to fix the
compare code. Old filesystems with the unwritten bit erroneously set in
the rmap key struct will work fine on new kernels since we still ignore
the unwritten bit. New filesystems on older kernels will work fine
since the old kernels never paid attention to the unwritten bit.
A previous version of this patch forgot to keep the (un)written state
flag masked during the comparison and caused a major regression in
5.9.x since unwritten extent conversion can update an rmap record
without requiring key updates.
Note that blocks cannot go directly from data fork to attr fork without
being deallocated and reallocated, nor can they be added to or removed
from a bmbt without a free/alloc cycle, so this should not cause any
regressions.
Found by fuzzing keys[1].attrfork = ones on xfs/371.
Fixes: 4b8ed67794fe ("xfs: add rmap btree operations")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2023-04-12 02:00:07 +00:00
|
|
|
a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
|
|
|
|
b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
|
2016-08-03 01:39:05 +00:00
|
|
|
if (a <= b)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
STATIC enum xbtree_key_contig
|
|
|
|
xfs_rmapbt_keys_contiguous(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
2023-04-12 02:00:11 +00:00
|
|
|
const union xfs_btree_key *key2,
|
|
|
|
const union xfs_btree_key *mask)
|
2023-04-12 02:00:10 +00:00
|
|
|
{
|
2023-04-12 02:00:11 +00:00
|
|
|
ASSERT(!mask || mask->rmap.rm_startblock);
|
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
/*
|
|
|
|
* We only support checking contiguity of the physical space component.
|
|
|
|
* If any callers ever need more specificity than that, they'll have to
|
|
|
|
* implement it here.
|
|
|
|
*/
|
2023-04-12 02:00:11 +00:00
|
|
|
ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
|
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
|
|
|
|
be32_to_cpu(key2->rmap.rm_startblock));
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:15 +00:00
|
|
|
const struct xfs_btree_ops xfs_rmapbt_ops = {
|
2024-02-22 20:39:47 +00:00
|
|
|
.name = "rmap",
|
2024-02-22 20:36:17 +00:00
|
|
|
.type = XFS_BTREE_TYPE_AG,
|
2024-02-22 20:34:29 +00:00
|
|
|
.geom_flags = XFS_BTGEO_OVERLAPPING,
|
|
|
|
|
2016-08-03 01:36:07 +00:00
|
|
|
.rec_len = sizeof(struct xfs_rmap_rec),
|
2024-02-22 20:37:25 +00:00
|
|
|
/* Overlapping btree; 2 keys per pointer. */
|
2016-08-03 01:36:07 +00:00
|
|
|
.key_len = 2 * sizeof(struct xfs_rmap_key),
|
2024-02-22 20:35:36 +00:00
|
|
|
.ptr_len = XFS_BTREE_SHORT_PTR_LEN,
|
2016-08-03 01:36:07 +00:00
|
|
|
|
2024-02-22 20:35:20 +00:00
|
|
|
.lru_refs = XFS_RMAP_BTREE_REF,
|
2024-02-22 20:35:21 +00:00
|
|
|
.statoff = XFS_STATS_CALC_INDEX(xs_rmap_2),
|
2024-02-22 20:39:47 +00:00
|
|
|
.sick_mask = XFS_SICK_AG_RMAPBT,
|
2024-02-22 20:35:20 +00:00
|
|
|
|
2016-08-03 01:36:07 +00:00
|
|
|
.dup_cursor = xfs_rmapbt_dup_cursor,
|
2016-08-03 01:39:05 +00:00
|
|
|
.set_root = xfs_rmapbt_set_root,
|
|
|
|
.alloc_block = xfs_rmapbt_alloc_block,
|
|
|
|
.free_block = xfs_rmapbt_free_block,
|
|
|
|
.get_minrecs = xfs_rmapbt_get_minrecs,
|
|
|
|
.get_maxrecs = xfs_rmapbt_get_maxrecs,
|
|
|
|
.init_key_from_rec = xfs_rmapbt_init_key_from_rec,
|
2016-08-03 01:40:56 +00:00
|
|
|
.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
|
2016-08-03 01:39:05 +00:00
|
|
|
.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
|
|
|
|
.init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
|
|
|
|
.key_diff = xfs_rmapbt_key_diff,
|
2016-08-03 01:36:07 +00:00
|
|
|
.buf_ops = &xfs_rmapbt_buf_ops,
|
2016-08-03 01:40:56 +00:00
|
|
|
.diff_two_keys = xfs_rmapbt_diff_two_keys,
|
2016-08-03 01:39:05 +00:00
|
|
|
.keys_inorder = xfs_rmapbt_keys_inorder,
|
|
|
|
.recs_inorder = xfs_rmapbt_recs_inorder,
|
2023-04-12 02:00:10 +00:00
|
|
|
.keys_contiguous = xfs_rmapbt_keys_contiguous,
|
2016-08-03 01:36:07 +00:00
|
|
|
};
|
|
|
|
|
2024-02-22 20:39:41 +00:00
|
|
|
/*
|
|
|
|
* Create a new reverse mapping btree cursor.
|
|
|
|
*
|
|
|
|
* For staging cursors tp and agbp are NULL.
|
|
|
|
*/
|
2020-03-11 18:11:42 +00:00
|
|
|
struct xfs_btree_cur *
|
|
|
|
xfs_rmapbt_init_cursor(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_trans *tp,
|
|
|
|
struct xfs_buf *agbp,
|
2021-06-02 00:48:24 +00:00
|
|
|
struct xfs_perag *pag)
|
2020-03-11 18:11:42 +00:00
|
|
|
{
|
|
|
|
struct xfs_btree_cur *cur;
|
|
|
|
|
2024-02-22 20:40:51 +00:00
|
|
|
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
|
2024-02-22 20:39:41 +00:00
|
|
|
mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
|
|
|
|
cur->bc_ag.pag = xfs_perag_hold(pag);
|
2020-03-11 00:51:15 +00:00
|
|
|
cur->bc_ag.agbp = agbp;
|
2024-02-22 20:39:41 +00:00
|
|
|
if (agbp) {
|
|
|
|
struct xfs_agf *agf = agbp->b_addr;
|
|
|
|
|
2024-02-22 20:39:46 +00:00
|
|
|
cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
|
2024-02-22 20:39:41 +00:00
|
|
|
}
|
2020-03-11 18:11:42 +00:00
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Install a new reverse mapping btree root. Caller is responsible for
|
|
|
|
* invalidating and freeing the old btree blocks.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_rmapbt_commit_staged_btree(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_trans *tp,
|
|
|
|
struct xfs_buf *agbp)
|
|
|
|
{
|
|
|
|
struct xfs_agf *agf = agbp->b_addr;
|
|
|
|
struct xbtree_afakeroot *afake = cur->bc_ag.afake;
|
|
|
|
|
|
|
|
ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
|
|
|
|
|
2024-02-22 20:39:46 +00:00
|
|
|
agf->agf_rmap_root = cpu_to_be32(afake->af_root);
|
|
|
|
agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
|
2020-03-11 18:11:42 +00:00
|
|
|
agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
|
|
|
|
xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
|
|
|
|
XFS_AGF_RMAP_BLOCKS);
|
2024-02-22 20:37:35 +00:00
|
|
|
xfs_btree_commit_afakeroot(cur, tp, agbp);
|
2020-03-11 18:11:42 +00:00
|
|
|
}
|
|
|
|
|
2021-09-23 17:32:06 +00:00
|
|
|
/* Calculate number of records in a reverse mapping btree block. */
|
|
|
|
static inline unsigned int
|
|
|
|
xfs_rmapbt_block_maxrecs(
|
|
|
|
unsigned int blocklen,
|
|
|
|
bool leaf)
|
|
|
|
{
|
|
|
|
if (leaf)
|
|
|
|
return blocklen / sizeof(struct xfs_rmap_rec);
|
|
|
|
return blocklen /
|
|
|
|
(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:36:07 +00:00
|
|
|
/*
|
|
|
|
* Calculate number of records in an rmap btree block.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_rmapbt_maxrecs(
|
|
|
|
int blocklen,
|
|
|
|
int leaf)
|
|
|
|
{
|
|
|
|
blocklen -= XFS_RMAP_BLOCK_LEN;
|
2021-09-23 17:32:06 +00:00
|
|
|
return xfs_rmapbt_block_maxrecs(blocklen, leaf);
|
|
|
|
}
|
2016-08-03 01:36:07 +00:00
|
|
|
|
2021-09-23 17:32:06 +00:00
|
|
|
/* Compute the max possible height for reverse mapping btrees. */
|
|
|
|
unsigned int
|
|
|
|
xfs_rmapbt_maxlevels_ondisk(void)
|
|
|
|
{
|
|
|
|
unsigned int minrecs[2];
|
|
|
|
unsigned int blocklen;
|
|
|
|
|
|
|
|
blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
|
|
|
|
|
|
|
|
minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
|
|
|
|
minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
|
|
|
|
*
|
|
|
|
* On a reflink filesystem, each AG block can have up to 2^32 (per the
|
|
|
|
* refcount record format) owners, which means that theoretically we
|
|
|
|
* could face up to 2^64 rmap records. However, we're likely to run
|
|
|
|
* out of blocks in the AG long before that happens, which means that
|
|
|
|
* we must compute the max height based on what the btree will look
|
|
|
|
* like if it consumes almost all the blocks in the AG due to maximal
|
|
|
|
* sharing factor.
|
|
|
|
*/
|
|
|
|
return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
|
2016-08-03 01:36:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Compute the maximum height of an rmap btree. */
|
|
|
|
void
|
|
|
|
xfs_rmapbt_compute_maxlevels(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2021-09-16 19:27:43 +00:00
|
|
|
if (!xfs_has_rmapbt(mp)) {
|
|
|
|
mp->m_rmap_maxlevels = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xfs_has_reflink(mp)) {
|
|
|
|
/*
|
|
|
|
* Compute the asymptotic maxlevels for an rmap btree on a
|
|
|
|
* filesystem that supports reflink.
|
|
|
|
*
|
|
|
|
* On a reflink filesystem, each AG block can have up to 2^32
|
|
|
|
* (per the refcount record format) owners, which means that
|
|
|
|
* theoretically we could face up to 2^64 rmap records.
|
|
|
|
* However, we're likely to run out of blocks in the AG long
|
|
|
|
* before that happens, which means that we must compute the
|
|
|
|
* max height based on what the btree will look like if it
|
|
|
|
* consumes almost all the blocks in the AG due to maximal
|
|
|
|
* sharing factor.
|
|
|
|
*/
|
|
|
|
mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
|
|
|
|
mp->m_sb.sb_agblocks);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If there's no block sharing, compute the maximum rmapbt
|
|
|
|
* height assuming one rmap record per AG block.
|
|
|
|
*/
|
2018-04-06 17:09:42 +00:00
|
|
|
mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
|
2016-10-03 16:11:16 +00:00
|
|
|
mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
|
2021-09-16 19:27:43 +00:00
|
|
|
}
|
2021-09-23 17:32:06 +00:00
|
|
|
ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
|
2016-08-03 01:36:07 +00:00
|
|
|
}
|
2016-10-03 16:11:44 +00:00
|
|
|
|
|
|
|
/* Calculate the refcount btree size for some records. */
|
|
|
|
xfs_extlen_t
|
|
|
|
xfs_rmapbt_calc_size(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
unsigned long long len)
|
|
|
|
{
|
2018-04-06 17:09:42 +00:00
|
|
|
return xfs_btree_calc_size(mp->m_rmap_mnr, len);
|
2016-10-03 16:11:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the maximum refcount btree size.
|
|
|
|
*/
|
|
|
|
xfs_extlen_t
|
|
|
|
xfs_rmapbt_max_size(
|
2017-01-04 02:39:33 +00:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
xfs_agblock_t agblocks)
|
2016-10-03 16:11:44 +00:00
|
|
|
{
|
|
|
|
/* Bail out if we're uninitialized, which can happen in mkfs. */
|
|
|
|
if (mp->m_rmap_mxr[0] == 0)
|
|
|
|
return 0;
|
|
|
|
|
2017-01-04 02:39:33 +00:00
|
|
|
return xfs_rmapbt_calc_size(mp, agblocks);
|
2016-10-03 16:11:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Figure out how many blocks to reserve and how many are used by this btree.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_rmapbt_calc_reserves(
|
|
|
|
struct xfs_mount *mp,
|
2018-07-30 05:37:08 +00:00
|
|
|
struct xfs_trans *tp,
|
2021-06-02 00:48:24 +00:00
|
|
|
struct xfs_perag *pag,
|
2016-10-03 16:11:44 +00:00
|
|
|
xfs_extlen_t *ask,
|
|
|
|
xfs_extlen_t *used)
|
|
|
|
{
|
|
|
|
struct xfs_buf *agbp;
|
|
|
|
struct xfs_agf *agf;
|
2017-01-04 02:39:33 +00:00
|
|
|
xfs_agblock_t agblocks;
|
2016-10-03 16:11:44 +00:00
|
|
|
xfs_extlen_t tree_len;
|
|
|
|
int error;
|
|
|
|
|
2021-08-19 01:46:37 +00:00
|
|
|
if (!xfs_has_rmapbt(mp))
|
2016-10-03 16:11:44 +00:00
|
|
|
return 0;
|
|
|
|
|
2022-07-07 09:07:40 +00:00
|
|
|
error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
|
2016-10-03 16:11:44 +00:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
2020-03-10 15:57:29 +00:00
|
|
|
agf = agbp->b_addr;
|
2017-01-04 02:39:33 +00:00
|
|
|
agblocks = be32_to_cpu(agf->agf_length);
|
2016-10-03 16:11:44 +00:00
|
|
|
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
|
2018-07-30 05:37:08 +00:00
|
|
|
xfs_trans_brelse(tp, agbp);
|
2016-10-03 16:11:44 +00:00
|
|
|
|
2019-05-20 18:25:39 +00:00
|
|
|
/*
|
|
|
|
* The log is permanently allocated, so the space it occupies will
|
|
|
|
* never be available for the kinds of things that would require btree
|
|
|
|
* expansion. We therefore can pretend the space isn't there.
|
|
|
|
*/
|
2022-07-07 09:13:21 +00:00
|
|
|
if (xfs_ag_contains_log(mp, pag->pag_agno))
|
2019-05-20 18:25:39 +00:00
|
|
|
agblocks -= mp->m_sb.sb_logblocks;
|
|
|
|
|
2017-01-04 02:39:33 +00:00
|
|
|
/* Reserve 1% of the AG or enough for 1 block per record. */
|
|
|
|
*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
|
2016-10-03 16:11:44 +00:00
|
|
|
*used += tree_len;
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
2021-09-23 19:21:37 +00:00
|
|
|
|
|
|
|
int __init
|
|
|
|
xfs_rmapbt_init_cur_cache(void)
|
|
|
|
{
|
|
|
|
xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
|
|
|
|
xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
|
|
|
|
0, 0, NULL);
|
|
|
|
|
|
|
|
if (!xfs_rmapbt_cur_cache)
|
|
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
xfs_rmapbt_destroy_cur_cache(void)
|
|
|
|
{
|
|
|
|
kmem_cache_destroy(xfs_rmapbt_cur_cache);
|
|
|
|
xfs_rmapbt_cur_cache = NULL;
|
|
|
|
}
|