linux/fs/xfs/libxfs/xfs_btree.h

535 lines
17 KiB
C
Raw Normal View History

/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef __XFS_BTREE_H__
#define __XFS_BTREE_H__
struct xfs_buf;
struct xfs_defer_ops;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
extern kmem_zone_t *xfs_btree_cur_zone;
/*
* Generic key, ptr and record wrapper structures.
*
* These are disk format structures, and are converted where necessary
* by the btree specific code that needs to interpret them.
*/
union xfs_btree_ptr {
__be32 s; /* short form ptr */
__be64 l; /* long form ptr */
};
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
/*
* The in-core btree key. Overlapping btrees actually store two keys
* per pointer, so we reserve enough memory to hold both. The __*bigkey
* items should never be accessed directly.
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
*/
union xfs_btree_key {
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
struct xfs_bmbt_key bmbt;
xfs_bmdr_key_t bmbr; /* bmbt root block */
xfs_alloc_key_t alloc;
struct xfs_inobt_key inobt;
struct xfs_rmap_key rmap;
struct xfs_rmap_key __rmap_bigkey[2];
struct xfs_refcount_key refc;
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
};
union xfs_btree_rec {
struct xfs_bmbt_rec bmbt;
xfs_bmdr_rec_t bmbr; /* bmbt root block */
struct xfs_alloc_rec alloc;
struct xfs_inobt_rec inobt;
struct xfs_rmap_rec rmap;
struct xfs_refcount_rec refc;
};
/*
* This nonsense is to make -wlint happy.
*/
#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi)
#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi)
uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
/*
* For logging record fields.
*/
xfs: swap extents operations for CRC filesystems For CRC enabled filesystems, we can't just swap inode forks from one inode to another when defragmenting a file - the blocks in the inode fork bmap btree contain pointers back to the owner inode. Hence if we are to swap the inode forks we have to atomically modify every block in the btree during the transaction. We are doing an entire fork swap here, so we could create a new transaction item type that indicates we are changing the owner of a certain structure from one value to another. If we combine this with ordered buffer logging to modify all the buffers in the tree, then we can change the buffers in the tree without needing log space for the operation. However, this then requires log recovery to perform the modification of the owner information of the objects/structures in question. This does introduce some interesting ordering details into recovery: we have to make sure that the owner change replay occurs after the change that moves the objects is made, not before. Hence we can't use a separate log item for this as we have no guarantee of strict ordering between multiple items in the log due to the relogging action of asynchronous transaction commits. Hence there is no "generic" method we can use for changing the ownership of arbitrary metadata structures. For inode forks, however, there is a simple method of communicating that the fork contents need the owner rewritten - we can pass a inode log format flag for the fork for the transaction that does a fork swap. This flag will then follow the inode fork through relogging actions so when the swap actually gets replayed the ownership can be changed immediately by log recovery. So that gives us a simple method of "whole fork" exchange between two inodes. This is relatively simple to implement, so it makes sense to do this as an initial implementation to support xfs_fsr on CRC enabled filesytems in the same manner as we do on existing filesystems. This commit introduces the swapext driven functionality, the recovery functionality will be in a separate patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 00:23:44 +00:00
#define XFS_BB_MAGIC (1 << 0)
#define XFS_BB_LEVEL (1 << 1)
#define XFS_BB_NUMRECS (1 << 2)
#define XFS_BB_LEFTSIB (1 << 3)
#define XFS_BB_RIGHTSIB (1 << 4)
#define XFS_BB_BLKNO (1 << 5)
#define XFS_BB_LSN (1 << 6)
#define XFS_BB_UUID (1 << 7)
#define XFS_BB_OWNER (1 << 8)
#define XFS_BB_NUM_BITS 5
#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
xfs: swap extents operations for CRC filesystems For CRC enabled filesystems, we can't just swap inode forks from one inode to another when defragmenting a file - the blocks in the inode fork bmap btree contain pointers back to the owner inode. Hence if we are to swap the inode forks we have to atomically modify every block in the btree during the transaction. We are doing an entire fork swap here, so we could create a new transaction item type that indicates we are changing the owner of a certain structure from one value to another. If we combine this with ordered buffer logging to modify all the buffers in the tree, then we can change the buffers in the tree without needing log space for the operation. However, this then requires log recovery to perform the modification of the owner information of the objects/structures in question. This does introduce some interesting ordering details into recovery: we have to make sure that the owner change replay occurs after the change that moves the objects is made, not before. Hence we can't use a separate log item for this as we have no guarantee of strict ordering between multiple items in the log due to the relogging action of asynchronous transaction commits. Hence there is no "generic" method we can use for changing the ownership of arbitrary metadata structures. For inode forks, however, there is a simple method of communicating that the fork contents need the owner rewritten - we can pass a inode log format flag for the fork for the transaction that does a fork swap. This flag will then follow the inode fork through relogging actions so when the swap actually gets replayed the ownership can be changed immediately by log recovery. So that gives us a simple method of "whole fork" exchange between two inodes. This is relatively simple to implement, so it makes sense to do this as an initial implementation to support xfs_fsr on CRC enabled filesytems in the same manner as we do on existing filesystems. This commit introduces the swapext driven functionality, the recovery functionality will be in a separate patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 00:23:44 +00:00
#define XFS_BB_NUM_BITS_CRC 9
#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
*/
#define XFS_BTREE_STATS_INC(cur, stat) \
XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
struct xfs_btree_ops {
/* size of the key and record structures */
size_t key_len;
size_t rec_len;
/* cursor operations */
struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
void (*update_cursor)(struct xfs_btree_cur *src,
struct xfs_btree_cur *dst);
/* update btree root pointer */
void (*set_root)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *nptr, int level_change);
/* block allocation / freeing */
int (*alloc_block)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *start_bno,
union xfs_btree_ptr *new_bno,
int *stat);
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
/* update last record information */
void (*update_lastrec)(struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
union xfs_btree_rec *rec,
int ptr, int reason);
/* records in block/level */
int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
/* records on disk. Matter for the root in inode case. */
int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
/* init values of btree structures */
void (*init_key_from_rec)(union xfs_btree_key *key,
union xfs_btree_rec *rec);
void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec);
void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
void (*init_high_key_from_rec)(union xfs_btree_key *key,
union xfs_btree_rec *rec);
/* difference between key value and cursor value */
int64_t (*key_diff)(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
/*
* Difference between key2 and key1 -- positive if key1 > key2,
* negative if key1 < key2, and zero if equal.
*/
int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
union xfs_btree_key *key1,
union xfs_btree_key *key2);
const struct xfs_buf_ops *buf_ops;
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2);
/* check that r1 is lower than r2 */
int (*recs_inorder)(struct xfs_btree_cur *cur,
union xfs_btree_rec *r1,
union xfs_btree_rec *r2);
};
/*
* Reasons for the update_lastrec method to be called.
*/
#define LASTREC_UPDATE 0
#define LASTREC_INSREC 1
#define LASTREC_DELREC 2
union xfs_btree_irec {
struct xfs_alloc_rec_incore a;
struct xfs_bmbt_irec b;
struct xfs_inobt_rec_incore i;
struct xfs_rmap_irec r;
struct xfs_refcount_irec rc;
};
/* Per-AG btree private information. */
union xfs_btree_cur_private {
struct {
unsigned long nr_ops; /* # record updates */
int shape_changes; /* # of extent splits */
} refc;
};
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
*/
typedef struct xfs_btree_cur
{
struct xfs_trans *bc_tp; /* transaction we're in, if any */
struct xfs_mount *bc_mp; /* file system mount struct */
const struct xfs_btree_ops *bc_ops;
uint bc_flags; /* btree features - below */
union xfs_btree_irec bc_rec; /* current insert/search record value */
struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
uint8_t bc_nlevels; /* number of levels in the tree */
uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
xfs_btnum_t bc_btnum; /* identifies which btree type */
int bc_statoff; /* offset of btre stats array */
union {
struct { /* needed for BNO, CNT, INO */
struct xfs_buf *agbp; /* agf/agi buffer pointer */
struct xfs_defer_ops *dfops; /* deferred updates */
xfs_agnumber_t agno; /* ag number */
union xfs_btree_cur_private priv;
} a;
struct { /* needed for BMAP */
struct xfs_inode *ip; /* pointer to our inode */
struct xfs_defer_ops *dfops; /* deferred updates */
xfs_fsblock_t firstblock; /* 1st blk allocated */
int allocated; /* count of alloced */
short forksize; /* fork's inode space */
char whichfork; /* data or attr fork */
char flags; /* flags */
xfs: skip bmbt block ino validation during owner change Extent swap uses xfs_btree_visit_blocks() to fix up bmbt block owners on v5 (!rmapbt) filesystems. The bmbt scan uses xfs_btree_lookup_get_block() to read bmbt blocks which verifies the current owner of the block against the parent inode of the bmbt. This works during extent swap because the bmbt owners are updated to the opposite inode number before the inode extent forks are swapped. The modified bmbt blocks are marked as ordered buffers which allows everything to commit in a single transaction. If the transaction commits to the log and the system crashes such that recovery of the extent swap is required, log recovery restarts the bmbt scan to fix up any bmbt blocks that may have not been written back before the crash. The log recovery bmbt scan occurs after the inode forks have been swapped, however. This causes the bmbt block owner verification to fail, leads to log recovery failure and requires xfs_repair to zap the log to recover. Define a new invalid inode owner flag to inform the btree block lookup mechanism that the current inode may be invalid with respect to the current owner of the bmbt block. Set this flag on the cursor used for change owner scans to allow this operation to work at runtime and during log recovery. Signed-off-by: Brian Foster <bfoster@redhat.com> Fixes: bb3be7e7c ("xfs: check for bogus values in btree block headers") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-08-29 17:08:39 +00:00
#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
} b;
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;
/* cursor flags */
#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
xfs: support btrees with overlapping intervals for keys On a filesystem with both reflink and reverse mapping enabled, it's possible to have multiple rmap records referring to the same blocks on disk. When overlapping intervals are possible, querying a classic btree to find all records intersecting a given interval is inefficient because we cannot use the left side of the search interval to filter out non-matching records the same way that we can use the existing btree key to filter out records coming after the right side of the search interval. This will become important once we want to use the rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl. (For the non-overlapping case, we can perform such queries trivially by starting at the left side of the interval and walking the tree until we pass the right side.) Therefore, extend the btree code to come closer to supporting intervals as a first-class record attribute. This involves widening the btree node's key space to store both the lowest key reachable via the node pointer (as the btree does now) and the highest key reachable via the same pointer and teaching the btree modifying functions to keep the highest-key records up to date. This behavior can be turned on via a new btree ops flag so that btrees that cannot store overlapping intervals don't pay the overhead costs in terms of extra code and disk format changes. When we're deleting a record in a btree that supports overlapped interval records and the deletion results in two btree blocks being joined, we defer updating the high/low keys until after all possible joining (at higher levels in the tree) have finished. At this point, the btree pointers at all levels have been updated to remove the empty blocks and we can update the low and high keys. When we're doing this, we must be careful to update the keys of all node pointers up to the root instead of stopping at the first set of keys that don't need updating. This is because it's possible for a single deletion to cause joining of multiple levels of tree, and so we need to update everything going back to the root. The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than, equal to, or greater than key2, respectively. This is consistent with the rest of the kernel and the C library. In btree_updkeys(), we need to evaluate the force_all parameter before running the key diff to avoid reading uninitialized memory when we're forcing a key update. This happens when we've allocated an empty slot at level N + 1 to point to a new block at level N and we're in the process of filling out the new keys. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
/*
* Convert from buffer to btree block header.
*/
#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
/*
* Internal long and short btree block checks. They return NULL if the
* block is ok or the address of the failed check otherwise.
*/
xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, int level, struct xfs_buf *bp);
xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, int level, struct xfs_buf *bp);
/*
* Check that block header is ok.
*/
int
xfs_btree_check_block(
struct xfs_btree_cur *cur, /* btree cursor */
struct xfs_btree_block *block, /* generic btree block pointer */
int level, /* level of the btree block */
struct xfs_buf *bp); /* buffer containing block, if any */
/*
* Check that (long) pointer is ok.
*/
bool /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_fsblock_t fsbno, /* btree block disk address */
int level); /* btree block level */
/*
* Check that (short) pointer is ok.
*/
bool /* error (0 or EFSCORRUPTED) */
xfs_btree_check_sptr(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t agbno, /* btree block disk address */
int level); /* btree block level */
/*
* Delete the btree cursor.
*/
void
xfs_btree_del_cursor(
xfs_btree_cur_t *cur, /* btree cursor */
int error); /* del because of error */
/*
* Duplicate the btree cursor.
* Allocate a new one, copy the record, re-get the buffers.
*/
int /* error */
xfs_btree_dup_cursor(
xfs_btree_cur_t *cur, /* input cursor */
xfs_btree_cur_t **ncur);/* output cursor */
/*
* Get a buffer for the block, return it with no data read.
* Long-form addressing.
*/
struct xfs_buf * /* buffer for fsbno */
xfs_btree_get_bufl(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t fsbno, /* file system block number */
uint lock); /* lock flags for get_buf */
/*
* Get a buffer for the block, return it with no data read.
* Short-form addressing.
*/
struct xfs_buf * /* buffer for agno/agbno */
xfs_btree_get_bufs(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_agnumber_t agno, /* allocation group number */
xfs_agblock_t agbno, /* allocation group block number */
uint lock); /* lock flags for get_buf */
/*
* Check for the cursor referring to the last block at the given level.
*/
int /* 1=is last block, 0=not last block */
xfs_btree_islastblock(
xfs_btree_cur_t *cur, /* btree cursor */
int level); /* level to check */
/*
* Compute first and last byte offsets for the fields given.
* Interprets the offsets table, which contains struct field offsets.
*/
void
xfs_btree_offsets(
int64_t fields, /* bitmask of fields */
const short *offsets,/* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
int *last); /* output: last byte offset */
/*
* Get a buffer for the block, return it read in.
* Long-form addressing.
*/
int /* error */
xfs_btree_read_bufl(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t fsbno, /* file system block number */
uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
int refval, /* ref count value for buffer */
const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Long-form addressing.
*/
void /* error */
xfs_btree_reada_bufl(
struct xfs_mount *mp, /* file system mount point */
xfs_fsblock_t fsbno, /* file system block number */
xfs_extlen_t count, /* count of filesystem blocks */
const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Short-form addressing.
*/
void /* error */
xfs_btree_reada_bufs(
struct xfs_mount *mp, /* file system mount point */
xfs_agnumber_t agno, /* allocation group number */
xfs_agblock_t agbno, /* allocation group block number */
xfs_extlen_t count, /* count of filesystem blocks */
const struct xfs_buf_ops *ops);
/*
* Initialise a new btree block header
*/
void
xfs_btree_init_block(
struct xfs_mount *mp,
struct xfs_buf *bp,
xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
__u64 owner,
unsigned int flags);
void
xfs_btree_init_block_int(
struct xfs_mount *mp,
struct xfs_btree_block *buf,
xfs_daddr_t blkno,
xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
__u64 owner,
unsigned int flags);
/*
* Common btree core entry points.
*/
int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
xfs: recovery of swap extents operations for CRC filesystems This is the recovery side of the btree block owner change operation performed by swapext on CRC enabled filesystems. We detect that an owner change is needed by the flag that has been placed on the inode log format flag field. Because the inode recovery is being replayed after the buffers that make up the BMBT in the given checkpoint, we can walk all the buffers and directly modify them when we see the flag set on an inode. Because the inode can be relogged and hence present in multiple chekpoints with the "change owner" flag set, we could do multiple passes across the inode to do this change. While this isn't optimal, we can't directly ignore the flag as there may be multiple independent swap extent operations being replayed on the same inode in different checkpoints so we can't ignore them. Further, because the owner change operation uses ordered buffers, we might have buffers that are newer on disk than the current checkpoint and so already have the owner changed in them. Hence we cannot just peek at a buffer in the tree and check that it has the correct owner and assume that the change was completed. So, for the moment just brute force the owner change every time we see an inode with the flag set. Note that we have to be careful here because the owner of the buffers may point to either the old owner or the new owner. Currently the verifier can't verify the owner directly, so there is no failure case here right now. If we verify the owner exactly in future, then we'll have to take this into account. This was tested in terms of normal operation via xfstests - all of the fsr tests now pass without failure. however, we really need to modify xfs/227 to stress v3 inodes correctly to ensure we fully cover this case for v5 filesystems. In terms of recovery testing, I used a hacked version of xfs_fsr that held the temp inode open for a few seconds before exiting so that the filesystem could be shut down with an open owner change recovery flags set on at least the temp inode. fsr leaves the temp inode unlinked and in btree format, so this was necessary for the owner change to be reliably replayed. logprint confirmed the tmp inode in the log had the correct flag set: INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88 INODE: #regs:3 ino:0x44 flags:0x209 dsize:88 ^^^^^ 0x200 is set, indicating a data fork owner change needed to be replayed on inode 0x44. A printk in the revoery code confirmed that the inode change was recovered: XFS (vdc): Mounting Filesystem XFS (vdc): Starting recovery (logdev: internal) recovering owner change ino 0x44 XFS (vdc): Version 5 superblock detected. This kernel L support enabled! Use of these features in this kernel is at your own risk! XFS (vdc): Ending recovery (logdev: internal) The script used to test this was: $ cat ./recovery-fsr.sh #!/bin/bash dev=/dev/vdc mntpt=/mnt/scratch testfile=$mntpt/testfile umount $mntpt mkfs.xfs -f -m crc=1 $dev mount $dev $mntpt chmod 777 $mntpt for i in `seq 10000 -1 0`; do xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1 done xfs_bmap -vp $testfile |head -20 xfs_fsr -d -v $testfile & sleep 10 /home/dave/src/xfstests-dev/src/godown -f $mntpt wait umount $mntpt xfs_logprint -t $dev |tail -20 time mount $dev $mntpt xfs_bmap -vp $testfile umount $mntpt $ Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 00:23:45 +00:00
struct list_head *buffer_list);
/*
* btree block CRC helpers
*/
void xfs_btree_lblock_calc_crc(struct xfs_buf *);
bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
void xfs_btree_sblock_calc_crc(struct xfs_buf *);
bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
*/
void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
* Helpers.
*/
static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
{
return be16_to_cpu(block->bb_numrecs);
}
static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
uint16_t numrecs)
{
block->bb_numrecs = cpu_to_be16(numrecs);
}
static inline int xfs_btree_get_level(struct xfs_btree_block *block)
{
return be16_to_cpu(block->bb_level);
}
/*
* Min and max functions for extlen, agblock, fileoff, and filblks types.
*/
#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b))
#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b))
#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b))
#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b))
#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b))
#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b))
#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
uint64_t owner);
xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
unsigned long len);
xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits,
unsigned long long len);
/* return codes */
#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */
typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec, void *priv);
int xfs_btree_query_range(struct xfs_btree_cur *cur,
union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn, void *priv);
int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
void *priv);
typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
void *data);
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
xfs_btree_visit_blocks_fn fn, void *data);
int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
struct xfs_btree_block *block);
union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n,
struct xfs_btree_block *block);
union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n,
struct xfs_btree_block *block);
union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n,
struct xfs_btree_block *block);
int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr);
int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b);
void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
union xfs_btree_ptr *ptr, int lr);
void xfs_btree_get_keys(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, union xfs_btree_key *key);
union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
union xfs_btree_irec *high, bool *exists);
#endif /* __XFS_BTREE_H__ */