mirror of
https://github.com/torvalds/linux.git
synced 2024-11-23 12:42:02 +00:00
a8b3a677e7
This adds support for nocow mode, where we do writes in-place when possible. Patch components: - New boolean filesystem and inode option, nocow: note that when nocow is enabled, data checksumming and compression are implicitly disabled - To prevent in-place writes from racing with data moves (data_update.c) or bucket reuse (i.e. a bucket being reused and re-allocated while a nocow write is in flight, we have a new locking mechanism. Buckets can be locked for either data update or data move, using a fixed size hash table of two_state_shared locks. We don't have any chaining, meaning updates and moves to different buckets that hash to the same lock will wait unnecessarily - we'll want to watch for this becoming an issue. - The allocator path also needs to check for in-place writes in flight to a given bucket before giving it out: thus we add another counter to bucket_alloc_state so we can track this. - Fsync now may need to issue cache flushes to block devices instead of flushing the journal. We add a device bitmask to bch_inode_info, ei_devs_need_flush, which tracks devices that need to have flushes issued - note that this will lead to unnecessary flushes when other codepaths have already issued flushes, we may want to replace this with a sequence number. - New nocow write path: look up extents, and if they're writable write to them - otherwise fall back to the normal COW write path. XXX: switch to sequence numbers instead of bitmask for devs needing journal flush XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to run in process context - see if we can improve this Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
206 lines
5.4 KiB
C
206 lines
5.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _BCACHEFS_FS_H
|
|
#define _BCACHEFS_FS_H
|
|
|
|
#include "inode.h"
|
|
#include "opts.h"
|
|
#include "str_hash.h"
|
|
#include "quota_types.h"
|
|
#include "two_state_shared_lock.h"
|
|
|
|
#include <linux/seqlock.h>
|
|
#include <linux/stat.h>
|
|
|
|
struct bch_inode_info {
|
|
struct inode v;
|
|
unsigned long ei_flags;
|
|
|
|
struct mutex ei_update_lock;
|
|
u64 ei_quota_reserved;
|
|
unsigned long ei_last_dirtied;
|
|
two_state_lock_t ei_pagecache_lock;
|
|
|
|
struct mutex ei_quota_lock;
|
|
struct bch_qid ei_qid;
|
|
|
|
u32 ei_subvol;
|
|
|
|
/*
|
|
* When we've been doing nocow writes we'll need to issue flushes to the
|
|
* underlying block devices
|
|
*
|
|
* XXX: a device may have had a flush issued by some other codepath. It
|
|
* would be better to keep for each device a sequence number that's
|
|
* incremented when we isusue a cache flush, and track here the sequence
|
|
* number that needs flushing.
|
|
*/
|
|
struct bch_devs_mask ei_devs_need_flush;
|
|
|
|
/* copy of inode in btree: */
|
|
struct bch_inode_unpacked ei_inode;
|
|
};
|
|
|
|
#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
|
|
#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
|
|
#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0)
|
|
|
|
#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
|
|
#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1)
|
|
|
|
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
|
|
{
|
|
return (subvol_inum) {
|
|
.subvol = inode->ei_subvol,
|
|
.inum = inode->ei_inode.bi_inum,
|
|
};
|
|
}
|
|
|
|
/*
|
|
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
|
|
* btree inode may be inconsistent:
|
|
*/
|
|
#define EI_INODE_ERROR 0
|
|
|
|
/*
|
|
* Set in the inode is in a snapshot subvolume - we don't do quota accounting in
|
|
* those:
|
|
*/
|
|
#define EI_INODE_SNAPSHOT 1
|
|
|
|
#define to_bch_ei(_inode) \
|
|
container_of_or_null(_inode, struct bch_inode_info, v)
|
|
|
|
static inline int ptrcmp(void *l, void *r)
|
|
{
|
|
return cmp_int(l, r);
|
|
}
|
|
|
|
enum bch_inode_lock_op {
|
|
INODE_LOCK = (1U << 0),
|
|
INODE_PAGECACHE_BLOCK = (1U << 1),
|
|
INODE_UPDATE_LOCK = (1U << 2),
|
|
};
|
|
|
|
#define bch2_lock_inodes(_locks, ...) \
|
|
do { \
|
|
struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
|
|
unsigned i; \
|
|
\
|
|
bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
|
|
\
|
|
for (i = 1; i < ARRAY_SIZE(a); i++) \
|
|
if (a[i] != a[i - 1]) { \
|
|
if ((_locks) & INODE_LOCK) \
|
|
down_write_nested(&a[i]->v.i_rwsem, i); \
|
|
if ((_locks) & INODE_PAGECACHE_BLOCK) \
|
|
bch2_pagecache_block_get(a[i]);\
|
|
if ((_locks) & INODE_UPDATE_LOCK) \
|
|
mutex_lock_nested(&a[i]->ei_update_lock, i);\
|
|
} \
|
|
} while (0)
|
|
|
|
#define bch2_unlock_inodes(_locks, ...) \
|
|
do { \
|
|
struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
|
|
unsigned i; \
|
|
\
|
|
bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
|
|
\
|
|
for (i = 1; i < ARRAY_SIZE(a); i++) \
|
|
if (a[i] != a[i - 1]) { \
|
|
if ((_locks) & INODE_LOCK) \
|
|
up_write(&a[i]->v.i_rwsem); \
|
|
if ((_locks) & INODE_PAGECACHE_BLOCK) \
|
|
bch2_pagecache_block_put(a[i]);\
|
|
if ((_locks) & INODE_UPDATE_LOCK) \
|
|
mutex_unlock(&a[i]->ei_update_lock); \
|
|
} \
|
|
} while (0)
|
|
|
|
static inline struct bch_inode_info *file_bch_inode(struct file *file)
|
|
{
|
|
return to_bch_ei(file_inode(file));
|
|
}
|
|
|
|
static inline bool inode_attr_changing(struct bch_inode_info *dir,
|
|
struct bch_inode_info *inode,
|
|
enum inode_opt_id id)
|
|
{
|
|
return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
|
|
bch2_inode_opt_get(&dir->ei_inode, id) !=
|
|
bch2_inode_opt_get(&inode->ei_inode, id);
|
|
}
|
|
|
|
static inline bool inode_attrs_changing(struct bch_inode_info *dir,
|
|
struct bch_inode_info *inode)
|
|
{
|
|
unsigned id;
|
|
|
|
for (id = 0; id < Inode_opt_nr; id++)
|
|
if (inode_attr_changing(dir, inode, id))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
struct bch_inode_unpacked;
|
|
|
|
#ifndef NO_BCACHEFS_FS
|
|
|
|
struct bch_inode_info *
|
|
__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
|
|
struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
|
|
|
|
int bch2_fs_quota_transfer(struct bch_fs *,
|
|
struct bch_inode_info *,
|
|
struct bch_qid,
|
|
unsigned,
|
|
enum quota_acct_mode);
|
|
|
|
static inline int bch2_set_projid(struct bch_fs *c,
|
|
struct bch_inode_info *inode,
|
|
u32 projid)
|
|
{
|
|
struct bch_qid qid = inode->ei_qid;
|
|
|
|
qid.q[QTYP_PRJ] = projid;
|
|
|
|
return bch2_fs_quota_transfer(c, inode, qid,
|
|
1 << QTYP_PRJ,
|
|
KEY_TYPE_QUOTA_PREALLOC);
|
|
}
|
|
|
|
struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
|
|
|
|
/* returns 0 if we want to do the update, or error is passed up */
|
|
typedef int (*inode_set_fn)(struct bch_inode_info *,
|
|
struct bch_inode_unpacked *, void *);
|
|
|
|
void bch2_inode_update_after_write(struct btree_trans *,
|
|
struct bch_inode_info *,
|
|
struct bch_inode_unpacked *,
|
|
unsigned);
|
|
int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
|
|
inode_set_fn, void *, unsigned);
|
|
|
|
int bch2_setattr_nonsize(struct mnt_idmap *,
|
|
struct bch_inode_info *,
|
|
struct iattr *);
|
|
int __bch2_unlink(struct inode *, struct dentry *, bool);
|
|
|
|
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
|
|
|
|
void bch2_vfs_exit(void);
|
|
int bch2_vfs_init(void);
|
|
|
|
#else
|
|
|
|
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
|
|
snapshot_id_list *s) {}
|
|
static inline void bch2_vfs_exit(void) {}
|
|
static inline int bch2_vfs_init(void) { return 0; }
|
|
|
|
#endif /* NO_BCACHEFS_FS */
|
|
|
|
#endif /* _BCACHEFS_FS_H */
|