diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index ce66894e6537..eeb2787e6f79 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -427,6 +427,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ + x(blocked_write_buffer_full) \ x(nocow_lock_contended) enum bch_time_stats { @@ -1122,6 +1123,16 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) #endif } +static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_going_ro, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget(&c->writes); +#endif +} + static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { #ifdef BCH_WRITE_REF_DEBUG diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 545df77bcd46..3d9393eecb93 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -2162,7 +2162,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(clock, 7) \ x(dev_usage, 8) \ x(log, 9) \ - x(overwrite, 10) + x(overwrite, 10) \ + x(write_buffer_keys, 11) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index abdf4fd10b6a..e52386fdc7ec 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -659,10 +659,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, i->k->k.needs_whiteout = false; } - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; - /* * Don't get journal reservation until after we know insert will * succeed: @@ -697,14 +693,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; - if (trans->nr_wb_updates) { - EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } - h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -757,7 +745,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_wb_update(trans, wb) { entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, + BCH_JSET_ENTRY_write_buffer_keys, wb->btree, 0, wb->k.k.u64s); bkey_copy((struct bkey_i *) entry->start, &wb->k); @@ -948,30 +936,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = bch2_trans_relock(trans); break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flush_lock); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } default: BUG_ON(ret >= 0); break; @@ -1070,20 +1034,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; } - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&c->btree_write_buffer.flush_lock); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); trans->journal_u64s = trans->journal_entries_u64s; diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 6ab26576252c..b19aeea5a80b 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,30 +7,25 @@ #include "btree_write_buffer.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" +#include #include static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); + +static inline int wb_key_cmp(const void *_l, const void *_r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + const struct wb_key_ref *l = _l; + const struct wb_key_ref *r = _r; - return cmp_int(l->btree, r->btree) ?: - bpos_cmp(l->k.k.p, r->k.k.p) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); -} - -static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) -{ - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; - - return cmp_int(l->journal_seq, r->journal_seq); + return cmp_int(l->hi, r->hi) ?: + cmp_int(l->mi, r->mi) ?: + cmp_int(l->lo, r->lo); } static noinline int wb_flush_one_slowpath(struct btree_trans *trans, @@ -59,6 +54,9 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite int ret; EBUG_ON(!wb->journal_seq); + EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; @@ -91,26 +89,6 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite return 0; } -static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) -{ - union btree_write_buffer_state old, new; - u64 v = READ_ONCE(wb->state.v); - - do { - old.v = new.v = v; - - new.nr = 0; - new.idx++; - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); - - while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) - cpu_relax(); - - smp_mb(); - - return old; -} - /* * Update a btree with a write buffered key using the journal seq of the * original write buffer insert. @@ -140,28 +118,79 @@ btree_write_buffered_insert(struct btree_trans *trans, return ret; } -int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) +static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) +{ + struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); + struct journal *j = &c->journal; + + if (!wb->inc.keys.nr) + return; + + bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); + darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { + swap(wb->flushing.keys, wb->inc.keys); + goto out; + } + + size_t nr = min(darray_room(wb->flushing.keys), + wb->sorted.size - wb->flushing.keys.nr); + nr = min(nr, wb->inc.keys.nr); + + memcpy(&darray_top(wb->flushing.keys), + wb->inc.keys.data, + sizeof(wb->inc.keys.data[0]) * nr); + + memmove(wb->inc.keys.data, + wb->inc.keys.data + nr, + sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); + + wb->flushing.keys.nr += nr; + wb->inc.keys.nr -= nr; +out: + if (!wb->inc.keys.nr) + bch2_journal_pin_drop(j, &wb->inc.pin); + else + bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, + bch2_btree_write_buffer_journal_flush); + + if (j->watermark) { + spin_lock(&j->lock); + bch2_journal_set_watermark(j); + spin_unlock(&j->lock); + } + + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); +} + +static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *keys; + struct wb_key_ref *i; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + size_t skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; - union btree_write_buffer_state s; int ret = 0; - memset(&pin, 0, sizeof(pin)); + bch2_trans_unlock(trans); + bch2_trans_begin(trans); - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, - bch2_btree_write_buffer_journal_flush); - bch2_journal_pin_drop(j, &wb->journal_pin); + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + mutex_unlock(&wb->inc.lock); - s = btree_write_buffer_switch(wb); - keys = wb->keys[s.idx]; - nr = s.nr; + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { + wb->sorted.data[i].idx = i; + wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; + wb->sorted.data[i].pos = wb->flushing.keys.data[i].k.k.p; + } + wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and @@ -177,32 +206,46 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_key_cmp, NULL); + sort(wb->sorted.data, wb->sorted.nr, + sizeof(wb->sorted.data[0]), + wb_key_cmp, NULL); - for (i = keys; i < keys + nr; i++) { - if (i + 1 < keys + nr && + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); + + BUG_ON(!k->journal_seq); + + if (i + 1 < &darray_top(wb->sorted) && i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) { + bpos_eq(i[0].pos, i[1].pos)) { + struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; + skipped++; - i->journal_seq = 0; + n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); + k->journal_seq = 0; continue; } - if (write_locked && - (iter.path->btree_id != i->btree || - bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); - write_locked = false; + if (write_locked) { + struct btree_path *path = iter.path; + + if (path->btree_id != i->btree || + bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + write_locked = false; + } } - if (!iter.path || iter.path->btree_id != i->btree) { + if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } - bch2_btree_iter_set_pos(&iter, i->k.k.p); + bch2_btree_iter_set_pos(&iter, k->k.k.p); iter.path->preserve = false; do { @@ -211,13 +254,13 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) break; } - ret = wb_flush_one(trans, &iter, i, &write_locked, &fast); + ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); if (!ret) { - i->journal_seq = 0; + k->journal_seq = 0; } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; ret = 0; @@ -239,18 +282,17 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) * The fastpath zapped the seq of keys that were successfully flushed so * we can skip those here. */ - trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, nr); + trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_journal_cmp, - NULL); - - for (i = keys; i < keys + nr; i++) { + struct btree_write_buffered_key *i; + darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; - bch2_journal_pin_update(j, i->journal_seq, &pin, - bch2_btree_write_buffer_journal_flush); + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + bch2_trans_begin(trans); ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| @@ -265,26 +307,67 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) } err: bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); - trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); - bch2_journal_pin_drop(j, &pin); + trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; return ret; } +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +{ + struct journal *j = &c->journal; + struct journal_buf *buf; + int ret = 0; + + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + ret = bch2_journal_keys_to_write_buffer(c, buf); + mutex_unlock(&j->buf_lock); + } + + return ret; +} + +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +{ + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0, fetch_from_journal_err; + + do { + bch2_trans_unlock(trans); + + fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + + /* + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() + * is not guaranteed to empty wb->inc: + */ + mutex_lock(&wb->flushing.lock); + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } while (!ret && + (fetch_from_journal_err || + (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); + + return ret; +} + +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); +} + int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) - return -BCH_ERR_erofs_no_writes; - trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - bch2_trans_unlock(trans); - mutex_lock(&c->btree_write_buffer.flush_lock); - int ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&c->btree_write_buffer.flush_lock); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); - return ret; + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); } int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) @@ -293,9 +376,9 @@ int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) struct btree_write_buffer *wb = &c->btree_write_buffer; int ret = 0; - if (mutex_trylock(&wb->flush_lock)) { + if (mutex_trylock(&wb->flushing.lock)) { ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flush_lock); + mutex_unlock(&wb->flushing.lock); } return ret; @@ -313,90 +396,179 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) return ret; } -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) +static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; - mutex_lock(&wb->flush_lock); - int ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - mutex_unlock(&wb->flush_lock); + mutex_lock(&wb->flushing.lock); + do { + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + } while (!ret && bch2_btree_write_buffer_should_flush(c)); + mutex_unlock(&wb->flushing.lock); - return ret; + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); } -static inline u64 btree_write_buffer_ref(int idx) +int __bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) { - return ((union btree_write_buffer_state) { - .ref0 = idx == 0, - .ref1 = idx == 1, - }).v; -} - -int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key *i; - union btree_write_buffer_state old, new; - int ret = 0; - u64 v; + int ret; +retry: + ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); + if (!ret && dst->wb == &wb->flushing) + ret = darray_resize(&wb->sorted, wb->flushing.keys.size); - trans_for_each_wb_update(trans, i) { - EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + if (unlikely(ret)) { + if (dst->wb == &c->btree_write_buffer.flushing) { + mutex_unlock(&dst->wb->lock); + dst->wb = &c->btree_write_buffer.inc; + bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, + bch2_btree_write_buffer_journal_flush); + goto retry; + } - i->journal_seq = trans->journal_res.seq; - i->journal_offset = trans->journal_res.offset; + return ret; } - preempt_disable(); - v = READ_ONCE(wb->state.v); - do { - old.v = new.v = v; + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + BUG_ON(!dst->room); + BUG_ON(!dst->seq); - new.v += btree_write_buffer_ref(new.idx); - new.nr += trans->nr_wb_updates; - if (new.nr > wb->size) { - ret = -BCH_ERR_btree_insert_need_flush_buffer; - goto out; + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (mutex_trylock(&wb->flushing.lock)) { + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + + /* + * Attempt to skip wb->inc, and add keys directly to + * wb->flushing, saving us a copy later: + */ + + if (!wb->inc.keys.nr) { + dst->wb = &wb->flushing; + } else { + mutex_unlock(&wb->flushing.lock); + dst->wb = &wb->inc; } - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + } else { + mutex_lock(&wb->inc.lock); + dst->wb = &wb->inc; + } - memcpy(wb->keys[new.idx] + old.nr, - trans->wb_updates, - sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + dst->seq = seq; - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); +} - atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (!dst->wb->keys.nr) + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); + + if (bch2_btree_write_buffer_should_flush(c) && + __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + + if (dst->wb == &wb->flushing) + mutex_unlock(&wb->flushing.lock); + mutex_unlock(&wb->inc.lock); +} + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + struct jset_entry *entry; + struct bkey_i *k; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + buf->need_flush_to_write_buffer = false; out: - preempt_enable(); + bch2_journal_keys_to_write_buffer_end(c, &dst); return ret; } +static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) +{ + if (wb->keys.size >= new_size) + return 0; + + if (!mutex_trylock(&wb->lock)) + return -EINTR; + + int ret = darray_resize(&wb->keys, new_size); + mutex_unlock(&wb->lock); + return ret; +} + +int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb_keys_resize(&wb->flushing, new_size) ?: + wb_keys_resize(&wb->inc, new_size); +} + void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && + !bch2_journal_error(&c->journal)); - kvfree(wb->keys[1]); - kvfree(wb->keys[0]); + darray_exit(&wb->sorted); + darray_exit(&wb->flushing.keys); + darray_exit(&wb->inc.keys); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - mutex_init(&wb->flush_lock); - wb->size = c->opts.btree_write_buffer_size; + mutex_init(&wb->inc.lock); + mutex_init(&wb->flushing.lock); + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); - wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); - wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); - if (!wb->keys[0] || !wb->keys[1]) - return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + /* Will be resized by journal as needed: */ + unsigned initial_size = 1 << 16; - return 0; + return darray_make_room(&wb->inc.keys, initial_size) ?: + darray_make_room(&wb->flushing.keys, initial_size) ?: + darray_make_room(&wb->sorted, initial_size); } diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index dec2c9a8bab2..1f645f529ed2 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -2,13 +2,59 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H #define _BCACHEFS_BTREE_WRITE_BUFFER_H -int bch2_btree_write_buffer_flush_locked(struct btree_trans *); -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); +#include "bkey.h" + +static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; +} + +static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; +} + +struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); -int bch2_btree_insert_keys_write_buffer(struct btree_trans *); +struct journal_keys_to_wb { + struct btree_write_buffer_keys *wb; + size_t room; + u64 seq; +}; +int __bch2_journal_key_to_wb(struct bch_fs *, + struct journal_keys_to_wb *, + enum btree_id, struct bkey_i *); + +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + if (unlikely(!dst->room)) + return __bch2_journal_key_to_wb(c, dst, btree, k); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); + +int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h index 99993ba77aea..8758d6adabf4 100644 --- a/fs/bcachefs/btree_write_buffer_types.h +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -2,43 +2,56 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) +struct wb_key_ref { +union { + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned idx:24; + struct bpos pos; + enum btree_id btree:8; +#else + enum btree_id btree:8; + struct bpos pos; + unsigned idx:24; +#endif + } __packed; + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + u64 lo; + u64 mi; + u64 hi; +#else + u64 hi; + u64 mi; + u64 lo; +#endif + }; +}; +}; + struct btree_write_buffered_key { - u64 journal_seq; - unsigned journal_offset; - enum btree_id btree; + enum btree_id btree:8; + u64 journal_seq:56; __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); }; -union btree_write_buffer_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 nr:23; - u64 idx:1; - u64 ref0:20; - u64 ref1:20; - }; +struct btree_write_buffer_keys { + DARRAY(struct btree_write_buffered_key) keys; + struct journal_entry_pin pin; + struct mutex lock; }; struct btree_write_buffer { - struct mutex flush_lock; - struct journal_entry_pin journal_pin; - - union btree_write_buffer_state state; - size_t size; - - struct btree_write_buffered_key *keys[2]; + DARRAY(struct wb_key_ref) sorted; + struct btree_write_buffer_keys inc; + struct btree_write_buffer_keys flushing; + struct work_struct flush_work; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 1a3303c65961..76163c2ea3b3 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1005,7 +1005,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index ac6201403fbe..c39bea441983 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -151,7 +151,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 6d56a71243bd..8294d7fd6632 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -10,6 +10,7 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "error.h" #include "journal.h" @@ -332,6 +333,7 @@ static int journal_entry_open(struct journal *j) buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; + buf->need_flush_to_write_buffer = true; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -768,6 +770,48 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret = NULL; + + mutex_lock(&j->buf_lock); + spin_lock(&j->lock); + max_seq = min(max_seq, journal_cur_seq(j)); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= max_seq; + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + idx; + + if (buf->need_flush_to_write_buffer) { + if (seq == journal_cur_seq(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + + union journal_res_state s; + s.v = atomic64_read_acquire(&j->reservations.counter); + + ret = journal_state_count(s, idx) + ? ERR_PTR(-EAGAIN) + : buf; + break; + } + } + + spin_unlock(&j->lock); + if (IS_ERR_OR_NULL(ret)) + mutex_unlock(&j->buf_lock); + return ret; +} + +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); + return ret; +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index a1384433d193..1e14e6b324f8 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -425,6 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index afcb2a435956..4d8e10c901a8 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_groups.h" @@ -723,6 +724,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -1503,6 +1520,8 @@ done: static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* we aren't holding j->lock: */ unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; @@ -1510,6 +1529,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; + size_t btree_write_buffer_size = new_size / 64; + + if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) + return; + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1703,9 +1727,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct jset_entry *start, *end, *i, *next, *prev = NULL; struct jset *jset = w->data; + struct journal_keys_to_wb wb = { NULL }; unsigned sectors, bytes, u64s; - bool validate_before_checksum = false; unsigned long btree_roots_have = 0; + bool validate_before_checksum = false; + u64 seq = le64_to_cpu(jset->seq); int ret; /* @@ -1733,9 +1759,28 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * to c->btree_roots we have to get any missing btree roots and * add them to this journal entry: */ - if (i->type == BCH_JSET_ENTRY_btree_root) { + switch (i->type) { + case BCH_JSET_ENTRY_btree_root: bch2_journal_entry_to_btree_root(c, i); __set_bit(i->btree_id, &btree_roots_have); + break; + case BCH_JSET_ENTRY_write_buffer_keys: + EBUG_ON(!w->need_flush_to_write_buffer); + + if (!wb.wb) + bch2_journal_keys_to_write_buffer_start(c, &wb, seq); + + struct bkey_i *k; + jset_entry_for_each_key(i, k) { + ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); + if (ret) { + bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_journal_keys_to_write_buffer_end(c, &wb); + return ret; + } + } + i->type = BCH_JSET_ENTRY_btree_keys; + break; } /* Can we merge with previous entry? */ @@ -1758,6 +1803,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) memmove_u64s_down(prev, i, jset_u64s(u64s)); } + if (wb.wb) + bch2_journal_keys_to_write_buffer_end(c, &wb); + w->need_flush_to_write_buffer = false; + prev = prev ? vstruct_next(prev) : jset->start; jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); @@ -1765,8 +1814,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - bch2_journal_super_entries_add_common(c, &end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1789,7 +1837,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = le64_to_cpu(jset->seq); + j->last_empty_seq = seq; if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index dc415e0ec493..60b9d3572387 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "errcode.h" #include "error.h" @@ -50,20 +51,23 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static inline void journal_set_watermark(struct journal *j) +void bch2_journal_set_watermark(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool low_on_space = j->space[journal_space_clean].total * 4 <= j->space[journal_space_total].total; bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; - unsigned watermark = low_on_space || low_on_pin + bool low_on_wb = bch2_btree_write_buffer_must_wait(c); + unsigned watermark = low_on_space || low_on_pin || low_on_wb ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], &j->low_on_space_start, low_on_space) || track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin)) + &j->low_on_pin_start, low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], + &j->write_buffer_full_start, low_on_wb)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); @@ -230,7 +234,7 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - journal_set_watermark(j); + bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 7b15d682a0f5..ec84c3345281 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j) unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); +void bch2_journal_set_watermark(struct journal *); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 5c8d3a8ec4df..38817c7a0851 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -36,6 +36,7 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; + bool need_flush_to_write_buffer; }; /* @@ -276,6 +277,7 @@ struct journal { u64 low_on_space_start; u64 low_on_pin_start; u64 max_in_flight_start; + u64 write_buffer_full_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 7f9e3001bf55..cf69b92cbd03 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -233,11 +233,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(btree_write_buffer_size, u32, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(16, (1U << 20) - 1), \ - BCH2_NO_SB_OPT, 1U << 13, \ - NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index c7c7d4a11eb9..88a762bce7da 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -363,7 +363,8 @@ void bch2_fs_read_only(struct bch_fs *c) BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.state.nr); + BUG_ON(c->btree_write_buffer.inc.keys.nr); + BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c);