bcachefs: Journal reclaim refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-11-22 20:22:09 +00:00 · 2019-02-21 13:33:21 -05:00 · 2019-02-21 13:33:21 -05:00 · e5a66496a0
commit e5a66496a0
parent 2d3b581039
6 changed files with 293 additions and 281 deletions
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@ -84,17 +84,12 @@ void bch2_journal_halt(struct journal *j)
 	journal_wake(j);
 	closure_wake_up(&journal_cur_buf(j)->wait);
 	closure_wake_up(&journal_prev_buf(j)->wait);
 }
 /* journal entry close/open: */
 void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
 {
 	struct journal_buf *w = journal_prev_buf(j);
 	atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
 	if (!need_write_just_set &&
 	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
 		bch2_time_stats_update(j->delay_time,
@ -175,7 +170,6 @@ static bool __journal_entry_close(struct journal *j)
 	 * Hence, we want update/set last_seq on the current journal entry right
 	 * before we open a new one:
 	 */
 	bch2_journal_reclaim_fast(j);
 	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
 	if (journal_entry_empty(buf->data))
@ -189,8 +183,8 @@ static bool __journal_entry_close(struct journal *j)
 	cancel_delayed_work(&j->write_work);
-	/* ugh - might be called from __journal_res_get() under wait_event() */
+	bch2_journal_space_available(j);
-	__set_current_state(TASK_RUNNING);
+
 	bch2_journal_buf_put(j, old.idx, set_need_write);
 	return true;
 }
@ -220,7 +214,7 @@ static int journal_entry_open(struct journal *j)
 {
 	struct journal_buf *buf = journal_cur_buf(j);
 	union journal_res_state old, new;
-	int u64s, ret;
+	int u64s;
 	u64 v;
 	lockdep_assert_held(&j->lock);
@ -229,12 +223,10 @@ static int journal_entry_open(struct journal *j)
 	if (j->blocked)
 		return -EAGAIN;
-	if (!fifo_free(&j->pin))
+	if (j->cur_entry_error)
-		return -ENOSPC;
+		return j->cur_entry_error;
-	ret = bch2_journal_space_available(j);
+	BUG_ON(!j->cur_entry_sectors);
 	if (ret)
 		return ret;
 	buf->u64s_reserved	= j->entry_u64s_reserved;
 	buf->disk_sectors	= j->cur_entry_sectors;
@ -411,7 +403,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 {
 	int ret;
-	wait_event(j->wait,
+	closure_wait_event(&j->async_wait,
 		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
 		   (flags & JOURNAL_RES_GET_NONBLOCK));
 	return ret;
@ -969,6 +961,7 @@ void bch2_fs_journal_start(struct journal *j)
 	c->last_bucket_seq_cleanup = journal_cur_seq(j);
 	bch2_journal_space_available(j);
 	spin_unlock(&j->lock);
 	/*
@ -1144,9 +1137,12 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 		pr_buf(&out,
 		       "dev %u:\n"
 		       "\tnr\t\t%u\n"
 		       "\tavailable\t%u:%u\n"
 		       "\tcur_idx\t\t%u (seq %llu)\n"
 		       "\tlast_idx\t%u (seq %llu)\n",
 		       iter, ja->nr,
 		       bch2_journal_dev_buckets_available(j, ja),
 		       ja->sectors_free,
 		       ja->cur_idx,	ja->bucket_seq[ja->cur_idx],
 		       ja->last_idx,	ja->bucket_seq[ja->last_idx]);
 	}
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@ -825,7 +825,6 @@ fsck_err:
 int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 {
 	struct journal *j = &c->journal;
 	struct journal_entry_pin_list *pin_list;
 	struct bkey_i *k, *_n;
 	struct jset_entry *entry;
 	struct journal_replay *i, *n;
@ -867,10 +866,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 			cond_resched();
 		}
-		pin_list = journal_seq_pin(j, j->replay_journal_seq);
+		bch2_journal_pin_put(j, j->replay_journal_seq);
 		if (atomic_dec_and_test(&pin_list->count))
 			journal_wake(j);
 	}
 	j->replay_journal_seq = 0;
@ -885,101 +881,6 @@ err:
 /* journal write: */
 static unsigned journal_dev_buckets_available(struct journal *j,
 					      struct journal_device *ja)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned next = (ja->cur_idx + 1) % ja->nr;
 	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
 	/*
 	 * Allocator startup needs some journal space before we can do journal
 	 * replay:
 	 */
 	if (available &&
 	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
 		available--;
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
 	 */
 	if (available &&
 	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
 		--available;
 	return available;
 }
 int bch2_journal_space_available(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned sectors_next_entry	= UINT_MAX;
 	unsigned i, nr_online = 0, nr_devs = 0;
 	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
 		? journal_prev_buf(j)->sectors
 		: 0;
 	int ret = 0;
 	lockdep_assert_held(&j->lock);
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_JOURNAL]) {
 		struct journal_device *ja = &ca->journal;
 		unsigned buckets_this_device, sectors_this_device;
 		if (!ja->nr)
 			continue;
 		nr_online++;
 		buckets_this_device = journal_dev_buckets_available(j, ja);
 		sectors_this_device = ja->sectors_free;
 		/*
 		 * We that we don't allocate the space for a journal entry
 		 * until we write it out - thus, account for it here:
 		 */
 		if (unwritten_sectors >= sectors_this_device) {
 			if (!buckets_this_device)
 				continue;
 			buckets_this_device--;
 			sectors_this_device = ca->mi.bucket_size;
 		}
 		sectors_this_device -= unwritten_sectors;
 		if (buckets_this_device)
 			sectors_this_device = ca->mi.bucket_size;
 		if (!sectors_this_device)
 			continue;
 		sectors_next_entry = min(sectors_next_entry,
 					 sectors_this_device);
 		nr_devs++;
 	}
 	rcu_read_unlock();
 	if (nr_online < c->opts.metadata_replicas_required) {
 		ret = -EROFS;
 		sectors_next_entry = 0;
 	} else if (!sectors_next_entry ||
 		   nr_devs < min_t(unsigned, nr_online,
 				   c->opts.metadata_replicas)) {
 		ret = -ENOSPC;
 		sectors_next_entry = 0;
 	}
 	WRITE_ONCE(j->cur_entry_sectors, sectors_next_entry);
 	return ret;
 }
 static void __journal_write_alloc(struct journal *j,
 				  struct journal_buf *w,
 				  struct dev_alloc_list *devs_sorted,
@ -1053,7 +954,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
 					  &c->rw_devs[BCH_DATA_JOURNAL]);
 	spin_lock(&j->lock);
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
@ -1069,7 +969,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 		if (sectors > ja->sectors_free &&
 		    sectors <= ca->mi.bucket_size &&
-		    journal_dev_buckets_available(j, ja)) {
+		    bch2_journal_dev_buckets_available(j, ja)) {
 			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 			ja->sectors_free = ca->mi.bucket_size;
 		}
@ -1078,7 +978,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
 done:
 	spin_unlock(&j->lock);
 	rcu_read_unlock();
 	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
@ -1237,6 +1136,9 @@ void bch2_journal_write(struct closure *cl)
 	struct bch_extent_ptr *ptr;
 	bool validate_before_checksum = false;
 	unsigned i, sectors, bytes, u64s;
 	int ret;
 	bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
 	journal_buf_realloc(j, w);
 	jset = w->data;
@ -1293,7 +1195,23 @@ void bch2_journal_write(struct closure *cl)
 	bytes = vstruct_bytes(jset);
 	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
-	if (journal_write_alloc(j, w, sectors)) {
+	spin_lock(&j->lock);
 	ret = journal_write_alloc(j, w, sectors);
 	/*
 	 * write is allocated, no longer need to account for it in
 	 * bch2_journal_space_available():
 	 */
 	w->sectors = 0;
 	/*
 	 * journal entry has been compacted and allocated, recalculate space
 	 * available:
 	 */
 	bch2_journal_space_available(j);
 	spin_unlock(&j->lock);
 	if (ret) {
 		bch2_journal_halt(j);
 		bch_err(c, "Unable to allocate journal write");
 		bch2_fatal_error(c);
@ -1301,12 +1219,6 @@ void bch2_journal_write(struct closure *cl)
 		return;
 	}
 	/*
 	 * write is allocated, no longer need to account for it in
 	 * bch2_journal_entry_sectors:
 	 */
 	w->sectors = 0;
 	/*
 	 * XXX: we really should just disable the entire journal in nochanges
 	 * mode
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@ -40,7 +40,6 @@ int bch2_journal_read(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);
 int bch2_journal_space_available(struct journal *);
 void bch2_journal_write(struct closure *);
 #endif /* _BCACHEFS_JOURNAL_IO_H */
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@ -2,15 +2,213 @@
 #include "bcachefs.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
 #include "super.h"
 /* Free space calculations: */
 unsigned bch2_journal_dev_buckets_available(struct journal *j,
 					    struct journal_device *ja)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	unsigned next = (ja->cur_idx + 1) % ja->nr;
 	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
 	/*
 	 * Allocator startup needs some journal space before we can do journal
 	 * replay:
 	 */
 	if (available &&
 	    test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
 		available--;
 	/*
 	 * Don't use the last bucket unless writing the new last_seq
 	 * will make another bucket available:
 	 */
 	if (available &&
 	    journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
 		--available;
 	return available;
 }
 void bch2_journal_space_available(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned sectors_next_entry	= UINT_MAX;
 	unsigned sectors_total		= UINT_MAX;
 	unsigned max_entry_size		= min(j->buf[0].buf_size >> 9,
 					      j->buf[1].buf_size >> 9);
 	unsigned i, nr_online = 0, nr_devs = 0;
 	unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
 		? journal_prev_buf(j)->sectors
 		: 0;
 	int ret = 0;
 	lockdep_assert_held(&j->lock);
 	rcu_read_lock();
 	for_each_member_device_rcu(ca, c, i,
 				   &c->rw_devs[BCH_DATA_JOURNAL]) {
 		struct journal_device *ja = &ca->journal;
 		unsigned buckets_this_device, sectors_this_device;
 		if (!ja->nr)
 			continue;
 		nr_online++;
 		buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
 		sectors_this_device = ja->sectors_free;
 		/*
 		 * We that we don't allocate the space for a journal entry
 		 * until we write it out - thus, account for it here:
 		 */
 		if (unwritten_sectors >= sectors_this_device) {
 			if (!buckets_this_device)
 				continue;
 			buckets_this_device--;
 			sectors_this_device = ca->mi.bucket_size;
 		}
 		sectors_this_device -= unwritten_sectors;
 		if (sectors_this_device < ca->mi.bucket_size &&
 		    buckets_this_device) {
 			buckets_this_device--;
 			sectors_this_device = ca->mi.bucket_size;
 		}
 		if (!sectors_this_device)
 			continue;
 		sectors_next_entry = min(sectors_next_entry,
 					 sectors_this_device);
 		sectors_total = min(sectors_total,
 			buckets_this_device * ca->mi.bucket_size +
 			sectors_this_device);
 		max_entry_size = min_t(unsigned, max_entry_size,
 				       ca->mi.bucket_size);
 		nr_devs++;
 	}
 	rcu_read_unlock();
 	if (nr_online < c->opts.metadata_replicas_required) {
 		ret = -EROFS;
 		sectors_next_entry = 0;
 	} else if (!sectors_next_entry ||
 		   nr_devs < min_t(unsigned, nr_online,
 				   c->opts.metadata_replicas)) {
 		ret = -ENOSPC;
 		sectors_next_entry = 0;
 	} else if (!fifo_free(&j->pin)) {
 		ret = -ENOSPC;
 		sectors_next_entry = 0;
 	}
 	j->cur_entry_sectors	= sectors_next_entry;
 	j->cur_entry_error	= ret;
 	if (!ret)
 		journal_wake(j);
 }
 /* Discards - last part of journal reclaim: */
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
 	spin_lock(&j->lock);
 	ret = ja->nr &&
 		ja->last_idx != ja->cur_idx &&
 		ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
 	spin_unlock(&j->lock);
 	return ret;
 }
 /*
 * Advance ja->last_idx as long as it points to buckets that are no longer
 * dirty, issuing discards if necessary:
 */
 static void journal_do_discards(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	unsigned iter;
 	mutex_lock(&j->reclaim_lock);
 	for_each_rw_member(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
 		while (should_discard_bucket(j, ja)) {
 			if (ca->mi.discard &&
 			    bdev_max_discard_sectors(ca->disk_sb.bdev))
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
 						ja->buckets[ja->last_idx]),
 					ca->mi.bucket_size, GFP_NOIO);
 			spin_lock(&j->lock);
 			ja->last_idx = (ja->last_idx + 1) % ja->nr;
 			bch2_journal_space_available(j);
 			spin_unlock(&j->lock);
 		}
 	}
 	mutex_unlock(&j->reclaim_lock);
 }
 /*
 * Journal entry pinning - machinery for holding a reference on a given journal
 * entry, holding it open to ensure it gets replayed during recovery:
 */
 static void bch2_journal_reclaim_fast(struct journal *j)
 {
 	struct journal_entry_pin_list temp;
 	bool popped = false;
 	lockdep_assert_held(&j->lock);
 	/*
 	 * Unpin journal entries whose reference counts reached zero, meaning
 	 * all btree nodes got written out
 	 */
 	while (!fifo_empty(&j->pin) &&
 	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
 		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 		BUG_ON(!fifo_pop(&j->pin, temp));
 		popped = true;
 	}
 	if (popped)
 		bch2_journal_space_available(j);
 }
 void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
 	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 	if (atomic_dec_and_test(&pin_list->count)) {
 		spin_lock(&j->lock);
 		bch2_journal_reclaim_fast(j);
 		spin_unlock(&j->lock);
 	}
 }
 static inline void __journal_pin_add(struct journal *j,
 				     u64 seq,
 				     struct journal_entry_pin *pin,
@ -25,10 +223,7 @@ static inline void __journal_pin_add(struct journal *j,
 	pin->seq	= seq;
 	pin->flush	= flush_fn;
-	if (flush_fn)
+	list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 		list_add(&pin->list, &pin_list->list);
 	else
 		INIT_LIST_HEAD(&pin->list);
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
@ -130,88 +325,55 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 * data off of a specific device:
 */
 /**
 * bch2_journal_reclaim_fast - do the fast part of journal reclaim
 *
 * Called from IO submission context, does not block. Cleans up after btree
 * write completions by advancing the journal pin and each cache's last_idx,
 * kicking off discards and background reclaim as necessary.
 */
 void bch2_journal_reclaim_fast(struct journal *j)
 {
 	struct journal_entry_pin_list temp;
 	bool popped = false;
 	lockdep_assert_held(&j->lock);
 	/*
 	 * Unpin journal entries whose reference counts reached zero, meaning
 	 * all btree nodes got written out
 	 */
 	while (!fifo_empty(&j->pin) &&
 	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
 		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 		BUG_ON(!fifo_pop(&j->pin, temp));
 		popped = true;
 	}
 	if (popped)
 		journal_wake(j);
 }
 static void journal_pin_mark_flushing(struct journal *j,
 				      struct journal_entry_pin *pin,
 				      u64 seq)
 {
 	lockdep_assert_held(&j->reclaim_lock);
 	list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
 	BUG_ON(j->flush_in_progress);
 	j->flush_in_progress = pin;
 }
 static void journal_pin_flush(struct journal *j,
 			      struct journal_entry_pin *pin,
 			      u64 seq)
 {
 	pin->flush(j, pin, seq);
 	BUG_ON(j->flush_in_progress != pin);
 	j->flush_in_progress = NULL;
 	wake_up(&j->pin_flush_wait);
 }
 static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *ret = NULL;
-	/* no need to iterate over empty fifo entries: */
+	spin_lock(&j->lock);
-	bch2_journal_reclaim_fast(j);
+
 	BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count));
 	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
-		if (*seq > seq_to_flush ||
+		if (*seq > max_seq ||
 		    (ret = list_first_entry_or_null(&pin_list->list,
 				struct journal_entry_pin, list)))
 			break;
-	return ret;
+	if (ret) {
-}
+		list_move(&ret->list, &pin_list->flushed);
 		BUG_ON(j->flush_in_progress);
 		j->flush_in_progress = ret;
 		j->last_flushed = jiffies;
 	}
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
 	bool ret;
 	spin_lock(&j->lock);
 	ret = ja->nr &&
 		(ja->last_idx != ja->cur_idx &&
 		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
 	spin_unlock(&j->lock);
 	return ret;
 }
 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 			       unsigned min_nr)
 {
 	struct journal_entry_pin *pin;
 	u64 seq;
 	lockdep_assert_held(&j->reclaim_lock);
 	while ((pin = journal_get_next_pin(j, min_nr
 				? U64_MAX : seq_to_flush, &seq))) {
 		if (min_nr)
 			min_nr--;
 		pin->flush(j, pin, seq);
 		BUG_ON(j->flush_in_progress != pin);
 		j->flush_in_progress = NULL;
 		wake_up(&j->pin_flush_wait);
 	}
 }
 /**
 * bch2_journal_reclaim_work - free up journal buckets
 *
@ -236,104 +398,44 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 				struct bch_fs, journal.reclaim_work);
 	struct journal *j = &c->journal;
 	struct bch_dev *ca;
-	struct journal_entry_pin *pin;
+	unsigned iter, bucket_to_flush, min_nr = 0;
-	u64 seq, seq_to_flush = 0;
+	u64 seq_to_flush = 0;
-	unsigned iter, bucket_to_flush;
+
-	unsigned long next_flush;
+	journal_do_discards(j);
-	bool reclaim_lock_held = false, need_flush;
+
 	mutex_lock(&j->reclaim_lock);
 	spin_lock(&j->lock);
 	/*
 	 * Advance last_idx to point to the oldest journal entry containing
 	 * btree node updates that have not yet been written out
 	 */
 	for_each_rw_member(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
 		if (!ja->nr)
 			continue;
 		while (should_discard_bucket(j, ja)) {
 			if (!reclaim_lock_held) {
 				/*
 				 * ugh:
 				 * might be called from __journal_res_get()
 				 * under wait_event() - have to go back to
 				 * TASK_RUNNING before doing something that
 				 * would block, but only if we're doing work:
 				 */
 				__set_current_state(TASK_RUNNING);
-				mutex_lock(&j->reclaim_lock);
+		/* Try to keep the journal at most half full: */
 				reclaim_lock_held = true;
 				/* recheck under reclaim_lock: */
 				continue;
 			}
 			if (ca->mi.discard &&
 			    bdev_max_discard_sectors(ca->disk_sb.bdev))
 				blkdev_issue_discard(ca->disk_sb.bdev,
 					bucket_to_sector(ca,
 						ja->buckets[ja->last_idx]),
 					ca->mi.bucket_size, GFP_NOIO);
 			spin_lock(&j->lock);
 			ja->last_idx = (ja->last_idx + 1) % ja->nr;
 			spin_unlock(&j->lock);
 			journal_wake(j);
 		}
 		/*
 		 * Write out enough btree nodes to free up 50% journal
 		 * buckets
 		 */
 		spin_lock(&j->lock);
 		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
 		seq_to_flush = max_t(u64, seq_to_flush,
 				     ja->bucket_seq[bucket_to_flush]);
 		spin_unlock(&j->lock);
 	}
 	/* Also flush if the pin fifo is more than half full */
 	spin_lock(&j->lock);
 	seq_to_flush = max_t(s64, seq_to_flush,
 			     (s64) journal_cur_seq(j) -
 			     (j->pin.size >> 1));
 	spin_unlock(&j->lock);
 	/*
 	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
 	 * make sure to flush at least one journal pin:
 	 */
-	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+	if (time_after(jiffies, j->last_flushed +
-	need_flush = time_after(jiffies, next_flush);
+		       msecs_to_jiffies(j->reclaim_delay_ms)))
 		min_nr = 1;
-	while ((pin = journal_get_next_pin(j, need_flush
+	journal_flush_pins(j, seq_to_flush, min_nr);
 					   ? U64_MAX
 					   : seq_to_flush, &seq))) {
 		if (!reclaim_lock_held) {
 			spin_unlock(&j->lock);
 			__set_current_state(TASK_RUNNING);
 			mutex_lock(&j->reclaim_lock);
 			reclaim_lock_held = true;
 			spin_lock(&j->lock);
 			continue;
 		}
-		journal_pin_mark_flushing(j, pin, seq);
+	mutex_unlock(&j->reclaim_lock);
 		spin_unlock(&j->lock);
 		journal_pin_flush(j, pin, seq);
 		need_flush = false;
 		j->last_flushed = jiffies;
 		spin_lock(&j->lock);
 	}
 	spin_unlock(&j->lock);
 	if (reclaim_lock_held)
 		mutex_unlock(&j->reclaim_lock);
 	if (!test_bit(BCH_FS_RO, &c->flags))
 		queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
@ -342,8 +444,6 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 {
 	struct journal_entry_pin *pin;
 	u64 pin_seq;
 	int ret;
 	ret = bch2_journal_error(j);
@ -351,16 +451,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 		return ret;
 	mutex_lock(&j->reclaim_lock);
 	journal_flush_pins(j, seq_to_flush, 0);
 	spin_lock(&j->lock);
 	while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) {
 		journal_pin_mark_flushing(j, pin, pin_seq);
 		spin_unlock(&j->lock);
 		journal_pin_flush(j, pin, pin_seq);
 		spin_lock(&j->lock);
 	}
 	/*
 	 * If journal replay hasn't completed, the unreplayed journal entries
 	 * hold refs on their corresponding sequence numbers
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@ -4,6 +4,10 @@
 #define JOURNAL_PIN	(32 * 1024)
 unsigned bch2_journal_dev_buckets_available(struct journal *,
 					    struct journal_device *);
 void bch2_journal_space_available(struct journal *);
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
 {
 	return pin->seq != 0;
@ -17,6 +21,8 @@ journal_seq_pin(struct journal *j, u64 seq)
 	return &j->pin.data[seq & j->pin.mask];
 }
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
 			  journal_pin_flush_fn);
 void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
@ -28,7 +34,6 @@ void bch2_journal_pin_add_if_older(struct journal *,
 				  journal_pin_flush_fn);
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 void bch2_journal_reclaim_fast(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 void bch2_journal_flush_pins(struct journal *, u64);
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@ -136,6 +136,12 @@ struct journal {
 	unsigned		cur_entry_u64s;
 	unsigned		cur_entry_sectors;
 	/*
 	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
 	 * insufficient devices:
 	 */
 	int			cur_entry_error;
 	/* Reserved space in journal entry to be used just prior to write */
 	unsigned		entry_u64s_reserved;