2017-03-17 06:18:50 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include "bcachefs.h"
|
2020-11-20 00:54:40 +00:00
|
|
|
#include "btree_key_cache.h"
|
2023-03-07 12:28:20 +00:00
|
|
|
#include "btree_update.h"
|
2023-11-02 22:57:19 +00:00
|
|
|
#include "btree_write_buffer.h"
|
2023-08-05 19:40:21 +00:00
|
|
|
#include "buckets.h"
|
2022-07-18 23:42:58 +00:00
|
|
|
#include "errcode.h"
|
2020-12-05 21:25:05 +00:00
|
|
|
#include "error.h"
|
2017-03-17 06:18:50 +00:00
|
|
|
#include "journal.h"
|
2019-02-21 18:33:21 +00:00
|
|
|
#include "journal_io.h"
|
2017-03-17 06:18:50 +00:00
|
|
|
#include "journal_reclaim.h"
|
|
|
|
#include "replicas.h"
|
2023-08-05 19:40:21 +00:00
|
|
|
#include "sb-members.h"
|
2020-11-20 00:54:40 +00:00
|
|
|
#include "trace.h"
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2020-11-20 01:55:33 +00:00
|
|
|
#include <linux/kthread.h>
|
2020-11-20 02:15:39 +00:00
|
|
|
#include <linux/sched/mm.h>
|
|
|
|
|
2019-02-21 18:33:21 +00:00
|
|
|
/* Free space calculations: */
|
|
|
|
|
2019-03-03 21:50:40 +00:00
|
|
|
static unsigned journal_space_from(struct journal_device *ja,
|
|
|
|
enum journal_space_from from)
|
|
|
|
{
|
|
|
|
switch (from) {
|
|
|
|
case journal_space_discarded:
|
|
|
|
return ja->discard_idx;
|
|
|
|
case journal_space_clean_ondisk:
|
|
|
|
return ja->dirty_idx_ondisk;
|
|
|
|
case journal_space_clean:
|
|
|
|
return ja->dirty_idx;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-21 18:33:21 +00:00
|
|
|
unsigned bch2_journal_dev_buckets_available(struct journal *j,
|
2019-03-03 21:50:40 +00:00
|
|
|
struct journal_device *ja,
|
|
|
|
enum journal_space_from from)
|
2019-02-21 18:33:21 +00:00
|
|
|
{
|
2022-02-23 11:56:35 +00:00
|
|
|
unsigned available = (journal_space_from(ja, from) -
|
|
|
|
ja->cur_idx - 1 + ja->nr) % ja->nr;
|
2019-02-21 18:33:21 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't use the last bucket unless writing the new last_seq
|
|
|
|
* will make another bucket available:
|
|
|
|
*/
|
2019-03-03 20:15:55 +00:00
|
|
|
if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
|
2019-02-21 18:33:21 +00:00
|
|
|
--available;
|
|
|
|
|
|
|
|
return available;
|
|
|
|
}
|
|
|
|
|
2023-11-02 22:57:19 +00:00
|
|
|
void bch2_journal_set_watermark(struct journal *j)
|
2019-02-19 18:41:36 +00:00
|
|
|
{
|
2023-11-10 03:07:42 +00:00
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
bool low_on_space = j->space[journal_space_clean].total * 4 <=
|
|
|
|
j->space[journal_space_total].total;
|
|
|
|
bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
|
2023-11-02 22:57:19 +00:00
|
|
|
bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
|
|
|
|
unsigned watermark = low_on_space || low_on_pin || low_on_wb
|
2023-11-10 03:07:42 +00:00
|
|
|
? BCH_WATERMARK_reclaim
|
|
|
|
: BCH_WATERMARK_stripe;
|
|
|
|
|
2024-03-14 00:16:40 +00:00
|
|
|
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
|
|
|
|
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
|
|
|
|
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
|
2023-11-10 03:07:42 +00:00
|
|
|
trace_and_count(c, journal_full, c);
|
2023-11-05 02:54:26 +00:00
|
|
|
|
2024-04-30 10:20:37 +00:00
|
|
|
mod_bit(JOURNAL_space_low, &j->flags, low_on_space || low_on_pin);
|
2024-04-06 03:27:27 +00:00
|
|
|
|
2023-11-05 02:54:26 +00:00
|
|
|
swap(watermark, j->watermark);
|
|
|
|
if (watermark > j->watermark)
|
|
|
|
journal_wake(j);
|
2019-02-19 18:41:36 +00:00
|
|
|
}
|
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
static struct journal_space
|
|
|
|
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
|
2019-03-03 21:50:40 +00:00
|
|
|
enum journal_space_from from)
|
2019-02-21 18:33:21 +00:00
|
|
|
{
|
2020-11-14 17:29:21 +00:00
|
|
|
struct journal_device *ja = &ca->journal;
|
2022-03-01 00:17:27 +00:00
|
|
|
unsigned sectors, buckets, unwritten;
|
|
|
|
u64 seq;
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
if (from == journal_space_total)
|
|
|
|
return (struct journal_space) {
|
|
|
|
.next_entry = ca->mi.bucket_size,
|
|
|
|
.total = ca->mi.bucket_size * ja->nr,
|
|
|
|
};
|
2019-03-03 20:15:55 +00:00
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
buckets = bch2_journal_dev_buckets_available(j, ja, from);
|
|
|
|
sectors = ja->sectors_free;
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
/*
|
|
|
|
* We that we don't allocate the space for a journal entry
|
|
|
|
* until we write it out - thus, account for it here:
|
|
|
|
*/
|
2022-03-01 00:17:27 +00:00
|
|
|
for (seq = journal_last_unwritten_seq(j);
|
|
|
|
seq <= journal_cur_seq(j);
|
|
|
|
seq++) {
|
|
|
|
unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
|
|
|
|
|
|
|
|
if (!unwritten)
|
|
|
|
continue;
|
|
|
|
|
2021-05-31 04:13:39 +00:00
|
|
|
/* entry won't fit on this device, skip: */
|
|
|
|
if (unwritten > ca->mi.bucket_size)
|
|
|
|
continue;
|
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
if (unwritten >= sectors) {
|
|
|
|
if (!buckets) {
|
|
|
|
sectors = 0;
|
|
|
|
break;
|
2020-11-13 23:36:33 +00:00
|
|
|
}
|
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
buckets--;
|
|
|
|
sectors = ca->mi.bucket_size;
|
2019-02-21 18:33:21 +00:00
|
|
|
}
|
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
sectors -= unwritten;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sectors < ca->mi.bucket_size && buckets) {
|
|
|
|
buckets--;
|
|
|
|
sectors = ca->mi.bucket_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (struct journal_space) {
|
|
|
|
.next_entry = sectors,
|
|
|
|
.total = sectors + buckets * ca->mi.bucket_size,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
|
|
|
|
enum journal_space_from from)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
2023-12-17 07:34:05 +00:00
|
|
|
unsigned pos, nr_devs = 0;
|
2020-11-14 17:29:21 +00:00
|
|
|
struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
|
|
|
|
|
|
|
|
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
rcu_read_lock();
|
2023-12-17 07:34:05 +00:00
|
|
|
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
|
2020-11-14 17:29:21 +00:00
|
|
|
if (!ca->journal.nr)
|
2019-02-21 18:33:21 +00:00
|
|
|
continue;
|
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
space = journal_dev_space_available(j, ca, from);
|
|
|
|
if (!space.next_entry)
|
|
|
|
continue;
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
for (pos = 0; pos < nr_devs; pos++)
|
|
|
|
if (space.total > dev_space[pos].total)
|
|
|
|
break;
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
array_insert_item(dev_space, nr_devs, pos, space);
|
2019-02-21 18:33:21 +00:00
|
|
|
}
|
2019-03-03 21:50:40 +00:00
|
|
|
rcu_read_unlock();
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2019-03-03 21:50:40 +00:00
|
|
|
if (nr_devs < nr_devs_want)
|
|
|
|
return (struct journal_space) { 0, 0 };
|
|
|
|
|
2020-11-14 17:29:21 +00:00
|
|
|
/*
|
|
|
|
* We sorted largest to smallest, and we want the smallest out of the
|
|
|
|
* @nr_devs_want largest devices:
|
|
|
|
*/
|
|
|
|
return dev_space[nr_devs_want - 1];
|
2019-03-03 21:50:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_journal_space_available(struct journal *j)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
bcachefs: Don't require flush/fua on every journal write
This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.
- non flush/fua journal writes don't update last_seq (i.e. they don't
free up space in the journal), thus the journal free space
calculations now check whether nonflush journal writes are currently
allowed (i.e. are we low on free space, or would doing a flush write
free up a lot of space in the journal)
- write_delay_ms, the user configurable option for when open journal
entries are automatically written, is now interpreted as the max
delay between flush journal writes (default 1 second).
- bch2_journal_flush_seq_async is changed to ensure a flush write >=
the requested sequence number has happened
- journal read/replay must now ignore, and blacklist, any journal
entries newer than the most recent flush entry in the journal. Also,
the way the read_entire_journal option is handled has been improved;
struct journal_replay now has an entry, 'ignore', for entries that
were read but should not be used.
- assorted refactoring and improvements related to journal read in
journal_io.c and recovery.c
Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-11-14 14:59:58 +00:00
|
|
|
unsigned clean, clean_ondisk, total;
|
2019-02-19 18:41:36 +00:00
|
|
|
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
|
|
|
|
j->buf[1].buf_size >> 9);
|
2023-12-17 07:34:05 +00:00
|
|
|
unsigned nr_online = 0, nr_devs_want;
|
2019-03-03 21:50:40 +00:00
|
|
|
bool can_discard = false;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
lockdep_assert_held(&j->lock);
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2023-12-17 07:34:05 +00:00
|
|
|
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
|
2019-03-03 21:50:40 +00:00
|
|
|
struct journal_device *ja = &ca->journal;
|
|
|
|
|
|
|
|
if (!ja->nr)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
while (ja->dirty_idx != ja->cur_idx &&
|
|
|
|
ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
|
|
|
|
ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
|
|
|
|
|
|
|
|
while (ja->dirty_idx_ondisk != ja->dirty_idx &&
|
|
|
|
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
|
|
|
|
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
|
|
|
|
|
|
|
|
if (ja->discard_idx != ja->dirty_idx_ondisk)
|
|
|
|
can_discard = true;
|
|
|
|
|
|
|
|
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
|
|
|
|
nr_online++;
|
2019-02-21 18:33:21 +00:00
|
|
|
}
|
2019-03-03 20:15:55 +00:00
|
|
|
rcu_read_unlock();
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2019-03-03 21:50:40 +00:00
|
|
|
j->can_discard = can_discard;
|
|
|
|
|
2024-02-11 02:01:40 +00:00
|
|
|
if (nr_online < metadata_replicas_required(c)) {
|
2024-07-15 20:30:44 +00:00
|
|
|
struct printbuf buf = PRINTBUF;
|
2024-07-22 15:25:03 +00:00
|
|
|
buf.atomic++;
|
2024-07-15 20:30:44 +00:00
|
|
|
prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
|
|
|
|
"rw journal devs:", nr_online, metadata_replicas_required(c));
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
|
|
|
|
prt_printf(&buf, " %s", ca->name);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
bch_err(c, "%s", buf.buf);
|
|
|
|
printbuf_exit(&buf);
|
2022-03-15 01:48:42 +00:00
|
|
|
ret = JOURNAL_ERR_insufficient_devices;
|
2019-03-03 21:50:40 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
|
|
|
|
|
2023-12-17 07:34:05 +00:00
|
|
|
for (unsigned i = 0; i < journal_space_nr; i++)
|
2020-11-14 17:29:21 +00:00
|
|
|
j->space[i] = __journal_space_available(j, nr_devs_want, i);
|
2019-03-03 21:50:40 +00:00
|
|
|
|
bcachefs: Don't require flush/fua on every journal write
This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.
- non flush/fua journal writes don't update last_seq (i.e. they don't
free up space in the journal), thus the journal free space
calculations now check whether nonflush journal writes are currently
allowed (i.e. are we low on free space, or would doing a flush write
free up a lot of space in the journal)
- write_delay_ms, the user configurable option for when open journal
entries are automatically written, is now interpreted as the max
delay between flush journal writes (default 1 second).
- bch2_journal_flush_seq_async is changed to ensure a flush write >=
the requested sequence number has happened
- journal read/replay must now ignore, and blacklist, any journal
entries newer than the most recent flush entry in the journal. Also,
the way the read_entire_journal option is handled has been improved;
struct journal_replay now has an entry, 'ignore', for entries that
were read but should not be used.
- assorted refactoring and improvements related to journal read in
journal_io.c and recovery.c
Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-11-14 14:59:58 +00:00
|
|
|
clean_ondisk = j->space[journal_space_clean_ondisk].total;
|
2020-11-14 17:29:21 +00:00
|
|
|
clean = j->space[journal_space_clean].total;
|
bcachefs: Don't require flush/fua on every journal write
This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.
- non flush/fua journal writes don't update last_seq (i.e. they don't
free up space in the journal), thus the journal free space
calculations now check whether nonflush journal writes are currently
allowed (i.e. are we low on free space, or would doing a flush write
free up a lot of space in the journal)
- write_delay_ms, the user configurable option for when open journal
entries are automatically written, is now interpreted as the max
delay between flush journal writes (default 1 second).
- bch2_journal_flush_seq_async is changed to ensure a flush write >=
the requested sequence number has happened
- journal read/replay must now ignore, and blacklist, any journal
entries newer than the most recent flush entry in the journal. Also,
the way the read_entire_journal option is handled has been improved;
struct journal_replay now has an entry, 'ignore', for entries that
were read but should not be used.
- assorted refactoring and improvements related to journal read in
journal_io.c and recovery.c
Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-11-14 14:59:58 +00:00
|
|
|
total = j->space[journal_space_total].total;
|
2020-11-14 17:29:21 +00:00
|
|
|
|
2023-03-21 12:09:16 +00:00
|
|
|
if (!j->space[journal_space_discarded].next_entry)
|
2022-03-15 01:48:42 +00:00
|
|
|
ret = JOURNAL_ERR_journal_full;
|
2019-02-19 18:41:36 +00:00
|
|
|
|
2020-12-05 21:25:05 +00:00
|
|
|
if ((j->space[journal_space_clean_ondisk].next_entry <
|
|
|
|
j->space[journal_space_clean_ondisk].total) &&
|
|
|
|
(clean - clean_ondisk <= total / 8) &&
|
2022-10-19 22:31:33 +00:00
|
|
|
(clean_ondisk * 2 > clean))
|
2024-04-30 10:20:37 +00:00
|
|
|
set_bit(JOURNAL_may_skip_flush, &j->flags);
|
bcachefs: Don't require flush/fua on every journal write
This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.
- non flush/fua journal writes don't update last_seq (i.e. they don't
free up space in the journal), thus the journal free space
calculations now check whether nonflush journal writes are currently
allowed (i.e. are we low on free space, or would doing a flush write
free up a lot of space in the journal)
- write_delay_ms, the user configurable option for when open journal
entries are automatically written, is now interpreted as the max
delay between flush journal writes (default 1 second).
- bch2_journal_flush_seq_async is changed to ensure a flush write >=
the requested sequence number has happened
- journal read/replay must now ignore, and blacklist, any journal
entries newer than the most recent flush entry in the journal. Also,
the way the read_entire_journal option is handled has been improved;
struct journal_replay now has an entry, 'ignore', for entries that
were read but should not be used.
- assorted refactoring and improvements related to journal read in
journal_io.c and recovery.c
Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-11-14 14:59:58 +00:00
|
|
|
else
|
2024-04-30 10:20:37 +00:00
|
|
|
clear_bit(JOURNAL_may_skip_flush, &j->flags);
|
bcachefs: Don't require flush/fua on every journal write
This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.
- non flush/fua journal writes don't update last_seq (i.e. they don't
free up space in the journal), thus the journal free space
calculations now check whether nonflush journal writes are currently
allowed (i.e. are we low on free space, or would doing a flush write
free up a lot of space in the journal)
- write_delay_ms, the user configurable option for when open journal
entries are automatically written, is now interpreted as the max
delay between flush journal writes (default 1 second).
- bch2_journal_flush_seq_async is changed to ensure a flush write >=
the requested sequence number has happened
- journal read/replay must now ignore, and blacklist, any journal
entries newer than the most recent flush entry in the journal. Also,
the way the read_entire_journal option is handled has been improved;
struct journal_replay now has an entry, 'ignore', for entries that
were read but should not be used.
- assorted refactoring and improvements related to journal read in
journal_io.c and recovery.c
Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-11-14 14:59:58 +00:00
|
|
|
|
2023-11-02 22:57:19 +00:00
|
|
|
bch2_journal_set_watermark(j);
|
2019-03-03 21:50:40 +00:00
|
|
|
out:
|
2020-11-14 17:29:21 +00:00
|
|
|
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
|
2019-02-21 18:33:21 +00:00
|
|
|
j->cur_entry_error = ret;
|
|
|
|
|
|
|
|
if (!ret)
|
|
|
|
journal_wake(j);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Discards - last part of journal reclaim: */
|
|
|
|
|
|
|
|
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
|
|
|
{
|
|
|
|
bool ret;
|
|
|
|
|
|
|
|
spin_lock(&j->lock);
|
2019-03-03 20:15:55 +00:00
|
|
|
ret = ja->discard_idx != ja->dirty_idx_ondisk;
|
2019-02-21 18:33:21 +00:00
|
|
|
spin_unlock(&j->lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2019-03-03 20:15:55 +00:00
|
|
|
* Advance ja->discard_idx as long as it points to buckets that are no longer
|
2019-02-21 18:33:21 +00:00
|
|
|
* dirty, issuing discards if necessary:
|
|
|
|
*/
|
2019-03-03 23:39:07 +00:00
|
|
|
void bch2_journal_do_discards(struct journal *j)
|
2019-02-21 18:33:21 +00:00
|
|
|
{
|
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
|
2019-03-03 20:15:55 +00:00
|
|
|
mutex_lock(&j->discard_lock);
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2023-12-17 04:47:29 +00:00
|
|
|
for_each_rw_member(c, ca) {
|
2019-02-21 18:33:21 +00:00
|
|
|
struct journal_device *ja = &ca->journal;
|
|
|
|
|
|
|
|
while (should_discard_bucket(j, ja)) {
|
2022-02-21 10:05:29 +00:00
|
|
|
if (!c->opts.nochanges &&
|
|
|
|
ca->mi.discard &&
|
2019-02-21 18:33:21 +00:00
|
|
|
bdev_max_discard_sectors(ca->disk_sb.bdev))
|
|
|
|
blkdev_issue_discard(ca->disk_sb.bdev,
|
|
|
|
bucket_to_sector(ca,
|
2019-03-03 20:15:55 +00:00
|
|
|
ja->buckets[ja->discard_idx]),
|
2023-05-28 22:02:38 +00:00
|
|
|
ca->mi.bucket_size, GFP_NOFS);
|
2019-02-21 18:33:21 +00:00
|
|
|
|
|
|
|
spin_lock(&j->lock);
|
2019-03-03 20:15:55 +00:00
|
|
|
ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
|
2019-02-21 18:33:21 +00:00
|
|
|
|
|
|
|
bch2_journal_space_available(j);
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-03-03 20:15:55 +00:00
|
|
|
mutex_unlock(&j->discard_lock);
|
2019-02-21 18:33:21 +00:00
|
|
|
}
|
|
|
|
|
2017-03-17 06:18:50 +00:00
|
|
|
/*
|
|
|
|
* Journal entry pinning - machinery for holding a reference on a given journal
|
|
|
|
* entry, holding it open to ensure it gets replayed during recovery:
|
|
|
|
*/
|
|
|
|
|
2023-09-15 12:51:51 +00:00
|
|
|
void bch2_journal_reclaim_fast(struct journal *j)
|
2019-02-21 18:33:21 +00:00
|
|
|
{
|
|
|
|
bool popped = false;
|
|
|
|
|
|
|
|
lockdep_assert_held(&j->lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unpin journal entries whose reference counts reached zero, meaning
|
|
|
|
* all btree nodes got written out
|
|
|
|
*/
|
|
|
|
while (!fifo_empty(&j->pin) &&
|
2023-11-07 23:08:38 +00:00
|
|
|
j->pin.front <= j->seq_ondisk &&
|
2019-02-21 18:33:21 +00:00
|
|
|
!atomic_read(&fifo_peek_front(&j->pin).count)) {
|
2023-09-12 22:41:22 +00:00
|
|
|
j->pin.front++;
|
2019-02-21 18:33:21 +00:00
|
|
|
popped = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (popped)
|
|
|
|
bch2_journal_space_available(j);
|
|
|
|
}
|
|
|
|
|
2023-09-15 12:51:51 +00:00
|
|
|
bool __bch2_journal_pin_put(struct journal *j, u64 seq)
|
2020-11-13 23:36:33 +00:00
|
|
|
{
|
|
|
|
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
|
|
|
|
|
2023-09-15 12:51:51 +00:00
|
|
|
return atomic_dec_and_test(&pin_list->count);
|
2020-11-13 23:36:33 +00:00
|
|
|
}
|
|
|
|
|
2019-02-21 18:33:21 +00:00
|
|
|
void bch2_journal_pin_put(struct journal *j, u64 seq)
|
|
|
|
{
|
2023-09-15 12:51:51 +00:00
|
|
|
if (__bch2_journal_pin_put(j, seq)) {
|
2019-02-21 18:33:21 +00:00
|
|
|
spin_lock(&j->lock);
|
|
|
|
bch2_journal_reclaim_fast(j);
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-05 15:13:37 +00:00
|
|
|
static inline bool __journal_pin_drop(struct journal *j,
|
2017-03-17 06:18:50 +00:00
|
|
|
struct journal_entry_pin *pin)
|
|
|
|
{
|
2018-07-17 16:19:14 +00:00
|
|
|
struct journal_entry_pin_list *pin_list;
|
2017-03-17 06:18:50 +00:00
|
|
|
|
|
|
|
if (!journal_pin_active(pin))
|
2023-01-05 15:13:37 +00:00
|
|
|
return false;
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2021-04-03 20:24:13 +00:00
|
|
|
if (j->flush_in_progress == pin)
|
|
|
|
j->flush_in_progress_dropped = true;
|
|
|
|
|
2018-07-17 16:19:14 +00:00
|
|
|
pin_list = journal_seq_pin(j, pin->seq);
|
|
|
|
pin->seq = 0;
|
2017-03-17 06:18:50 +00:00
|
|
|
list_del_init(&pin->list);
|
|
|
|
|
|
|
|
/*
|
2023-08-07 16:04:05 +00:00
|
|
|
* Unpinning a journal entry may make journal_next_bucket() succeed, if
|
2017-03-17 06:18:50 +00:00
|
|
|
* writing a new last_seq will now make another bucket available:
|
|
|
|
*/
|
2023-01-05 15:13:37 +00:00
|
|
|
return atomic_dec_and_test(&pin_list->count) &&
|
|
|
|
pin_list == &fifo_peek_front(&j->pin);
|
2017-03-17 06:18:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_journal_pin_drop(struct journal *j,
|
2018-07-17 16:19:14 +00:00
|
|
|
struct journal_entry_pin *pin)
|
2017-03-17 06:18:50 +00:00
|
|
|
{
|
|
|
|
spin_lock(&j->lock);
|
2023-01-05 15:13:37 +00:00
|
|
|
if (__journal_pin_drop(j, pin))
|
|
|
|
bch2_journal_reclaim_fast(j);
|
2017-03-17 06:18:50 +00:00
|
|
|
spin_unlock(&j->lock);
|
|
|
|
}
|
|
|
|
|
2023-07-07 02:47:42 +00:00
|
|
|
static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
|
2023-03-07 12:28:20 +00:00
|
|
|
{
|
|
|
|
if (fn == bch2_btree_node_flush0 ||
|
|
|
|
fn == bch2_btree_node_flush1)
|
|
|
|
return JOURNAL_PIN_btree;
|
|
|
|
else if (fn == bch2_btree_key_cache_journal_flush)
|
|
|
|
return JOURNAL_PIN_key_cache;
|
|
|
|
else
|
|
|
|
return JOURNAL_PIN_other;
|
|
|
|
}
|
|
|
|
|
2023-11-07 17:32:50 +00:00
|
|
|
static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
|
2020-12-01 16:48:08 +00:00
|
|
|
struct journal_entry_pin *pin,
|
2023-11-07 17:32:50 +00:00
|
|
|
journal_pin_flush_fn flush_fn,
|
|
|
|
enum journal_pin_type type)
|
|
|
|
{
|
|
|
|
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
|
|
|
|
|
2023-11-09 18:19:00 +00:00
|
|
|
/*
|
|
|
|
* flush_fn is how we identify journal pins in debugfs, so must always
|
|
|
|
* exist, even if it doesn't do anything:
|
|
|
|
*/
|
|
|
|
BUG_ON(!flush_fn);
|
|
|
|
|
2023-11-07 17:32:50 +00:00
|
|
|
atomic_inc(&pin_list->count);
|
|
|
|
pin->seq = seq;
|
|
|
|
pin->flush = flush_fn;
|
2023-11-09 18:19:00 +00:00
|
|
|
list_add(&pin->list, &pin_list->list[type]);
|
2023-11-07 17:32:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_journal_pin_copy(struct journal *j,
|
|
|
|
struct journal_entry_pin *dst,
|
|
|
|
struct journal_entry_pin *src,
|
|
|
|
journal_pin_flush_fn flush_fn)
|
2018-07-17 17:50:15 +00:00
|
|
|
{
|
2020-12-01 16:48:08 +00:00
|
|
|
spin_lock(&j->lock);
|
2020-12-16 20:41:29 +00:00
|
|
|
|
2023-11-07 17:32:50 +00:00
|
|
|
u64 seq = READ_ONCE(src->seq);
|
|
|
|
|
2020-12-16 20:41:29 +00:00
|
|
|
if (seq < journal_last_seq(j)) {
|
|
|
|
/*
|
|
|
|
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
|
|
|
|
* the src pin - with the pin dropped, the entry to pin might no
|
|
|
|
* longer to exist, but that means there's no longer anything to
|
|
|
|
* copy and we can bail out here:
|
|
|
|
*/
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-01-31 16:06:59 +00:00
|
|
|
bool reclaim = __journal_pin_drop(j, dst);
|
2020-02-27 20:03:44 +00:00
|
|
|
|
2023-11-07 17:32:50 +00:00
|
|
|
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
|
2020-02-27 20:03:44 +00:00
|
|
|
|
2023-11-07 17:32:50 +00:00
|
|
|
if (reclaim)
|
|
|
|
bch2_journal_reclaim_fast(j);
|
2020-02-27 20:03:44 +00:00
|
|
|
|
2023-11-07 17:32:50 +00:00
|
|
|
/*
|
|
|
|
* If the journal is currently full, we might want to call flush_fn
|
|
|
|
* immediately:
|
|
|
|
*/
|
2024-01-31 16:06:59 +00:00
|
|
|
if (seq == journal_last_seq(j))
|
|
|
|
journal_wake(j);
|
|
|
|
spin_unlock(&j->lock);
|
2023-11-07 17:32:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_journal_pin_set(struct journal *j, u64 seq,
|
|
|
|
struct journal_entry_pin *pin,
|
|
|
|
journal_pin_flush_fn flush_fn)
|
|
|
|
{
|
|
|
|
spin_lock(&j->lock);
|
|
|
|
|
|
|
|
BUG_ON(seq < journal_last_seq(j));
|
|
|
|
|
2024-01-31 16:06:59 +00:00
|
|
|
bool reclaim = __journal_pin_drop(j, pin);
|
2023-11-07 17:32:50 +00:00
|
|
|
|
|
|
|
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
|
2023-01-05 15:13:37 +00:00
|
|
|
|
|
|
|
if (reclaim)
|
|
|
|
bch2_journal_reclaim_fast(j);
|
2019-03-08 00:46:10 +00:00
|
|
|
/*
|
|
|
|
* If the journal is currently full, we might want to call flush_fn
|
|
|
|
* immediately:
|
|
|
|
*/
|
2024-01-31 16:06:59 +00:00
|
|
|
if (seq == journal_last_seq(j))
|
|
|
|
journal_wake(j);
|
|
|
|
|
|
|
|
spin_unlock(&j->lock);
|
2019-03-08 00:46:10 +00:00
|
|
|
}
|
|
|
|
|
2020-02-27 20:03:44 +00:00
|
|
|
/**
|
|
|
|
* bch2_journal_pin_flush: ensure journal pin callback is no longer running
|
2023-09-12 22:41:22 +00:00
|
|
|
* @j: journal object
|
|
|
|
* @pin: pin to flush
|
2020-02-27 20:03:44 +00:00
|
|
|
*/
|
2018-07-17 16:19:14 +00:00
|
|
|
void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
|
|
|
|
{
|
|
|
|
BUG_ON(journal_pin_active(pin));
|
|
|
|
|
|
|
|
wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
|
|
|
|
}
|
|
|
|
|
2017-03-17 06:18:50 +00:00
|
|
|
/*
|
|
|
|
* Journal reclaim: flush references to open journal entries to reclaim space in
|
|
|
|
* the journal
|
|
|
|
*
|
|
|
|
* May be done by the journal code in the background as needed to free up space
|
|
|
|
* for more journal entries, or as part of doing a clean shutdown, or to migrate
|
|
|
|
* data off of a specific device:
|
|
|
|
*/
|
|
|
|
|
|
|
|
static struct journal_entry_pin *
|
2021-04-01 01:44:55 +00:00
|
|
|
journal_get_next_pin(struct journal *j,
|
2023-03-07 12:28:20 +00:00
|
|
|
u64 seq_to_flush,
|
|
|
|
unsigned allowed_below_seq,
|
|
|
|
unsigned allowed_above_seq,
|
|
|
|
u64 *seq)
|
2017-03-17 06:18:50 +00:00
|
|
|
{
|
2018-07-17 16:19:14 +00:00
|
|
|
struct journal_entry_pin_list *pin_list;
|
|
|
|
struct journal_entry_pin *ret = NULL;
|
2023-03-07 12:28:20 +00:00
|
|
|
unsigned i;
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2021-04-01 01:44:55 +00:00
|
|
|
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
|
2023-03-07 12:28:20 +00:00
|
|
|
if (*seq > seq_to_flush && !allowed_above_seq)
|
2018-07-17 16:19:14 +00:00
|
|
|
break;
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2023-03-07 12:28:20 +00:00
|
|
|
for (i = 0; i < JOURNAL_PIN_NR; i++)
|
|
|
|
if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
|
|
|
|
((1U << i) & allowed_above_seq)) {
|
|
|
|
ret = list_first_entry_or_null(&pin_list->list[i],
|
|
|
|
struct journal_entry_pin, list);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2021-04-01 01:44:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
2017-03-17 06:18:50 +00:00
|
|
|
}
|
|
|
|
|
2020-05-28 20:06:13 +00:00
|
|
|
/* returns true if we did work */
|
2023-03-07 12:28:20 +00:00
|
|
|
static size_t journal_flush_pins(struct journal *j,
|
|
|
|
u64 seq_to_flush,
|
|
|
|
unsigned allowed_below_seq,
|
|
|
|
unsigned allowed_above_seq,
|
2021-04-01 01:44:55 +00:00
|
|
|
unsigned min_any,
|
|
|
|
unsigned min_key_cache)
|
2017-03-17 06:18:50 +00:00
|
|
|
{
|
2019-02-21 18:33:21 +00:00
|
|
|
struct journal_entry_pin *pin;
|
2021-04-03 20:24:13 +00:00
|
|
|
size_t nr_flushed = 0;
|
|
|
|
journal_pin_flush_fn flush_fn;
|
|
|
|
u64 seq;
|
|
|
|
int err;
|
|
|
|
|
2019-02-21 18:33:21 +00:00
|
|
|
lockdep_assert_held(&j->reclaim_lock);
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2020-11-20 00:54:40 +00:00
|
|
|
while (1) {
|
2023-03-07 12:28:20 +00:00
|
|
|
unsigned allowed_above = allowed_above_seq;
|
|
|
|
unsigned allowed_below = allowed_below_seq;
|
|
|
|
|
|
|
|
if (min_any) {
|
|
|
|
allowed_above |= ~0;
|
|
|
|
allowed_below |= ~0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (min_key_cache) {
|
|
|
|
allowed_above |= 1U << JOURNAL_PIN_key_cache;
|
|
|
|
allowed_below |= 1U << JOURNAL_PIN_key_cache;
|
|
|
|
}
|
|
|
|
|
2020-11-20 00:54:40 +00:00
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
j->last_flushed = jiffies;
|
|
|
|
|
2021-04-03 20:24:13 +00:00
|
|
|
spin_lock(&j->lock);
|
2023-03-07 12:28:20 +00:00
|
|
|
pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
|
2021-04-03 20:24:13 +00:00
|
|
|
if (pin) {
|
|
|
|
BUG_ON(j->flush_in_progress);
|
|
|
|
j->flush_in_progress = pin;
|
|
|
|
j->flush_in_progress_dropped = false;
|
|
|
|
flush_fn = pin->flush;
|
|
|
|
}
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
|
2020-11-20 00:54:40 +00:00
|
|
|
if (!pin)
|
|
|
|
break;
|
|
|
|
|
2021-04-01 01:44:55 +00:00
|
|
|
if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
|
|
|
|
min_key_cache--;
|
|
|
|
|
|
|
|
if (min_any)
|
|
|
|
min_any--;
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2021-04-03 20:24:13 +00:00
|
|
|
err = flush_fn(j, pin, seq);
|
2019-02-21 18:33:21 +00:00
|
|
|
|
2021-04-03 20:24:13 +00:00
|
|
|
spin_lock(&j->lock);
|
|
|
|
/* Pin might have been dropped or rearmed: */
|
|
|
|
if (likely(!err && !j->flush_in_progress_dropped))
|
|
|
|
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
|
2019-02-21 18:33:21 +00:00
|
|
|
j->flush_in_progress = NULL;
|
2021-04-03 20:24:13 +00:00
|
|
|
j->flush_in_progress_dropped = false;
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
|
2019-02-21 18:33:21 +00:00
|
|
|
wake_up(&j->pin_flush_wait);
|
2021-04-03 20:24:13 +00:00
|
|
|
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
|
|
|
|
nr_flushed++;
|
2019-02-21 18:33:21 +00:00
|
|
|
}
|
2020-05-28 20:06:13 +00:00
|
|
|
|
2021-04-03 20:24:13 +00:00
|
|
|
return nr_flushed;
|
2017-03-17 06:18:50 +00:00
|
|
|
}
|
|
|
|
|
2020-11-02 22:51:38 +00:00
|
|
|
static u64 journal_seq_to_flush(struct journal *j)
|
2017-03-17 06:18:50 +00:00
|
|
|
{
|
2019-02-28 19:22:52 +00:00
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
2019-02-21 18:33:21 +00:00
|
|
|
u64 seq_to_flush = 0;
|
|
|
|
|
|
|
|
spin_lock(&j->lock);
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2023-12-17 04:47:29 +00:00
|
|
|
for_each_rw_member(c, ca) {
|
2017-03-17 06:18:50 +00:00
|
|
|
struct journal_device *ja = &ca->journal;
|
2019-02-19 18:41:36 +00:00
|
|
|
unsigned nr_buckets, bucket_to_flush;
|
2017-03-17 06:18:50 +00:00
|
|
|
|
|
|
|
if (!ja->nr)
|
|
|
|
continue;
|
|
|
|
|
2019-02-21 18:33:21 +00:00
|
|
|
/* Try to keep the journal at most half full: */
|
2019-02-19 18:41:36 +00:00
|
|
|
nr_buckets = ja->nr / 2;
|
|
|
|
|
|
|
|
nr_buckets = min(nr_buckets, ja->nr);
|
|
|
|
|
|
|
|
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
|
2019-03-07 22:19:04 +00:00
|
|
|
seq_to_flush = max(seq_to_flush,
|
|
|
|
ja->bucket_seq[bucket_to_flush]);
|
2017-03-17 06:18:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Also flush if the pin fifo is more than half full */
|
|
|
|
seq_to_flush = max_t(s64, seq_to_flush,
|
|
|
|
(s64) journal_cur_seq(j) -
|
|
|
|
(j->pin.size >> 1));
|
2019-02-21 18:33:21 +00:00
|
|
|
spin_unlock(&j->lock);
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2020-11-02 22:51:38 +00:00
|
|
|
return seq_to_flush;
|
|
|
|
}
|
2018-07-17 16:19:14 +00:00
|
|
|
|
2020-11-02 22:51:38 +00:00
|
|
|
/**
|
2023-09-12 22:41:22 +00:00
|
|
|
* __bch2_journal_reclaim - free up journal buckets
|
|
|
|
* @j: journal object
|
|
|
|
* @direct: direct or background reclaim?
|
|
|
|
* @kicked: requested to run since we last ran?
|
|
|
|
* Returns: 0 on success, or -EIO if the journal has been shutdown
|
2020-11-02 22:51:38 +00:00
|
|
|
*
|
|
|
|
* Background journal reclaim writes out btree nodes. It should be run
|
|
|
|
* early enough so that we never completely run out of journal buckets.
|
|
|
|
*
|
|
|
|
* High watermarks for triggering background reclaim:
|
|
|
|
* - FIFO has fewer than 512 entries left
|
|
|
|
* - fewer than 25% journal buckets free
|
|
|
|
*
|
|
|
|
* Background reclaim runs until low watermarks are reached:
|
|
|
|
* - FIFO has more than 1024 entries left
|
|
|
|
* - more than 50% journal buckets free
|
|
|
|
*
|
|
|
|
* As long as a reclaim can complete in the time it takes to fill up
|
|
|
|
* 512 journal entries or 25% of all journal buckets, then
|
|
|
|
* journal_next_bucket() should not stall.
|
|
|
|
*/
|
2022-04-17 22:06:31 +00:00
|
|
|
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
|
2020-11-02 22:51:38 +00:00
|
|
|
{
|
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-05 00:49:37 +00:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
2020-11-20 01:55:33 +00:00
|
|
|
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
2021-04-03 20:24:13 +00:00
|
|
|
u64 seq_to_flush;
|
2021-04-16 16:38:14 +00:00
|
|
|
size_t min_nr, min_key_cache, nr_flushed;
|
2020-11-20 02:15:39 +00:00
|
|
|
unsigned flags;
|
2020-12-03 18:23:58 +00:00
|
|
|
int ret = 0;
|
2020-11-02 22:51:38 +00:00
|
|
|
|
2020-11-20 02:15:39 +00:00
|
|
|
/*
|
|
|
|
* We can't invoke memory reclaim while holding the reclaim_lock -
|
|
|
|
* journal reclaim is required to make progress for memory reclaim
|
|
|
|
* (cleaning the caches), so we can't get stuck in memory reclaim while
|
|
|
|
* we're holding the reclaim lock:
|
|
|
|
*/
|
2020-11-02 22:51:38 +00:00
|
|
|
lockdep_assert_held(&j->reclaim_lock);
|
2020-11-20 02:15:39 +00:00
|
|
|
flags = memalloc_noreclaim_save();
|
2020-11-02 22:51:38 +00:00
|
|
|
|
|
|
|
do {
|
2020-11-20 01:55:33 +00:00
|
|
|
if (kthread && kthread_should_stop())
|
|
|
|
break;
|
|
|
|
|
2020-12-03 18:23:58 +00:00
|
|
|
if (bch2_journal_error(j)) {
|
|
|
|
ret = -EIO;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-11-02 22:51:38 +00:00
|
|
|
bch2_journal_do_discards(j);
|
|
|
|
|
|
|
|
seq_to_flush = journal_seq_to_flush(j);
|
|
|
|
min_nr = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If it's been longer than j->reclaim_delay_ms since we last flushed,
|
|
|
|
* make sure to flush at least one journal pin:
|
|
|
|
*/
|
|
|
|
if (time_after(jiffies, j->last_flushed +
|
2021-12-05 01:07:19 +00:00
|
|
|
msecs_to_jiffies(c->opts.journal_reclaim_delay)))
|
2020-11-02 22:51:38 +00:00
|
|
|
min_nr = 1;
|
2019-02-19 18:41:36 +00:00
|
|
|
|
2023-11-05 02:54:26 +00:00
|
|
|
if (j->watermark != BCH_WATERMARK_stripe)
|
2020-12-05 21:25:05 +00:00
|
|
|
min_nr = 1;
|
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-05 00:49:37 +00:00
|
|
|
size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
|
|
|
|
if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
|
2021-11-11 20:50:22 +00:00
|
|
|
min_nr = 1;
|
|
|
|
|
2022-04-17 22:06:31 +00:00
|
|
|
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
|
|
|
|
|
2022-08-27 16:48:36 +00:00
|
|
|
trace_and_count(c, journal_reclaim_start, c,
|
|
|
|
direct, kicked,
|
2022-04-17 22:06:31 +00:00
|
|
|
min_nr, min_key_cache,
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-05 00:49:37 +00:00
|
|
|
atomic_long_read(&bc->nr_dirty), btree_cache_live,
|
2021-03-25 03:37:33 +00:00
|
|
|
atomic_long_read(&c->btree_key_cache.nr_dirty),
|
|
|
|
atomic_long_read(&c->btree_key_cache.nr_keys));
|
2020-11-20 00:54:40 +00:00
|
|
|
|
2021-04-01 01:44:55 +00:00
|
|
|
nr_flushed = journal_flush_pins(j, seq_to_flush,
|
2023-03-07 12:28:20 +00:00
|
|
|
~0, 0,
|
2021-04-16 16:38:14 +00:00
|
|
|
min_nr, min_key_cache);
|
2020-11-20 01:55:33 +00:00
|
|
|
|
|
|
|
if (direct)
|
|
|
|
j->nr_direct_reclaim += nr_flushed;
|
|
|
|
else
|
|
|
|
j->nr_background_reclaim += nr_flushed;
|
2022-08-27 16:48:36 +00:00
|
|
|
trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
|
2021-03-31 21:52:52 +00:00
|
|
|
|
|
|
|
if (nr_flushed)
|
|
|
|
wake_up(&j->reclaim_wait);
|
2022-02-26 03:45:58 +00:00
|
|
|
} while ((min_nr || min_key_cache) && nr_flushed && !direct);
|
2020-11-20 00:54:40 +00:00
|
|
|
|
2020-11-20 02:15:39 +00:00
|
|
|
memalloc_noreclaim_restore(flags);
|
2020-12-03 18:23:58 +00:00
|
|
|
|
|
|
|
return ret;
|
2020-11-20 01:55:33 +00:00
|
|
|
}
|
|
|
|
|
2020-12-03 18:23:58 +00:00
|
|
|
int bch2_journal_reclaim(struct journal *j)
|
2020-11-20 01:55:33 +00:00
|
|
|
{
|
2022-04-17 22:06:31 +00:00
|
|
|
return __bch2_journal_reclaim(j, true, true);
|
2020-11-20 01:55:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int bch2_journal_reclaim_thread(void *arg)
|
|
|
|
{
|
|
|
|
struct journal *j = arg;
|
2021-12-05 01:07:19 +00:00
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
2021-04-29 02:12:07 +00:00
|
|
|
unsigned long delay, now;
|
2022-03-06 22:20:39 +00:00
|
|
|
bool journal_empty;
|
2020-12-03 18:23:58 +00:00
|
|
|
int ret = 0;
|
2020-11-20 01:55:33 +00:00
|
|
|
|
2020-11-29 22:09:13 +00:00
|
|
|
set_freezable();
|
|
|
|
|
2021-04-29 02:12:07 +00:00
|
|
|
j->last_flushed = jiffies;
|
|
|
|
|
2020-12-03 18:23:58 +00:00
|
|
|
while (!ret && !kthread_should_stop()) {
|
2022-04-17 22:06:31 +00:00
|
|
|
bool kicked = j->reclaim_kicked;
|
|
|
|
|
2020-11-20 01:55:33 +00:00
|
|
|
j->reclaim_kicked = false;
|
|
|
|
|
|
|
|
mutex_lock(&j->reclaim_lock);
|
2022-04-17 22:06:31 +00:00
|
|
|
ret = __bch2_journal_reclaim(j, false, kicked);
|
2020-11-20 01:55:33 +00:00
|
|
|
mutex_unlock(&j->reclaim_lock);
|
|
|
|
|
2021-04-29 02:12:07 +00:00
|
|
|
now = jiffies;
|
2021-12-05 01:07:19 +00:00
|
|
|
delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
|
2021-04-29 02:12:07 +00:00
|
|
|
j->next_reclaim = j->last_flushed + delay;
|
|
|
|
|
|
|
|
if (!time_in_range(j->next_reclaim, now, now + delay))
|
|
|
|
j->next_reclaim = now + delay;
|
2020-11-20 02:15:39 +00:00
|
|
|
|
2020-11-20 01:55:33 +00:00
|
|
|
while (1) {
|
2021-04-29 02:12:07 +00:00
|
|
|
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
|
2020-11-20 01:55:33 +00:00
|
|
|
if (kthread_should_stop())
|
|
|
|
break;
|
|
|
|
if (j->reclaim_kicked)
|
|
|
|
break;
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2022-03-06 22:20:39 +00:00
|
|
|
spin_lock(&j->lock);
|
|
|
|
journal_empty = fifo_empty(&j->pin);
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
|
|
|
|
if (journal_empty)
|
|
|
|
schedule();
|
|
|
|
else if (time_after(j->next_reclaim, jiffies))
|
|
|
|
schedule_timeout(j->next_reclaim - jiffies);
|
|
|
|
else
|
|
|
|
break;
|
2020-11-20 01:55:33 +00:00
|
|
|
}
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2017-03-17 06:18:50 +00:00
|
|
|
}
|
|
|
|
|
2020-11-20 01:55:33 +00:00
|
|
|
void bch2_journal_reclaim_stop(struct journal *j)
|
2019-02-28 19:22:52 +00:00
|
|
|
{
|
2020-11-20 01:55:33 +00:00
|
|
|
struct task_struct *p = j->reclaim_thread;
|
2019-02-28 19:22:52 +00:00
|
|
|
|
2020-11-20 01:55:33 +00:00
|
|
|
j->reclaim_thread = NULL;
|
|
|
|
|
|
|
|
if (p) {
|
|
|
|
kthread_stop(p);
|
|
|
|
put_task_struct(p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_journal_reclaim_start(struct journal *j)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
struct task_struct *p;
|
2022-07-18 23:42:58 +00:00
|
|
|
int ret;
|
2020-11-20 01:55:33 +00:00
|
|
|
|
|
|
|
if (j->reclaim_thread)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
p = kthread_create(bch2_journal_reclaim_thread, j,
|
|
|
|
"bch-reclaim/%s", c->name);
|
2022-07-18 23:42:58 +00:00
|
|
|
ret = PTR_ERR_OR_ZERO(p);
|
2023-12-17 03:43:41 +00:00
|
|
|
bch_err_msg(c, ret, "creating journal reclaim thread");
|
|
|
|
if (ret)
|
2022-07-18 23:42:58 +00:00
|
|
|
return ret;
|
2020-11-20 01:55:33 +00:00
|
|
|
|
|
|
|
get_task_struct(p);
|
|
|
|
j->reclaim_thread = p;
|
|
|
|
wake_up_process(p);
|
|
|
|
return 0;
|
2019-02-28 19:22:52 +00:00
|
|
|
}
|
|
|
|
|
2020-05-28 20:06:13 +00:00
|
|
|
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
|
|
|
bool *did_work)
|
2017-03-17 06:18:50 +00:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = bch2_journal_error(j);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2019-01-14 05:38:47 +00:00
|
|
|
mutex_lock(&j->reclaim_lock);
|
|
|
|
|
2023-03-07 12:28:20 +00:00
|
|
|
if (journal_flush_pins(j, seq_to_flush,
|
|
|
|
(1U << JOURNAL_PIN_key_cache)|
|
|
|
|
(1U << JOURNAL_PIN_other), 0, 0, 0) ||
|
|
|
|
journal_flush_pins(j, seq_to_flush,
|
|
|
|
(1U << JOURNAL_PIN_btree), 0, 0, 0))
|
2022-02-26 03:14:35 +00:00
|
|
|
*did_work = true;
|
2019-01-14 05:38:47 +00:00
|
|
|
|
2023-12-10 20:23:27 +00:00
|
|
|
if (seq_to_flush > journal_cur_seq(j))
|
|
|
|
bch2_journal_entry_close(j);
|
|
|
|
|
2019-02-21 18:33:21 +00:00
|
|
|
spin_lock(&j->lock);
|
2017-03-17 06:18:50 +00:00
|
|
|
/*
|
|
|
|
* If journal replay hasn't completed, the unreplayed journal entries
|
|
|
|
* hold refs on their corresponding sequence numbers
|
|
|
|
*/
|
2024-04-30 10:20:37 +00:00
|
|
|
ret = !test_bit(JOURNAL_replay_done, &j->flags) ||
|
2017-03-17 06:18:50 +00:00
|
|
|
journal_last_seq(j) > seq_to_flush ||
|
2022-02-28 21:35:42 +00:00
|
|
|
!fifo_used(&j->pin);
|
2018-07-17 16:19:14 +00:00
|
|
|
|
2017-03-17 06:18:50 +00:00
|
|
|
spin_unlock(&j->lock);
|
2019-01-14 05:38:47 +00:00
|
|
|
mutex_unlock(&j->reclaim_lock);
|
2017-03-17 06:18:50 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-05-28 20:06:13 +00:00
|
|
|
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
|
2017-03-17 06:18:50 +00:00
|
|
|
{
|
2023-11-10 03:07:42 +00:00
|
|
|
/* time_stats this */
|
2020-05-28 20:06:13 +00:00
|
|
|
bool did_work = false;
|
|
|
|
|
2024-04-30 10:20:37 +00:00
|
|
|
if (!test_bit(JOURNAL_running, &j->flags))
|
2020-05-28 20:06:13 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
closure_wait_event(&j->async_wait,
|
|
|
|
journal_flush_done(j, seq_to_flush, &did_work));
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2020-05-28 20:06:13 +00:00
|
|
|
return did_work;
|
2017-03-17 06:18:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
|
|
struct journal_entry_pin_list *p;
|
|
|
|
u64 iter, seq = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
spin_lock(&j->lock);
|
|
|
|
fifo_for_each_entry_ptr(p, &j->pin, iter)
|
|
|
|
if (dev_idx >= 0
|
|
|
|
? bch2_dev_list_has_dev(p->devs, dev_idx)
|
|
|
|
: p->devs.nr < c->opts.metadata_replicas)
|
|
|
|
seq = iter;
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
|
|
|
|
bch2_journal_flush_pins(j, seq);
|
|
|
|
|
|
|
|
ret = bch2_journal_error(j);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
mutex_lock(&c->replicas_gc_lock);
|
2020-07-09 22:28:11 +00:00
|
|
|
bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2023-06-30 14:51:46 +00:00
|
|
|
/*
|
|
|
|
* Now that we've populated replicas_gc, write to the journal to mark
|
|
|
|
* active journal devices. This handles the case where the journal might
|
|
|
|
* be empty. Otherwise we could clear all journal replicas and
|
|
|
|
* temporarily put the fs into an unrecoverable state. Journal recovery
|
|
|
|
* expects to find devices marked for journal data on unclean mount.
|
|
|
|
*/
|
|
|
|
ret = bch2_journal_meta(&c->journal);
|
2023-07-09 02:21:45 +00:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
2017-03-17 06:18:50 +00:00
|
|
|
|
2023-06-30 14:51:46 +00:00
|
|
|
seq = 0;
|
2017-03-17 06:18:50 +00:00
|
|
|
spin_lock(&j->lock);
|
2022-03-07 06:35:55 +00:00
|
|
|
while (!ret) {
|
2019-01-21 20:32:13 +00:00
|
|
|
struct bch_replicas_padded replicas;
|
|
|
|
|
2017-03-17 06:18:50 +00:00
|
|
|
seq = max(seq, journal_last_seq(j));
|
2022-03-07 06:35:55 +00:00
|
|
|
if (seq >= j->pin.back)
|
|
|
|
break;
|
2020-07-09 22:28:11 +00:00
|
|
|
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
|
2019-01-21 20:32:13 +00:00
|
|
|
journal_seq_pin(j, seq)->devs);
|
2017-03-17 06:18:50 +00:00
|
|
|
seq++;
|
|
|
|
|
2024-02-18 01:38:47 +00:00
|
|
|
if (replicas.e.nr_devs) {
|
|
|
|
spin_unlock(&j->lock);
|
|
|
|
ret = bch2_mark_replicas(c, &replicas.e);
|
|
|
|
spin_lock(&j->lock);
|
|
|
|
}
|
2017-03-17 06:18:50 +00:00
|
|
|
}
|
|
|
|
spin_unlock(&j->lock);
|
2023-07-09 02:21:45 +00:00
|
|
|
err:
|
2017-03-17 06:18:50 +00:00
|
|
|
ret = bch2_replicas_gc_end(c, ret);
|
|
|
|
mutex_unlock(&c->replicas_gc_lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|