Commit 5222a460 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet
Browse files

bcachefs: BTREE_ITER_WITH_JOURNAL



This adds a new btree iterator flag, BTREE_ITER_WITH_JOURNAL, that is
automatically enabled when initializing a btree iterator before journal
replay has completed - it overlays the contents of the journal with the
btree.

This lets us delete bch2_btree_and_journal_walk() and just use the
normal btree iterator interface instead - which also lets us delete a
significant amount of duplicated code.

Note that BTREE_ITER_WITH_JOURNAL is still unoptimized in this patch -
we're redoing the binary search over keys in the journal every time we
call bch2_btree_iter_peek().

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent f28620c1
Loading
Loading
Loading
Loading
+30 −30
Original line number Diff line number Diff line
@@ -340,15 +340,23 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
#undef  x
}

static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
int bch2_alloc_read(struct bch_fs *c)
{
	struct bch_fs *c = trans->c;
	struct btree_trans trans;
	struct btree_iter iter;
	struct bkey_s_c k;
	struct bch_dev *ca;
	struct bucket *g;
	struct bkey_alloc_unpacked u;
	int ret;

	bch2_trans_init(&trans, c, 0, 0);
	down_read(&c->gc_lock);

	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
			   BTREE_ITER_PREFETCH, k, ret) {
		if (!bkey_is_alloc(k.k))
		return 0;
			continue;

		ca = bch_dev_bkey_exists(c, k.k->p.inode);
		g = bucket(ca, k.k->p.offset);
@@ -366,20 +374,12 @@ static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
		g->io_time[WRITE]	= u.write_time;
		g->oldest_gen		= u.oldest_gen;
		g->gen_valid		= 1;

	return 0;
	}
	bch2_trans_iter_exit(&trans, &iter);

int bch2_alloc_read(struct bch_fs *c)
{
	struct btree_trans trans;
	int ret;

	bch2_trans_init(&trans, c, 0, 0);
	down_read(&c->gc_lock);
	ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
	up_read(&c->gc_lock);
	bch2_trans_exit(&trans);

	if (ret) {
		bch_err(c, "error reading alloc info: %i", ret);
		return ret;
+0 −1
Original line number Diff line number Diff line
@@ -860,7 +860,6 @@ mempool_t bio_bounce_pages;
	u64			reflink_hint;
	reflink_gc_table	reflink_gc_table;
	size_t			reflink_gc_nr;
	size_t			reflink_gc_idx;

	/* VFS IO PATH - fs-io.c */
	struct bio_set		writepage_bioset;
+45 −140
Original line number Diff line number Diff line
@@ -1342,59 +1342,6 @@ static int bch2_gc_start(struct bch_fs *c,
	return 0;
}

static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
					   struct bkey_s_c k)
{
	struct bch_fs *c = trans->c;
	struct reflink_gc *r;
	const __le64 *refcount = bkey_refcount_c(k);
	char buf[200];
	int ret = 0;

	if (!refcount)
		return 0;

	r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
	if (!r)
		return -ENOMEM;

	if (!r ||
	    r->offset != k.k->p.offset ||
	    r->size != k.k->size) {
		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
		return -EINVAL;
	}

	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
			"reflink key has wrong refcount:\n"
			"  %s\n"
			"  should be %u",
			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
			r->refcount)) {
		struct bkey_i *new;

		new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
		if (!new) {
			ret = -ENOMEM;
			goto fsck_err;
		}

		bkey_reassemble(new, k);

		if (!r->refcount) {
			new->k.type = KEY_TYPE_deleted;
			new->k.size = 0;
		} else {
			*bkey_refcount(new) = cpu_to_le64(r->refcount);
		}

		ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
		kfree(new);
	}
fsck_err:
	return ret;
}

static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
				bool metadata_only)
{
@@ -1411,14 +1358,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,

	bch2_trans_init(&trans, c, 0, 0);

	if (initial) {
		c->reflink_gc_idx = 0;

		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
				bch2_gc_reflink_done_initial_fn);
		goto out;
	}

	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
			   BTREE_ITER_PREFETCH, k, ret) {
		const __le64 *refcount = bkey_refcount_c(k);
@@ -1426,7 +1365,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
		if (!refcount)
			continue;

		r = genradix_ptr(&c->reflink_gc_table, idx);
		r = genradix_ptr(&c->reflink_gc_table, idx++);
		if (!r ||
		    r->offset != k.k->p.offset ||
		    r->size != k.k->size) {
@@ -1456,7 +1395,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
			else
				*bkey_refcount(new) = cpu_to_le64(r->refcount);

			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
			ret = initial
			       ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
			       : __bch2_trans_do(&trans, NULL, NULL, 0,
					__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
			kfree(new);

@@ -1466,33 +1407,40 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
	}
fsck_err:
	bch2_trans_iter_exit(&trans, &iter);
out:
	c->reflink_gc_nr = 0;
	bch2_trans_exit(&trans);
	return ret;
}

static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
					   struct bkey_s_c k)
static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
				bool metadata_only)
{
	struct bch_fs *c = trans->c;
	struct btree_trans trans;
	struct btree_iter iter;
	struct bkey_s_c k;
	struct gc_stripe *m;
	const struct bch_stripe *s;
	char buf[200];
	unsigned i;
	int ret = 0;

	if (k.k->type != KEY_TYPE_stripe)
	if (metadata_only)
		return 0;

	s = bkey_s_c_to_stripe(k).v;
	bch2_trans_init(&trans, c, 0, 0);

	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
			   BTREE_ITER_PREFETCH, k, ret) {
		if (k.k->type != KEY_TYPE_stripe)
			continue;

		s = bkey_s_c_to_stripe(k).v;
		m = genradix_ptr(&c->gc_stripes, k.k->p.offset);

		for (i = 0; i < s->nr_blocks; i++)
			if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
				goto inconsistent;
	return 0;
		continue;
inconsistent:
		if (fsck_err_on(true, c,
				"stripe has wrong block sector count %u:\n"
@@ -1505,7 +1453,7 @@ static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
			if (!new) {
				ret = -ENOMEM;
			goto fsck_err;
				break;
			}

			bkey_reassemble(&new->k_i, k);
@@ -1513,57 +1461,20 @@ static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
			for (i = 0; i < new->v.nr_blocks; i++)
				stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);

		ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
			ret = initial
				? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
				: __bch2_trans_do(&trans, NULL, NULL, 0,
					__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
			kfree(new);
		}
fsck_err:
	return ret;
}

static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
				bool metadata_only)
{
	struct btree_trans trans;
	int ret = 0;

	if (metadata_only)
		return 0;

	bch2_trans_init(&trans, c, 0, 0);

	if (initial) {
		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
				bch2_gc_stripes_done_initial_fn);
	} else {
		BUG();
	}
fsck_err:
	bch2_trans_iter_exit(&trans, &iter);

	bch2_trans_exit(&trans);
	return ret;
}

static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
					    struct bkey_s_c k)
{

	struct bch_fs *c = trans->c;
	struct reflink_gc *r;
	const __le64 *refcount = bkey_refcount_c(k);

	if (!refcount)
		return 0;

	r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
			       GFP_KERNEL);
	if (!r)
		return -ENOMEM;

	r->offset	= k.k->p.offset;
	r->size		= k.k->size;
	r->refcount	= 0;
	return 0;
}

static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
				 bool metadata_only)
{
@@ -1579,12 +1490,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
	bch2_trans_init(&trans, c, 0, 0);
	c->reflink_gc_nr = 0;

	if (initial) {
		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
						bch2_gc_reflink_start_initial_fn);
		goto out;
	}

	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
			   BTREE_ITER_PREFETCH, k, ret) {
		const __le64 *refcount = bkey_refcount_c(k);
@@ -1604,7 +1509,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
		r->refcount	= 0;
	}
	bch2_trans_iter_exit(&trans, &iter);
out:

	bch2_trans_exit(&trans);
	return ret;
}
+166 −28
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
#include "recovery.h"
#include "replicas.h"
#include "subvolume.h"
#include "trace.h"
@@ -1064,6 +1065,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
static void btree_path_verify_new_node(struct btree_trans *trans,
				       struct btree_path *path, struct btree *b)
{
	struct bch_fs *c = trans->c;
	struct btree_path_level *l;
	unsigned plevel;
	bool parent_locked;
@@ -1072,6 +1074,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
		return;

	if (trans->journal_replay_not_finished)
		return;

	plevel = b->c.level + 1;
	if (!btree_path_node(path, plevel))
		return;
@@ -1092,7 +1097,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
		char buf4[100];
		struct bkey uk = bkey_unpack_key(b, k);

		bch2_dump_btree_node(trans->c, l->b);
		bch2_dump_btree_node(c, l->b);
		bch2_bpos_to_text(&PBUF(buf1), path->pos);
		bch2_bkey_to_text(&PBUF(buf2), &uk);
		bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
@@ -1283,6 +1288,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
	return ret;
}

static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
				 struct btree_and_journal_iter *jiter)
{
	struct bch_fs *c = trans->c;
	struct bkey_s_c k;
	struct bkey_buf tmp;
	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
		? (path->level > 1 ? 0 :  2)
		: (path->level > 1 ? 1 : 16);
	bool was_locked = btree_node_locked(path, path->level);
	int ret = 0;

	bch2_bkey_buf_init(&tmp);

	while (nr && !ret) {
		if (!bch2_btree_node_relock(trans, path, path->level))
			break;

		bch2_btree_and_journal_iter_advance(jiter);
		k = bch2_btree_and_journal_iter_peek(jiter);
		if (!k.k)
			break;

		bch2_bkey_buf_reassemble(&tmp, c, k);
		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
					       path->level - 1);
	}

	if (!was_locked)
		btree_node_unlock(path, path->level);

	bch2_bkey_buf_exit(&tmp, c);
	return ret;
}

static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
					    struct btree_path *path,
					    unsigned plevel, struct btree *b)
@@ -1305,6 +1345,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
		btree_node_unlock(path, plevel);
}

static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
						     struct btree_path *path,
						     unsigned flags,
						     struct bkey_buf *out)
{
	struct bch_fs *c = trans->c;
	struct btree_path_level *l = path_l(path);
	struct btree_and_journal_iter jiter;
	struct bkey_s_c k;
	int ret = 0;

	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);

	k = bch2_btree_and_journal_iter_peek(&jiter);

	bch2_bkey_buf_reassemble(out, c, k);

	if (flags & BTREE_ITER_PREFETCH)
		ret = btree_path_prefetch_j(trans, path, &jiter);

	bch2_btree_and_journal_iter_exit(&jiter);
	return ret;
}

static __always_inline int btree_path_down(struct btree_trans *trans,
					   struct btree_path *path,
					   unsigned flags,
@@ -1321,9 +1385,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
	EBUG_ON(!btree_node_locked(path, path->level));

	bch2_bkey_buf_init(&tmp);

	if (unlikely(trans->journal_replay_not_finished)) {
		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
		if (ret)
			goto err;
	} else {
		bch2_bkey_buf_unpack(&tmp, c, l->b,
				 bch2_btree_node_iter_peek(&l->iter, l->b));

		if (flags & BTREE_ITER_PREFETCH) {
			ret = btree_path_prefetch(trans, path);
			if (ret)
				goto err;
		}
	}

	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
	ret = PTR_ERR_OR_ZERO(b);
	if (unlikely(ret))
@@ -1332,13 +1409,11 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
	mark_btree_node_locked(path, level, lock_type);
	btree_path_level_init(trans, path, b);

	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
	if (likely(!trans->journal_replay_not_finished &&
		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
	    unlikely(b != btree_node_mem_ptr(tmp.k)))
		btree_node_mem_ptr_set(trans, path, level + 1, b);

	if (flags & BTREE_ITER_PREFETCH)
		ret = btree_path_prefetch(trans, path);

	if (btree_node_read_locked(path, level + 1))
		btree_node_unlock(path, level + 1);
	path->level = level;
@@ -2113,6 +2188,55 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
	return ret;
}

static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
						 struct btree_path *path)
{
	struct journal_keys *keys = &trans->c->journal_keys;
	size_t idx = bch2_journal_key_search(keys, path->btree_id,
					     path->level, path->pos);

	while (idx < keys->nr && keys->d[idx].overwritten)
		idx++;

	return (idx < keys->nr &&
		keys->d[idx].btree_id	== path->btree_id &&
		keys->d[idx].level	== path->level)
		? keys->d[idx].k
		: NULL;
}

static noinline
struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
					      struct btree_iter *iter)
{
	struct bkey_i *k = __btree_trans_peek_journal(trans, iter->path);

	if (k && !bpos_cmp(k->k.p, iter->pos)) {
		iter->k = k->k;
		return bkey_i_to_s_c(k);
	} else {
		return bkey_s_c_null;
	}
}

static noinline
struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
					 struct btree_iter *iter,
					 struct bkey_s_c k)
{
	struct bkey_i *next_journal =
		__btree_trans_peek_journal(trans, iter->path);

	if (next_journal &&
	    bpos_cmp(next_journal->k.p,
		     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
		iter->k = next_journal->k;
		k = bkey_i_to_s_c(next_journal);
	}

	return k;
}

/**
 * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
 * current position
@@ -2141,16 +2265,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
			goto out;
		}

		next_update = btree_trans_peek_updates(iter);
		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);

		/* * In the btree, deleted keys sort before non deleted: */
		if (k.k && bkey_deleted(k.k) &&
		    (!next_update ||
		     bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
			search_key = k.k->p;
			continue;
		}
		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
			k = btree_trans_peek_journal(trans, iter, k);

		next_update = btree_trans_peek_updates(iter);

		if (next_update &&
		    bpos_cmp(next_update->k.p,
@@ -2159,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
			k = bkey_i_to_s_c(next_update);
		}

		if (k.k && bkey_deleted(k.k)) {
			/*
			 * If we've got a whiteout, and it's after the search
			 * key, advance the search key to the whiteout instead
			 * of just after the whiteout - it might be a btree
			 * whiteout, with a real key at the same position, since
			 * in the btree deleted keys sort before non deleted.
			 */
			search_key = bpos_cmp(search_key, k.k->p)
				? k.k->p
				: bpos_successor(k.k->p);
			continue;
		}

		if (likely(k.k)) {
			/*
			 * We can never have a key in a leaf node at POS_MAX, so
@@ -2249,6 +2383,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)

	EBUG_ON(iter->path->cached || iter->path->level);
	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);

	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
		return bkey_s_c_err(-EIO);

	bch2_btree_iter_verify(iter);
	bch2_btree_iter_verify_entry_exit(iter);

@@ -2395,23 +2533,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
		struct bkey_i *next_update;

		next_update = btree_trans_peek_updates(iter);
		if (next_update &&
		if ((next_update = btree_trans_peek_updates(iter)) &&
		    !bpos_cmp(next_update->k.p, iter->pos)) {
			iter->k = next_update->k;
			k = bkey_i_to_s_c(next_update);
		} else {
			k = bch2_btree_path_peek_slot(iter->path, &iter->k);
			goto out;
		}

		if (!k.k ||
		    ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
		     ? bpos_cmp(iter->pos, k.k->p)
		     : bkey_cmp(iter->pos, k.k->p))) {
			bkey_init(&iter->k);
			iter->k.p = iter->pos;
			k = (struct bkey_s_c) { &iter->k, NULL };
		}
		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
			goto out;

		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
	} else {
		struct bpos next;

@@ -2455,7 +2588,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
			k = (struct bkey_s_c) { &iter->k, NULL };
		}
	}

out:
	iter->path->should_be_locked = true;

	bch2_btree_iter_verify_entry_exit(iter);
@@ -2635,6 +2768,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
	    btree_type_has_snapshots(btree_id))
		flags |= BTREE_ITER_FILTER_SNAPSHOTS;

	if (trans->journal_replay_not_finished)
		flags |= BTREE_ITER_WITH_JOURNAL;

	iter->trans	= trans;
	iter->path	= NULL;
	iter->btree_id	= btree_id;
@@ -2801,6 +2937,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
	memset(trans, 0, sizeof(*trans));
	trans->c		= c;
	trans->ip		= _RET_IP_;
	trans->journal_replay_not_finished =
		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);

	bch2_trans_alloc_paths(trans, c);

+6 −4
Original line number Diff line number Diff line
@@ -207,10 +207,11 @@ struct btree_node_iter {
#define BTREE_ITER_CACHED_NOFILL	(1 << 8)
#define BTREE_ITER_CACHED_NOCREATE	(1 << 9)
#define BTREE_ITER_WITH_UPDATES		(1 << 10)
#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 13)
#define BTREE_ITER_NOPRESERVE		(1 << 14)
#define BTREE_ITER_WITH_JOURNAL		(1 << 11)
#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 13)
#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 14)
#define BTREE_ITER_NOPRESERVE		(1 << 15)

enum btree_path_uptodate {
	BTREE_ITER_UPTODATE		= 0,
@@ -381,6 +382,7 @@ struct btree_trans {
	bool			restarted:1;
	bool			paths_sorted:1;
	bool			journal_transaction_names:1;
	bool			journal_replay_not_finished:1;
	/*
	 * For when bch2_trans_update notices we'll be splitting a compressed
	 * extent:
Loading