Commit 8479938d authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Convert snapshot table to RCU array



This switches the generic radix tree for the in-memory table of snapshot
nodes to a simple rcu array. This means we have to add new locking to
deal with reallocations, but is faster than traversing the radix tree.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent d82978ca
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -774,9 +774,10 @@ struct bch_fs {
	struct mutex		sb_lock;

	/* snapshot.c: */
	GENRADIX(struct snapshot_t) snapshots;
	struct bch_snapshot_table __rcu *snapshot_table;
	struct snapshot_table __rcu *snapshots;
	size_t			snapshot_table_size;
	struct mutex		snapshot_table_lock;

	struct work_struct	snapshot_delete_work;
	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
	snapshot_id_list	snapshots_unlinked;
+2 −2
Original line number Diff line number Diff line
@@ -311,7 +311,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
		i->k->k.p.snapshot &&
		bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
}

static noinline int
@@ -1229,7 +1229,7 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
					  struct bpos pos)
{
	if (!btree_type_has_snapshots(id) ||
	    !snapshot_t(trans->c, pos.snapshot)->children[0])
	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
		return 0;

	return __check_pos_snapshot_overwritten(trans, id, pos);
+1 −1
Original line number Diff line number Diff line
@@ -894,7 +894,7 @@ static int check_inode(struct btree_trans *trans,
	 * particular is not atomic, so on the internal snapshot nodes
	 * we can see inodes marked for deletion after a clean shutdown
	 */
	if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
	if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
		return 0;

	if (!bkey_is_inode(k.k))
+1 −1
Original line number Diff line number Diff line
@@ -562,7 +562,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
	int ret;

	ret = bch2_snapshot_tree_lookup(trans,
			snapshot_t(c, k.k->p.snapshot)->tree, &s_t);
			bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
			"%s: snapshot tree %u not found", __func__,
			snapshot_t(c, k.k->p.snapshot)->tree);
+117 −25
Original line number Diff line number Diff line
@@ -12,9 +12,9 @@

static int bch2_subvolume_delete(struct btree_trans *, u32);

static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor)
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
{
	struct snapshot_t *s = snapshot_t(c, id);
	const struct snapshot_t *s = __snapshot_t(t, id);

	if (s->skip[2] <= ancestor)
		return s->skip[2];
@@ -27,22 +27,102 @@ static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor)

bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
	struct snapshot_table *t;

	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);

	rcu_read_lock();
	t = rcu_dereference(c->snapshots);

	while (id && id < ancestor)
		id = get_ancestor_below(c, id, ancestor);
		id = get_ancestor_below(t, id, ancestor);
	rcu_read_unlock();

	return id == ancestor;
}

static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
{
	struct snapshot_table *t;

	rcu_read_lock();
	t = rcu_dereference(c->snapshots);

	while (id && id < ancestor)
		id = snapshot_t(c, id)->parent;
		id = __snapshot_t(t, id)->parent;
	rcu_read_unlock();

	return id == ancestor;
}

static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
{
	u32 depth;

	rcu_read_lock();
	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
	rcu_read_unlock();

	return depth;
}

struct snapshot_t_free_rcu {
	struct rcu_head		rcu;
	struct snapshot_table	*t;
};

static void snapshot_t_free_rcu(struct rcu_head *rcu)
{
	struct snapshot_t_free_rcu *free_rcu =
		container_of(rcu, struct snapshot_t_free_rcu, rcu);

	kvfree(free_rcu->t);
	kfree(free_rcu);
}

static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
{
	size_t idx = U32_MAX - id;
	size_t new_size;
	struct snapshot_table *new, *old;

	new_size = max(16UL, roundup_pow_of_two(idx + 1));

	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
	if (!new)
		return NULL;

	old = rcu_dereference_protected(c->snapshots, true);
	if (old)
		memcpy(new->s,
		       rcu_dereference_protected(c->snapshots, true)->s,
		       sizeof(new->s[0]) * c->snapshot_table_size);

	rcu_assign_pointer(c->snapshots, new);
	c->snapshot_table_size = new_size;
	if (old) {
		struct snapshot_t_free_rcu *rcu =
			kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL);

		rcu->t = old;
		call_rcu(&rcu->rcu, snapshot_t_free_rcu);
	}

	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
}

static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
{
	size_t idx = U32_MAX - id;

	lockdep_assert_held(&c->snapshot_table_lock);

	if (likely(idx < c->snapshot_table_size))
		return &rcu_dereference_protected(c->snapshots, true)->s[idx];

	return __snapshot_t_mut(c, id);
}

/* Snapshot tree: */

void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
@@ -209,12 +289,15 @@ int bch2_mark_snapshot(struct btree_trans *trans,
{
	struct bch_fs *c = trans->c;
	struct snapshot_t *t;
	int ret = 0;

	t = genradix_ptr_alloc(&c->snapshots,
			       U32_MAX - new.k->p.offset,
			       GFP_KERNEL);
	if (!t)
		return -BCH_ERR_ENOMEM_mark_snapshot;
	mutex_lock(&c->snapshot_table_lock);

	t = snapshot_t_mut(c, new.k->p.offset);
	if (!t) {
		ret = -BCH_ERR_ENOMEM_mark_snapshot;
		goto err;
	}

	if (new.k->type == KEY_TYPE_snapshot) {
		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
@@ -246,8 +329,9 @@ int bch2_mark_snapshot(struct btree_trans *trans,
		t->subvol	= 0;
		t->tree		= 0;
	}

	return 0;
err:
	mutex_unlock(&c->snapshot_table_lock);
	return ret;
}

static int snapshot_lookup(struct btree_trans *trans, u32 id,
@@ -300,9 +384,14 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
		nr_live += ret;
	}

	snapshot_t(c, id)->equiv = nr_live == 1
		? snapshot_t(c, child[live_idx])->equiv
	mutex_lock(&c->snapshot_table_lock);

	snapshot_t_mut(c, id)->equiv = nr_live == 1
		? snapshot_t_mut(c, child[live_idx])->equiv
		: id;

	mutex_unlock(&c->snapshot_table_lock);

	return 0;
}

@@ -520,16 +609,18 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,

static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
{
	struct snapshot_t *s;
	const struct snapshot_t *s;

	if (!id)
		return 0;

	rcu_read_lock();
	s = snapshot_t(c, id);
	if (!s->parent)
		return id;
	if (s->parent)
		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
	rcu_read_unlock();

	return bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
	return id;
}

static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
@@ -633,9 +724,6 @@ static int check_snapshot(struct btree_trans *trans,
	struct bkey_i_snapshot *u;
	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
	u32 real_depth;
	struct snapshot_t *parent = parent_id
		? snapshot_t(c, parent_id)
		: NULL;
	struct printbuf buf = PRINTBUF;
	bool should_have_subvol;
	u32 i, id;
@@ -726,7 +814,7 @@ static int check_snapshot(struct btree_trans *trans,
	}
	ret = 0;

	real_depth = parent ? parent->depth + 1 : 0;
	real_depth = bch2_snapshot_depth(c, parent_id);

	if (le32_to_cpu(s.depth) != real_depth &&
	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
@@ -823,9 +911,13 @@ static int check_subvol(struct btree_trans *trans,

	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
		u32 snapshot_tree = snapshot_t(c, snapshot_root)->tree;
		u32 snapshot_tree;
		struct bch_snapshot_tree st;

		rcu_read_lock();
		snapshot_tree = snapshot_t(c, snapshot_root)->tree;
		rcu_read_unlock();

		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);

		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
@@ -869,7 +961,7 @@ int bch2_check_subvols(struct bch_fs *c)

void bch2_fs_snapshots_exit(struct bch_fs *c)
{
	genradix_free(&c->snapshots);
	kfree(rcu_dereference_protected(c->snapshots, true));
}

int bch2_snapshots_read(struct bch_fs *c)
@@ -1011,7 +1103,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
	struct bkey_i_snapshot *n;
	struct bkey_s_c k;
	unsigned i, j;
	u32 depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
	u32 depth = bch2_snapshot_depth(c, parent);
	int ret;

	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
@@ -1150,7 +1242,7 @@ static int snapshot_delete_key(struct btree_trans *trans,
			       struct bpos *last_pos)
{
	struct bch_fs *c = trans->c;
	u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);

	if (!bkey_eq(k.k->p, *last_pos))
		equiv_seen->nr = 0;
Loading