Commit 7a51608d authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Rework btree node pinning



In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers

Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).

Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.

Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 91ddd715
Loading
Loading
Loading
Loading
+7 −6
Original line number Diff line number Diff line
@@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
	s64 mem_may_pin = mem_may_pin_bytes(c);
	int ret = 0;

	bch2_btree_cache_unpin(c);

	btree_interior_mask |= btree_leaf_mask;

	c->btree_cache.pinned_nodes_leaf_mask		= btree_leaf_mask;
	c->btree_cache.pinned_nodes_interior_mask	= btree_interior_mask;
	c->btree_cache.pinned_nodes_mask[0]		= btree_leaf_mask;
	c->btree_cache.pinned_nodes_mask[1]		= btree_interior_mask;
	c->btree_cache.pinned_nodes_start		= start;
	c->btree_cache.pinned_nodes_end			= *end = BBPOS_MAX;

@@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
					BBPOS(btree, b->key.k.p);
				break;
			}
			bch2_node_pin(c, b);
			0;
		}));
	}
@@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
	bch2_trans_put(trans);
	bch2_bkey_buf_exit(&s.last_flushed, c);

	c->btree_cache.pinned_nodes_leaf_mask = 0;
	c->btree_cache.pinned_nodes_interior_mask = 0;
	bch2_btree_cache_unpin(c);

	bch_err_fn(c, ret);
	return ret;
@@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
	}
	bch2_trans_put(trans);

	c->btree_cache.pinned_nodes_leaf_mask = 0;
	c->btree_cache.pinned_nodes_interior_mask = 0;
	bch2_btree_cache_unpin(c);

	bch_err_fn(c, ret);
	return ret;
+112 −52
Original line number Diff line number Diff line
@@ -47,9 +47,14 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
	c->btree_cache.nr_reserve = reserve;
}

static inline size_t btree_cache_can_free(struct btree_cache *bc)
static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
	return max_t(int, 0, bc->nr_live + bc->nr_freeable - bc->nr_reserve);
	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);

	size_t can_free = list->nr;
	if (!list->idx)
		can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
	return can_free;
}

static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@@ -184,6 +189,51 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
	six_unlock_intent(&b->c.lock);
}

static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
{
	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);

	u64 mask = bc->pinned_nodes_mask[!!b->c.level];

	return ((mask & BIT_ULL(b->c.btree_id)) &&
		bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
		bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
}

void bch2_node_pin(struct bch_fs *c, struct btree *b)
{
	struct btree_cache *bc = &c->btree_cache;

	mutex_lock(&bc->lock);
	BUG_ON(!__btree_node_pinned(bc, b));
	if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
		set_btree_node_pinned(b);
		list_move(&b->list, &bc->live[1].list);
		bc->live[0].nr--;
		bc->live[1].nr++;
	}
	mutex_unlock(&bc->lock);
}

void bch2_btree_cache_unpin(struct bch_fs *c)
{
	struct btree_cache *bc = &c->btree_cache;
	struct btree *b, *n;

	mutex_lock(&bc->lock);
	c->btree_cache.pinned_nodes_mask[0] = 0;
	c->btree_cache.pinned_nodes_mask[1] = 0;

	list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
		clear_btree_node_pinned(b);
		list_move(&b->list, &bc->live[0].list);
		bc->live[0].nr++;
		bc->live[1].nr--;
	}

	mutex_unlock(&bc->lock);
}

/* Btree in memory cache - hash table */

void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -199,7 +249,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
	if (b->c.btree_id < BTREE_ID_NR)
		--bc->nr_by_btree[b->c.btree_id];

	bc->nr_live--;
	bc->live[btree_node_pinned(b)].nr--;
	bc->nr_freeable++;
	list_move(&b->list, &bc->freeable);
}
@@ -216,9 +266,14 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)

	if (b->c.btree_id < BTREE_ID_NR)
		bc->nr_by_btree[b->c.btree_id]++;
	bc->nr_live++;

	bool p = __btree_node_pinned(bc, b);
	mod_bit(BTREE_NODE_pinned, &b->flags, p);

	list_move_tail(&b->list, &bc->live[p].list);
	bc->live[p].nr++;

	bc->nr_freeable--;
	list_move_tail(&b->list, &bc->live);
	return 0;
}

@@ -283,20 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
	int ret = 0;

	lockdep_assert_held(&bc->lock);

	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);

	u64 mask = b->c.level
		? bc->pinned_nodes_interior_mask
		: bc->pinned_nodes_leaf_mask;

	if ((mask & BIT_ULL(b->c.btree_id)) &&
	    bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
	    bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) {
		BTREE_CACHE_NOT_FREED_INCREMENT(pinned);
		return -BCH_ERR_ENOMEM_btree_node_reclaim;
	}

wait_on_io:
	if (b->flags & ((1U << BTREE_NODE_dirty)|
			(1U << BTREE_NODE_read_in_flight)|
@@ -401,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
					   struct shrink_control *sc)
{
	struct bch_fs *c = shrink->private_data;
	struct btree_cache *bc = &c->btree_cache;
	struct btree_cache_list *list = shrink->private_data;
	struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
	struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
	struct btree *b, *t;
	unsigned long nr = sc->nr_to_scan;
	unsigned long can_free = 0;
@@ -410,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
	unsigned long touched = 0;
	unsigned i, flags;
	unsigned long ret = SHRINK_STOP;
	bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >=
		(bc->nr_live + bc->nr_freeable) * 3 / 4;
	bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;

	if (bch2_btree_shrinker_disabled)
		return SHRINK_STOP;
@@ -426,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
	 * succeed, so that inserting keys into the btree can always succeed and
	 * IO can always make forward progress:
	 */
	can_free = btree_cache_can_free(bc);
	can_free = btree_cache_can_free(list);
	nr = min_t(unsigned long, nr, can_free);

	i = 0;
@@ -452,7 +493,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
		}
	}
restart:
	list_for_each_entry_safe(b, t, &bc->live, list) {
	list_for_each_entry_safe(b, t, &list->list, list) {
		touched++;

		if (btree_node_accessed(b)) {
@@ -476,7 +517,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
			   !btree_node_will_make_reachable(b) &&
			   !btree_node_write_blocked(b) &&
			   six_trylock_read(&b->c.lock)) {
			list_move(&bc->live, &b->list);
			list_move(&list->list, &b->list);
			mutex_unlock(&bc->lock);
			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
			six_unlock_read(&b->c.lock);
@@ -490,8 +531,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
			break;
	}
out_rotate:
	if (&t->list != &bc->live)
		list_move_tail(&bc->live, &t->list);
	if (&t->list != &list->list)
		list_move_tail(&list->list, &t->list);
out:
	mutex_unlock(&bc->lock);
out_nounlock:
@@ -504,40 +545,42 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
					    struct shrink_control *sc)
{
	struct bch_fs *c = shrink->private_data;
	struct btree_cache *bc = &c->btree_cache;
	struct btree_cache_list *list = shrink->private_data;

	if (bch2_btree_shrinker_disabled)
		return 0;

	return btree_cache_can_free(bc);
	return btree_cache_can_free(list);
}

void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
	struct btree_cache *bc = &c->btree_cache;
	struct btree *b, *t;
	unsigned i, flags;
	unsigned long flags;

	shrinker_free(bc->shrink);
	shrinker_free(bc->live[1].shrink);
	shrinker_free(bc->live[0].shrink);

	/* vfree() can allocate memory: */
	flags = memalloc_nofs_save();
	mutex_lock(&bc->lock);

	if (c->verify_data)
		list_move(&c->verify_data->list, &bc->live);
		list_move(&c->verify_data->list, &bc->live[0].list);

	kvfree(c->verify_ondisk);

	for (i = 0; i < btree_id_nr_alive(c); i++) {
	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
		struct btree_root *r = bch2_btree_id_root(c, i);

		if (r->b)
			list_add(&r->b->list, &bc->live);
			list_add(&r->b->list, &bc->live[0].list);
	}

	list_for_each_entry_safe(b, t, &bc->live, list)
	list_for_each_entry_safe(b, t, &bc->live[1].list, list)
		bch2_btree_node_hash_remove(bc, b);
	list_for_each_entry_safe(b, t, &bc->live[0].list, list)
		bch2_btree_node_hash_remove(bc, b);

	list_for_each_entry_safe(b, t, &bc->freeable, list) {
@@ -563,7 +606,8 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)

	for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
		BUG_ON(bc->nr_by_btree[i]);
	BUG_ON(bc->nr_live);
	BUG_ON(bc->live[0].nr);
	BUG_ON(bc->live[1].nr);
	BUG_ON(bc->nr_freeable);

	if (bc->table_init_done)
@@ -589,18 +633,28 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
		if (!__bch2_btree_node_mem_alloc(c))
			goto err;

	list_splice_init(&bc->live, &bc->freeable);
	list_splice_init(&bc->live[0].list, &bc->freeable);

	mutex_init(&c->verify_lock);

	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
	if (!shrink)
		goto err;
	bc->shrink = shrink;
	bc->live[0].shrink	= shrink;
	shrink->count_objects	= bch2_btree_cache_count;
	shrink->scan_objects	= bch2_btree_cache_scan;
	shrink->seeks		= 2;
	shrink->private_data	= &bc->live[0];
	shrinker_register(shrink);

	shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
	if (!shrink)
		goto err;
	bc->live[1].shrink	= shrink;
	shrink->count_objects	= bch2_btree_cache_count;
	shrink->scan_objects	= bch2_btree_cache_scan;
	shrink->seeks		= 4;
	shrink->private_data	= c;
	shrink->seeks		= 8;
	shrink->private_data	= &bc->live[1];
	shrinker_register(shrink);

	return 0;
@@ -611,7 +665,10 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
	mutex_init(&bc->lock);
	INIT_LIST_HEAD(&bc->live);
	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
		bc->live[i].idx = i;
		INIT_LIST_HEAD(&bc->live[i].list);
	}
	INIT_LIST_HEAD(&bc->freeable);
	INIT_LIST_HEAD(&bc->freed_pcpu);
	INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -673,12 +730,14 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
	struct btree_cache *bc = &c->btree_cache;
	struct btree *b;

	list_for_each_entry_reverse(b, &bc->live, list)
	for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
		list_for_each_entry_reverse(b, &bc->live[i].list, list)
			if (!btree_node_reclaim(c, b, false))
				return b;

	while (1) {
		list_for_each_entry_reverse(b, &bc->live, list)
		for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
			list_for_each_entry_reverse(b, &bc->live[i].list, list)
				if (!btree_node_write_and_reclaim(c, b))
					return b;

@@ -1387,9 +1446,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
	if (!out->nr_tabstops)
		printbuf_tabstop_push(out, 32);

	prt_btree_cache_line(out, c, "nr_live:",	bc->nr_live);
	prt_btree_cache_line(out, c, "nr_freeable:",	bc->nr_freeable);
	prt_btree_cache_line(out, c, "nr dirty:",	atomic_long_read(&bc->nr_dirty));
	prt_btree_cache_line(out, c, "live:",		bc->live[0].nr);
	prt_btree_cache_line(out, c, "pinned:",		bc->live[1].nr);
	prt_btree_cache_line(out, c, "freeable:",	bc->nr_freeable);
	prt_btree_cache_line(out, c, "dirty:",		atomic_long_read(&bc->nr_dirty));
	prt_printf(out, "cannibalize lock:\t%p\n",	bc->alloc_lock);
	prt_newline(out);

+3 −0
Original line number Diff line number Diff line
@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
				unsigned, enum btree_id);

void bch2_node_pin(struct bch_fs *, struct btree *);
void bch2_btree_cache_unpin(struct bch_fs *);

void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
				      struct bkey_s_c, struct bkey_i *);

+13 −8
Original line number Diff line number Diff line
@@ -147,8 +147,7 @@ struct btree {
	x(noevict)				\
	x(write_blocked)			\
	x(will_make_reachable)			\
	x(access_bit)				\
	x(pinned)				\
	x(access_bit)

enum bch_btree_cache_not_freed_reasons {
#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
@@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons {
	BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
};

struct btree_cache_list {
	unsigned		idx;
	struct shrinker		*shrink;
	struct list_head	list;
	size_t			nr;
};

struct btree_cache {
	struct rhashtable	table;
	bool			table_init_done;
@@ -174,12 +180,11 @@ struct btree_cache {
	 * should never grow past ~2-3 nodes in practice.
	 */
	struct mutex		lock;
	struct list_head	live;
	struct list_head	freeable;
	struct list_head	freed_pcpu;
	struct list_head	freed_nonpcpu;
	struct btree_cache_list	live[2];

	size_t			nr_live;
	size_t			nr_freeable;
	size_t			nr_reserve;
	size_t			nr_by_btree[BTREE_ID_NR];
@@ -188,7 +193,6 @@ struct btree_cache {
	/* shrinker stats */
	size_t			nr_freed;
	u64			not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
	struct shrinker		*shrink;

	/*
	 * If we need to allocate memory for a new btree node and that
@@ -201,8 +205,8 @@ struct btree_cache {

	struct bbpos		pinned_nodes_start;
	struct bbpos		pinned_nodes_end;
	u64			pinned_nodes_leaf_mask;
	u64			pinned_nodes_interior_mask;
	/* btree id mask: 0 for leaves, 1 for interior */
	u64			pinned_nodes_mask[2];
};

struct btree_node_iter {
@@ -594,7 +598,8 @@ enum btree_write_type {
	x(dying)							\
	x(fake)								\
	x(need_rewrite)							\
	x(never_write)
	x(never_write)							\
	x(pinned)

enum btree_flags {
	/* First bits for btree node write type */
+1 −1
Original line number Diff line number Diff line
@@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
	six_unlock_intent(&n->c.lock);

	mutex_lock(&c->btree_cache.lock);
	list_add_tail(&b->list, &c->btree_cache.live);
	list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
	mutex_unlock(&c->btree_cache.lock);

	bch2_trans_verify_locks(trans);
Loading