Commit fee3e843 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2025-05-15' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "The main user reported ones are:

   - Fix a btree iterator locking inconsistency that's been causing us
     to go emergency read-only in evacuate: "Fix broken btree_path lock
     invariants in next_node()"

   - Minor btree node cache reclaim tweak that should help with OOMs:
     don't set btree nodes as accessed on fill

   - Fix a bch2_bkey_clear_rebalance() issue that was causing rebalance
     to do needless work"

* tag 'bcachefs-2025-05-15' of git://evilpiepirate.org/bcachefs:
  bcachefs: fix wrong arg to fsck_err()
  bcachefs: Fix missing commit in backpointer to missing target
  bcachefs: Fix accidental O(n^2) in fiemap
  bcachefs: Fix set_should_be_locked() call in peek_slot()
  bcachefs: Fix self deadlock
  bcachefs: Don't set btree nodes as accessed on fill
  bcachefs: Fix livelock in journal_entry_open()
  bcachefs: Fix broken btree_path lock invariants in next_node()
  bcachefs: Don't strip rebalance_opts from indirect extents
parents 4d0be1aa 9c09e59c
Loading
Loading
Loading
Loading
+79 −38
Original line number Diff line number Diff line
@@ -192,7 +192,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
static int backpointer_target_not_found(struct btree_trans *trans,
				  struct bkey_s_c_backpointer bp,
				  struct bkey_s_c target_k,
				  struct bkey_buf *last_flushed)
				  struct bkey_buf *last_flushed,
				  bool commit)
{
	struct bch_fs *c = trans->c;
	struct printbuf buf = PRINTBUF;
@@ -228,18 +229,77 @@ static int backpointer_target_not_found(struct btree_trans *trans,
		}

	if (fsck_err(trans, backpointer_to_missing_ptr,
		     "%s", buf.buf))
		     "%s", buf.buf)) {
		ret = bch2_backpointer_del(trans, bp.k->p);
		if (ret || !commit)
			goto out;

		/*
		 * Normally, on transaction commit from inside a transaction,
		 * we'll return -BCH_ERR_transaction_restart_nested, since a
		 * transaction commit invalidates pointers given out by peek().
		 *
		 * However, since we're updating a write buffer btree, if we
		 * return a transaction restart and loop we won't see that the
		 * backpointer has been deleted without an additional write
		 * buffer flush - and those are expensive.
		 *
		 * So we're relying on the caller immediately advancing to the
		 * next backpointer and starting a new transaction immediately
		 * after backpointer_get_key() returns NULL:
		 */
		ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
	}
out:
fsck_err:
	printbuf_exit(&buf);
	return ret;
}

struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
						 struct bkey_s_c_backpointer bp,
						 struct btree_iter *iter,
						 struct bkey_buf *last_flushed,
						 bool commit)
{
	struct bch_fs *c = trans->c;

	BUG_ON(!bp.v->level);

	bch2_trans_node_iter_init(trans, iter,
				  bp.v->btree_id,
				  bp.v->pos,
				  0,
				  bp.v->level - 1,
				  0);
	struct btree *b = bch2_btree_iter_peek_node(trans, iter);
	if (IS_ERR_OR_NULL(b))
		goto err;

	BUG_ON(b->c.level != bp.v->level - 1);

	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
			      bkey_i_to_s_c(&b->key), bp))
		return b;

	if (btree_node_will_make_reachable(b)) {
		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
	} else {
		int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
						       last_flushed, commit);
		b = ret ? ERR_PTR(ret) : NULL;
	}
err:
	bch2_trans_iter_exit(trans, iter);
	return b;
}

static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
						  struct bkey_s_c_backpointer bp,
						  struct btree_iter *iter,
						  unsigned iter_flags,
					 struct bkey_buf *last_flushed)
						  struct bkey_buf *last_flushed,
						  bool commit)
{
	struct bch_fs *c = trans->c;

@@ -277,10 +337,10 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
	bch2_trans_iter_exit(trans, iter);

	if (!bp.v->level) {
		int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
		int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
		return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
	} else {
		struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
		struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
		if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
			return bkey_s_c_null;
		if (IS_ERR_OR_NULL(b))
@@ -295,35 +355,16 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
					struct btree_iter *iter,
					struct bkey_buf *last_flushed)
{
	struct bch_fs *c = trans->c;

	BUG_ON(!bp.v->level);

	bch2_trans_node_iter_init(trans, iter,
				  bp.v->btree_id,
				  bp.v->pos,
				  0,
				  bp.v->level - 1,
				  0);
	struct btree *b = bch2_btree_iter_peek_node(trans, iter);
	if (IS_ERR_OR_NULL(b))
		goto err;

	BUG_ON(b->c.level != bp.v->level - 1);

	if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
			      bkey_i_to_s_c(&b->key), bp))
		return b;

	if (btree_node_will_make_reachable(b)) {
		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
	} else {
		int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
		b = ret ? ERR_PTR(ret) : NULL;
	return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
}
err:
	bch2_trans_iter_exit(trans, iter);
	return b;

struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
					 struct bkey_s_c_backpointer bp,
					 struct btree_iter *iter,
					 unsigned iter_flags,
					 struct bkey_buf *last_flushed)
{
	return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
}

static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
@@ -521,7 +562,7 @@ static int check_bp_exists(struct btree_trans *trans,
	struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);

	struct bkey_s_c other_extent =
		bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
		__bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
	ret = bkey_err(other_extent);
	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
		ret = 0;
+4 −5
Original line number Diff line number Diff line
@@ -852,7 +852,6 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
	b->sib_u64s[1]		= 0;
	b->whiteout_u64s	= 0;
	bch2_btree_keys_init(b);
	set_btree_node_accessed(b);

	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
			       start_time);
@@ -1286,6 +1285,10 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
			six_unlock_read(&b->c.lock);
			goto retry;
		}

		/* avoid atomic set bit if it's not needed: */
		if (!btree_node_accessed(b))
			set_btree_node_accessed(b);
	}

	/* XXX: waiting on IO with btree locks held: */
@@ -1301,10 +1304,6 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
		prefetch(p + L1_CACHE_BYTES * 2);
	}

	/* avoid atomic set bit if it's not needed: */
	if (!btree_node_accessed(b))
		set_btree_node_accessed(b);

	if (unlikely(btree_node_read_error(b))) {
		six_unlock_read(&b->c.lock);
		b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
+14 −8
Original line number Diff line number Diff line
@@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_
		return NULL;
	}

	/*
	 * We don't correctly handle nodes with extra intent locks here:
	 * downgrade so we don't violate locking invariants
	 */
	bch2_btree_path_downgrade(trans, path);

	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
		__bch2_btree_path_unlock(trans, path);
		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
@@ -2743,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
	ret = trans_maybe_inject_restart(trans, _RET_IP_);
	if (unlikely(ret)) {
		k = bkey_s_c_err(ret);
		goto out_no_locked;
		goto out;
	}

	/* extents can't span inode numbers: */
@@ -2763,13 +2769,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
	if (unlikely(ret)) {
		k = bkey_s_c_err(ret);
		goto out_no_locked;
		goto out;
	}

	struct btree_path *path = btree_iter_path(trans, iter);
	if (unlikely(!btree_path_node(path, path->level)))
		return bkey_s_c_null;

	btree_path_set_should_be_locked(trans, path);

	if ((iter->flags & BTREE_ITER_cached) ||
	    !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
		k = bkey_s_c_null;
@@ -2790,12 +2798,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
			if (!bkey_err(k))
				iter->k = *k.k;
			/* We're not returning a key from iter->path: */
			goto out_no_locked;
			goto out;
		}

		k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
		k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
		if (unlikely(!k.k))
			goto out_no_locked;
			goto out;

		if (unlikely(k.k->type == KEY_TYPE_whiteout &&
			     (iter->flags & BTREE_ITER_filter_snapshots) &&
@@ -2833,7 +2841,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
		}

		if (unlikely(bkey_err(k)))
			goto out_no_locked;
			goto out;

		next = k.k ? bkey_start_pos(k.k) : POS_MAX;

@@ -2855,8 +2863,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
		}
	}
out:
	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
	bch2_btree_iter_verify_entry_exit(iter);
	bch2_btree_iter_verify(trans, iter);
	ret = bch2_btree_iter_verify_ret(trans, iter, k);
+15 −2
Original line number Diff line number Diff line
@@ -376,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
	return ret;
}

int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
			       enum bch_accounting_mode mode)
{
	struct bch_replicas_padded r;

	if (mode != BCH_ACCOUNTING_read &&
	    accounting_to_replicas(&r.e, a.k->p) &&
	    !bch2_replicas_marked_locked(c, &r.e))
		return -BCH_ERR_btree_insert_need_mark_replicas;

	return __bch2_accounting_mem_insert(c, a);
}

static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
{
	for (unsigned i = 0; i < e->nr_counters; i++)
@@ -583,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
					accounting_key_init(&k_i.k, &acc_k, src_v, nr);
					bch2_accounting_mem_mod_locked(trans,
								bkey_i_to_s_c_accounting(&k_i.k),
								BCH_ACCOUNTING_normal);
								BCH_ACCOUNTING_normal, true);

					preempt_disable();
					struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -612,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)

	percpu_down_read(&c->mark_lock);
	int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
						 BCH_ACCOUNTING_read);
						 BCH_ACCOUNTING_read, false);
	percpu_up_read(&c->mark_lock);
	return ret;
}
+11 −5
Original line number Diff line number Diff line
@@ -136,6 +136,7 @@ enum bch_accounting_mode {
};

int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);

static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
@@ -150,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
 */
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
						 struct bkey_s_c_accounting a,
						 enum bch_accounting_mode mode)
						 enum bch_accounting_mode mode,
						 bool write_locked)
{
	struct bch_fs *c = trans->c;
	struct bch_accounting_mem *acc = &c->accounting;
@@ -189,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,

	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
		int ret = bch2_accounting_mem_insert(c, a, mode);
		int ret = 0;
		if (unlikely(write_locked))
			ret = bch2_accounting_mem_insert_locked(c, a, mode);
		else
			ret = bch2_accounting_mem_insert(c, a, mode);
		if (ret)
			return ret;
	}
@@ -206,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
	percpu_down_read(&trans->c->mark_lock);
	int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
	int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
	percpu_up_read(&trans->c->mark_lock);
	return ret;
}
@@ -259,7 +265,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
	EBUG_ON(bversion_zero(a->k.bversion));

	return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
		? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
		? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
		: 0;
}

@@ -271,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
		struct bkey_s_accounting a = accounting_i_to_s(a_i);

		bch2_accounting_neg(a);
		bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
		bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
		bch2_accounting_neg(a);
	}
}
Loading