Commit 6f2a71a9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2025-06-26' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:

 - Lots of small check/repair fixes, primarily in subvol loop and
   directory structure loop (when involving snapshots).

 - Fix a few 6.16 regressions: rare UAF in the foreground allocator path
   when taking a transaction restart from the transaction bump
   allocator, and some small fallout from the change to log the error
   being corrected in the journal when repairing errors, also some
   fallout from the btree node read error logging improvements.

   (Alan, Bharadwaj)

 - New option: journal_rewind

   This lets the entire filesystem be reset to an earlier point in time.

   Note that this is only a disaster recovery tool, and right now there
   are major caveats to using it (discards should be disabled, in
   particular), but it successfully restored the filesystem of one of
   the users who was bit by the subvolume deletion bug and didn't have
   backups. I'll likely be making some changes to the discard path in
   the future to make this a reliable recovery tool.

 - Some new btree iterator tracepoints, for tracking down some
   livelock-ish behaviour we've been seeing in the main data write path.

* tag 'bcachefs-2025-06-26' of git://evilpiepirate.org/bcachefs: (51 commits)
  bcachefs: Plumb correct ip to trans_relock_fail tracepoint
  bcachefs: Ensure we rewind to run recovery passes
  bcachefs: Ensure btree node scan runs before checking for scanned nodes
  bcachefs: btree_root_unreadable_and_scan_found_nothing should not be autofix
  bcachefs: fix bch2_journal_keys_peek_prev_min() underflow
  bcachefs: Use wait_on_allocator() when allocating journal
  bcachefs: Check for bad write buffer key when moving from journal
  bcachefs: Don't unlock the trans if ret doesn't match BCH_ERR_operation_blocked
  bcachefs: Fix range in bch2_lookup_indirect_extent() error path
  bcachefs: fix spurious error_throw
  bcachefs: Add missing bch2_err_class() to fileattr_set()
  bcachefs: Add missing key type checks to check_snapshot_exists()
  bcachefs: Don't log fsck err in the journal if doing repair elsewhere
  bcachefs: Fix *__bch2_trans_subbuf_alloc() error path
  bcachefs: Fix missing newlines before ero
  bcachefs: fix spurious error in read_btree_roots()
  bcachefs: fsck: Fix oops in key_visible_in_snapshot()
  bcachefs: fsck: fix unhandled restart in topology repair
  bcachefs: fsck: Fix check_directory_structure when no check_dirents
  bcachefs: Fix restart handling in btree_node_scrub_work()
  ...
parents 8a20830f ef6fac0f
Loading
Loading
Loading
Loading
+9 −4
Original line number Diff line number Diff line
@@ -1406,6 +1406,9 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
		: BCH_DATA_free;
	struct printbuf buf = PRINTBUF;

	unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
		FSCK_CAN_FIX|FSCK_CAN_IGNORE;

	struct bpos bucket = iter->pos;
	bucket.offset &= ~(~0ULL << 56);
	u64 genbits = iter->pos.offset & (~0ULL << 56);
@@ -1419,7 +1422,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
		return ret;

	if (!bch2_dev_bucket_exists(c, bucket)) {
		if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
		if (__fsck_err(trans, fsck_flags,
			       need_discard_freespace_key_to_invalid_dev_bucket,
			       "entry in %s btree for nonexistant dev:bucket %llu:%llu",
			       bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
			goto delete;
@@ -1433,7 +1437,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
	if (a->data_type != state ||
	    (state == BCH_DATA_free &&
	     genbits != alloc_freespace_genbits(*a))) {
		if (fsck_err(trans, need_discard_freespace_key_bad,
		if (__fsck_err(trans, fsck_flags,
			       need_discard_freespace_key_bad,
			     "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
			     (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
			     bch2_btree_id_str(iter->btree_id),
+1 −1
Original line number Diff line number Diff line
@@ -353,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
		return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
	} else {
		struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
		if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)))
		if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
			return bkey_s_c_null;
		if (IS_ERR_OR_NULL(b))
			return ((struct bkey_s_c) { .k = ERR_CAST(b) });
+2 −1
Original line number Diff line number Diff line
@@ -767,7 +767,8 @@ struct btree_trans_buf {
	x(sysfs)							\
	x(btree_write_buffer)						\
	x(btree_node_scrub)						\
	x(async_recovery_passes)
	x(async_recovery_passes)					\
	x(ioctl_data)

enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
+25 −12
Original line number Diff line number Diff line
@@ -503,8 +503,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
	prt_newline(&buf);
	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));

	/*
	 * XXX: we're not passing the trans object here because we're not set up
	 * to handle a transaction restart - this code needs to be rewritten
	 * when we start doing online topology repair
	 */
	bch2_trans_unlock_long(trans);
	if (mustfix_fsck_err_on(!have_child,
			trans, btree_node_topology_interior_node_empty,
			c, btree_node_topology_interior_node_empty,
			"empty interior btree node at %s", buf.buf))
		ret = DROP_THIS_NODE;
err:
@@ -528,32 +534,39 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
	return ret;
}

static int bch2_check_root(struct btree_trans *trans, enum btree_id i,
static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
			   bool *reconstructed_root)
{
	struct bch_fs *c = trans->c;
	struct btree_root *r = bch2_btree_id_root(c, i);
	struct btree_root *r = bch2_btree_id_root(c, btree);
	struct printbuf buf = PRINTBUF;
	int ret = 0;

	bch2_btree_id_to_text(&buf, i);
	bch2_btree_id_to_text(&buf, btree);

	if (r->error) {
		bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);

		r->alive = false;
		r->error = 0;
		ret = bch2_btree_has_scanned_nodes(c, btree);
		if (ret < 0)
			goto err;

		if (!bch2_btree_has_scanned_nodes(c, i)) {
		if (!ret) {
			__fsck_err(trans,
				   FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0),
				   FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
				   btree_root_unreadable_and_scan_found_nothing,
				   "no nodes found for btree %s, continue?", buf.buf);
			bch2_btree_root_alloc_fake_trans(trans, i, 0);

			r->alive = false;
			r->error = 0;
			bch2_btree_root_alloc_fake_trans(trans, btree, 0);
		} else {
			bch2_btree_root_alloc_fake_trans(trans, i, 1);
			bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
			ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
			r->alive = false;
			r->error = 0;
			bch2_btree_root_alloc_fake_trans(trans, btree, 1);

			bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
			ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
			if (ret)
				goto err;
		}
+31 −43
Original line number Diff line number Diff line
@@ -557,7 +557,9 @@ static int __btree_err(int ret,
		       const char *fmt, ...)
{
	if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
		return bch_err_throw(c, fsck_fix);
		return ret == -BCH_ERR_btree_node_read_err_fixable
			? bch_err_throw(c, fsck_fix)
			: ret;

	bool have_retry = false;
	int ret2;
@@ -723,12 +725,11 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)

static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
			 struct btree *b, struct bset *i,
			 unsigned offset, unsigned sectors, int write,
			 unsigned offset, int write,
			 struct bch_io_failures *failed,
			 struct printbuf *err_msg)
{
	unsigned version = le16_to_cpu(i->version);
	unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
	struct printbuf buf1 = PRINTBUF;
	struct printbuf buf2 = PRINTBUF;
	int ret = 0;
@@ -778,15 +779,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
		     btree_node_unsupported_version,
		     "BSET_SEPARATE_WHITEOUTS no longer supported");

	if (!write &&
	    btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
			 -BCH_ERR_btree_node_read_err_fixable,
			 c, ca, b, i, NULL,
			 bset_past_end_of_btree_node,
			 "bset past end of btree node (offset %u len %u but written %zu)",
			 offset, sectors, ptr_written ?: btree_sectors(c)))
		i->u64s = 0;

	btree_err_on(offset && !i->u64s,
		     -BCH_ERR_btree_node_read_err_fixable,
		     c, ca, b, i, NULL,
@@ -1151,6 +1143,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
			     "unknown checksum type %llu", BSET_CSUM_TYPE(i));

		if (first) {
			sectors = vstruct_sectors(b->data, c->block_bits);
			if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
					 -BCH_ERR_btree_node_read_err_fixable,
					 c, ca, b, i, NULL,
					 bset_past_end_of_btree_node,
					 "bset past end of btree node (offset %u len %u but written %zu)",
					 b->written, sectors, ptr_written ?: btree_sectors(c)))
				i->u64s = 0;
			if (good_csum_type) {
				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
				bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
@@ -1178,9 +1178,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
				     c, NULL, b, NULL, NULL,
				     btree_node_unsupported_version,
				     "btree node does not have NEW_EXTENT_OVERWRITE set");

			sectors = vstruct_sectors(b->data, c->block_bits);
		} else {
			sectors = vstruct_sectors(bne, c->block_bits);
			if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
					 -BCH_ERR_btree_node_read_err_fixable,
					 c, ca, b, i, NULL,
					 bset_past_end_of_btree_node,
					 "bset past end of btree node (offset %u len %u but written %zu)",
					 b->written, sectors, ptr_written ?: btree_sectors(c)))
				i->u64s = 0;
			if (good_csum_type) {
				struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
				bool csum_bad = bch2_crc_cmp(bne->csum, csum);
@@ -1201,14 +1207,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
						"decrypting btree node: %s", bch2_err_str(ret)))
					goto fsck_err;
			}

			sectors = vstruct_sectors(bne, c->block_bits);
		}

		b->version_ondisk = min(b->version_ondisk,
					le16_to_cpu(i->version));

		ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg);
		ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
		if (ret)
			goto fsck_err;

@@ -1982,28 +1986,12 @@ static void btree_node_scrub_work(struct work_struct *work)
	prt_newline(&err);

	if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
		struct btree_trans *trans = bch2_trans_get(c);

		struct btree_iter iter;
		bch2_trans_node_iter_init(trans, &iter, scrub->btree,
					  scrub->key.k->k.p, 0, scrub->level - 1, 0);

		struct btree *b;
		int ret = lockrestart_do(trans,
			PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter)));
		if (ret)
			goto err;

		if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
			bch_err(c, "error validating btree node during scrub on %s at btree %s",
				scrub->ca->name, err.buf);

			ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0);
		}
err:
		bch2_trans_iter_exit(trans, &iter);
		bch2_trans_begin(trans);
		bch2_trans_put(trans);
		int ret = bch2_trans_do(c,
			bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
						    scrub->key.k, 0));
		if (!bch2_err_matches(ret, ENOENT) &&
		    !bch2_err_matches(ret, EROFS))
			bch_err_fn_ratelimited(c, ret);
	}

	printbuf_exit(&err);
@@ -2267,7 +2255,7 @@ static void btree_node_write_endio(struct bio *bio)
}

static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
				   struct bset *i, unsigned sectors)
				   struct bset *i)
{
	int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
				     (struct bkey_validate_context) {
@@ -2282,7 +2270,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
	}

	ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL);
		validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL);
	if (ret) {
		bch2_inconsistent_error(c);
		dump_stack();
@@ -2475,7 +2463,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)

	/* if we're going to be encrypting, check metadata validity first: */
	if (validate_before_checksum &&
	    validate_bset_for_write(c, b, i, sectors_to_write))
	    validate_bset_for_write(c, b, i))
		goto err;

	ret = bset_encrypt(c, i, b->written << 9);
@@ -2492,7 +2480,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)

	/* if we're not encrypting, check metadata after checksumming: */
	if (!validate_before_checksum &&
	    validate_bset_for_write(c, b, i, sectors_to_write))
	    validate_bset_for_write(c, b, i))
		goto err;

	/*
Loading