Commit ec25bd8d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs repair code from Kent Overstreet:
 "A couple more small fixes, and new repair code.

  We can now automatically recover from arbitrary corrupted interior
  btree nodes by scanning, and we can reconstruct metadata as needed to
  bring a filesystem back into a working, consistent, read-write state
  and preserve access to whatevver wasn't corrupted.

  Meaning - you can blow away all metadata except for extents and
  dirents leaf nodes, and repair will reconstruct everything else and
  give you your data, and under the correct paths. If inodes are missing
  i_size will be slightly off and permissions/ownership/timestamps will
  be gone, and we do still need the snapshots btree if snapshots were in
  use - in the future we'll be able to guess the snapshot tree structure
  in some situations.

  IOW - aside from shaking out remaining bugs (fuzz testing is still
  coming), repair code should be complete and if repair ever doesn't
  work that's the highest priority bug that I want to know about
  immediately.

  This patchset was kindly tested by a user from India who accidentally
  wiped one drive out of a three drive filesystem with no replication on
  the family computer - it took a couple weeks but we got everything
  important back"

* tag 'bcachefs-2024-04-03' of https://evilpiepirate.org/git/bcachefs:
  bcachefs: reconstruct_inode()
  bcachefs: Subvolume reconstruction
  bcachefs: Check for extents that point to same space
  bcachefs: Reconstruct missing snapshot nodes
  bcachefs: Flag btrees with missing data
  bcachefs: Topology repair now uses nodes found by scanning to fill holes
  bcachefs: Repair pass for scanning for btree nodes
  bcachefs: Don't skip fake btree roots in fsck
  bcachefs: bch2_btree_root_alloc() -> bch2_btree_root_alloc_fake()
  bcachefs: Etyzinger cleanups
  bcachefs: bch2_shoot_down_journal_keys()
  bcachefs: Clear recovery_passes_required as they complete without errors
  bcachefs: ratelimit informational fsck errors
  bcachefs: Check for bad needs_discard before doing discard
  bcachefs: Improve bch2_btree_update_to_text()
  mean_and_variance: Drop always failing tests
  bcachefs: fix nocow lock deadlock
  bcachefs: BCH_WATERMARK_interior_updates
  bcachefs: Fix btree node reserve
parents c85af715 09d4c2ac
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ bcachefs-y := \
	btree_journal_iter.o	\
	btree_key_cache.o	\
	btree_locking.o		\
	btree_node_scan.o	\
	btree_trans_commit.o	\
	btree_update.o		\
	btree_update_interior.o	\
@@ -37,6 +38,7 @@ bcachefs-y := \
	error.o			\
	extents.o		\
	extent_update.o		\
	eytzinger.o		\
	fs.o			\
	fs-common.o		\
	fs-ioctl.o		\
+26 −21
Original line number Diff line number Diff line
@@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
	if (ret)
		goto out;

	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
	if (a->v.dirty_sectors) {
		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
					       trans, "attempting to discard bucket with dirty data\n%s",
					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
			ret = -EIO;
		goto out;
	}

	if (a->v.data_type != BCH_DATA_need_discard) {
		if (data_type_is_empty(a->v.data_type) &&
		    BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
			a->v.gen++;
			SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
			goto write;
		}

	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
			bch2_trans_inconsistent(trans,
				"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
					       trans, "bucket incorrectly set in need_discard btree\n"
					       "%s",
				a->v.journal_seq,
				c->journal.flushed_seq_ondisk,
				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
			ret = -EIO;
		}
		goto out;
	}

	if (a->v.data_type != BCH_DATA_need_discard) {
		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
			bch2_trans_inconsistent(trans,
				"bucket incorrectly set in need_discard btree\n"
				"%s",
				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
					       trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
					       a->v.journal_seq,
					       c->journal.flushed_seq_ondisk,
					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
			ret = -EIO;
		}

		goto out;
	}

@@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo
	if (ret)
		goto err;

	BUG_ON(a->v.dirty_sectors);
	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
	a->v.data_type = alloc_data_type(a->v, a->v.data_type);

@@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
		goto out;

	BUG_ON(a->v.data_type != BCH_DATA_cached);
	BUG_ON(a->v.dirty_sectors);

	if (!a->v.cached_sectors)
		bch_err(c, "invalidating empty bucket, confused");
+3 −1
Original line number Diff line number Diff line
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
{
	switch (watermark) {
	case BCH_WATERMARK_reclaim:
	case BCH_WATERMARK_interior_updates:
		return 0;
	case BCH_WATERMARK_reclaim:
		return OPEN_BUCKETS_COUNT / 6;
	case BCH_WATERMARK_btree:
	case BCH_WATERMARK_btree_copygc:
		return OPEN_BUCKETS_COUNT / 4;
+2 −1
Original line number Diff line number Diff line
@@ -22,7 +22,8 @@ struct bucket_alloc_state {
	x(copygc)			\
	x(btree)			\
	x(btree_copygc)			\
	x(reclaim)
	x(reclaim)			\
	x(interior_updates)

enum bch_watermark {
#define x(name)	BCH_WATERMARK_##name,
+166 −7
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "checksum.h"
#include "error.h"

#include <linux/mm.h>
@@ -418,6 +419,84 @@ struct extents_to_bp_state {
	struct bkey_buf last_flushed;
};

static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
			       struct bkey_s_c extent, unsigned dev)
{
	struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
	int ret = PTR_ERR_OR_ZERO(n);
	if (ret)
		return ret;

	bch2_bkey_drop_device(bkey_i_to_s(n), dev);
	return bch2_btree_insert_trans(trans, btree, n, 0);
}

static int check_extent_checksum(struct btree_trans *trans,
				 enum btree_id btree, struct bkey_s_c extent,
				 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
{
	struct bch_fs *c = trans->c;
	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
	const union bch_extent_entry *entry;
	struct extent_ptr_decoded p;
	struct printbuf buf = PRINTBUF;
	void *data_buf = NULL;
	struct bio *bio = NULL;
	size_t bytes;
	int ret = 0;

	if (bkey_is_btree_ptr(extent.k))
		return false;

	bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
		if (p.ptr.dev == dev)
			goto found;
	BUG();
found:
	if (!p.crc.csum_type)
		return false;

	bytes = p.crc.compressed_size << 9;

	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
	if (!bch2_dev_get_ioref(ca, READ))
		return false;

	data_buf = kvmalloc(bytes, GFP_KERNEL);
	if (!data_buf) {
		ret = -ENOMEM;
		goto err;
	}

	bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
	bio->bi_iter.bi_sector = p.ptr.offset;
	bch2_bio_map(bio, data_buf, bytes);
	ret = submit_bio_wait(bio);
	if (ret)
		goto err;

	prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(btree));
	bch2_bkey_val_to_text(&buf, c, extent);
	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(o_btree));
	bch2_bkey_val_to_text(&buf, c, extent2);

	struct nonce nonce = extent_nonce(extent.k->version, p.crc);
	struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
	if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
			c, dup_backpointer_to_bad_csum_extent,
			"%s", buf.buf))
		ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
fsck_err:
err:
	if (bio)
		bio_put(bio);
	kvfree(data_buf);
	percpu_ref_put(&ca->io_ref);
	printbuf_exit(&buf);
	return ret;
}

static int check_bp_exists(struct btree_trans *trans,
			   struct extents_to_bp_state *s,
			   struct bpos bucket,
@@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans,
			   struct bkey_s_c orig_k)
{
	struct bch_fs *c = trans->c;
	struct btree_iter bp_iter = { NULL };
	struct btree_iter bp_iter = {};
	struct btree_iter other_extent_iter = {};
	struct printbuf buf = PRINTBUF;
	struct bkey_s_c bp_k;
	struct bkey_buf tmp;
@@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans,

	bch2_bkey_buf_init(&tmp);

	if (!bch2_dev_bucket_exists(c, bucket)) {
		prt_str(&buf, "extent for nonexistent device:bucket ");
		bch2_bpos_to_text(&buf, bucket);
		prt_str(&buf, "\n  ");
		bch2_bkey_val_to_text(&buf, c, orig_k);
		bch_err(c, "%s", buf.buf);
		return -BCH_ERR_fsck_repair_unimplemented;
	}

	if (bpos_lt(bucket, s->bucket_start) ||
	    bpos_gt(bucket, s->bucket_end))
		return 0;

	if (!bch2_dev_bucket_exists(c, bucket))
		goto missing;

	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
				  bucket_pos_to_bp(c, bucket, bp.bucket_offset),
				  0);
@@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans,
			ret = -BCH_ERR_transaction_restart_write_buffer_flush;
			goto out;
		}
		goto missing;

		goto check_existing_bp;
	}
out:
err:
fsck_err:
	bch2_trans_iter_exit(trans, &other_extent_iter);
	bch2_trans_iter_exit(trans, &bp_iter);
	bch2_bkey_buf_exit(&tmp, c);
	printbuf_exit(&buf);
	return ret;
check_existing_bp:
	/* Do we have a backpointer for a different extent? */
	if (bp_k.k->type != KEY_TYPE_backpointer)
		goto missing;

	struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;

	struct bkey_s_c other_extent =
		bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
	ret = bkey_err(other_extent);
	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
		ret = 0;
	if (ret)
		goto err;

	if (!other_extent.k)
		goto missing;

	if (bch2_extents_match(orig_k, other_extent)) {
		printbuf_reset(&buf);
		prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n  ");
		bch2_bkey_val_to_text(&buf, c, orig_k);
		prt_str(&buf, "\n  ");
		bch2_bkey_val_to_text(&buf, c, other_extent);
		bch_err(c, "%s", buf.buf);

		if (other_extent.k->size <= orig_k.k->size) {
			ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
			if (ret)
				goto err;
			goto out;
		} else {
			ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
			if (ret)
				goto err;
			goto missing;
		}
	}

	ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
	if (ret < 0)
		goto err;
	if (ret) {
		ret = 0;
		goto missing;
	}

	ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
	if (ret < 0)
		goto err;
	if (ret) {
		ret = 0;
		goto out;
	}

	printbuf_reset(&buf);
	prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n  ", bucket.inode);
	bch2_bkey_val_to_text(&buf, c, orig_k);
	prt_str(&buf, "\n  ");
	bch2_bkey_val_to_text(&buf, c, other_extent);
	bch_err(c, "%s", buf.buf);
	ret = -BCH_ERR_fsck_repair_unimplemented;
	goto err;
missing:
	printbuf_reset(&buf);
	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
	       bch2_btree_id_str(bp.btree_id), bp.level);
	bch2_bkey_val_to_text(&buf, c, orig_k);
	prt_printf(&buf, "\nbp pos ");
	bch2_bpos_to_text(&buf, bp_iter.pos);
	prt_printf(&buf, "\n  got:   ");
	bch2_bkey_val_to_text(&buf, c, bp_k);

	struct bkey_i_backpointer n_bp_k;
	bkey_backpointer_init(&n_bp_k.k_i);
	n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
	n_bp_k.v = bp;
	prt_printf(&buf, "\n  want:  ");
	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));

	if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
Loading