Commit ff9bce3d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2024-05-30' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Assorted odds and ends...

   - two downgrade fixes

   - a couple snapshot deletion and repair fixes, thanks to noradtux for
     finding these and providing the image to debug them

   - a couple assert fixes

   - convert to folio helper, from Matthew

   - some improved error messages

   - bit of code reorganization (just moving things around); doing this
     while things are quiet so I'm not rebasing fixes past reorgs

   - don't return -EROFS on inconsistency error in recovery, this
     confuses util-linux and has it retry the mount

   - fix failure to return error on misaligned dio write; reported as an
     issue with coreutils shred"

* tag 'bcachefs-2024-05-30' of https://evilpiepirate.org/git/bcachefs: (21 commits)
  bcachefs: Fix failure to return error on misaligned dio write
  bcachefs: Don't return -EROFS from mount on inconsistency error
  bcachefs: Fix uninitialized var warning
  bcachefs: Split out sb-errors_format.h
  bcachefs: Split out journal_seq_blacklist_format.h
  bcachefs: Split out replicas_format.h
  bcachefs: Split out disk_groups_format.h
  bcachefs: split out sb-downgrade_format.h
  bcachefs: split out sb-members_format.h
  bcachefs: Better fsck error message for key version
  bcachefs: btree_gc can now handle unknown btrees
  bcachefs: add missing MODULE_DESCRIPTION()
  bcachefs: Fix setting of downgrade recovery passes/errors
  bcachefs: Run check_key_has_snapshot in snapshot_delete_keys()
  bcachefs: Refactor delete_dead_snapshots()
  bcachefs: Fix locking assert
  bcachefs: Fix lookup_first_inode() when inode_generations are present
  bcachefs: Plumb bkey into __btree_err()
  bcachefs: Use copy_folio_from_iter_atomic()
  bcachefs: Fix sb-downgrade validation
  ...
parents d8ec1985 7b038b56
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -690,7 +690,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,

	ptrs = bch2_bkey_ptrs_c(k);
	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
		struct bpos bucket_pos;
		struct bpos bucket_pos = POS_MIN;
		struct bch_backpointer bp;

		if (p.ptr.cached)
+1 −43
Original line number Diff line number Diff line
@@ -457,6 +457,7 @@ enum bch_time_stats {
};

#include "alloc_types.h"
#include "btree_gc_types.h"
#include "btree_types.h"
#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
@@ -488,49 +489,6 @@ enum bch_time_stats {

struct btree;

enum gc_phase {
	GC_PHASE_NOT_RUNNING,
	GC_PHASE_START,
	GC_PHASE_SB,

	GC_PHASE_BTREE_stripes,
	GC_PHASE_BTREE_extents,
	GC_PHASE_BTREE_inodes,
	GC_PHASE_BTREE_dirents,
	GC_PHASE_BTREE_xattrs,
	GC_PHASE_BTREE_alloc,
	GC_PHASE_BTREE_quotas,
	GC_PHASE_BTREE_reflink,
	GC_PHASE_BTREE_subvolumes,
	GC_PHASE_BTREE_snapshots,
	GC_PHASE_BTREE_lru,
	GC_PHASE_BTREE_freespace,
	GC_PHASE_BTREE_need_discard,
	GC_PHASE_BTREE_backpointers,
	GC_PHASE_BTREE_bucket_gens,
	GC_PHASE_BTREE_snapshot_trees,
	GC_PHASE_BTREE_deleted_inodes,
	GC_PHASE_BTREE_logged_ops,
	GC_PHASE_BTREE_rebalance_work,
	GC_PHASE_BTREE_subvolume_children,

	GC_PHASE_PENDING_DELETE,
};

struct gc_pos {
	enum gc_phase		phase;
	u16			level;
	struct bpos		pos;
};

struct reflink_gc {
	u64		offset;
	u32		size;
	u32		refcount;
};

typedef GENRADIX(struct reflink_gc) reflink_gc_table;

struct io_count {
	u64			sectors[2][BCH_DATA_NR];
};
+10 −185
Original line number Diff line number Diff line
@@ -503,16 +503,22 @@ struct bch_sb_field {

#include "alloc_background_format.h"
#include "extents_format.h"
#include "reflink_format.h"
#include "ec_format.h"
#include "inode_format.h"
#include "dirent_format.h"
#include "xattr_format.h"
#include "quota_format.h"
#include "disk_groups_format.h"
#include "inode_format.h"
#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
#include "quota_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
#include "snapshot_format.h"
#include "subvolume_format.h"
#include "sb-counters_format.h"
#include "sb-downgrade_format.h"
#include "sb-errors_format.h"
#include "sb-members_format.h"
#include "xattr_format.h"

enum bch_sb_field_type {
#define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -545,107 +551,6 @@ struct bch_sb_field_journal_v2 {
	}			d[];
};

/* BCH_SB_FIELD_members_v1: */

#define BCH_MIN_NR_NBUCKETS	(1 << 6)

#define BCH_IOPS_MEASUREMENTS()			\
	x(seqread,	0)			\
	x(seqwrite,	1)			\
	x(randread,	2)			\
	x(randwrite,	3)

enum bch_iops_measurement {
#define x(t, n) BCH_IOPS_##t = n,
	BCH_IOPS_MEASUREMENTS()
#undef x
	BCH_IOPS_NR
};

#define BCH_MEMBER_ERROR_TYPES()		\
	x(read,		0)			\
	x(write,	1)			\
	x(checksum,	2)

enum bch_member_error_type {
#define x(t, n) BCH_MEMBER_ERROR_##t = n,
	BCH_MEMBER_ERROR_TYPES()
#undef x
	BCH_MEMBER_ERROR_NR
};

struct bch_member {
	__uuid_t		uuid;
	__le64			nbuckets;	/* device size */
	__le16			first_bucket;   /* index of first bucket used */
	__le16			bucket_size;	/* sectors */
	__u8			btree_bitmap_shift;
	__u8			pad[3];
	__le64			last_mount;	/* time_t */

	__le64			flags;
	__le32			iops[4];
	__le64			errors[BCH_MEMBER_ERROR_NR];
	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
	__le64			errors_reset_time;
	__le64			seq;
	__le64			btree_allocated_bitmap;
	/*
	 * On recovery from a clean shutdown we don't normally read the journal,
	 * but we still want to resume writing from where we left off so we
	 * don't overwrite more than is necessary, for list journal debugging:
	 */
	__le32			last_journal_bucket;
	__le32			last_journal_bucket_offset;
};

/*
 * This limit comes from the bucket_gens array - it's a single allocation, and
 * kernel allocation are limited to INT_MAX
 */
#define BCH_MEMBER_NBUCKETS_MAX	(INT_MAX - 64)

#define BCH_MEMBER_V1_BYTES	56

LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags,  0,  4)
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags, 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags, 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags, 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags, 28, 30)
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
					struct bch_member, flags, 30, 31)

#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
#endif

#define BCH_MEMBER_STATES()			\
	x(rw,		0)			\
	x(ro,		1)			\
	x(failed,	2)			\
	x(spare,	3)

enum bch_member_state {
#define x(t, n) BCH_MEMBER_STATE_##t = n,
	BCH_MEMBER_STATES()
#undef x
	BCH_MEMBER_STATE_NR
};

struct bch_sb_field_members_v1 {
	struct bch_sb_field	field;
	struct bch_member	_members[]; //Members are now variable size
};

struct bch_sb_field_members_v2 {
	struct bch_sb_field	field;
	__le16			member_bytes; //size of single member entry
	u8			pad[6];
	struct bch_member	_members[];
};

/* BCH_SB_FIELD_crypt: */

struct nonce {
@@ -694,8 +599,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);

/* BCH_SB_FIELD_replicas: */

#define BCH_DATA_TYPES()		\
	x(free,		0)		\
	x(sb,		1)		\
@@ -738,50 +641,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
	}
}

struct bch_replicas_entry_v0 {
	__u8			data_type;
	__u8			nr_devs;
	__u8			devs[];
} __packed;

struct bch_sb_field_replicas_v0 {
	struct bch_sb_field	field;
	struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);

struct bch_replicas_entry_v1 {
	__u8			data_type;
	__u8			nr_devs;
	__u8			nr_required;
	__u8			devs[];
} __packed;

#define replicas_entry_bytes(_i)					\
	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)

struct bch_sb_field_replicas {
	struct bch_sb_field	field;
	struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);

/* BCH_SB_FIELD_disk_groups: */

#define BCH_SB_LABEL_SIZE		32

struct bch_disk_group {
	__u8			label[BCH_SB_LABEL_SIZE];
	__le64			flags[2];
} __packed __aligned(8);

LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)

struct bch_sb_field_disk_groups {
	struct bch_sb_field	field;
	struct bch_disk_group	entries[];
} __packed __aligned(8);

/*
 * On clean shutdown, store btree roots and current journal sequence number in
 * the superblock:
@@ -809,27 +668,6 @@ struct bch_sb_field_clean {
	__u64			_data[];
};

struct journal_seq_blacklist_entry {
	__le64			start;
	__le64			end;
};

struct bch_sb_field_journal_seq_blacklist {
	struct bch_sb_field	field;
	struct journal_seq_blacklist_entry start[];
};

struct bch_sb_field_errors {
	struct bch_sb_field	field;
	struct bch_sb_field_error_entry {
		__le64		v;
		__le64		last_error_time;
	}			entries[];
};

LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID,	struct bch_sb_field_error_entry, v,  0, 16);
LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR,	struct bch_sb_field_error_entry, v, 16, 64);

struct bch_sb_field_ext {
	struct bch_sb_field	field;
	__le64			recovery_passes_required[2];
@@ -837,18 +675,6 @@ struct bch_sb_field_ext {
	__le64			btrees_lost_data;
};

struct bch_sb_field_downgrade_entry {
	__le16			version;
	__le64			recovery_passes[2];
	__le16			nr_errors;
	__le16			errors[] __counted_by(nr_errors);
} __packed __aligned(2);

struct bch_sb_field_downgrade {
	struct bch_sb_field	field;
	struct bch_sb_field_downgrade_entry entries[];
};

/* Superblock: */

/*
@@ -909,7 +735,6 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re
#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)

#define BCH_SB_SECTOR			8
#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */

#define BCH_SB_LAYOUT_SIZE_BITS_MAX	16 /* 32 MB */

+9 −9
Original line number Diff line number Diff line
@@ -585,16 +585,17 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,

		if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
				bkey_version_in_future,
				"key version number higher than recorded: %llu > %llu",
				k.k->version.lo,
				atomic64_read(&c->key_version)))
				"key version number higher than recorded %llu\n  %s",
				atomic64_read(&c->key_version),
				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
			atomic64_set(&c->key_version, k.k->version.lo);
	}

	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
				c, btree_bitmap_not_marked,
				"btree ptr not marked in member info btree allocated bitmap\n  %s",
				(bch2_bkey_val_to_text(&buf, c, k),
				(printbuf_reset(&buf),
				 bch2_bkey_val_to_text(&buf, c, k),
				 buf.buf))) {
		mutex_lock(&c->sb_lock);
		bch2_dev_btree_bitmap_mark(c, k);
@@ -673,8 +674,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in

static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
	return  (int) btree_id_to_gc_phase(l) -
		(int) btree_id_to_gc_phase(r);
	return cmp_int(gc_btree_order(l), gc_btree_order(r));
}

static int bch2_gc_btrees(struct bch_fs *c)
@@ -711,7 +711,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
static int bch2_mark_superblocks(struct bch_fs *c)
{
	mutex_lock(&c->sb_lock);
	gc_pos_set(c, gc_phase(GC_PHASE_SB));
	gc_pos_set(c, gc_phase(GC_PHASE_sb));

	int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
	mutex_unlock(&c->sb_lock);
@@ -1209,7 +1209,7 @@ int bch2_check_allocations(struct bch_fs *c)
	if (ret)
		goto out;

	gc_pos_set(c, gc_phase(GC_PHASE_START));
	gc_pos_set(c, gc_phase(GC_PHASE_start));

	ret = bch2_mark_superblocks(c);
	BUG_ON(ret);
@@ -1231,7 +1231,7 @@ int bch2_check_allocations(struct bch_fs *c)

	percpu_down_write(&c->mark_lock);
	/* Indicates that gc is no longer in progress: */
	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
	__gc_pos_set(c, gc_phase(GC_PHASE_not_running));

	bch2_gc_free(c);
	percpu_up_write(&c->mark_lock);
+20 −24
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
#define _BCACHEFS_BTREE_GC_H

#include "bkey.h"
#include "btree_gc_types.h"
#include "btree_types.h"

int bch2_check_topology(struct bch_fs *);
@@ -32,36 +33,15 @@ int bch2_check_allocations(struct bch_fs *);
/* Position of (the start of) a gc phase: */
static inline struct gc_pos gc_phase(enum gc_phase phase)
{
	return (struct gc_pos) {
		.phase	= phase,
		.level	= 0,
		.pos	= POS_MIN,
	};
}

static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
	return   cmp_int(l.phase, r.phase) ?:
		-cmp_int(l.level, r.level) ?:
		 bpos_cmp(l.pos, r.pos);
}

static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
{
	switch (id) {
#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
	BCH_BTREE_IDS()
#undef x
	default:
		BUG();
	}
	return (struct gc_pos) { .phase	= phase, };
}

static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
					 struct bpos pos)
{
	return (struct gc_pos) {
		.phase	= btree_id_to_gc_phase(btree),
		.phase	= GC_PHASE_btree,
		.btree	= btree,
		.level	= level,
		.pos	= pos,
	};
@@ -76,6 +56,22 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
	return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
}

static inline int gc_btree_order(enum btree_id btree)
{
	if (btree == BTREE_ID_stripes)
		return -1;
	return btree;
}

static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
	return   cmp_int(l.phase, r.phase) ?:
		 cmp_int(gc_btree_order(l.btree),
			 gc_btree_order(r.btree)) ?:
		-cmp_int(l.level, r.level) ?:
		 bpos_cmp(l.pos, r.pos);
}

static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
{
	unsigned seq;
Loading