Commit 35a4474b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2024-01-21' of https://evilpiepirate.org/git/bcachefs

Pull more bcachefs updates from Kent Overstreet:
 "Some fixes, Some refactoring, some minor features:

   - Assorted prep work for disk space accounting rewrite

   - BTREE_TRIGGER_ATOMIC: after combining our trigger callbacks, this
     makes our trigger context more explicit

   - A few fixes to avoid excessive transaction restarts on
     multithreaded workloads: fstests (in addition to ktest tests) are
     now checking slowpath counters, and that's shaking out a few bugs

   - Assorted tracepoint improvements

   - Starting to break up bcachefs_format.h and move on disk types so
     they're with the code they belong to; this will make room to start
     documenting the on disk format better.

   - A few minor fixes"

* tag 'bcachefs-2024-01-21' of https://evilpiepirate.org/git/bcachefs: (46 commits)
  bcachefs: Improve inode_to_text()
  bcachefs: logged_ops_format.h
  bcachefs: reflink_format.h
  bcachefs; extents_format.h
  bcachefs: ec_format.h
  bcachefs: subvolume_format.h
  bcachefs: snapshot_format.h
  bcachefs: alloc_background_format.h
  bcachefs: xattr_format.h
  bcachefs: dirent_format.h
  bcachefs: inode_format.h
  bcachefs; quota_format.h
  bcachefs: sb-counters_format.h
  bcachefs: counters.c -> sb-counters.c
  bcachefs: comment bch_subvolume
  bcachefs: bch_snapshot::btime
  bcachefs: add missing __GFP_NOWARN
  bcachefs: opts->compression can now also be applied in the background
  bcachefs: Prep work for variable size btree node buffers
  bcachefs: grab s_umount only if snapshotting
  ...
parents 4fbbed78 249f441f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -27,7 +27,6 @@ bcachefs-y := \
	checksum.o		\
	clock.o			\
	compress.o		\
	counters.o		\
	darray.o		\
	debug.o			\
	dirent.o		\
@@ -71,6 +70,7 @@ bcachefs-y := \
	reflink.o		\
	replicas.o		\
	sb-clean.o		\
	sb-counters.o		\
	sb-downgrade.o		\
	sb-errors.o		\
	sb-members.o		\
+45 −44
Original line number Diff line number Diff line
@@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
		bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
				 c, err, alloc_key_dirty_sectors_0,
				 "data_type %s but dirty_sectors==0",
				 bch2_data_types[a.v->data_type]);
				 bch2_data_type_str(a.v->data_type));
		break;
	case BCH_DATA_cached:
		bkey_fsck_err_on(!a.v->cached_sectors ||
@@ -321,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
	struct bch_alloc_v4 _a;
	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
	unsigned i;

	prt_newline(out);
	printbuf_indent_add(out, 2);

	prt_printf(out, "gen %u oldest_gen %u data_type %s",
	       a->gen, a->oldest_gen,
	       a->data_type < BCH_DATA_NR
	       ? bch2_data_types[a->data_type]
	       : "(invalid data type)");
	prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
	bch2_prt_data_type(out, a->data_type);
	prt_newline(out);
	prt_printf(out, "journal_seq       %llu",	a->journal_seq);
	prt_newline(out);
@@ -353,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
	prt_printf(out, "fragmentation     %llu",	a->fragmentation_lru);
	prt_newline(out);
	prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
	prt_newline(out);

	if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
		struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);

		prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
		printbuf_indent_add(out, 2);

		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
			prt_newline(out);
			bch2_backpointer_to_text(out, &bps[i]);
		}

		printbuf_indent_sub(out, 2);
	}

	printbuf_indent_sub(out, 2);
}

@@ -839,7 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
		}
	}

	if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
	if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
		struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
		u64 journal_seq = trans->journal_res.seq;
		u64 bucket_journal_seq = new_a->journal_seq;
@@ -1625,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
	return ret;
}

struct discard_buckets_state {
	u64		seen;
	u64		open;
	u64		need_journal_commit;
	u64		discarded;
	struct bch_dev	*ca;
	u64		need_journal_commit_this_dev;
};

static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
{
	if (s->ca == ca)
		return;

	if (s->ca && s->need_journal_commit_this_dev >
	    bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
		bch2_journal_flush_async(&c->journal, NULL);

	if (s->ca)
		percpu_ref_put(&s->ca->ref);
	if (ca)
		percpu_ref_get(&ca->ref);
	s->ca = ca;
	s->need_journal_commit_this_dev = 0;
}

static int bch2_discard_one_bucket(struct btree_trans *trans,
				   struct btree_iter *need_discard_iter,
				   struct bpos *discard_pos_done,
				   u64 *seen,
				   u64 *open,
				   u64 *need_journal_commit,
				   u64 *discarded)
				   struct discard_buckets_state *s)
{
	struct bch_fs *c = trans->c;
	struct bpos pos = need_discard_iter->pos;
@@ -1643,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
	int ret = 0;

	ca = bch_dev_bkey_exists(c, pos.inode);

	if (!percpu_ref_tryget(&ca->io_ref)) {
		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
		return 0;
	}

	discard_buckets_next_dev(c, s, ca);

	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
		(*open)++;
		s->open++;
		goto out;
	}

	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
			c->journal.flushed_seq_ondisk,
			pos.inode, pos.offset)) {
		(*need_journal_commit)++;
		s->need_journal_commit++;
		s->need_journal_commit_this_dev++;
		goto out;
	}

@@ -1732,9 +1738,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
		goto out;

	count_event(c, bucket_discard);
	(*discarded)++;
	s->discarded++;
out:
	(*seen)++;
	s->seen++;
	bch2_trans_iter_exit(trans, &iter);
	percpu_ref_put(&ca->io_ref);
	printbuf_exit(&buf);
@@ -1744,7 +1750,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
static void bch2_do_discards_work(struct work_struct *work)
{
	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
	struct discard_buckets_state s = {};
	struct bpos discard_pos_done = POS_MAX;
	int ret;

@@ -1756,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work)
	ret = bch2_trans_run(c,
		for_each_btree_key(trans, iter,
				   BTREE_ID_need_discard, POS_MIN, 0, k,
			bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
						&seen,
						&open,
						&need_journal_commit,
						&discarded)));
			bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));

	if (need_journal_commit * 2 > seen)
		bch2_journal_flush_async(&c->journal, NULL);
	discard_buckets_next_dev(c, &s, NULL);

	bch2_write_ref_put(c, BCH_WRITE_REF_discard);

	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
			      bch2_err_str(ret));

	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}

void bch2_do_discards(struct bch_fs *c)
+92 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H

struct bch_alloc {
	struct bch_val		v;
	__u8			fields;
	__u8			gen;
	__u8			data[];
} __packed __aligned(8);

#define BCH_ALLOC_FIELDS_V1()			\
	x(read_time,		16)		\
	x(write_time,		16)		\
	x(data_type,		8)		\
	x(dirty_sectors,	16)		\
	x(cached_sectors,	16)		\
	x(oldest_gen,		8)		\
	x(stripe,		32)		\
	x(stripe_redundancy,	8)

enum {
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
	BCH_ALLOC_FIELDS_V1()
#undef x
};

struct bch_alloc_v2 {
	struct bch_val		v;
	__u8			nr_fields;
	__u8			gen;
	__u8			oldest_gen;
	__u8			data_type;
	__u8			data[];
} __packed __aligned(8);

#define BCH_ALLOC_FIELDS_V2()			\
	x(read_time,		64)		\
	x(write_time,		64)		\
	x(dirty_sectors,	32)		\
	x(cached_sectors,	32)		\
	x(stripe,		32)		\
	x(stripe_redundancy,	8)

struct bch_alloc_v3 {
	struct bch_val		v;
	__le64			journal_seq;
	__le32			flags;
	__u8			nr_fields;
	__u8			gen;
	__u8			oldest_gen;
	__u8			data_type;
	__u8			data[];
} __packed __aligned(8);

LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)

struct bch_alloc_v4 {
	struct bch_val		v;
	__u64			journal_seq;
	__u32			flags;
	__u8			gen;
	__u8			oldest_gen;
	__u8			data_type;
	__u8			stripe_redundancy;
	__u32			dirty_sectors;
	__u32			cached_sectors;
	__u64			io_time[2];
	__u32			stripe;
	__u32			nr_external_backpointers;
	__u64			fragmentation_lru;
} __packed __aligned(8);

#define BCH_ALLOC_V4_U64s_V0	6
#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(__u64))

BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)

#define KEY_TYPE_BUCKET_GENS_BITS	8
#define KEY_TYPE_BUCKET_GENS_NR		(1U << KEY_TYPE_BUCKET_GENS_BITS)
#define KEY_TYPE_BUCKET_GENS_MASK	(KEY_TYPE_BUCKET_GENS_NR - 1)

struct bch_bucket_gens {
	struct bch_val		v;
	u8			gens[KEY_TYPE_BUCKET_GENS_NR];
} __packed __aligned(8);

#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
+4 −3
Original line number Diff line number Diff line
@@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
	unsigned data_type = ob->data_type;
	barrier(); /* READ_ONCE() doesn't work on bitfields */

	prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
	prt_printf(out, "%zu ref %u ",
		   ob - c->open_buckets,
		   atomic_read(&ob->pin),
		   data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
		   atomic_read(&ob->pin));
	bch2_prt_data_type(out, data_type);
	prt_printf(out, " %u:%llu gen %u allocated %u/%u",
		   ob->dev, ob->bucket, ob->gen,
		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
	if (ob->ec)
+49 −51
Original line number Diff line number Diff line
@@ -400,13 +400,24 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
	return ret;
}

static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
{
	return bpos_eq(l.k->p, r.k->p) &&
		bkey_bytes(l.k) == bkey_bytes(r.k) &&
		!memcmp(l.v, r.v, bkey_val_bytes(l.k));
}

struct extents_to_bp_state {
	struct bpos	bucket_start;
	struct bpos	bucket_end;
	struct bkey_buf last_flushed;
};

static int check_bp_exists(struct btree_trans *trans,
			   struct extents_to_bp_state *s,
			   struct bpos bucket,
			   struct bch_backpointer bp,
			   struct bkey_s_c orig_k,
			   struct bpos bucket_start,
			   struct bpos bucket_end,
			   struct bkey_buf *last_flushed)
			   struct bkey_s_c orig_k)
{
	struct bch_fs *c = trans->c;
	struct btree_iter bp_iter = { NULL };
@@ -417,8 +428,8 @@ static int check_bp_exists(struct btree_trans *trans,

	bch2_bkey_buf_init(&tmp);

	if (bpos_lt(bucket, bucket_start) ||
	    bpos_gt(bucket, bucket_end))
	if (bpos_lt(bucket, s->bucket_start) ||
	    bpos_gt(bucket, s->bucket_end))
		return 0;

	if (!bch2_dev_bucket_exists(c, bucket))
@@ -433,11 +444,9 @@ static int check_bp_exists(struct btree_trans *trans,

	if (bp_k.k->type != KEY_TYPE_backpointer ||
	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
		if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) ||
		    bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) ||
		    memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) {
		bch2_bkey_buf_reassemble(&tmp, c, orig_k);

		if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
			if (bp.level) {
				bch2_trans_unlock(trans);
				bch2_btree_interior_updates_flush(c);
@@ -447,7 +456,7 @@ static int check_bp_exists(struct btree_trans *trans,
			if (ret)
				goto err;

			bch2_bkey_buf_copy(last_flushed, c, tmp.k);
			bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
			ret = -BCH_ERR_transaction_restart_write_buffer_flush;
			goto out;
		}
@@ -475,10 +484,8 @@ static int check_bp_exists(struct btree_trans *trans,
}

static int check_extent_to_backpointers(struct btree_trans *trans,
					struct extents_to_bp_state *s,
					enum btree_id btree, unsigned level,
					struct bpos bucket_start,
					struct bpos bucket_end,
					struct bkey_buf *last_flushed,
					struct bkey_s_c k)
{
	struct bch_fs *c = trans->c;
@@ -498,9 +505,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
		bch2_extent_ptr_to_bp(c, btree, level,
				      k, p, &bucket_pos, &bp);

		ret = check_bp_exists(trans, bucket_pos, bp, k,
				      bucket_start, bucket_end,
				      last_flushed);
		ret = check_bp_exists(trans, s, bucket_pos, bp, k);
		if (ret)
			return ret;
	}
@@ -509,10 +514,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
}

static int check_btree_root_to_backpointers(struct btree_trans *trans,
					    struct extents_to_bp_state *s,
					    enum btree_id btree_id,
					    struct bpos bucket_start,
					    struct bpos bucket_end,
					    struct bkey_buf *last_flushed,
					    int *level)
{
	struct bch_fs *c = trans->c;
@@ -536,9 +539,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
	*level = b->c.level;

	k = bkey_i_to_s_c(&b->key);
	ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1,
				      bucket_start, bucket_end,
				      last_flushed, k);
	ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
err:
	bch2_trans_iter_exit(trans, &iter);
	return ret;
@@ -559,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)

	si_meminfo(&i);
	mem_bytes = i.totalram * i.mem_unit;
	return div_u64(mem_bytes >> 1, btree_bytes(c));
	return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
}

static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@@ -610,43 +611,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
}

static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
						   struct bpos bucket_start,
						   struct bpos bucket_end)
						   struct extents_to_bp_state *s)
{
	struct bch_fs *c = trans->c;
	struct btree_iter iter;
	enum btree_id btree_id;
	struct bkey_s_c k;
	struct bkey_buf last_flushed;
	int ret = 0;

	bch2_bkey_buf_init(&last_flushed);
	bkey_init(&last_flushed.k->k);

	for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
	for (enum btree_id btree_id = 0;
	     btree_id < btree_id_nr_alive(c);
	     btree_id++) {
		int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;

		ret = commit_do(trans, NULL, NULL,
				BCH_TRANS_COMMIT_no_enospc,
				check_btree_root_to_backpointers(trans, btree_id,
							bucket_start, bucket_end,
							&last_flushed, &level));
				check_btree_root_to_backpointers(trans, s, btree_id, &level));
		if (ret)
			return ret;

		while (level >= depth) {
			struct btree_iter iter;
			bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
						  level,
						  BTREE_ITER_PREFETCH);
			while (1) {
				bch2_trans_begin(trans);
				k = bch2_btree_iter_peek(&iter);

				struct bkey_s_c k = bch2_btree_iter_peek(&iter);
				if (!k.k)
					break;
				ret = bkey_err(k) ?:
					check_extent_to_backpointers(trans, btree_id, level,
								     bucket_start, bucket_end,
								     &last_flushed, k) ?:
					check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
					bch2_trans_commit(trans, NULL, NULL,
							  BCH_TRANS_COMMIT_no_enospc);
				if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
@@ -668,7 +661,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
		}
	}

	bch2_bkey_buf_exit(&last_flushed, c);
	return 0;
}

@@ -731,37 +723,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
	struct btree_trans *trans = bch2_trans_get(c);
	struct bpos start = POS_MIN, end;
	struct extents_to_bp_state s = { .bucket_start = POS_MIN };
	int ret;

	bch2_bkey_buf_init(&s.last_flushed);
	bkey_init(&s.last_flushed.k->k);

	while (1) {
		ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
		ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
		if (ret)
			break;

		if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
		if ( bpos_eq(s.bucket_start, POS_MIN) &&
		    !bpos_eq(s.bucket_end, SPOS_MAX))
			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
				    __func__, btree_nodes_fit_in_ram(c));

		if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
		if (!bpos_eq(s.bucket_start, POS_MIN) ||
		    !bpos_eq(s.bucket_end, SPOS_MAX)) {
			struct printbuf buf = PRINTBUF;

			prt_str(&buf, "check_extents_to_backpointers(): ");
			bch2_bpos_to_text(&buf, start);
			bch2_bpos_to_text(&buf, s.bucket_start);
			prt_str(&buf, "-");
			bch2_bpos_to_text(&buf, end);
			bch2_bpos_to_text(&buf, s.bucket_end);

			bch_verbose(c, "%s", buf.buf);
			printbuf_exit(&buf);
		}

		ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
		if (ret || bpos_eq(end, SPOS_MAX))
		ret = bch2_check_extents_to_backpointers_pass(trans, &s);
		if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
			break;

		start = bpos_successor(end);
		s.bucket_start = bpos_successor(s.bucket_end);
	}
	bch2_trans_put(trans);
	bch2_bkey_buf_exit(&s.last_flushed, c);

	bch_err_fn(c, ret);
	return ret;
Loading