Commit 1d16c605 authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Disk space accounting rewrite



Main part of the disk accounting rewrite.

This is a wholesale rewrite of the existing disk space accounting, which
relies on percepu counters that are sharded by journal buffer, and
rolled up and added to each journal write.

With the new scheme, every set of counters is a distinct key in the
accounting btree; this fixes scaling limitations of the old scheme,
where counters took up space in each journal entry and required multiple
percpu counters.

Now, in memory accounting requires a single set of percpu counters - not
multiple for each in flight journal buffer - and in the future we'll
probably also have counters that don't use in memory percpu counters,
they're not strictly required.

An accounting update is now a normal btree update, using the btree write
buffer path. At transaction commit time, we apply accounting updates to
the in memory counters, which are percpu counters indexed in an
eytzinger tree by the accounting key.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 5d9667d1
Loading
Loading
Loading
Loading
+67 −12
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "lru.h"
@@ -760,6 +761,61 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
	return ret;
}

static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
						    enum bch_data_type data_type,
						    s64 delta_buckets,
						    s64 delta_sectors,
						    s64 delta_fragmented, unsigned flags)
{
	struct disk_accounting_pos acc = {
		.type = BCH_DISK_ACCOUNTING_dev_data_type,
		.dev_data_type.dev		= ca->dev_idx,
		.dev_data_type.data_type	= data_type,
	};
	s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };

	return bch2_disk_accounting_mod(trans, &acc, d, 3);
}

int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
				   const struct bch_alloc_v4 *old,
				   const struct bch_alloc_v4 *new,
				   unsigned flags)
{
	s64 old_sectors = bch2_bucket_sectors(*old);
	s64 new_sectors = bch2_bucket_sectors(*new);
	if (old->data_type != new->data_type) {
		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
				 1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
			  bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
				-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
		if (ret)
			return ret;
	} else if (old_sectors != new_sectors) {
		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
					 0,
					 new_sectors - old_sectors,
					 bch2_bucket_sectors_fragmented(ca, *new) -
					 bch2_bucket_sectors_fragmented(ca, *old), flags);
		if (ret)
			return ret;
	}

	s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
	s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
	if (old_unstriped != new_unstriped) {
		int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
					 !!new_unstriped - !!old_unstriped,
					 new_unstriped - old_unstriped,
					 0,
					 flags);
		if (ret)
			return ret;
	}

	return 0;
}

int bch2_trigger_alloc(struct btree_trans *trans,
		       enum btree_id btree, unsigned level,
		       struct bkey_s_c old, struct bkey_s new,
@@ -835,18 +891,17 @@ int bch2_trigger_alloc(struct btree_trans *trans,
				goto err;
		}

		/*
		 * need to know if we're getting called from the invalidate path or
		 * not:
		 */

		if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
		    old_a->cached_sectors) {
			ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
			ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
					 -((s64) old_a->cached_sectors));
			if (ret)
				goto err;
		}

		ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
		if (ret)
			goto err;
	}

	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
@@ -886,19 +941,17 @@ int bch2_trigger_alloc(struct btree_trans *trans,
			}
		}

		percpu_down_read(&c->mark_lock);
		if (new_a->gen != old_a->gen) {
			rcu_read_lock();
			u8 *gen = bucket_gen(ca, new.k->p.offset);
			if (unlikely(!gen)) {
				percpu_up_read(&c->mark_lock);
				rcu_read_unlock();
				goto invalid_bucket;
			}
			*gen = new_a->gen;
			rcu_read_unlock();
		}

		bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
		percpu_up_read(&c->mark_lock);

#define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
#define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
#define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
@@ -946,6 +999,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,

		bucket_unlock(g);
		percpu_up_read(&c->mark_lock);

		bch2_dev_usage_update(c, ca, old_a, new_a);
	}
err:
	printbuf_exit(&buf);
+22 −5
Original line number Diff line number Diff line
@@ -82,25 +82,39 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
		bucket_data_type(bucket) != bucket_data_type(ptr);
}

static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
	return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
}

static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
{
	return a.stripe_sectors + a.dirty_sectors;
}

static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
{
	return a.data_type == BCH_DATA_cached
		? a.cached_sectors
		: bch2_bucket_sectors_dirty(a);
}

static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
						 struct bch_alloc_v4 a)
{
	int d = bch2_bucket_sectors_dirty(a);
	int d = bch2_bucket_sectors(a);

	return d ? max(0, ca->mi.bucket_size - d) : 0;
}

static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
{
	int d = a.stripe_sectors + a.dirty_sectors;

	return d ? max(0, ca->mi.bucket_size - d) : 0;
}

static inline unsigned bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
{
	return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
}
@@ -277,6 +291,9 @@ static inline bool bkey_is_alloc(const struct bkey *k)

int bch2_alloc_read(struct bch_fs *);

int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
				   const struct bch_alloc_v4 *,
				   const struct bch_alloc_v4 *, unsigned);
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
		       struct bkey_s_c, struct bkey_s,
		       enum btree_iter_update_trigger_flags);
+3 −3
Original line number Diff line number Diff line
@@ -205,6 +205,7 @@
#include <linux/zstd.h>

#include "bcachefs_format.h"
#include "disk_accounting_types.h"
#include "errcode.h"
#include "fifo.h"
#include "nocow_locking_types.h"
@@ -672,8 +673,6 @@ struct btree_trans_buf {
	struct btree_trans	*trans;
};

#define REPLICAS_DELTA_LIST_MAX	(1U << 16)

#define BCACHEFS_ROOT_SUBVOL_INUM					\
	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })

@@ -743,10 +742,11 @@ struct bch_fs {

	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];

	struct bch_accounting_mem accounting;

	struct bch_replicas_cpu replicas;
	struct bch_replicas_cpu replicas_gc;
	struct mutex		replicas_gc_lock;
	mempool_t		replicas_delta_pool;

	struct journal_entry_res btree_root_journal_res;
	struct journal_entry_res replicas_journal_res;
+0 −1
Original line number Diff line number Diff line
@@ -1138,7 +1138,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e)
	switch (e->type) {
	case BCH_JSET_ENTRY_btree_keys:
	case BCH_JSET_ENTRY_btree_root:
	case BCH_JSET_ENTRY_overwrite:
	case BCH_JSET_ENTRY_write_buffer_keys:
		return true;
	}
+6 −1
Original line number Diff line number Diff line
@@ -251,10 +251,15 @@ struct bch_replicas_usage {
	struct bch_replicas_entry_v1 r;
} __packed;

static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
{
	return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
}

static inline struct bch_replicas_usage *
replicas_usage_next(struct bch_replicas_usage *u)
{
	return (void *) u + replicas_entry_bytes(&u->r) + 8;
	return (void *) u + replicas_usage_bytes(u);
}

/*
Loading