Commit 5d9667d1 authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: btree write buffer knows how to accumulate bch_accounting keys



Teach the btree write buffer how to accumulate accounting keys - instead
of having the newer key overwrite the older key as we do with other
updates, we need to add them together.

Also, add a flag so that write buffer flush knows when journal replay is
finished flushing accounting, and teach it to hold accounting keys until
that flag is set.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 9dec2a47
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -593,6 +593,7 @@ struct bch_dev {
	x(new_fs)			\
	x(started)			\
	x(btree_running)		\
	x(accounting_replay_done)	\
	x(may_go_rw)			\
	x(rw)				\
	x(was_rw)			\
+75 −9
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "disk_accounting.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
@@ -134,7 +135,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,

static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
			       struct btree_write_buffered_key *wb,
			       bool *write_locked, size_t *fast)
			       bool *write_locked,
			       bool *accounting_accumulated,
			       size_t *fast)
{
	struct btree_path *path;
	int ret;
@@ -147,6 +150,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
	if (ret)
		return ret;

	if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
		struct bkey u;
		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);

		if (k.k->type == KEY_TYPE_accounting)
			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
						   bkey_s_c_to_accounting(k));
	}
	*accounting_accumulated = true;

	/*
	 * We can't clone a path that has write locks: unshare it now, before
	 * set_pos and traverse():
@@ -259,8 +272,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
	struct journal *j = &c->journal;
	struct btree_write_buffer *wb = &c->btree_write_buffer;
	struct btree_iter iter = { NULL };
	size_t skipped = 0, fast = 0, slowpath = 0;
	size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
	bool write_locked = false;
	bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
	int ret = 0;

	bch2_trans_unlock(trans);
@@ -301,11 +315,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)

		BUG_ON(!k->journal_seq);

		if (!accounting_replay_done &&
		    k->k.k.type == KEY_TYPE_accounting) {
			slowpath++;
			continue;
		}

		if (i + 1 < &darray_top(wb->sorted) &&
		    wb_key_eq(i, i + 1)) {
			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];

			skipped++;
			if (k->k.k.type == KEY_TYPE_accounting &&
			    n->k.k.type == KEY_TYPE_accounting)
				bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
							   bkey_i_to_s_c_accounting(&k->k));

			overwritten++;
			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
			k->journal_seq = 0;
			continue;
@@ -340,13 +365,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
		bch2_btree_iter_set_pos(&iter, k->k.k.p);
		btree_iter_path(trans, &iter)->preserve = false;

		bool accounting_accumulated = false;
		do {
			if (race_fault()) {
				ret = -BCH_ERR_journal_reclaim_would_deadlock;
				break;
			}

			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
			ret = wb_flush_one(trans, &iter, k, &write_locked,
					   &accounting_accumulated, &fast);
			if (!write_locked)
				bch2_trans_begin(trans);
		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -387,6 +414,13 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
			if (!i->journal_seq)
				continue;

			if (!accounting_replay_done &&
			    i->k.k.type == KEY_TYPE_accounting) {
				could_not_insert++;
				continue;
			}

			if (!could_not_insert)
				bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
							bch2_btree_write_buffer_journal_flush);

@@ -401,13 +435,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
					btree_write_buffered_insert(trans, i));
			if (ret)
				goto err;

			i->journal_seq = 0;
		}

		/*
		 * If journal replay hasn't finished with accounting keys we
		 * can't flush accounting keys at all - condense them and leave
		 * them for next time.
		 *
		 * Q: Can the write buffer overflow?
		 * A Shouldn't be any actual risk. It's just new accounting
		 * updates that the write buffer can't flush, and those are only
		 * going to be generated by interior btree node updates as
		 * journal replay has to split/rewrite nodes to make room for
		 * its updates.
		 *
		 * And for those new acounting updates, updates to the same
		 * counters get accumulated as they're flushed from the journal
		 * to the write buffer - see the patch for eytzingcer tree
		 * accumulated. So we could only overflow if the number of
		 * distinct counters touched somehow was very large.
		 */
		if (could_not_insert) {
			struct btree_write_buffered_key *dst = wb->flushing.keys.data;

			darray_for_each(wb->flushing.keys, i)
				if (i->journal_seq)
					*dst++ = *i;
			wb->flushing.keys.nr = dst - wb->flushing.keys.data;
		}
	}
err:
	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
	if (ret || !could_not_insert) {
		bch2_journal_pin_drop(j, &wb->flushing.pin);
		wb->flushing.keys.nr = 0;
	}

	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
	trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
	return ret;
}

+3 −0
Original line number Diff line number Diff line
@@ -290,6 +290,8 @@ int bch2_journal_replay(struct bch_fs *c)
		k->overwritten = true;
	}

	set_bit(BCH_FS_accounting_replay_done, &c->flags);

	/*
	 * First, attempt to replay keys in sorted order. This is more
	 * efficient - better locality of btree access -  but some might fail if
@@ -1060,6 +1062,7 @@ int bch2_fs_initialize(struct bch_fs *c)
	 * set up the journal.pin FIFO and journal.cur pointer:
	 */
	bch2_fs_journal_start(&c->journal, 1);
	set_bit(BCH_FS_accounting_replay_done, &c->flags);
	bch2_journal_set_replay_done(&c->journal);

	ret = bch2_fs_read_write_early(c);