Commit 1e690efa authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Split out journal pins by btree level



This lets us flush the journal to go read-only more effectively.

Flushing the journal and going read-only requires halting mutually
recursive processes, which strictly speaking are not guaranteed to
terminate.

Flushing btree node journal pins will kick off a btree node write, and
btree node writes on completion must do another btree update to the
parent node to update the 'sectors_written' field for that node's key.

If the parent node is full and requires a split or compaction, that's
going to generate a whole bunch of additional btree updates - alloc
info, LRU btree, and more - which then have to be flushed, and the cycle
repeats.

This process will terminate much more effectively if we tweak journal
reclaim to flush btree updates leaf to root: i.e., don't flush updates
for a given btree node (kicking off a write, and consuming space within
that node up to the next block boundary) if there might still be
unflushed updates in child nodes.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 1c316eb5
Loading
Loading
Loading
Loading
+18 −19
Original line number Diff line number Diff line
@@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j,
	spin_unlock(&j->lock);
}

static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
					      journal_pin_flush_fn fn)
{
	if (fn == bch2_btree_node_flush0 ||
	    fn == bch2_btree_node_flush1)
		return JOURNAL_PIN_TYPE_btree;
	else if (fn == bch2_btree_key_cache_journal_flush)
	    fn == bch2_btree_node_flush1) {
		unsigned idx = fn == bch2_btree_node_flush1;
		struct btree *b = container_of(pin, struct btree, writes[idx].journal);

		return JOURNAL_PIN_TYPE_btree0 - b->c.level;
	} else if (fn == bch2_btree_key_cache_journal_flush)
		return JOURNAL_PIN_TYPE_key_cache;
	else
		return JOURNAL_PIN_TYPE_other;
@@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j,

	bool reclaim = __journal_pin_drop(j, dst);

	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));

	if (reclaim)
		bch2_journal_reclaim_fast(j);
@@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,

	bool reclaim = __journal_pin_drop(j, pin);

	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));

	if (reclaim)
		bch2_journal_reclaim_fast(j);
@@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j,
		spin_lock(&j->lock);
		/* Pin might have been dropped or rearmed: */
		if (likely(!err && !j->flush_in_progress_dropped))
			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
		j->flush_in_progress = NULL;
		j->flush_in_progress_dropped = false;
		spin_unlock(&j->lock);
@@ -869,15 +873,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,

	mutex_lock(&j->reclaim_lock);

	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
			       BIT(JOURNAL_PIN_TYPE_key_cache)|
			       BIT(JOURNAL_PIN_TYPE_other))) {
		*did_work = true;
		goto unlock;
	}

	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
			       BIT(JOURNAL_PIN_TYPE_btree))) {
	for (int type = JOURNAL_PIN_TYPE_NR - 1;
	     type >= 0;
	     --type)
		if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
			*did_work = true;
			goto unlock;
		}
+4 −1
Original line number Diff line number Diff line
@@ -53,7 +53,10 @@ struct journal_buf {
 */

enum journal_pin_type {
	JOURNAL_PIN_TYPE_btree,
	JOURNAL_PIN_TYPE_btree3,
	JOURNAL_PIN_TYPE_btree2,
	JOURNAL_PIN_TYPE_btree1,
	JOURNAL_PIN_TYPE_btree0,
	JOURNAL_PIN_TYPE_key_cache,
	JOURNAL_PIN_TYPE_other,
	JOURNAL_PIN_TYPE_NR,