Commit 35f51970 authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Improve journal pin flushing



Running the preempt tiering tests with a lower than normal journal
reclaim delay turned up a shutdown hang - a lost wakeup, caused because
flushing a journal pin (e.g. key cache/write buffer) can generate a new
journal pin.

The "simple" fix of adding the correct wakeup didn't work because of
ordering issues; if we flush btree node pins too aggressively before
other pins have completed, we end up spinning where each flush iteration
generates new work.

So to fix this correctly:
- The list of flushed journal pins is now broken out by type, so that
  we can wait for key cache/write buffer pin flushing to complete
  before flushing dirty btree nodes

- A new closure_waitlist is added for bch2_journal_flush_pins; this one
  is only used under or when we're taking the journal lock, so it's
  pretty cheap to add rigorously correct wakeups to journal_pin_set()
  and journal_pin_drop().

Additionally, bch2_journal_seq_pins_to_text() is moved to
journal_reclaim.c, where it belongs, along with a bit of other small
renaming and refactoring.

Besides fixing the hang, the better ordering between key cache/write
buffer flushing and btree node flushing should help or fix the "unmount
taking excessively long" a few users have been noticing.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 0c74c85b
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include "extents.h"
#include "fsck.h"
#include "inode.h"
#include "journal_reclaim.h"
#include "super.h"

#include <linux/console.h>
+4 −56
Original line number Diff line number Diff line
@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)

static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
	unsigned i;

	for (i = 0; i < ARRAY_SIZE(p->list); i++)
		INIT_LIST_HEAD(&p->list[i]);
	INIT_LIST_HEAD(&p->flushed);
	for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
		INIT_LIST_HEAD(&p->unflushed[i]);
	for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
		INIT_LIST_HEAD(&p->flushed[i]);
	atomic_set(&p->count, count);
	p->devs.nr = 0;
}
@@ -1626,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
	__bch2_journal_debug_to_text(out, j);
	spin_unlock(&j->lock);
}

bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
{
	struct journal_entry_pin_list *pin_list;
	struct journal_entry_pin *pin;

	spin_lock(&j->lock);
	if (!test_bit(JOURNAL_running, &j->flags)) {
		spin_unlock(&j->lock);
		return true;
	}

	*seq = max(*seq, j->pin.front);

	if (*seq >= j->pin.back) {
		spin_unlock(&j->lock);
		return true;
	}

	out->atomic++;

	pin_list = journal_seq_pin(j, *seq);

	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
	printbuf_indent_add(out, 2);

	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
		list_for_each_entry(pin, &pin_list->list[i], list)
			prt_printf(out, "\t%px %ps\n", pin, pin->flush);

	if (!list_empty(&pin_list->flushed))
		prt_printf(out, "flushed:\n");

	list_for_each_entry(pin, &pin_list->flushed, list)
		prt_printf(out, "\t%px %ps\n", pin, pin->flush);

	printbuf_indent_sub(out, 2);

	--out->atomic;
	spin_unlock(&j->lock);

	return false;
}

void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
{
	u64 seq = 0;

	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
		seq++;
}
+0 −2
Original line number Diff line number Diff line
@@ -430,8 +430,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u

void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);

int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
				unsigned nr);
+121 −21
Original line number Diff line number Diff line
@@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
		popped = true;
	}

	if (popped)
	if (popped) {
		bch2_journal_space_available(j);
		__closure_wake_up(&j->reclaim_flush_wait);
	}
}

bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
	pin->seq = 0;
	list_del_init(&pin->list);

	if (j->reclaim_flush_wait.list.first)
		__closure_wake_up(&j->reclaim_flush_wait);

	/*
	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
	 * writing a new last_seq will now make another bucket available:
@@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
{
	if (fn == bch2_btree_node_flush0 ||
	    fn == bch2_btree_node_flush1)
		return JOURNAL_PIN_btree;
		return JOURNAL_PIN_TYPE_btree;
	else if (fn == bch2_btree_key_cache_journal_flush)
		return JOURNAL_PIN_key_cache;
		return JOURNAL_PIN_TYPE_key_cache;
	else
		return JOURNAL_PIN_other;
		return JOURNAL_PIN_TYPE_other;
}

static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
@@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
	atomic_inc(&pin_list->count);
	pin->seq	= seq;
	pin->flush	= flush_fn;
	list_add(&pin->list, &pin_list->list[type]);

	if (list_empty(&pin_list->unflushed[type]) &&
	    j->reclaim_flush_wait.list.first)
		__closure_wake_up(&j->reclaim_flush_wait);

	list_add(&pin->list, &pin_list->unflushed[type]);
}

void bch2_journal_pin_copy(struct journal *j,
@@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j,
{
	struct journal_entry_pin_list *pin_list;
	struct journal_entry_pin *ret = NULL;
	unsigned i;

	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
		if (*seq > seq_to_flush && !allowed_above_seq)
			break;

		for (i = 0; i < JOURNAL_PIN_NR; i++)
			if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
			    ((1U << i) & allowed_above_seq)) {
				ret = list_first_entry_or_null(&pin_list->list[i],
		for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
			if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
			    (BIT(i) & allowed_above_seq)) {
				ret = list_first_entry_or_null(&pin_list->unflushed[i],
					struct journal_entry_pin, list);
				if (ret)
					return ret;
@@ -544,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j,
		}

		if (min_key_cache) {
			allowed_above |= 1U << JOURNAL_PIN_key_cache;
			allowed_below |= 1U << JOURNAL_PIN_key_cache;
			allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
			allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
		}

		cond_resched();
@@ -553,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j,
		j->last_flushed = jiffies;

		spin_lock(&j->lock);
		pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
		pin = journal_get_next_pin(j, seq_to_flush,
					   allowed_below,
					   allowed_above, &seq);
		if (pin) {
			BUG_ON(j->flush_in_progress);
			j->flush_in_progress = pin;
@@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j,
		spin_lock(&j->lock);
		/* Pin might have been dropped or rearmed: */
		if (likely(!err && !j->flush_in_progress_dropped))
			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
		j->flush_in_progress = NULL;
		j->flush_in_progress_dropped = false;
		spin_unlock(&j->lock);
@@ -816,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j)
	return 0;
}

static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
					unsigned types)
{
	struct journal_entry_pin_list *pin_list;
	u64 seq;

	spin_lock(&j->lock);
	fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
		if (seq > seq_to_flush)
			break;

		for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
			if ((BIT(i) & types) &&
			    (!list_empty(&pin_list->unflushed[i]) ||
			     !list_empty(&pin_list->flushed[i]))) {
				spin_unlock(&j->lock);
				return true;
			}
	}
	spin_unlock(&j->lock);

	return false;
}

static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
						 unsigned types)
{
	return  journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
		journal_pins_still_flushing(j, seq_to_flush, types);
}

static int journal_flush_done(struct journal *j, u64 seq_to_flush,
			      bool *did_work)
{
	int ret;
	int ret = 0;

	ret = bch2_journal_error(j);
	if (ret)
@@ -827,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,

	mutex_lock(&j->reclaim_lock);

	if (journal_flush_pins(j, seq_to_flush,
			       (1U << JOURNAL_PIN_key_cache)|
			       (1U << JOURNAL_PIN_other), 0, 0, 0) ||
	    journal_flush_pins(j, seq_to_flush,
			       (1U << JOURNAL_PIN_btree), 0, 0, 0))
	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
			       BIT(JOURNAL_PIN_TYPE_key_cache)|
			       BIT(JOURNAL_PIN_TYPE_other))) {
		*did_work = true;
		goto unlock;
	}

	if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
			       BIT(JOURNAL_PIN_TYPE_btree))) {
		*did_work = true;
		goto unlock;
	}

	if (seq_to_flush > journal_cur_seq(j))
		bch2_journal_entry_close(j);
@@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
		!fifo_used(&j->pin);

	spin_unlock(&j->lock);
unlock:
	mutex_unlock(&j->reclaim_lock);

	return ret;
@@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
	if (!test_bit(JOURNAL_running, &j->flags))
		return false;

	closure_wait_event(&j->async_wait,
	closure_wait_event(&j->reclaim_flush_wait,
		journal_flush_done(j, seq_to_flush, &did_work));

	return did_work;
@@ -926,3 +975,54 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)

	return ret;
}

bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
{
	struct journal_entry_pin_list *pin_list;
	struct journal_entry_pin *pin;

	spin_lock(&j->lock);
	if (!test_bit(JOURNAL_running, &j->flags)) {
		spin_unlock(&j->lock);
		return true;
	}

	*seq = max(*seq, j->pin.front);

	if (*seq >= j->pin.back) {
		spin_unlock(&j->lock);
		return true;
	}

	out->atomic++;

	pin_list = journal_seq_pin(j, *seq);

	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
	printbuf_indent_add(out, 2);

	prt_printf(out, "unflushed:\n");
	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
		list_for_each_entry(pin, &pin_list->unflushed[i], list)
			prt_printf(out, "\t%px %ps\n", pin, pin->flush);

	prt_printf(out, "flushed:\n");
	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
		list_for_each_entry(pin, &pin_list->flushed[i], list)
			prt_printf(out, "\t%px %ps\n", pin, pin->flush);

	printbuf_indent_sub(out, 2);

	--out->atomic;
	spin_unlock(&j->lock);

	return false;
}

void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
{
	u64 seq = 0;

	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
		seq++;
}
+3 −0
Original line number Diff line number Diff line
@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)

int bch2_journal_flush_device_pins(struct journal *, int);

void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);

#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
Loading