Commit 387df331 authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Start copygc, rebalance threads earlier



Previously, copygc and rebalance weren't started until the very end of
mounting, after all recvoery passes have finished.

But copygc really should be started earlier, since it may be needed for
allocations to make forward progress. Additionally, we've been seeing
occasional bug reports where starting the kthread fails due to a pending
signal - i.e. we're getting timed out by systemd (during a version
upgrade), but we're not seeing the signal until mount is about to
complete.

Additionally, we now have copygc/rebalance explicitly wait for
check_snapshots to complete (if being run); they require that for
snapshot_is_ancestor() in the data move path.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent d64e8e84
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -356,6 +356,13 @@ static int bch2_copygc_thread(void *arg)

	set_freezable();

	/*
	 * Data move operations can't run until after check_snapshots has
	 * completed, and bch2_snapshot_is_ancestor() is available.
	 */
	kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
			       kthread_should_stop());

	bch2_move_stats_init(&move_stats, "copygc");
	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
			      writepoint_ptr(&c->copygc_write_point),
+7 −0
Original line number Diff line number Diff line
@@ -581,6 +581,13 @@ static int bch2_rebalance_thread(void *arg)

	set_freezable();

	/*
	 * Data move operations can't run until after check_snapshots has
	 * completed, and bch2_snapshot_is_ancestor() is available.
	 */
	kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
			       kthread_should_stop());

	bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
			      writepoint_ptr(&c->rebalance_write_point),
			      true);
+4 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include "journal_seq_blacklist.h"
#include "logged_ops.h"
#include "move.h"
#include "movinggc.h"
#include "namei.h"
#include "quota.h"
#include "rebalance.h"
@@ -1194,6 +1195,9 @@ int bch2_fs_initialize(struct bch_fs *c)

	c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;

	bch2_copygc_wakeup(c);
	bch2_rebalance_wakeup(c);

	if (enabled_qtypes(c)) {
		ret = bch2_fs_quota_read(c);
		if (ret)
+7 −0
Original line number Diff line number Diff line
@@ -266,6 +266,7 @@ int bch2_run_recovery_passes(struct bch_fs *c)
	spin_lock_irq(&c->recovery_pass_lock);

	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
		unsigned prev_done = c->recovery_pass_done;
		unsigned pass = c->curr_recovery_pass;

		c->next_recovery_pass = pass + 1;
@@ -299,6 +300,12 @@ int bch2_run_recovery_passes(struct bch_fs *c)
		}

		c->curr_recovery_pass = c->next_recovery_pass;

		if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
		    c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) {
			bch2_copygc_wakeup(c);
			bch2_rebalance_wakeup(c);
		}
	}

	spin_unlock_irq(&c->recovery_pass_lock);
+14 −36
Original line number Diff line number Diff line
@@ -418,32 +418,6 @@ bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
	return ret;
}

static int bch2_fs_read_write_late(struct bch_fs *c)
{
	int ret;

	/*
	 * Data move operations can't run until after check_snapshots has
	 * completed, and bch2_snapshot_is_ancestor() is available.
	 *
	 * Ideally we'd start copygc/rebalance earlier instead of waiting for
	 * all of recovery/fsck to complete:
	 */
	ret = bch2_copygc_start(c);
	if (ret) {
		bch_err(c, "error starting copygc thread");
		return ret;
	}

	ret = bch2_rebalance_start(c);
	if (ret) {
		bch_err(c, "error starting rebalance thread");
		return ret;
	}

	return 0;
}

static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
	int ret;
@@ -503,9 +477,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
		atomic_long_inc(&c->writes[i]);
	}
#endif
	if (!early) {
		ret = bch2_fs_read_write_late(c);
		if (ret)

	ret = bch2_copygc_start(c);
	if (ret) {
		bch_err_msg(c, ret, "error starting copygc thread");
		goto err;
	}

	ret = bch2_rebalance_start(c);
	if (ret) {
		bch_err_msg(c, ret, "error starting rebalance thread");
		goto err;
	}

@@ -1082,13 +1063,10 @@ int bch2_fs_start(struct bch_fs *c)
	wake_up(&c->ro_ref_wait);

	down_write(&c->state_lock);
	if (c->opts.read_only) {
	if (c->opts.read_only)
		bch2_fs_read_only(c);
	} else {
		ret = !test_bit(BCH_FS_rw, &c->flags)
			? bch2_fs_read_write(c)
			: bch2_fs_read_write_late(c);
	}
	else if (!test_bit(BCH_FS_rw, &c->flags))
		ret = bch2_fs_read_write(c);
	up_write(&c->state_lock);

err: