Commit 85ab365f authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Fix deadlock in journal write path



bch2_journal_write() was incorrectly waiting on earlier journal writes
synchronously; this usually worked because most of the time we'd be
running in the context of a thread that did a journal_buf_put(), but
sometimes we'd be running out of the same workqueue that completes those
prior journal writes.

Additionally, this makes sure to punt to a workqueue before submitting
preflushes - we really don't want to be calling submit_bio() in the main
transaction commit path.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent adfe9357
Loading
Loading
Loading
Loading
+42 −18
Original line number Diff line number Diff line
@@ -1723,7 +1723,7 @@ static void journal_write_endio(struct bio *bio)
	percpu_ref_put(&ca->io_ref);
}

static CLOSURE_CALLBACK(do_journal_write)
static CLOSURE_CALLBACK(journal_write_submit)
{
	closure_type(w, struct journal_buf, io);
	struct journal *j = container_of(w, struct journal, buf[w->idx]);
@@ -1768,6 +1768,44 @@ static CLOSURE_CALLBACK(do_journal_write)
	continue_at(cl, journal_write_done, j->wq);
}

static CLOSURE_CALLBACK(journal_write_preflush)
{
	closure_type(w, struct journal_buf, io);
	struct journal *j = container_of(w, struct journal, buf[w->idx]);
	struct bch_fs *c = container_of(j, struct bch_fs, journal);

	if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
		spin_lock(&j->lock);
		closure_wait(&j->async_wait, cl);
		spin_unlock(&j->lock);

		continue_at(cl, journal_write_preflush, j->wq);
		return;
	}

	if (w->separate_flush) {
		for_each_rw_member(c, ca) {
			percpu_ref_get(&ca->io_ref);

			struct journal_device *ja = &ca->journal;
			struct bio *bio = &ja->bio[w->idx]->bio;
			bio_reset(bio, ca->disk_sb.bdev,
				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
			bio->bi_end_io		= journal_write_endio;
			bio->bi_private		= ca;
			closure_bio_submit(bio, cl);
		}

		continue_at(cl, journal_write_submit, j->wq);
	} else {
		/*
		 * no need to punt to another work item if we're not waiting on
		 * preflushes
		 */
		journal_write_submit(&cl->work);
	}
}

static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
	struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -2033,23 +2071,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
		goto err;

	if (!JSET_NO_FLUSH(w->data))
		closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));

	if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
		for_each_rw_member(c, ca) {
			percpu_ref_get(&ca->io_ref);

			struct journal_device *ja = &ca->journal;
			struct bio *bio = &ja->bio[w->idx]->bio;
			bio_reset(bio, ca->disk_sb.bdev,
				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
			bio->bi_end_io		= journal_write_endio;
			bio->bi_private		= ca;
			closure_bio_submit(bio, cl);
		}
	}

	continue_at(cl, do_journal_write, j->wq);
		continue_at(cl, journal_write_preflush, j->wq);
	else
		continue_at(cl, journal_write_submit, j->wq);
	return;
no_io:
	continue_at(cl, journal_write_done, j->wq);