Commit cd63a278 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2024-06-28' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Simple stuff:

   - NULL ptr/err ptr deref fixes

   - fix for getting wedged on shutdown after journal error

   - fix missing recalc_capacity() call, capacity now changes correctly
     after a device goes read only

     however: our capacity calculation still doesn't take into account
     when we have mixed ro/rw devices and the ro devices have data on
     them, that's going to be a more involved fix to separate accounting
     for "capacity used on ro devices" and "capacity used on rw devices"

   - boring syzbot stuff

  Slightly more involved:

   - discard, invalidate workers are now per device

     this has the effect of simplifying how we take device refs in these
     paths, and the device ref cleanup fixes a longstanding race between
     the device removal path and the discard path

   - fixes for how the debugfs code takes refs on btree_trans objects we
     have debugfs code that prints in use btree_trans objects.

     It uses closure_get() on trans->ref, which is mainly for the cycle
     detector, but the debugfs code was using it on a closure that may
     have hit 0, which is not allowed; for performance reasons we cannot
     avoid having not-in-use transactions on the global list.

     Introduce some new primitives to fix this and make the
     synchronization here a whole lot saner"

* tag 'bcachefs-2024-06-28' of https://evilpiepirate.org/git/bcachefs:
  bcachefs: Fix kmalloc bug in __snapshot_t_mut
  bcachefs: Discard, invalidate workers are now per device
  bcachefs: Fix shift-out-of-bounds in bch2_blacklist_entries_gc
  bcachefs: slab-use-after-free Read in bch2_sb_errors_from_cpu
  bcachefs: Add missing bch2_journal_do_writes() call
  bcachefs: Fix null ptr deref in journal_pins_to_text()
  bcachefs: Add missing recalc_capacity() call
  bcachefs: Fix btree_trans list ordering
  bcachefs: Fix race between trans_put() and btree_transactions_read()
  closures: closure_get_not_zero(), closure_return_sync()
  bcachefs: Make btree_deadlock_to_text() clearer
  bcachefs: fix seqmutex_relock()
  bcachefs: Fix freeing of error pointers
parents cd17613f 64cd7de9
Loading
Loading
Loading
Loading
+139 −124
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@
#include <linux/sched/task.h>
#include <linux/sort.h>

static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);

/* Persistent alloc info: */

@@ -893,12 +893,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
		if (statechange(a->data_type == BCH_DATA_need_discard) &&
		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
		    bucket_flushed(new_a))
			bch2_discard_one_bucket_fast(c, new.k->p);
			bch2_discard_one_bucket_fast(ca, new.k->p.offset);

		if (statechange(a->data_type == BCH_DATA_cached) &&
		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
			bch2_do_invalidates(c);
			bch2_dev_do_invalidates(ca);

		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
			bch2_gc_gens_async(c);
@@ -1636,34 +1636,38 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
	return ret;
}

static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
	int ret;

	mutex_lock(&c->discard_buckets_in_flight_lock);
	darray_for_each(c->discard_buckets_in_flight, i)
		if (bkey_eq(*i, bucket)) {
	mutex_lock(&ca->discard_buckets_in_flight_lock);
	darray_for_each(ca->discard_buckets_in_flight, i)
		if (i->bucket == bucket) {
			ret = -BCH_ERR_EEXIST_discard_in_flight_add;
			goto out;
		}

	ret = darray_push(&c->discard_buckets_in_flight, bucket);
	ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
			   .in_progress = in_progress,
			   .bucket	= bucket,
	}));
out:
	mutex_unlock(&c->discard_buckets_in_flight_lock);
	mutex_unlock(&ca->discard_buckets_in_flight_lock);
	return ret;
}

static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
	mutex_lock(&c->discard_buckets_in_flight_lock);
	darray_for_each(c->discard_buckets_in_flight, i)
		if (bkey_eq(*i, bucket)) {
			darray_remove_item(&c->discard_buckets_in_flight, i);
	mutex_lock(&ca->discard_buckets_in_flight_lock);
	darray_for_each(ca->discard_buckets_in_flight, i)
		if (i->bucket == bucket) {
			BUG_ON(!i->in_progress);
			darray_remove_item(&ca->discard_buckets_in_flight, i);
			goto found;
		}
	BUG();
found:
	mutex_unlock(&c->discard_buckets_in_flight_lock);
	mutex_unlock(&ca->discard_buckets_in_flight_lock);
}

struct discard_buckets_state {
@@ -1671,26 +1675,11 @@ struct discard_buckets_state {
	u64		open;
	u64		need_journal_commit;
	u64		discarded;
	struct bch_dev	*ca;
	u64		need_journal_commit_this_dev;
};

static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
{
	if (s->ca == ca)
		return;

	if (s->ca && s->need_journal_commit_this_dev >
	    bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
		bch2_journal_flush_async(&c->journal, NULL);

	if (s->ca)
		percpu_ref_put(&s->ca->io_ref);
	s->ca = ca;
	s->need_journal_commit_this_dev = 0;
}

static int bch2_discard_one_bucket(struct btree_trans *trans,
				   struct bch_dev *ca,
				   struct btree_iter *need_discard_iter,
				   struct bpos *discard_pos_done,
				   struct discard_buckets_state *s)
@@ -1704,16 +1693,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
	bool discard_locked = false;
	int ret = 0;

	struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
		? s->ca
		: bch2_dev_get_ioref(c, pos.inode, WRITE);
	if (!ca) {
		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
		return 0;
	}

	discard_buckets_next_dev(c, s, ca);

	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
		s->open++;
		goto out;
@@ -1773,7 +1752,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
		goto out;
	}

	if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
	if (discard_in_flight_add(ca, iter.pos.offset, true))
		goto out;

	discard_locked = true;
@@ -1811,7 +1790,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
	s->discarded++;
out:
	if (discard_locked)
		discard_in_flight_remove(c, iter.pos);
		discard_in_flight_remove(ca, iter.pos.offset);
	s->seen++;
	bch2_trans_iter_exit(trans, &iter);
	printbuf_exit(&buf);
@@ -1820,7 +1799,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,

static void bch2_do_discards_work(struct work_struct *work)
{
	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
	struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
	struct bch_fs *c = ca->fs;
	struct discard_buckets_state s = {};
	struct bpos discard_pos_done = POS_MAX;
	int ret;
@@ -1831,23 +1811,41 @@ static void bch2_do_discards_work(struct work_struct *work)
	 * successful commit:
	 */
	ret = bch2_trans_run(c,
		for_each_btree_key(trans, iter,
				   BTREE_ID_need_discard, POS_MIN, 0, k,
			bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));

	discard_buckets_next_dev(c, &s, NULL);
		for_each_btree_key_upto(trans, iter,
				   BTREE_ID_need_discard,
				   POS(ca->dev_idx, 0),
				   POS(ca->dev_idx, U64_MAX), 0, k,
			bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));

	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
			      bch2_err_str(ret));

	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
	percpu_ref_put(&ca->io_ref);
}

void bch2_do_discards(struct bch_fs *c)
void bch2_dev_do_discards(struct bch_dev *ca)
{
	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
	    !queue_work(c->write_ref_wq, &c->discard_work))
	struct bch_fs *c = ca->fs;

	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
		return;

	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
		goto put_ioref;

	if (queue_work(c->write_ref_wq, &ca->discard_work))
		return;

	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
put_ioref:
	percpu_ref_put(&ca->io_ref);
}

void bch2_do_discards(struct bch_fs *c)
{
	for_each_member_device(c, ca)
		bch2_dev_do_discards(ca);
}

static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
@@ -1876,68 +1874,69 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo

static void bch2_do_discards_fast_work(struct work_struct *work)
{
	struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
	struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
	struct bch_fs *c = ca->fs;

	while (1) {
		bool got_bucket = false;
		struct bpos bucket;
		struct bch_dev *ca;

		mutex_lock(&c->discard_buckets_in_flight_lock);
		darray_for_each(c->discard_buckets_in_flight, i) {
			if (i->snapshot)
				continue;
		u64 bucket;

			ca = bch2_dev_get_ioref(c, i->inode, WRITE);
			if (!ca) {
				darray_remove_item(&c->discard_buckets_in_flight, i);
		mutex_lock(&ca->discard_buckets_in_flight_lock);
		darray_for_each(ca->discard_buckets_in_flight, i) {
			if (i->in_progress)
				continue;
			}

			got_bucket = true;
			bucket = *i;
			i->snapshot = true;
			bucket = i->bucket;
			i->in_progress = true;
			break;
		}
		mutex_unlock(&c->discard_buckets_in_flight_lock);
		mutex_unlock(&ca->discard_buckets_in_flight_lock);

		if (!got_bucket)
			break;

		if (ca->mi.discard && !c->opts.nochanges)
			blkdev_issue_discard(ca->disk_sb.bdev,
					     bucket.offset * ca->mi.bucket_size,
					     bucket_to_sector(ca, bucket),
					     ca->mi.bucket_size,
					     GFP_KERNEL);

		int ret = bch2_trans_do(c, NULL, NULL,
			BCH_WATERMARK_btree|
			BCH_TRANS_COMMIT_no_enospc,
					bch2_clear_bucket_needs_discard(trans, bucket));
			bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
		bch_err_fn(c, ret);

		percpu_ref_put(&ca->io_ref);
		discard_in_flight_remove(c, bucket);
		discard_in_flight_remove(ca, bucket);

		if (ret)
			break;
	}

	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
	percpu_ref_put(&ca->io_ref);
}

static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
{
	rcu_read_lock();
	struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
	bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
	rcu_read_unlock();
	struct bch_fs *c = ca->fs;

	if (discard_in_flight_add(ca, bucket, false))
		return;

	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
		return;

	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
		goto put_ioref;

	if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
		return;

	if (!dead &&
	    !discard_in_flight_add(c, bucket) &&
	    bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
	    !queue_work(c->write_ref_wq, &c->discard_fast_work))
	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
put_ioref:
	percpu_ref_put(&ca->io_ref);
}

static int invalidate_one_bucket(struct btree_trans *trans,
@@ -2038,7 +2037,8 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter

static void bch2_do_invalidates_work(struct work_struct *work)
{
	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
	struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
	struct bch_fs *c = ca->fs;
	struct btree_trans *trans = bch2_trans_get(c);
	int ret = 0;

@@ -2046,7 +2046,6 @@ static void bch2_do_invalidates_work(struct work_struct *work)
	if (ret)
		goto err;

	for_each_member_device(c, ca) {
	s64 nr_to_invalidate =
		should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
	struct btree_iter iter;
@@ -2076,22 +2075,34 @@ static void bch2_do_invalidates_work(struct work_struct *work)
		bch2_btree_iter_advance(&iter);
	}
	bch2_trans_iter_exit(trans, &iter);

		if (ret < 0) {
			bch2_dev_put(ca);
			break;
		}
	}
err:
	bch2_trans_put(trans);
	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
	percpu_ref_put(&ca->io_ref);
}

void bch2_do_invalidates(struct bch_fs *c)
void bch2_dev_do_invalidates(struct bch_dev *ca)
{
	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
	    !queue_work(c->write_ref_wq, &c->invalidate_work))
	struct bch_fs *c = ca->fs;

	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
		return;

	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
		goto put_ioref;

	if (queue_work(c->write_ref_wq, &ca->invalidate_work))
		return;

	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
put_ioref:
	percpu_ref_put(&ca->io_ref);
}

void bch2_do_invalidates(struct bch_fs *c)
{
	for_each_member_device(c, ca)
		bch2_dev_do_invalidates(ca);
}

int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
@@ -2407,16 +2418,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
			set_bit(ca->dev_idx, c->rw_devs[i].d);
}

void bch2_fs_allocator_background_exit(struct bch_fs *c)
void bch2_dev_allocator_background_exit(struct bch_dev *ca)
{
	darray_exit(&ca->discard_buckets_in_flight);
}

void bch2_dev_allocator_background_init(struct bch_dev *ca)
{
	darray_exit(&c->discard_buckets_in_flight);
	mutex_init(&ca->discard_buckets_in_flight_lock);
	INIT_WORK(&ca->discard_work, bch2_do_discards_work);
	INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
	INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
}

void bch2_fs_allocator_background_init(struct bch_fs *c)
{
	spin_lock_init(&c->freelist_lock);
	mutex_init(&c->discard_buckets_in_flight_lock);
	INIT_WORK(&c->discard_work, bch2_do_discards_work);
	INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
+5 −1
Original line number Diff line number Diff line
@@ -275,6 +275,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
		       enum btree_iter_update_trigger_flags);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_dev_do_discards(struct bch_dev *);
void bch2_do_discards(struct bch_fs *);

static inline u64 should_invalidate_buckets(struct bch_dev *ca,
@@ -289,6 +290,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}

void bch2_dev_do_invalidates(struct bch_dev *);
void bch2_do_invalidates(struct bch_fs *);

static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
@@ -312,7 +314,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);

void bch2_fs_allocator_background_exit(struct bch_fs *);
void bch2_dev_allocator_background_exit(struct bch_dev *);
void bch2_dev_allocator_background_init(struct bch_dev *);

void bch2_fs_allocator_background_init(struct bch_fs *);

#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
+2 −2
Original line number Diff line number Diff line
@@ -621,13 +621,13 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
	avail = dev_buckets_free(ca, *usage, watermark);

	if (usage->d[BCH_DATA_need_discard].buckets > avail)
		bch2_do_discards(c);
		bch2_dev_do_discards(ca);

	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
		bch2_gc_gens_async(c);

	if (should_invalidate_buckets(ca, *usage))
		bch2_do_invalidates(c);
		bch2_dev_do_invalidates(ca);

	if (!avail) {
		if (cl && !waiting) {
+11 −5
Original line number Diff line number Diff line
@@ -493,6 +493,11 @@ struct io_count {
	u64			sectors[2][BCH_DATA_NR];
};

struct discard_in_flight {
	bool			in_progress:1;
	u64			bucket:63;
};

struct bch_dev {
	struct kobject		kobj;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -554,6 +559,12 @@ struct bch_dev {
	size_t			inc_gen_really_needs_gc;
	size_t			buckets_waiting_on_journal;

	struct work_struct	invalidate_work;
	struct work_struct	discard_work;
	struct mutex		discard_buckets_in_flight_lock;
	DARRAY(struct discard_in_flight)	discard_buckets_in_flight;
	struct work_struct	discard_fast_work;

	atomic64_t		rebalance_work;

	struct journal_device	journal;
@@ -915,11 +926,6 @@ struct bch_fs {
	unsigned		write_points_nr;

	struct buckets_waiting_for_journal buckets_waiting_for_journal;
	struct work_struct	invalidate_work;
	struct work_struct	discard_work;
	struct mutex		discard_buckets_in_flight_lock;
	DARRAY(struct bpos)	discard_buckets_in_flight;
	struct work_struct	discard_fast_work;

	/* GARBAGE COLLECTION */
	struct work_struct	gc_gens_work;
+6 −13
Original line number Diff line number Diff line
@@ -3130,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)

	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
	memset(trans, 0, sizeof(*trans));
	closure_init_stack(&trans->ref);

	seqmutex_lock(&c->btree_trans_lock);
	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
@@ -3150,18 +3149,12 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
			BUG_ON(pos_task &&
			       pid == pos_task->pid &&
			       pos->locked);

			if (pos_task && pid < pos_task->pid) {
				list_add_tail(&trans->list, &pos->list);
				goto list_add_done;
			}
		}
	}
	list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:

	list_add(&trans->list, &c->btree_trans_list);
	seqmutex_unlock(&c->btree_trans_lock);
got_trans:
	trans->ref.closure_get_happened = false;
	trans->c		= c;
	trans->last_begin_time	= local_clock();
	trans->fn_idx		= fn_idx;
@@ -3200,6 +3193,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
	trans->srcu_lock_time	= jiffies;
	trans->srcu_held	= true;

	closure_init_stack_release(&trans->ref);
	return trans;
}

@@ -3257,10 +3252,10 @@ void bch2_trans_put(struct btree_trans *trans)
		bch2_journal_keys_put(c);

	/*
	 * trans->ref protects trans->locking_wait.task, btree_paths arary; used
	 * trans->ref protects trans->locking_wait.task, btree_paths array; used
	 * by cycle detector
	 */
	closure_sync(&trans->ref);
	closure_return_sync(&trans->ref);
	trans->locking_wait.task = NULL;

	unsigned long *paths_allocated = trans->paths_allocated;
@@ -3385,8 +3380,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
				per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;

			if (trans) {
				closure_sync(&trans->ref);

				seqmutex_lock(&c->btree_trans_lock);
				list_del(&trans->list);
				seqmutex_unlock(&c->btree_trans_lock);
Loading