Commit c3de9b57 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2024-06-22' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Lots of (mostly boring) fixes for syzbot bugs and rare(r) CI bugs.

  The LRU_TIME_BITS fix was slightly more involved; we only have 48 bits
  for the LRU position (we would prefer 64), so wraparound is possible
  for the cached data LRUs on a filesystem that has done sufficient
  (petabytes) reads; this is now handled.

  One notable user reported bugfix, where we were forgetting to
  correctly set the bucket data type, which should have been
  BCH_DATA_need_gc_gens instead of BCH_DATA_free; this was causing us to
  go emergency read-only on a filesystem that had seen heavy enough use
  to see bucket gen wraparoud.

  We're now starting to fix simple (safe) errors without requiring user
  intervention - i.e. a small incremental step towards full self
  healing.

  This is currently limited to just certain allocation information
  counters, and the error is still logged in the superblock; see that
  patch for more information. ("bcachefs: Fix safe errors by default")"

* tag 'bcachefs-2024-06-22' of https://evilpiepirate.org/git/bcachefs: (22 commits)
  bcachefs: Move the ei_flags setting to after initialization
  bcachefs: Fix a UAF after write_super()
  bcachefs: Use bch2_print_string_as_lines for long err
  bcachefs: Fix I_NEW warning in race path in bch2_inode_insert()
  bcachefs: Replace bare EEXIST with private error codes
  bcachefs: Fix missing alloc_data_type_set()
  closures: Change BUG_ON() to WARN_ON()
  bcachefs: fix alignment of VMA for memory mapped files on THP
  bcachefs: Fix safe errors by default
  bcachefs: Fix bch2_trans_put()
  bcachefs: set_worker_desc() for delete_dead_snapshots
  bcachefs: Fix bch2_sb_downgrade_update()
  bcachefs: Handle cached data LRU wraparound
  bcachefs: Guard against overflowing LRU_TIME_BITS
  bcachefs: delete_dead_snapshots() doesn't need to go RW
  bcachefs: Fix early init error path in journal code
  bcachefs: Check for invalid btree IDs
  bcachefs: Fix btree ID bitmasks
  bcachefs: Fix shift overflow in read_one_super()
  bcachefs: Fix a locking bug in the do_discard_fast() path
  ...
parents da3b6ef1 bd4da046
Loading
Loading
Loading
Loading
+61 −15
Original line number Diff line number Diff line
@@ -259,6 +259,14 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
			 "invalid data type (got %u should be %u)",
			 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));

	for (unsigned i = 0; i < 2; i++)
		bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
				 c, err,
				 alloc_key_io_time_bad,
				 "invalid io_time[%s]: %llu, max %llu",
				 i == READ ? "read" : "write",
				 a.v->io_time[i], LRU_TIME_MAX);

	switch (a.v->data_type) {
	case BCH_DATA_free:
	case BCH_DATA_need_gc_gens:
@@ -757,8 +765,8 @@ int bch2_trigger_alloc(struct btree_trans *trans,
		alloc_data_type_set(new_a, new_a->data_type);

		if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
			new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
			new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
			new_a->io_time[READ] = bch2_current_io_time(c, READ);
			new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
			SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
		}
@@ -768,6 +776,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
			new_a->gen++;
			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
			alloc_data_type_set(new_a, new_a->data_type);
		}

		if (old_a->data_type != new_a->data_type ||
@@ -781,7 +790,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,

		if (new_a->data_type == BCH_DATA_cached &&
		    !new_a->io_time[READ])
			new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
			new_a->io_time[READ] = bch2_current_io_time(c, READ);

		u64 old_lru = alloc_lru_idx_read(*old_a);
		u64 new_lru = alloc_lru_idx_read(*new_a);
@@ -882,7 +891,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
			closure_wake_up(&c->freelist_wait);

		if (statechange(a->data_type == BCH_DATA_need_discard) &&
		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
		    bucket_flushed(new_a))
			bch2_discard_one_bucket_fast(c, new.k->p);

@@ -1579,7 +1588,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
		if (ret)
			goto err;

		a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
		a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
		ret = bch2_trans_update(trans, alloc_iter,
					&a_mut->k_i, BTREE_TRIGGER_norun);
		if (ret)
@@ -1634,7 +1643,7 @@ static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
	mutex_lock(&c->discard_buckets_in_flight_lock);
	darray_for_each(c->discard_buckets_in_flight, i)
		if (bkey_eq(*i, bucket)) {
			ret = -EEXIST;
			ret = -BCH_ERR_EEXIST_discard_in_flight_add;
			goto out;
		}

@@ -1788,8 +1797,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
	}

	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
	alloc_data_type_set(&a->v, a->v.data_type);
write:
	alloc_data_type_set(&a->v, a->v.data_type);

	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
		bch2_trans_commit(trans, NULL, NULL,
				  BCH_WATERMARK_btree|
@@ -1975,8 +1985,8 @@ static int invalidate_one_bucket(struct btree_trans *trans,
	a->v.data_type		= 0;
	a->v.dirty_sectors	= 0;
	a->v.cached_sectors	= 0;
	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
	a->v.io_time[READ]	= bch2_current_io_time(c, READ);
	a->v.io_time[WRITE]	= bch2_current_io_time(c, WRITE);

	ret = bch2_trans_commit(trans, NULL, NULL,
				BCH_WATERMARK_btree|
@@ -2011,6 +2021,21 @@ static int invalidate_one_bucket(struct btree_trans *trans,
	goto out;
}

static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
				    struct bch_dev *ca, bool *wrapped)
{
	struct bkey_s_c k;
again:
	k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
	if (!k.k && !*wrapped) {
		bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
		*wrapped = true;
		goto again;
	}

	return k;
}

static void bch2_do_invalidates_work(struct work_struct *work)
{
	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
@@ -2024,12 +2049,33 @@ static void bch2_do_invalidates_work(struct work_struct *work)
	for_each_member_device(c, ca) {
		s64 nr_to_invalidate =
			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
		struct btree_iter iter;
		bool wrapped = false;

		bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
				     lru_pos(ca->dev_idx, 0,
					     ((bch2_current_io_time(c, READ) + U32_MAX) &
					      LRU_TIME_MAX)), 0);

		ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
				lru_pos(ca->dev_idx, 0, 0),
				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
				BTREE_ITER_intent, k,
			invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
		while (true) {
			bch2_trans_begin(trans);

			struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
			ret = bkey_err(k);
			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
				continue;
			if (ret)
				break;
			if (!k.k)
				break;

			ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
			if (ret)
				break;

			bch2_btree_iter_advance(&iter);
		}
		bch2_trans_iter_exit(trans, &iter);

		if (ret < 0) {
			bch2_dev_put(ca);
@@ -2204,7 +2250,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
	if (ret)
		return ret;

	now = atomic64_read(&c->io_clock[rw].now);
	now = bch2_current_io_time(c, rw);
	if (a->v.io_time[rw] == now)
		goto out;

+7 −1
Original line number Diff line number Diff line
@@ -141,7 +141,13 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
	    !bch2_bucket_sectors_fragmented(ca, a))
		return 0;

	u64 d = bch2_bucket_sectors_dirty(a);
	/*
	 * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
	 * bucket_sectors_dirty is (much) bigger than bucket_size
	 */
	u64 d = min(bch2_bucket_sectors_dirty(a),
		    ca->mi.bucket_size);

	return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}

+5 −0
Original line number Diff line number Diff line
@@ -1214,6 +1214,11 @@ static inline s64 bch2_current_time(const struct bch_fs *c)
	return timespec_to_bch2_time(c, now);
}

static inline u64 bch2_current_io_time(const struct bch_fs *c, int rw)
{
	return max(1ULL, (u64) atomic64_read(&c->io_clock[rw].now) & LRU_TIME_MAX);
}

static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
{
	struct stdio_redirect *stdio = c->stdio;
+9 −4
Original line number Diff line number Diff line
@@ -476,6 +476,9 @@ struct bch_lru {

#define LRU_ID_STRIPES		(1U << 16)

#define LRU_TIME_BITS	48
#define LRU_TIME_MAX	((1ULL << LRU_TIME_BITS) - 1)

/* Optional/variable size superblock sections: */

struct bch_sb_field {
@@ -987,8 +990,9 @@ enum bch_version_upgrade_opts {

#define BCH_ERROR_ACTIONS()		\
	x(continue,		0)	\
	x(ro,			1)	\
	x(panic,		2)
	x(fix_safe,		1)	\
	x(panic,		2)	\
	x(ro,			3)

enum bch_error_actions {
#define x(t, n) BCH_ON_ERROR_##t = n,
@@ -1382,9 +1386,10 @@ enum btree_id {

/*
 * Maximum number of btrees that we will _ever_ have under the current scheme,
 * where we refer to them with bitfields
 * where we refer to them with 64 bit bitfields - and we also need a bit for
 * the interior btree node type:
 */
#define BTREE_ID_NR_MAX		64
#define BTREE_ID_NR_MAX		63

static inline bool btree_id_is_alloc(enum btree_id id)
{
+1 −1
Original line number Diff line number Diff line
@@ -1064,7 +1064,7 @@ void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
{
	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
	u8 *l = k->key_start;
	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
	u8 *h = (u8 *) ((u64 *) k->_data + f->key_u64s) - 1;

	while (l < h) {
		swap(*l, *h);
Loading