Commit 56770e24 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2025-04-03' of git://evilpiepirate.org/bcachefs

Pull more bcachefs updates from Kent Overstreet:
 "More notable fixes:

   - Fix for striping behaviour on tiering filesystems where replicas
     exceeds durability on destination target

   - Fix a race in device removal where deleting alloc info races with
     the discard worker

   - Some small stack usage improvements: this is just enough for KMSAN
     builds to not blow the stack, more is queued up for 6.16"

* tag 'bcachefs-2025-04-03' of git://evilpiepirate.org/bcachefs:
  bcachefs: Fix "journal stuck" during recovery
  bcachefs: backpointer_get_key: check for null from peek_slot()
  bcachefs: Fix null ptr deref in invalidate_one_bucket()
  bcachefs: Fix check_snapshot_exists() restart handling
  bcachefs: use nonblocking variant of print_string_as_lines in error path
  bcachefs: Fix scheduling while atomic from logging changes
  bcachefs: Add error handling for zlib_deflateInit2()
  bcachefs: add missing selection of XARRAY_MULTI
  bcachefs: bch_dev_usage_full
  bcachefs: Kill btree_iter.trans
  bcachefs: do_trace_key_cache_fill()
  bcachefs: Split up bch_dev.io_ref
  bcachefs: fix ref leak in btree_node_read_all_replicas
  bcachefs: Fix null ptr deref in bch2_write_endio()
  bcachefs: Fix field spanning write warning
  bcachefs: Fix striping behaviour
parents bdafff62 77ad1df8
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@ config BCACHEFS_FS
	select SRCU
	select SYMBOLIC_ERRNAME
	select MIN_HEAP
	select XARRAY_MULTI
	help
	The bcachefs filesystem - a modern, copy on write filesystem, with
	support for multiple devices, compression, checksumming, etc.
+2 −2
Original line number Diff line number Diff line
@@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
	struct bch_fs *c = inode->v.i_sb->s_fs_info;
	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
	struct btree_iter iter = { NULL };
	struct btree_iter iter = {};
	struct posix_acl *acl = NULL;

	if (rcu)
@@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
{
	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
	struct bch_fs *c = inode->v.i_sb->s_fs_info;
	struct btree_iter inode_iter = { NULL };
	struct btree_iter inode_iter = {};
	struct bch_inode_unpacked inode_u;
	struct posix_acl *acl;
	umode_t mode;
+50 −45
Original line number Diff line number Diff line
@@ -610,7 +610,7 @@ int bch2_alloc_read(struct bch_fs *c)
			 * bch2_check_alloc_key() which runs later:
			 */
			if (!ca) {
				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
				continue;
			}

@@ -631,17 +631,17 @@ int bch2_alloc_read(struct bch_fs *c)
			 * bch2_check_alloc_key() which runs later:
			 */
			if (!ca) {
				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
				continue;
			}

			if (k.k->p.offset < ca->mi.first_bucket) {
				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
				continue;
			}

			if (k.k->p.offset >= ca->mi.nbuckets) {
				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
				bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
				continue;
			}

@@ -1039,9 +1039,10 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
 * extents style btrees, but works on non-extents btrees:
 */
static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
					    struct bpos end, struct bkey *hole)
{
	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
	struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);

	if (bkey_err(k))
		return k;
@@ -1052,9 +1053,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
		struct btree_iter iter2;
		struct bpos next;

		bch2_trans_copy_iter(&iter2, iter);
		bch2_trans_copy_iter(trans, &iter2, iter);

		struct btree_path *path = btree_iter_path(iter->trans, iter);
		struct btree_path *path = btree_iter_path(trans, iter);
		if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
			end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));

@@ -1064,9 +1065,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
		 * btree node min/max is a closed interval, upto takes a half
		 * open interval:
		 */
		k = bch2_btree_iter_peek_max(&iter2, end);
		k = bch2_btree_iter_peek_max(trans, &iter2, end);
		next = iter2.pos;
		bch2_trans_iter_exit(iter->trans, &iter2);
		bch2_trans_iter_exit(trans, &iter2);

		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);

@@ -1107,13 +1108,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
	return *ca != NULL;
}

static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
							struct btree_iter *iter,
							struct bch_dev **ca, struct bkey *hole)
{
	struct bch_fs *c = iter->trans->c;
	struct bch_fs *c = trans->c;
	struct bkey_s_c k;
again:
	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
	k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
	if (bkey_err(k))
		return k;

@@ -1126,7 +1128,7 @@ static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
			if (!next_bucket(c, ca, &hole_start))
				return bkey_s_c_null;

			bch2_btree_iter_set_pos(iter, hole_start);
			bch2_btree_iter_set_pos(trans, iter, hole_start);
			goto again;
		}

@@ -1167,8 +1169,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,

	a = bch2_alloc_to_v4(alloc_k, &a_convert);

	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
	k = bch2_btree_iter_peek_slot(discard_iter);
	bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
	k = bch2_btree_iter_peek_slot(trans, discard_iter);
	ret = bkey_err(k);
	if (ret)
		goto err;
@@ -1181,8 +1183,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
			goto err;
	}

	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
	k = bch2_btree_iter_peek_slot(freespace_iter);
	bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
	k = bch2_btree_iter_peek_slot(trans, freespace_iter);
	ret = bkey_err(k);
	if (ret)
		goto err;
@@ -1195,8 +1197,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
			goto err;
	}

	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
	bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
	k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
	ret = bkey_err(k);
	if (ret)
		goto err;
@@ -1249,9 +1251,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
	if (!ca->mi.freespace_initialized)
		return 0;

	bch2_btree_iter_set_pos(freespace_iter, start);
	bch2_btree_iter_set_pos(trans, freespace_iter, start);

	k = bch2_btree_iter_peek_slot(freespace_iter);
	k = bch2_btree_iter_peek_slot(trans, freespace_iter);
	ret = bkey_err(k);
	if (ret)
		goto err;
@@ -1300,9 +1302,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
	unsigned i, gens_offset, gens_end_offset;
	int ret;

	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
	bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));

	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
	k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
	ret = bkey_err(k);
	if (ret)
		goto err;
@@ -1435,7 +1437,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
	*gen = a->gen;
out:
fsck_err:
	bch2_set_btree_iter_dontneed(&alloc_iter);
	bch2_set_btree_iter_dontneed(trans, &alloc_iter);
	bch2_trans_iter_exit(trans, &alloc_iter);
	printbuf_exit(&buf);
	return ret;
@@ -1572,7 +1574,7 @@ int bch2_check_alloc_info(struct bch_fs *c)

		bch2_trans_begin(trans);

		k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
		k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
		ret = bkey_err(k);
		if (ret)
			goto bkey_err;
@@ -1610,7 +1612,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
		if (ret)
			goto bkey_err;

		bch2_btree_iter_set_pos(&iter, next);
		bch2_btree_iter_set_pos(trans, &iter, next);
bkey_err:
		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
			continue;
@@ -1638,7 +1640,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
			     BTREE_ITER_prefetch);
	while (1) {
		bch2_trans_begin(trans);
		k = bch2_btree_iter_peek(&iter);
		k = bch2_btree_iter_peek(trans, &iter);
		if (!k.k)
			break;

@@ -1657,7 +1659,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
			break;
		}

		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
		bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
	}
	bch2_trans_iter_exit(trans, &iter);
	if (ret)
@@ -1685,7 +1687,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
	struct printbuf buf = PRINTBUF;
	int ret;

	alloc_k = bch2_btree_iter_peek(alloc_iter);
	alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
	if (!alloc_k.k)
		return 0;

@@ -1826,7 +1828,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
{
	struct bch_fs *c = trans->c;
	struct bpos pos = need_discard_iter->pos;
	struct btree_iter iter = { NULL };
	struct btree_iter iter = {};
	struct bkey_s_c k;
	struct bkey_i_alloc_v4 *a;
	struct printbuf buf = PRINTBUF;
@@ -1950,7 +1952,7 @@ static void bch2_do_discards_work(struct work_struct *work)
	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
			      bch2_err_str(ret));

	percpu_ref_put(&ca->io_ref);
	percpu_ref_put(&ca->io_ref[WRITE]);
	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}

@@ -1967,7 +1969,7 @@ void bch2_dev_do_discards(struct bch_dev *ca)
	if (queue_work(c->write_ref_wq, &ca->discard_work))
		return;

	percpu_ref_put(&ca->io_ref);
	percpu_ref_put(&ca->io_ref[WRITE]);
put_write_ref:
	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
@@ -2045,7 +2047,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
	trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));

	bch2_trans_put(trans);
	percpu_ref_put(&ca->io_ref);
	percpu_ref_put(&ca->io_ref[WRITE]);
	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}

@@ -2065,7 +2067,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
	if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
		return;

	percpu_ref_put(&ca->io_ref);
	percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@@ -2082,6 +2084,9 @@ static int invalidate_one_bp(struct btree_trans *trans,
	if (ret)
		return ret;

	if (!extent_k.k)
		return 0;

	struct bkey_i *n =
		bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
				   BTREE_UPDATE_internal_snapshot_node);
@@ -2199,9 +2204,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
{
	struct bkey_s_c k;
again:
	k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
	k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
	if (!k.k && !*wrapped) {
		bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
		bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
		*wrapped = true;
		goto again;
	}
@@ -2251,12 +2256,12 @@ static void bch2_do_invalidates_work(struct work_struct *work)
		if (ret)
			break;

		bch2_btree_iter_advance(&iter);
		bch2_btree_iter_advance(trans, &iter);
	}
	bch2_trans_iter_exit(trans, &iter);
err:
	bch2_trans_put(trans);
	percpu_ref_put(&ca->io_ref);
	percpu_ref_put(&ca->io_ref[WRITE]);
	bch2_bkey_buf_exit(&last_flushed, c);
	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@@ -2274,7 +2279,7 @@ void bch2_dev_do_invalidates(struct bch_dev *ca)
	if (queue_work(c->write_ref_wq, &ca->invalidate_work))
		return;

	percpu_ref_put(&ca->io_ref);
	percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@@ -2321,7 +2326,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
			break;
		}

		k = bch2_get_key_or_hole(&iter, end, &hole);
		k = bch2_get_key_or_hole(trans, &iter, end, &hole);
		ret = bkey_err(k);
		if (ret)
			goto bkey_err;
@@ -2340,7 +2345,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
			if (ret)
				goto bkey_err;

			bch2_btree_iter_advance(&iter);
			bch2_btree_iter_advance(trans, &iter);
		} else {
			struct bkey_i *freespace;

@@ -2360,7 +2365,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
			if (ret)
				goto bkey_err;

			bch2_btree_iter_set_pos(&iter, k.k->p);
			bch2_btree_iter_set_pos(trans, &iter, k.k->p);
		}
bkey_err:
		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2506,7 +2511,7 @@ void bch2_recalc_capacity(struct bch_fs *c)

	bch2_set_ra_pages(c, ra_pages);

	for_each_rw_member(c, ca) {
	__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
		u64 dev_reserve = 0;

		/*
+3 −3
Original line number Diff line number Diff line
@@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
{
	u64 want_free = ca->mi.nbuckets >> 7;
	u64 free = max_t(s64, 0,
			   u.d[BCH_DATA_free].buckets
			 + u.d[BCH_DATA_need_discard].buckets
			   u.buckets[BCH_DATA_free]
			 + u.buckets[BCH_DATA_need_discard]
			 - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));

	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
	return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
}

void bch2_dev_do_invalidates(struct bch_dev *);
+58 −21
Original line number Diff line number Diff line
@@ -327,7 +327,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
			bucket = sector_to_bucket(ca,
					round_up(bucket_to_sector(ca, bucket) + 1,
						 1ULL << ca->mi.btree_bitmap_shift));
			bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
			bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
			s->buckets_seen++;
			s->skipped_mi_btree_bitmap++;
			continue;
@@ -355,7 +355,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
					     watermark, s, cl)
			: NULL;
next:
		bch2_set_btree_iter_dontneed(&citer);
		bch2_set_btree_iter_dontneed(trans, &citer);
		bch2_trans_iter_exit(trans, &citer);
		if (ob)
			break;
@@ -417,7 +417,7 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
							 1ULL << ca->mi.btree_bitmap_shift));
				alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));

				bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
				bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
				s->skipped_mi_btree_bitmap++;
				goto next;
			}
@@ -426,7 +426,7 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
			if (ob) {
				if (!IS_ERR(ob))
					*dev_alloc_cursor = iter.pos.offset;
				bch2_set_btree_iter_dontneed(&iter);
				bch2_set_btree_iter_dontneed(trans, &iter);
				break;
			}

@@ -469,7 +469,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
	prt_printf(&buf, "watermark\t%s\n",	bch2_watermarks[watermark]);
	prt_printf(&buf, "data type\t%s\n",	__bch2_data_types[data_type]);
	prt_printf(&buf, "blocking\t%u\n",	cl != NULL);
	prt_printf(&buf, "free\t%llu\n",	usage->d[BCH_DATA_free].buckets);
	prt_printf(&buf, "free\t%llu\n",	usage->buckets[BCH_DATA_free]);
	prt_printf(&buf, "avail\t%llu\n",	dev_buckets_free(ca, *usage, watermark));
	prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
		   bch2_copygc_wait_amount(c),
@@ -524,10 +524,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
	bch2_dev_usage_read_fast(ca, usage);
	avail = dev_buckets_free(ca, *usage, watermark);

	if (usage->d[BCH_DATA_need_discard].buckets > avail)
	if (usage->buckets[BCH_DATA_need_discard] > avail)
		bch2_dev_do_discards(ca);

	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
	if (usage->buckets[BCH_DATA_need_gc_gens] > avail)
		bch2_gc_gens_async(c);

	if (should_invalidate_buckets(ca, *usage))
@@ -606,8 +606,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
			    unsigned l, unsigned r)
{
	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
		(stripe->next_alloc[l] < stripe->next_alloc[r]));
	return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
}

#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
@@ -626,25 +625,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
	return ret;
}

static const u64 stripe_clock_hand_rescale	= 1ULL << 62; /* trigger rescale at */
static const u64 stripe_clock_hand_max		= 1ULL << 56; /* max after rescale */
static const u64 stripe_clock_hand_inv		= 1ULL << 52; /* max increment, if a device is empty */

static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
{
	/*
	 * Avoid underflowing clock hands if at all possible, if clock hands go
	 * to 0 then we lose information - clock hands can be in a wide range if
	 * we have devices we rarely try to allocate from, if we generally
	 * allocate from a specified target but only sometimes have to fall back
	 * to the whole filesystem.
	 */
	u64 scale_max = U64_MAX;	/* maximum we can subtract without underflow */
	u64 scale_min = 0;		/* minumum we must subtract to avoid overflow */

	for (u64 *v = stripe->next_alloc;
	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
		if (*v)
			scale_max = min(scale_max, *v);
		if (*v > stripe_clock_hand_max)
			scale_min = max(scale_min, *v - stripe_clock_hand_max);
	}

	u64 scale = max(scale_min, scale_max);

	for (u64 *v = stripe->next_alloc;
	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
		*v = *v < scale ? 0 : *v - scale;
}

static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
			       struct dev_stripe_state *stripe,
			       struct bch_dev_usage *usage)
{
	/*
	 * Stripe state has a per device clock hand: we allocate from the device
	 * with the smallest clock hand.
	 *
	 * When we allocate, we don't do a simple increment; we add the inverse
	 * of the device's free space. This results in round robin behavior that
	 * biases in favor of the device(s) with more free space.
	 */

	u64 *v = stripe->next_alloc + ca->dev_idx;
	u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
	u64 free_space_inv = free_space
		? div64_u64(1ULL << 48, free_space)
		: 1ULL << 48;
	u64 scale = *v / 4;
		? div64_u64(stripe_clock_hand_inv, free_space)
		: stripe_clock_hand_inv;

	if (*v + free_space_inv >= *v)
		*v += free_space_inv;
	else
		*v = U64_MAX;
	/* Saturating add, avoid overflow: */
	u64 sum = *v + free_space_inv;
	*v = sum >= *v ? sum : U64_MAX;

	for (v = stripe->next_alloc;
	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
		*v = *v < scale ? 0 : *v - scale;
	if (unlikely(*v > stripe_clock_hand_rescale))
		bch2_stripe_state_rescale(stripe);
}

void bch2_dev_stripe_increment(struct bch_dev *ca,
@@ -1633,7 +1669,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
	struct bch_fs *c = ca->fs;
	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
	struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
	unsigned nr[BCH_DATA_NR];

	memset(nr, 0, sizeof(nr));
@@ -1656,7 +1692,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
	printbuf_tabstop_push(out, 16);

	prt_printf(out, "open buckets\t%i\r\n",	ca->nr_open_buckets);
	prt_printf(out, "buckets to invalidate\t%llu\r\n",	should_invalidate_buckets(ca, stats));
	prt_printf(out, "buckets to invalidate\t%llu\r\n",
		   should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
}

static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
Loading