Commit b3f391fd authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'bcachefs-2024-09-21' of git://evilpiepirate.org/bcachefs

Pull bcachefs updates from Kent Overstreet:

 - rcu_pending, btree key cache rework: this solves lock contenting in
   the key cache, eliminating the biggest source of the srcu lock hold
   time warnings, and drastically improving performance on some metadata
   heavy workloads - on multithreaded creates we're now 3-4x faster than
   xfs.

 - We're now using an rhashtable instead of the system inode hash table;
   this is another significant performance improvement on multithreaded
   metadata workloads, eliminating more lock contention.

 - for_each_btree_key_in_subvolume_upto(): new helper for iterating over
   keys within a specific subvolume, eliminating a lot of open coded
   "subvolume_get_snapshot()" and also fixing another source of srcu
   lock time warnings, by running each loop iteration in its own
   transaction (as the existing for_each_btree_key() does).

 - More work on btree_trans locking asserts; we now assert that we don't
   hold btree node locks when trans->locked is false, which is important
   because we don't use lockdep for tracking individual btree node
   locks.

 - Some cleanups and improvements in the bset.c btree node lookup code,
   from Alan.

 - Rework of btree node pinning, which we use in backpointers fsck. The
   old hacky implementation, where the shrinker just skipped over nodes
   in the pinned range, was causing OOMs; instead we now use another
   shrinker with a much higher seeks number for pinned nodes.

 - Rebalance now uses BCH_WRITE_ONLY_SPECIFIED_DEVS; this fixes an issue
   where rebalance would sometimes fall back to allocating from the full
   filesystem, which is not what we want when it's trying to move data
   to a specific target.

 - Use __GFP_ACCOUNT, GFP_RECLAIMABLE for btree node, key cache
   allocations.

 - Idmap mounts are now supported (Hongbo Li)

 - Rename whiteouts are now supported (Hongbo Li)

 - Erasure coding can now handle devices being marked as failed, or
   forcibly removed. We still need the evacuate path for erasure coding,
   but it's getting very close to ready for people to start using.

* tag 'bcachefs-2024-09-21' of git://evilpiepirate.org/bcachefs: (99 commits)
  bcachefs: return err ptr instead of null in read sb clean
  bcachefs: Remove duplicated include in backpointers.c
  bcachefs: Don't drop devices with stripe pointers
  bcachefs: bch2_ec_stripe_head_get() now checks for change in rw devices
  bcachefs: bch_fs.rw_devs_change_count
  bcachefs: bch2_dev_remove_stripes()
  bcachefs: bch2_trigger_ptr() calculates sectors even when no device
  bcachefs: improve error messages in bch2_ec_read_extent()
  bcachefs: improve error message on too few devices for ec
  bcachefs: improve bch2_new_stripe_to_text()
  bcachefs: ec_stripe_head.nr_created
  bcachefs: bch_stripe.disk_label
  bcachefs: stripe_to_mem()
  bcachefs: EIO errcode cleanup
  bcachefs: Rework btree node pinning
  bcachefs: split up btree cache counters for live, freeable
  bcachefs: btree cache counters should be size_t
  bcachefs: Don't count "skipped access bit" as touched in btree cache scan
  bcachefs: Failed devices no longer require mounting in degraded mode
  bcachefs: bch2_dev_rcu_noerror()
  ...
parents f8ffbc36 025c55a4
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -175,7 +175,7 @@ errors in our thinking by running our code and seeing what happens. If your
time is being wasted because your tools are bad or too slow - don't accept it,
fix it.

Put effort into your documentation, commmit messages, and code comments - but
Put effort into your documentation, commit messages, and code comments - but
don't go overboard. A good commit message is wonderful - but if the information
was important enough to go in a commit message, ask yourself if it would be
even better as a code comment.
+7 −0
Original line number Diff line number Diff line
@@ -87,6 +87,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
	is held by another thread, spin for a short while, as long as the
	thread owning the lock is running.

config BCACHEFS_PATH_TRACEPOINTS
	bool "Extra btree_path tracepoints"
	depends on BCACHEFS_FS
	help
	Enable extra tracepoints for debugging btree_path operations; we don't
	normally want these enabled because they happen at very high rates.

config MEAN_AND_VARIANCE_UNIT_TEST
	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
	depends on KUNIT
+1 −0
Original line number Diff line number Diff line
@@ -69,6 +69,7 @@ bcachefs-y := \
	printbuf.o		\
	quota.o			\
	rebalance.o		\
	rcu_pending.o		\
	recovery.o		\
	recovery_passes.o	\
	reflink.o		\
+1 −1
Original line number Diff line number Diff line
@@ -361,7 +361,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
	bch2_trans_begin(trans);
	acl = _acl;

	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
		bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
			      BTREE_ITER_intent);
	if (ret)
+40 −5
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
#include <linux/jiffies.h>

static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);

@@ -2183,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
	 * freespace/need_discard/need_gc_gens btrees as needed:
	 */
	while (1) {
		if (last_updated + HZ * 10 < jiffies) {
		if (time_after(jiffies, last_updated + HZ * 10)) {
			bch_info(ca, "%s: currently at %llu/%llu",
				 __func__, iter.pos.offset, ca->mi.nbuckets);
			last_updated = jiffies;
@@ -2297,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c)
	return 0;
}

/* device removal */

int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
{
	struct bpos start	= POS(ca->dev_idx, 0);
	struct bpos end		= POS(ca->dev_idx, U64_MAX);
	int ret;

	/*
	 * We clear the LRU and need_discard btrees first so that we don't race
	 * with bch2_do_invalidates() and bch2_do_discards()
	 */
	ret =   bch2_dev_remove_stripes(c, ca->dev_idx) ?:
		bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
					BTREE_TRIGGER_norun, NULL) ?:
		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
					BTREE_TRIGGER_norun, NULL) ?:
		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
					BTREE_TRIGGER_norun, NULL) ?:
		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
					BTREE_TRIGGER_norun, NULL) ?:
		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
					BTREE_TRIGGER_norun, NULL) ?:
		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
					BTREE_TRIGGER_norun, NULL) ?:
		bch2_dev_usage_remove(c, ca->dev_idx);
	bch_err_msg(ca, ret, "removing dev alloc info");
	return ret;
}

/* Bucket IO clocks: */

int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
@@ -2432,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
/* device goes ro: */
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
	unsigned i;
	lockdep_assert_held(&c->state_lock);

	/* First, remove device from allocation groups: */

	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
	for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
		clear_bit(ca->dev_idx, c->rw_devs[i].d);

	c->rw_devs_change_count++;

	/*
	 * Capacity is calculated based off of devices in allocation groups:
	 */
@@ -2467,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* device goes rw: */
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
	unsigned i;
	lockdep_assert_held(&c->state_lock);

	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
	for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
		if (ca->mi.data_allowed & (1 << i))
			set_bit(ca->dev_idx, c->rw_devs[i].d);

	c->rw_devs_change_count++;
}

void bch2_dev_allocator_background_exit(struct bch_dev *ca)
Loading