Commit 31403dca authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: optimize __bch2_trans_get(), kill DEBUG_TRANSACTIONS



 - Some tweaks to greatly reduce locking overhead for the list of btree
   transactions, so that it can always be enabled: leave btree_trans
   objects on the list when they're on the percpu single item freelist,
   and only check for duplicates in the same process when
   CONFIG_BCACHEFS_DEBUG is enabled

 - don't zero out the full btree_trans() unless we allocated it from
   the mempool

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent fea153a8
Loading
Loading
Loading
Loading
+0 −8
Original line number Diff line number Diff line
@@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL
	depends on BCACHEFS_FS
	select FS_POSIX_ACL

config BCACHEFS_DEBUG_TRANSACTIONS
	bool "bcachefs runtime info"
	depends on BCACHEFS_FS
	help
	This makes the list of running btree transactions available in debugfs.

	This is a highly useful debugging feature but does add a small amount of overhead.

config BCACHEFS_DEBUG
	bool "bcachefs debugging"
	depends on BCACHEFS_FS
+82 −84
Original line number Diff line number Diff line
@@ -2714,6 +2714,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)

void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
	struct bch_fs *c = trans->c;
	unsigned new_top = trans->mem_top + size;
	unsigned old_bytes = trans->mem_bytes;
	unsigned new_bytes = roundup_pow_of_two(new_top);
@@ -2721,17 +2722,19 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
	void *new_mem;
	void *p;

	trans->mem_max = max(trans->mem_max, new_top);

	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);

	struct btree_transaction_stats *s = btree_trans_stats(trans);
	if (s)
		s->max_mem = max(s->max_mem, new_bytes);

	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
	if (unlikely(!new_mem)) {
		bch2_trans_unlock(trans);

		new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
			new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
			new_bytes = BTREE_TRANS_MEM_MAX;
			kfree(trans->mem);
		}
@@ -2751,7 +2754,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
	trans->mem_bytes = new_bytes;

	if (old_bytes) {
		trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
		trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
	}

@@ -2860,25 +2863,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
	return trans->restart_count;
}

static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
{
	struct btree_trans *trans;

	if (IS_ENABLED(__KERNEL__)) {
		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
		if (trans)
			return trans;
	}

	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
	/*
	 * paths need to be zeroed, bch2_check_for_deadlock looks at
	 * paths in other threads
	 */
	memset(&trans->paths, 0, sizeof(trans->paths));
	return trans;
}

const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];

unsigned bch2_trans_get_fn_idx(const char *fn)
@@ -2900,22 +2884,55 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
	__acquires(&c->btree_trans_barrier)
{
	struct btree_trans *trans;
	struct btree_transaction_stats *s;

	trans = bch2_trans_alloc(c);
	if (IS_ENABLED(__KERNEL__)) {
		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
		if (trans) {
			memset(trans, 0, offsetof(struct btree_trans, updates));
			goto got_trans;
		}
	}

	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
	memset(trans, 0, sizeof(*trans));
	closure_init_stack(&trans->ref);

	seqmutex_lock(&c->btree_trans_lock);
	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
		struct btree_trans *pos;
		pid_t pid = current->pid;

		trans->locking_wait.task = current;

		list_for_each_entry(pos, &c->btree_trans_list, list) {
			struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
			/*
			 * We'd much prefer to be stricter here and completely
			 * disallow multiple btree_trans in the same thread -
			 * but the data move path calls bch2_write when we
			 * already have a btree_trans initialized.
			 */
			BUG_ON(pos_task &&
			       pid == pos_task->pid &&
			       bch2_trans_locked(pos));

			if (pos_task && pid < pos_task->pid) {
				list_add_tail(&trans->list, &pos->list);
				goto list_add_done;
			}
		}
	}
	list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:
	seqmutex_unlock(&c->btree_trans_lock);
got_trans:
	trans->c		= c;
	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
		? bch2_btree_transaction_fns[fn_idx] : NULL;
	trans->last_begin_time	= local_clock();
	trans->fn_idx		= fn_idx;
	trans->locking_wait.task = current;
	trans->journal_replay_not_finished =
		unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
		atomic_inc_not_zero(&c->journal_keys.ref);
	closure_init_stack(&trans->ref);

	trans->paths_allocated	= trans->_paths_allocated;
	trans->sorted		= trans->_sorted;
	trans->paths		= trans->_paths;
@@ -2924,21 +2941,19 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)

	trans->paths_allocated[0] = 1;

	s = btree_trans_stats(trans);
	if (s && s->max_mem) {
	if (fn_idx < BCH_TRANSACTIONS_NR) {
		trans->fn = bch2_btree_transaction_fns[fn_idx];

		struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];

		if (s->max_mem) {
			unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);

			trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);

		if (!unlikely(trans->mem)) {
			trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
			trans->mem_bytes = BTREE_TRANS_MEM_MAX;
		} else {
			if (likely(trans->mem))
				trans->mem_bytes = expected_mem_bytes;
		}
	}

	if (s) {
		trans->nr_paths_max = s->nr_max_paths;
		trans->journal_entries_size = s->journal_entries_size;
	}
@@ -2946,31 +2961,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
	trans->srcu_lock_time	= jiffies;
	trans->srcu_held	= true;

	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
		struct btree_trans *pos;

		seqmutex_lock(&c->btree_trans_lock);
		list_for_each_entry(pos, &c->btree_trans_list, list) {
			/*
			 * We'd much prefer to be stricter here and completely
			 * disallow multiple btree_trans in the same thread -
			 * but the data move path calls bch2_write when we
			 * already have a btree_trans initialized.
			 */
			BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
			       bch2_trans_locked(pos));

			if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
				list_add_tail(&trans->list, &pos->list);
				goto list_add_done;
			}
		}
		list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:
		seqmutex_unlock(&c->btree_trans_lock);
	}

	return trans;
}

@@ -3001,24 +2991,13 @@ void bch2_trans_put(struct btree_trans *trans)
	__releases(&c->btree_trans_barrier)
{
	struct bch_fs *c = trans->c;
	struct btree_transaction_stats *s = btree_trans_stats(trans);

	bch2_trans_unlock(trans);

	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
		seqmutex_lock(&c->btree_trans_lock);
		list_del(&trans->list);
		seqmutex_unlock(&c->btree_trans_lock);
	}

	closure_sync(&trans->ref);

	if (s)
		s->max_mem = max(s->max_mem, trans->mem_max);

	trans_for_each_update(trans, i)
		__btree_path_put(trans->paths + i->path, true);
	trans->nr_updates	= 0;
	trans->locking_wait.task = NULL;

	check_btree_paths_leaked(trans);

@@ -3047,9 +3026,17 @@ void bch2_trans_put(struct btree_trans *trans)
	/* Userspace doesn't have a real percpu implementation: */
	if (IS_ENABLED(__KERNEL__))
		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
	if (trans)

	if (trans) {
		closure_sync(&trans->ref);

		seqmutex_lock(&c->btree_trans_lock);
		list_del(&trans->list);
		seqmutex_unlock(&c->btree_trans_lock);

		mempool_free(trans, &c->btree_trans_pool);
	}
}

static void __maybe_unused
bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
@@ -3146,15 +3133,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
	struct btree_trans *trans;
	int cpu;

	if (c->btree_trans_bufs)
		for_each_possible_cpu(cpu) {
			struct btree_trans *trans =
				per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;

			if (trans) {
				closure_sync(&trans->ref);

				seqmutex_lock(&c->btree_trans_lock);
				list_del(&trans->list);
				seqmutex_unlock(&c->btree_trans_lock);
			}
			kfree(trans);
		}
	free_percpu(c->btree_trans_bufs);

	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
	if (trans)
		panic("%s leaked btree_trans\n", trans->fn);

	if (c->btree_trans_bufs)
		for_each_possible_cpu(cpu)
			kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
	free_percpu(c->btree_trans_bufs);

	for (s = c->btree_transaction_stats;
	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
	     s++) {
+2 −1
Original line number Diff line number Diff line
@@ -95,9 +95,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
	struct trans_waiting_for_lock *i;

	for (i = g->g; i != g->g + g->nr; i++) {
		struct task_struct *task = i->trans->locking_wait.task;
		if (i != g->g)
			prt_str(out, "<- ");
		prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
		prt_printf(out, "%u ", task ?task->pid : 0);
	}
	prt_newline(out);
}
+7 −4
Original line number Diff line number Diff line
@@ -386,7 +386,6 @@ struct btree_trans {

	void			*mem;
	unsigned		mem_top;
	unsigned		mem_max;
	unsigned		mem_bytes;

	btree_path_idx_t	nr_sorted;
@@ -413,8 +412,6 @@ struct btree_trans {
	unsigned long		srcu_lock_time;

	const char		*fn;
	struct closure		ref;
	struct list_head	list;
	struct btree_bkey_cached_common *locking;
	struct six_lock_waiter	locking_wait;
	int			srcu_idx;
@@ -424,7 +421,6 @@ struct btree_trans {
	u16			journal_entries_size;
	struct jset_entry	*journal_entries;

	struct btree_insert_entry updates[BTREE_ITER_MAX];
	struct btree_trans_commit_hook *hooks;
	struct journal_entry_pin *journal_pin;

@@ -435,6 +431,13 @@ struct btree_trans {
	unsigned		extra_disk_res; /* XXX kill */
	struct replicas_delta_list *fs_usage_deltas;

	/* Entries before this are zeroed out on every bch2_trans_get() call */

	struct btree_insert_entry updates[BTREE_ITER_MAX];

	struct list_head	list;
	struct closure		ref;

	unsigned long		_paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)];
	struct btree_trans_paths trans_paths;
	struct btree_path	_paths[BTREE_ITER_MAX];
+9 −9
Original line number Diff line number Diff line
@@ -592,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = {
	.read		= bch2_cached_btree_nodes_read,
};

#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
					    size_t size, loff_t *ppos)
{
@@ -608,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
restart:
	seqmutex_lock(&c->btree_trans_lock);
	list_for_each_entry(trans, &c->btree_trans_list, list) {
		if (trans->locking_wait.task->pid <= i->iter)
		struct task_struct *task = READ_ONCE(trans->locking_wait.task);

		if (!task || task->pid <= i->iter)
			continue;

		closure_get(&trans->ref);
@@ -626,11 +627,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
		prt_printf(&i->buf, "backtrace:");
		prt_newline(&i->buf);
		printbuf_indent_add(&i->buf, 2);
		bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
		bch2_prt_task_backtrace(&i->buf, task);
		printbuf_indent_sub(&i->buf, 2);
		prt_newline(&i->buf);

		i->iter = trans->locking_wait.task->pid;
		i->iter = task->pid;

		closure_put(&trans->ref);

@@ -654,7 +655,6 @@ static const struct file_operations btree_transactions_ops = {
	.release	= bch2_dump_release,
	.read		= bch2_btree_transactions_read,
};
#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */

static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
				      size_t size, loff_t *ppos)
@@ -811,7 +811,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
restart:
	seqmutex_lock(&c->btree_trans_lock);
	list_for_each_entry(trans, &c->btree_trans_list, list) {
		if (trans->locking_wait.task->pid <= i->iter)
		struct task_struct *task = READ_ONCE(trans->locking_wait.task);

		if (!task || task->pid <= i->iter)
			continue;

		closure_get(&trans->ref);
@@ -826,7 +828,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,

		bch2_check_for_deadlock(trans, &i->buf);

		i->iter = trans->locking_wait.task->pid;
		i->iter = task->pid;

		closure_put(&trans->ref);

@@ -873,10 +875,8 @@ void bch2_fs_debug_init(struct bch_fs *c)
	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
			    c->btree_debug, &cached_btree_nodes_ops);

#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
			    c->btree_debug, &btree_transactions_ops);
#endif

	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
			    c->btree_debug, &journal_pins_ops);