Commit ca1e02f7 authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Etyzinger cleanups



Pull out eytzinger.c and kill eytzinger_cmp_fn. We now provide
eytzinger0_sort and eytzinger0_sort_r, which use the standard cmp_func_t
and cmp_r_func_t callbacks.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent bdbf953b
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@ bcachefs-y := \
	error.o			\
	extents.o		\
	extent_update.o		\
	eytzinger.o		\
	fs.o			\
	fs-common.o		\
	fs-ioctl.o		\
+234 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

#include "eytzinger.h"

/**
 * is_aligned - is this pointer & size okay for word-wide copying?
 * @base: pointer to data
 * @size: size of each element
 * @align: required alignment (typically 4 or 8)
 *
 * Returns true if elements can be copied using word loads and stores.
 * The size must be a multiple of the alignment, and the base address must
 * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
 *
 * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
 * to "if ((a | b) & mask)", so we do that by hand.
 */
__attribute_const__ __always_inline
static bool is_aligned(const void *base, size_t size, unsigned char align)
{
	unsigned char lsbits = (unsigned char)size;

	(void)base;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	lsbits |= (unsigned char)(uintptr_t)base;
#endif
	return (lsbits & (align - 1)) == 0;
}

/**
 * swap_words_32 - swap two elements in 32-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 4)
 *
 * Exchange the two objects in memory.  This exploits base+index addressing,
 * which basically all CPUs have, to minimize loop overhead computations.
 *
 * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
 * bottom of the loop, even though the zero flag is still valid from the
 * subtract (since the intervening mov instructions don't alter the flags).
 * Gcc 8.1.0 doesn't have that problem.
 */
static void swap_words_32(void *a, void *b, size_t n)
{
	do {
		u32 t = *(u32 *)(a + (n -= 4));
		*(u32 *)(a + n) = *(u32 *)(b + n);
		*(u32 *)(b + n) = t;
	} while (n);
}

/**
 * swap_words_64 - swap two elements in 64-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 8)
 *
 * Exchange the two objects in memory.  This exploits base+index
 * addressing, which basically all CPUs have, to minimize loop overhead
 * computations.
 *
 * We'd like to use 64-bit loads if possible.  If they're not, emulating
 * one requires base+index+4 addressing which x86 has but most other
 * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
 * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
 * x32 ABI).  Are there any cases the kernel needs to worry about?
 */
static void swap_words_64(void *a, void *b, size_t n)
{
	do {
#ifdef CONFIG_64BIT
		u64 t = *(u64 *)(a + (n -= 8));
		*(u64 *)(a + n) = *(u64 *)(b + n);
		*(u64 *)(b + n) = t;
#else
		/* Use two 32-bit transfers to avoid base+index+4 addressing */
		u32 t = *(u32 *)(a + (n -= 4));
		*(u32 *)(a + n) = *(u32 *)(b + n);
		*(u32 *)(b + n) = t;

		t = *(u32 *)(a + (n -= 4));
		*(u32 *)(a + n) = *(u32 *)(b + n);
		*(u32 *)(b + n) = t;
#endif
	} while (n);
}

/**
 * swap_bytes - swap two elements a byte at a time
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size
 *
 * This is the fallback if alignment doesn't allow using larger chunks.
 */
static void swap_bytes(void *a, void *b, size_t n)
{
	do {
		char t = ((char *)a)[--n];
		((char *)a)[n] = ((char *)b)[n];
		((char *)b)[n] = t;
	} while (n);
}

/*
 * The values are arbitrary as long as they can't be confused with
 * a pointer, but small integers make for the smallest compare
 * instructions.
 */
#define SWAP_WORDS_64 (swap_r_func_t)0
#define SWAP_WORDS_32 (swap_r_func_t)1
#define SWAP_BYTES    (swap_r_func_t)2
#define SWAP_WRAPPER  (swap_r_func_t)3

struct wrapper {
	cmp_func_t cmp;
	swap_func_t swap;
};

/*
 * The function pointer is last to make tail calls most efficient if the
 * compiler decides not to inline this function.
 */
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
	if (swap_func == SWAP_WRAPPER) {
		((const struct wrapper *)priv)->swap(a, b, (int)size);
		return;
	}

	if (swap_func == SWAP_WORDS_64)
		swap_words_64(a, b, size);
	else if (swap_func == SWAP_WORDS_32)
		swap_words_32(a, b, size);
	else if (swap_func == SWAP_BYTES)
		swap_bytes(a, b, size);
	else
		swap_func(a, b, (int)size, priv);
}

#define _CMP_WRAPPER ((cmp_r_func_t)0L)

static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
{
	if (cmp == _CMP_WRAPPER)
		return ((const struct wrapper *)priv)->cmp(a, b);
	return cmp(a, b, priv);
}

static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
			 cmp_r_func_t cmp_func, const void *priv,
			 size_t l, size_t r)
{
	return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
		      base + inorder_to_eytzinger0(r, n) * size,
		      cmp_func, priv);
}

static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
			   swap_r_func_t swap_func, const void *priv,
			   size_t l, size_t r)
{
	do_swap(base + inorder_to_eytzinger0(l, n) * size,
		base + inorder_to_eytzinger0(r, n) * size,
		size, swap_func, priv);
}

void eytzinger0_sort_r(void *base, size_t n, size_t size,
		       cmp_r_func_t cmp_func,
		       swap_r_func_t swap_func,
		       const void *priv)
{
	int i, c, r;

	/* called from 'sort' without swap function, let's pick the default */
	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
		swap_func = NULL;

	if (!swap_func) {
		if (is_aligned(base, size, 8))
			swap_func = SWAP_WORDS_64;
		else if (is_aligned(base, size, 4))
			swap_func = SWAP_WORDS_32;
		else
			swap_func = SWAP_BYTES;
	}

	/* heapify */
	for (i = n / 2 - 1; i >= 0; --i) {
		for (r = i; r * 2 + 1 < n; r = c) {
			c = r * 2 + 1;

			if (c + 1 < n &&
			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
				c++;

			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
				break;

			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
		}
	}

	/* sort */
	for (i = n - 1; i > 0; --i) {
		eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);

		for (r = 0; r * 2 + 1 < i; r = c) {
			c = r * 2 + 1;

			if (c + 1 < i &&
			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
				c++;

			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
				break;

			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
		}
	}
}

void eytzinger0_sort(void *base, size_t n, size_t size,
		     cmp_func_t cmp_func,
		     swap_func_t swap_func)
{
	struct wrapper w = {
		.cmp  = cmp_func,
		.swap = swap_func,
	};

	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
}
+37 −26
Original line number Diff line number Diff line
@@ -5,23 +5,33 @@
#include <linux/bitops.h>
#include <linux/log2.h>

#include "util.h"
#ifdef EYTZINGER_DEBUG
#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
#else
#define EYTZINGER_BUG_ON(cond)
#endif

/*
 * Traversal for trees in eytzinger layout - a full binary tree layed out in an
 * array
 */

/*
 * One based indexing version:
 * array.
 *
 * With one based indexing each level of the tree starts at a power of two -
 * good for cacheline alignment:
 * Consider using an eytzinger tree any time you would otherwise be doing binary
 * search over an array. Binary search is a worst case scenario for branch
 * prediction and prefetching, but in an eytzinger tree every node's children
 * are adjacent in memory, thus we can prefetch children before knowing the
 * result of the comparison, assuming multiple nodes fit on a cacheline.
 *
 * Two variants are provided, for one based indexing and zero based indexing.
 *
 * Zero based indexing is more convenient, but one based indexing has better
 * alignment and thus better performance because each new level of the tree
 * starts at a power of two, and thus if element 0 was cacheline aligned, each
 * new level will be as well.
 */

static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
	EBUG_ON(child > 1);
	EYTZINGER_BUG_ON(child > 1);

	return (i << 1) + child;
}
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)

static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
	EBUG_ON(i > size);
	EYTZINGER_BUG_ON(i > size);

	if (eytzinger1_right_child(i) <= size) {
		i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)

static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
	EBUG_ON(i > size);
	EYTZINGER_BUG_ON(i > size);

	if (eytzinger1_left_child(i) <= size) {
		i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
	unsigned shift = __fls(size) - b;
	int s;

	EBUG_ON(!i || i > size);
	EYTZINGER_BUG_ON(!i || i > size);

	i  ^= 1U << b;
	i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
	unsigned shift;
	int s;

	EBUG_ON(!i || i > size);
	EYTZINGER_BUG_ON(!i || i > size);

	/*
	 * sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)

static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
	EBUG_ON(child > 1);
	EYTZINGER_BUG_ON(child > 1);

	return (i << 1) + 1 + child;
}
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
	     (_i) != -1;				\
	     (_i) = eytzinger0_next((_i), (_size)))

typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);

/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
					 eytzinger_cmp_fn cmp, const void *search)
					 cmp_func_t cmp, const void *search)
{
	unsigned i, n = 0;

@@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,

	do {
		i = n;
		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
		n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
	} while (n < nr);

	if (n & 1) {
		/* @i was greater than @search, return previous node: */

		if (i == eytzinger0_first(nr))
			return -1;

		return eytzinger0_prev(i, nr);
	} else {
		return i;
	}
}

static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size,
					 cmp_func_t cmp, const void *search)
{
	ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
	return eytzinger0_next(idx, size);
}

#define eytzinger0_find(base, nr, size, _cmp, search)			\
({									\
	void *_base		= (base);				\
@@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
	int _res;							\
									\
	while (_i < _nr &&						\
	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
	       (_res = _cmp(_search, _base + _i * _size)))		\
		_i = eytzinger0_child(_i, _res > 0);			\
	_i;								\
})

void eytzinger0_sort(void *, size_t, size_t,
		    int (*cmp_func)(const void *, const void *, size_t),
		    void (*swap_func)(void *, void *, size_t));
void eytzinger0_sort_r(void *, size_t, size_t,
		       cmp_r_func_t, swap_r_func_t, const void *);
void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);

#endif /* _EYTZINGER_H */
+1 −2
Original line number Diff line number Diff line
@@ -95,8 +95,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
	return ret ?: bch2_blacklist_table_initialize(c);
}

static int journal_seq_blacklist_table_cmp(const void *_l,
					   const void *_r, size_t size)
static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
	const struct journal_seq_blacklist_table_entry *l = _l;
	const struct journal_seq_blacklist_table_entry *r = _r;
+12 −7
Original line number Diff line number Diff line
@@ -6,12 +6,15 @@
#include "replicas.h"
#include "super-io.h"

#include <linux/sort.h>

static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
					    struct bch_replicas_cpu *);

/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
static int bch2_memcmp(const void *l, const void *r, size_t size)
static int bch2_memcmp(const void *l, const void *r,  const void *priv)
{
	size_t size = (size_t) priv;
	return memcmp(l, r, size);
}

@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)

static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
	eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}

static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,

	verify_replicas_entry(search);

#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
#define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
			      entry_cmp, search);
#undef entry_cmp
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
	unsigned i;

	sort_cmp_size(cpu_r->entries,
	sort_r(cpu_r->entries,
	       cpu_r->nr,
	       cpu_r->entry_size,
		      bch2_memcmp, NULL);
	       bch2_memcmp, NULL,
	       (void *)(size_t)cpu_r->entry_size);

	for (i = 0; i < cpu_r->nr; i++) {
		struct bch_replicas_entry_v1 *e =
Loading