Commit d9e894d9 authored by Matthew Sakai's avatar Matthew Sakai Committed by Mike Snitzer
Browse files

dm vdo: add specialized request queueing functionality



This patch adds funnel_queue, a mostly lock-free multi-producer,
single-consumer queue. It also adds the request queue used by the dm-vdo
deduplication index, and the work_queue used by the dm-vdo data store. Both
of these are built on top of funnel queue and are intended to support the
dispatching of many short-running tasks. The work_queue also supports
priorities. Finally, this patch adds vdo_completion, the structure which is
enqueued on work_queues.

Co-developed-by: default avatarJ. corwin Coburn <corwin@hurlbutnet.net>
Signed-off-by: default avatarJ. corwin Coburn <corwin@hurlbutnet.net>
Co-developed-by: default avatarMichael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: default avatarMichael Sclafani <dm-devel@lists.linux.dev>
Co-developed-by: default avatarSweet Tea Dorminy <sweettea-kernel@dorminy.me>
Signed-off-by: default avatarSweet Tea Dorminy <sweettea-kernel@dorminy.me>
Co-developed-by: default avatarKen Raeburn <raeburn@redhat.com>
Signed-off-by: default avatarKen Raeburn <raeburn@redhat.com>
Signed-off-by: default avatarMatthew Sakai <msakai@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@kernel.org>
parent 89f9b701
Loading
Loading
Loading
Loading
+140 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2023 Red Hat
 */

#include "completion.h"

#include <linux/kernel.h>

#include "logger.h"
#include "permassert.h"

#include "status-codes.h"
#include "types.h"
#include "vio.h"
#include "vdo.h"

/**
 * DOC: vdo completions.
 *
 * Most of vdo's data structures are lock free, each either belonging to a single "zone," or
 * divided into a number of zones whose accesses to the structure do not overlap. During normal
 * operation, at most one thread will be operating in any given zone. Each zone has a
 * vdo_work_queue which holds vdo_completions that are to be run in that zone. A completion may
 * only be enqueued on one queue or operating in a single zone at a time.
 *
 * At each step of a multi-threaded operation, the completion performing the operation is given a
 * callback, error handler, and thread id for the next step. A completion is "run" when it is
 * operating on the correct thread (as specified by its callback_thread_id). If the value of its
 * "result" field is an error (i.e. not VDO_SUCCESS), the function in its "error_handler" will be
 * invoked. If the error_handler is NULL, or there is no error, the function set as its "callback"
 * will be invoked. Generally, a completion will not be run directly, but rather will be
 * "launched." In this case, it will check whether it is operating on the correct thread. If it is,
 * it will run immediately. Otherwise, it will be enqueue on the vdo_work_queue associated with the
 * completion's "callback_thread_id". When it is dequeued, it will be on the correct thread, and
 * will get run. In some cases, the completion should get queued instead of running immediately,
 * even if it is being launched from the correct thread. This is usually in cases where there is a
 * long chain of callbacks, all on the same thread, which could overflow the stack. In such cases,
 * the completion's "requeue" field should be set to true. Doing so will skip the current thread
 * check and simply enqueue the completion.
 *
 * A completion may be "finished," in which case its "complete" field will be set to true before it
 * is next run. It is a bug to attempt to set the result or re-finish a finished completion.
 * Because a completion's fields are not safe to examine from any thread other than the one on
 * which the completion is currently operating, this field is used only to aid in detecting
 * programming errors. It can not be used for cross-thread checking on the status of an operation.
 * A completion must be "reset" before it can be reused after it has been finished. Resetting will
 * also clear any error from the result field.
 **/

void vdo_initialize_completion(struct vdo_completion *completion,
			       struct vdo *vdo,
			       enum vdo_completion_type type)
{
	memset(completion, 0, sizeof(*completion));
	completion->vdo = vdo;
	completion->type = type;
	vdo_reset_completion(completion);
}

static inline void assert_incomplete(struct vdo_completion *completion)
{
	ASSERT_LOG_ONLY(!completion->complete, "completion is not complete");
}

/**
 * vdo_set_completion_result() - Set the result of a completion.
 *
 * Older errors will not be masked.
 */
void vdo_set_completion_result(struct vdo_completion *completion, int result)
{
	assert_incomplete(completion);
	if (completion->result == VDO_SUCCESS)
		completion->result = result;
}

/**
 * vdo_launch_completion_with_priority() - Run or enqueue a completion.
 * @priority: The priority at which to enqueue the completion.
 *
 * If called on the correct thread (i.e. the one specified in the completion's callback_thread_id
 * field) and not marked for requeue, the completion will be run immediately. Otherwise, the
 * completion will be enqueued on the specified thread.
 */
void vdo_launch_completion_with_priority(struct vdo_completion *completion,
					 enum vdo_completion_priority priority)
{
	thread_id_t callback_thread = completion->callback_thread_id;

	if (completion->requeue || (callback_thread != vdo_get_callback_thread_id())) {
		vdo_enqueue_completion(completion, priority);
		return;
	}

	vdo_run_completion(completion);
}

/** vdo_finish_completion() - Mark a completion as complete and then launch it. */
void vdo_finish_completion(struct vdo_completion *completion)
{
	assert_incomplete(completion);
	completion->complete = true;
	if (completion->callback != NULL)
		vdo_launch_completion(completion);
}

void vdo_enqueue_completion(struct vdo_completion *completion,
			    enum vdo_completion_priority priority)
{
	struct vdo *vdo = completion->vdo;
	thread_id_t thread_id = completion->callback_thread_id;

	if (ASSERT(thread_id < vdo->thread_config.thread_count,
		   "thread_id %u (completion type %d) is less than thread count %u",
		   thread_id, completion->type,
		   vdo->thread_config.thread_count) != UDS_SUCCESS)
		BUG();

	completion->requeue = false;
	completion->priority = priority;
	completion->my_queue = NULL;
	vdo_enqueue_work_queue(vdo->threads[thread_id].queue, completion);
}

/**
 * vdo_requeue_completion_if_needed() - Requeue a completion if not called on the specified thread.
 *
 * Return: True if the completion was requeued; callers may not access the completion in this case.
 */
bool vdo_requeue_completion_if_needed(struct vdo_completion *completion,
				      thread_id_t callback_thread_id)
{
	if (vdo_get_callback_thread_id() == callback_thread_id)
		return false;

	completion->callback_thread_id = callback_thread_id;
	vdo_enqueue_completion(completion, VDO_WORK_Q_DEFAULT_PRIORITY);
	return true;
}
+152 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2023 Red Hat
 */

#ifndef VDO_COMPLETION_H
#define VDO_COMPLETION_H

#include "permassert.h"

#include "status-codes.h"
#include "types.h"

/**
 * vdo_run_completion() - Run a completion's callback or error handler on the current thread.
 *
 * Context: This function must be called from the correct callback thread.
 */
static inline void vdo_run_completion(struct vdo_completion *completion)
{
	if ((completion->result != VDO_SUCCESS) && (completion->error_handler != NULL)) {
		completion->error_handler(completion);
		return;
	}

	completion->callback(completion);
}

void vdo_set_completion_result(struct vdo_completion *completion, int result);

void vdo_initialize_completion(struct vdo_completion *completion, struct vdo *vdo,
			       enum vdo_completion_type type);

/**
 * vdo_reset_completion() - Reset a completion to a clean state, while keeping the type, vdo and
 *                          parent information.
 */
static inline void vdo_reset_completion(struct vdo_completion *completion)
{
	completion->result = VDO_SUCCESS;
	completion->complete = false;
}

void vdo_launch_completion_with_priority(struct vdo_completion *completion,
					 enum vdo_completion_priority priority);

/**
 * vdo_launch_completion() - Launch a completion with default priority.
 */
static inline void vdo_launch_completion(struct vdo_completion *completion)
{
	vdo_launch_completion_with_priority(completion, VDO_WORK_Q_DEFAULT_PRIORITY);
}

/**
 * vdo_continue_completion() - Continue processing a completion.
 * @result: The current result (will not mask older errors).
 *
 * Continue processing a completion by setting the current result and calling
 * vdo_launch_completion().
 */
static inline void vdo_continue_completion(struct vdo_completion *completion, int result)
{
	vdo_set_completion_result(completion, result);
	vdo_launch_completion(completion);
}

void vdo_finish_completion(struct vdo_completion *completion);

/**
 * vdo_fail_completion() - Set the result of a completion if it does not already have an error,
 *                         then finish it.
 */
static inline void vdo_fail_completion(struct vdo_completion *completion, int result)
{
	vdo_set_completion_result(completion, result);
	vdo_finish_completion(completion);
}

/**
 * vdo_assert_completion_type() - Assert that a completion is of the correct type.
 *
 * Return: VDO_SUCCESS or an error
 */
static inline int vdo_assert_completion_type(struct vdo_completion *completion,
					     enum vdo_completion_type expected)
{
	return ASSERT(expected == completion->type,
		      "completion type should be %u, not %u", expected,
		      completion->type);
}

static inline void vdo_set_completion_callback(struct vdo_completion *completion,
					       vdo_action_fn callback,
					       thread_id_t callback_thread_id)
{
	completion->callback = callback;
	completion->callback_thread_id = callback_thread_id;
}

/**
 * vdo_launch_completion_callback() - Set the callback for a completion and launch it immediately.
 */
static inline void vdo_launch_completion_callback(struct vdo_completion *completion,
						  vdo_action_fn callback,
						  thread_id_t callback_thread_id)
{
	vdo_set_completion_callback(completion, callback, callback_thread_id);
	vdo_launch_completion(completion);
}

/**
 * vdo_prepare_completion() - Prepare a completion for launch.
 *
 * Resets the completion, and then sets its callback, error handler, callback thread, and parent.
 */
static inline void vdo_prepare_completion(struct vdo_completion *completion,
					  vdo_action_fn callback,
					  vdo_action_fn error_handler,
					  thread_id_t callback_thread_id, void *parent)
{
	vdo_reset_completion(completion);
	vdo_set_completion_callback(completion, callback, callback_thread_id);
	completion->error_handler = error_handler;
	completion->parent = parent;
}

/**
 * vdo_prepare_completion_for_requeue() - Prepare a completion for launch ensuring that it will
 *                                        always be requeued.
 *
 * Resets the completion, and then sets its callback, error handler, callback thread, and parent.
 */
static inline void vdo_prepare_completion_for_requeue(struct vdo_completion *completion,
						      vdo_action_fn callback,
						      vdo_action_fn error_handler,
						      thread_id_t callback_thread_id,
						      void *parent)
{
	vdo_prepare_completion(completion, callback, error_handler,
			       callback_thread_id, parent);
	completion->requeue = true;
}

void vdo_enqueue_completion(struct vdo_completion *completion,
			    enum vdo_completion_priority priority);


bool vdo_requeue_completion_if_needed(struct vdo_completion *completion,
				      thread_id_t callback_thread_id);

#endif /* VDO_COMPLETION_H */
+59 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2023 Red Hat
 */

#ifndef UDS_CPU_H
#define UDS_CPU_H

#include <linux/cache.h>

/**
 * uds_prefetch_address() - Minimize cache-miss latency by attempting to move data into a CPU cache
 *                          before it is accessed.
 *
 * @address: the address to fetch (may be invalid)
 * @for_write: must be constant at compile time--false if for reading, true if for writing
 */
static inline void uds_prefetch_address(const void *address, bool for_write)
{
	/*
	 * for_write won't be a constant if we are compiled with optimization turned off, in which
	 * case prefetching really doesn't matter. clang can't figure out that if for_write is a
	 * constant, it can be passed as the second, mandatorily constant argument to prefetch(),
	 * at least currently on llvm 12.
	 */
	if (__builtin_constant_p(for_write)) {
		if (for_write)
			__builtin_prefetch(address, true);
		else
			__builtin_prefetch(address, false);
	}
}

/**
 * uds_prefetch_range() - Minimize cache-miss latency by attempting to move a range of addresses
 *                        into a CPU cache before they are accessed.
 *
 * @start: the starting address to fetch (may be invalid)
 * @size: the number of bytes in the address range
 * @for_write: must be constant at compile time--false if for reading, true if for writing
 */
static inline void uds_prefetch_range(const void *start, unsigned int size,
				      bool for_write)
{
	/*
	 * Count the number of cache lines to fetch, allowing for the address range to span an
	 * extra cache line boundary due to address alignment.
	 */
	const char *address = (const char *) start;
	unsigned int offset = ((uintptr_t) address % L1_CACHE_BYTES);
	unsigned int cache_lines = (1 + ((size + offset) / L1_CACHE_BYTES));

	while (cache_lines-- > 0) {
		uds_prefetch_address(address, for_write);
		address += L1_CACHE_BYTES;
	}
}

#endif /* UDS_CPU_H */
+171 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2023 Red Hat
 */

#include "funnel-queue.h"

#include "cpu.h"
#include "memory-alloc.h"
#include "permassert.h"
#include "uds.h"

int uds_make_funnel_queue(struct funnel_queue **queue_ptr)
{
	int result;
	struct funnel_queue *queue;

	result = uds_allocate(1, struct funnel_queue, "funnel queue", &queue);
	if (result != UDS_SUCCESS)
		return result;

	/*
	 * Initialize the stub entry and put it in the queue, establishing the invariant that
	 * queue->newest and queue->oldest are never null.
	 */
	queue->stub.next = NULL;
	queue->newest = &queue->stub;
	queue->oldest = &queue->stub;

	*queue_ptr = queue;
	return UDS_SUCCESS;
}

void uds_free_funnel_queue(struct funnel_queue *queue)
{
	uds_free(queue);
}

static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue)
{
	/*
	 * Barrier requirements: We need a read barrier between reading a "next" field pointer
	 * value and reading anything it points to. There's an accompanying barrier in
	 * uds_funnel_queue_put() between its caller setting up the entry and making it visible.
	 */
	struct funnel_queue_entry *oldest = queue->oldest;
	struct funnel_queue_entry *next = READ_ONCE(oldest->next);

	if (oldest == &queue->stub) {
		/*
		 * When the oldest entry is the stub and it has no successor, the queue is
		 * logically empty.
		 */
		if (next == NULL)
			return NULL;
		/*
		 * The stub entry has a successor, so the stub can be dequeued and ignored without
		 * breaking the queue invariants.
		 */
		oldest = next;
		queue->oldest = oldest;
		next = READ_ONCE(oldest->next);
	}

	/*
	 * We have a non-stub candidate to dequeue. If it lacks a successor, we'll need to put the
	 * stub entry back on the queue first.
	 */
	if (next == NULL) {
		struct funnel_queue_entry *newest = READ_ONCE(queue->newest);

		if (oldest != newest) {
			/*
			 * Another thread has already swung queue->newest atomically, but not yet
			 * assigned previous->next. The queue is really still empty.
			 */
			return NULL;
		}

		/*
		 * Put the stub entry back on the queue, ensuring a successor will eventually be
		 * seen.
		 */
		uds_funnel_queue_put(queue, &queue->stub);

		/* Check again for a successor. */
		next = READ_ONCE(oldest->next);
		if (next == NULL) {
			/*
			 * We lost a race with a producer who swapped queue->newest before we did,
			 * but who hasn't yet updated previous->next. Try again later.
			 */
			return NULL;
		}
	}

	return oldest;
}

/*
 * Poll a queue, removing the oldest entry if the queue is not empty. This function must only be
 * called from a single consumer thread.
 */
struct funnel_queue_entry *uds_funnel_queue_poll(struct funnel_queue *queue)
{
	struct funnel_queue_entry *oldest = get_oldest(queue);

	if (oldest == NULL)
		return oldest;

	/*
	 * Dequeue the oldest entry and return it. Only one consumer thread may call this function,
	 * so no locking, atomic operations, or fences are needed; queue->oldest is owned by the
	 * consumer and oldest->next is never used by a producer thread after it is swung from NULL
	 * to non-NULL.
	 */
	queue->oldest = READ_ONCE(oldest->next);
	/*
	 * Make sure the caller sees the proper stored data for this entry. Since we've already
	 * fetched the entry pointer we stored in "queue->oldest", this also ensures that on entry
	 * to the next call we'll properly see the dependent data.
	 */
	smp_rmb();
	/*
	 * If "oldest" is a very light-weight work item, we'll be looking for the next one very
	 * soon, so prefetch it now.
	 */
	uds_prefetch_address(queue->oldest, true);
	WRITE_ONCE(oldest->next, NULL);
	return oldest;
}

/*
 * Check whether the funnel queue is empty or not. If the queue is in a transition state with one
 * or more entries being added such that the list view is incomplete, this function will report the
 * queue as empty.
 */
bool uds_is_funnel_queue_empty(struct funnel_queue *queue)
{
	return get_oldest(queue) == NULL;
}

/*
 * Check whether the funnel queue is idle or not. If the queue has entries available to be
 * retrieved, it is not idle. If the queue is in a transition state with one or more entries being
 * added such that the list view is incomplete, it may not be possible to retrieve an entry with
 * the uds_funnel_queue_poll() function, but the queue will not be considered idle.
 */
bool uds_is_funnel_queue_idle(struct funnel_queue *queue)
{
	/*
	 * Oldest is not the stub, so there's another entry, though if next is NULL we can't
	 * retrieve it yet.
	 */
	if (queue->oldest != &queue->stub)
		return false;

	/*
	 * Oldest is the stub, but newest has been updated by _put(); either there's another,
	 * retrievable entry in the list, or the list is officially empty but in the intermediate
	 * state of having an entry added.
	 *
	 * Whether anything is retrievable depends on whether stub.next has been updated and become
	 * visible to us, but for idleness we don't care. And due to memory ordering in _put(), the
	 * update to newest would be visible to us at the same time or sooner.
	 */
	if (READ_ONCE(queue->newest) != &queue->stub)
		return false;

	return true;
}
+110 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2023 Red Hat
 */

#ifndef UDS_FUNNEL_QUEUE_H
#define UDS_FUNNEL_QUEUE_H

#include <linux/atomic.h>
#include <linux/cache.h>

/*
 * A funnel queue is a simple (almost) lock-free queue that accepts entries from multiple threads
 * (multi-producer) and delivers them to a single thread (single-consumer). "Funnel" is an attempt
 * to evoke the image of requests from more than one producer being "funneled down" to a single
 * consumer.
 *
 * This is an unsynchronized but thread-safe data structure when used as intended. There is no
 * mechanism to ensure that only one thread is consuming from the queue. If more than one thread
 * attempts to consume from the queue, the resulting behavior is undefined. Clients must not
 * directly access or manipulate the internals of the queue, which are only exposed for the purpose
 * of allowing the very simple enqueue operation to be inlined.
 *
 * The implementation requires that a funnel_queue_entry structure (a link pointer) is embedded in
 * the queue entries, and pointers to those structures are used exclusively by the queue. No macros
 * are defined to template the queue, so the offset of the funnel_queue_entry in the records placed
 * in the queue must all be the same so the client can derive their structure pointer from the
 * entry pointer returned by uds_funnel_queue_poll().
 *
 * Callers are wholly responsible for allocating and freeing the entries. Entries may be freed as
 * soon as they are returned since this queue is not susceptible to the "ABA problem" present in
 * many lock-free data structures. The queue is dynamically allocated to ensure cache-line
 * alignment, but no other dynamic allocation is used.
 *
 * The algorithm is not actually 100% lock-free. There is a single point in uds_funnel_queue_put()
 * at which a preempted producer will prevent the consumers from seeing items added to the queue by
 * later producers, and only if the queue is short enough or the consumer fast enough for it to
 * reach what was the end of the queue at the time of the preemption.
 *
 * The consumer function, uds_funnel_queue_poll(), will return NULL when the queue is empty. To
 * wait for data to consume, spin (if safe) or combine the queue with a struct event_count to
 * signal the presence of new entries.
 */

/* This queue link structure must be embedded in client entries. */
struct funnel_queue_entry {
	/* The next (newer) entry in the queue. */
	struct funnel_queue_entry *next;
};

/*
 * The dynamically allocated queue structure, which is allocated on a cache line boundary so the
 * producer and consumer fields in the structure will land on separate cache lines. This should be
 * consider opaque but it is exposed here so uds_funnel_queue_put() can be inlined.
 */
struct __aligned(L1_CACHE_BYTES) funnel_queue {
	/*
	 * The producers' end of the queue, an atomically exchanged pointer that will never be
	 * NULL.
	 */
	struct funnel_queue_entry *newest;

	/* The consumer's end of the queue, which is owned by the consumer and never NULL. */
	struct funnel_queue_entry *oldest __aligned(L1_CACHE_BYTES);

	/* A dummy entry used to provide the non-NULL invariants above. */
	struct funnel_queue_entry stub;
};

int __must_check uds_make_funnel_queue(struct funnel_queue **queue_ptr);

void uds_free_funnel_queue(struct funnel_queue *queue);

/*
 * Put an entry on the end of the queue.
 *
 * The entry pointer must be to the struct funnel_queue_entry embedded in the caller's data
 * structure. The caller must be able to derive the address of the start of their data structure
 * from the pointer that passed in here, so every entry in the queue must have the struct
 * funnel_queue_entry at the same offset within the client's structure.
 */
static inline void uds_funnel_queue_put(struct funnel_queue *queue,
					struct funnel_queue_entry *entry)
{
	struct funnel_queue_entry *previous;

	/*
	 * Barrier requirements: All stores relating to the entry ("next" pointer, containing data
	 * structure fields) must happen before the previous->next store making it visible to the
	 * consumer. Also, the entry's "next" field initialization to NULL must happen before any
	 * other producer threads can see the entry (the xchg) and try to update the "next" field.
	 *
	 * xchg implements a full barrier.
	 */
	WRITE_ONCE(entry->next, NULL);
	previous = xchg(&queue->newest, entry);
	/*
	 * Preemptions between these two statements hide the rest of the queue from the consumer,
	 * preventing consumption until the following assignment runs.
	 */
	WRITE_ONCE(previous->next, entry);
}

struct funnel_queue_entry *__must_check uds_funnel_queue_poll(struct funnel_queue *queue);

bool __must_check uds_is_funnel_queue_empty(struct funnel_queue *queue);

bool __must_check uds_is_funnel_queue_idle(struct funnel_queue *queue);

#endif /* UDS_FUNNEL_QUEUE_H */
Loading