Commit 3323ddce authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull user work thread updates from Christian Brauner:
 "This contains the work generalizing the ability to create a kernel
  worker from a userspace process.

  Such user workers will run with the same credentials as the userspace
  process they were created from providing stronger security and
  accounting guarantees than the traditional override_creds() approach
  ever could've hoped for.

  The original work was heavily based and optimzed for the needs of
  io_uring which was the first user. However, as it quickly turned out
  the ability to create user workers inherting properties from a
  userspace process is generally useful.

  The vhost subsystem currently creates workers using the kthread api.
  The consequences of using the kthread api are that RLIMITs don't work
  correctly as they are inherited from khtreadd. This leads to bugs
  where more workers are created than would be allowed by the RLIMITs of
  the userspace process in lieu of which workers are created.

  Problems like this disappear with user workers created from the
  userspace processes for which they perform the work. In addition,
  providing this api allows vhost to remove additional complexity. For
  example, cgroup and mm sharing will just work out of the box with user
  workers based on the relevant userspace process instead of manually
  ensuring the correct cgroup and mm contexts are used.

  So the vhost subsystem should simply be made to use the same mechanism
  as io_uring. To this end the original mechanism used for
  create_io_thread() is generalized into user workers:

   - Introduce PF_USER_WORKER as a generic indicator that a given task
     is a user worker, i.e., a kernel task that was created from a
     userspace process. Now a PF_IO_WORKER thread is just a specialized
     version of PF_USER_WORKER. So io_uring io workers raise both flags.

   - Make copy_process() available to core kernel code

   - Extend struct kernel_clone_args with the following bitfields
     allowing to indicate to copy_process():
       - to create a user worker (raise PF_USER_WORKER)
       - to not inherit any files from the userspace process
       - to ignore signals

  After all generic changes are in place the vhost subsystem implements
  a new dedicated vhost api based on user workers. Finally, vhost is
  switched to rely on the new api moving it off of kthreads.

  Thanks to Mike for sticking it out and making it through this rather
  arduous journey"

* tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  vhost: use vhost_tasks for worker threads
  vhost: move worker thread fields to new struct
  vhost_task: Allow vhost layer to use copy_process
  fork: allow kernel code to call copy_process
  fork: Add kernel_clone_args flag to ignore signals
  fork: add kernel_clone_args flag to not dup/clone files
  fork/vm: Move common PF_IO_WORKER behavior to new flag
  kernel: Make io_thread and kthread bit fields
  kthread: Pass in the thread's name during creation
  kernel: Allow a kernel thread's name to be set in copy_process
  csky: Remove kernel_thread declaration
parents a632b76b 6e890c5d
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -22177,7 +22177,9 @@ L: virtualization@lists.linux-foundation.org
L:	netdev@vger.kernel.org
S:	Maintained
T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
F:	kernel/vhost_task.c
F:	drivers/vhost/
F:	include/linux/sched/vhost_task.h
F:	include/linux/vhost_iotlb.h
F:	include/uapi/linux/vhost.h
+0 −2
Original line number Diff line number Diff line
@@ -72,8 +72,6 @@ struct task_struct;
/* Prepare to copy thread state - unlazy all lazy status */
#define prepare_to_copy(tsk)    do { } while (0)

extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);

unsigned long __get_wchan(struct task_struct *p);

#define KSTK_EIP(tsk)		(task_pt_regs(tsk)->pc)
+5 −0
Original line number Diff line number Diff line
@@ -13,9 +13,14 @@ config VHOST_RING
	  This option is selected by any driver which needs to access
	  the host side of a virtio ring.

config VHOST_TASK
	bool
	default n

config VHOST
	tristate
	select VHOST_IOTLB
	select VHOST_TASK
	help
	  This option is selected by any driver which needs to access
	  the core of vhost.
+60 −64
Original line number Diff line number Diff line
@@ -22,11 +22,11 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/kthread.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sort.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/vhost_task.h>
#include <linux/interval_tree_generic.h>
#include <linux/nospec.h>
#include <linux/kcov.h>
@@ -255,8 +255,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
		 * sure it was not in the list.
		 * test_and_set_bit() implies a memory barrier.
		 */
		llist_add(&work->node, &dev->work_list);
		wake_up_process(dev->worker);
		llist_add(&work->node, &dev->worker->work_list);
		wake_up_process(dev->worker->vtsk->task);
	}
}
EXPORT_SYMBOL_GPL(vhost_work_queue);
@@ -264,7 +264,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue);
/* A lockless hint for busy polling code to exit the loop */
bool vhost_has_work(struct vhost_dev *dev)
{
	return !llist_empty(&dev->work_list);
	return dev->worker && !llist_empty(&dev->worker->work_list);
}
EXPORT_SYMBOL_GPL(vhost_has_work);

@@ -335,22 +335,20 @@ static void vhost_vq_reset(struct vhost_dev *dev,

static int vhost_worker(void *data)
{
	struct vhost_dev *dev = data;
	struct vhost_worker *worker = data;
	struct vhost_work *work, *work_next;
	struct llist_node *node;

	kthread_use_mm(dev->mm);

	for (;;) {
		/* mb paired w/ kthread_stop */
		set_current_state(TASK_INTERRUPTIBLE);

		if (kthread_should_stop()) {
		if (vhost_task_should_stop(worker->vtsk)) {
			__set_current_state(TASK_RUNNING);
			break;
		}

		node = llist_del_all(&dev->work_list);
		node = llist_del_all(&worker->work_list);
		if (!node)
			schedule();

@@ -360,14 +358,14 @@ static int vhost_worker(void *data)
		llist_for_each_entry_safe(work, work_next, node, node) {
			clear_bit(VHOST_WORK_QUEUED, &work->flags);
			__set_current_state(TASK_RUNNING);
			kcov_remote_start_common(dev->kcov_handle);
			kcov_remote_start_common(worker->kcov_handle);
			work->fn(work);
			kcov_remote_stop();
			if (need_resched())
				schedule();
		}
	}
	kthread_unuse_mm(dev->mm);

	return 0;
}

@@ -479,7 +477,6 @@ void vhost_dev_init(struct vhost_dev *dev,
	dev->byte_weight = byte_weight;
	dev->use_worker = use_worker;
	dev->msg_handler = msg_handler;
	init_llist_head(&dev->work_list);
	init_waitqueue_head(&dev->wait);
	INIT_LIST_HEAD(&dev->read_list);
	INIT_LIST_HEAD(&dev->pending_list);
@@ -509,31 +506,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);

struct vhost_attach_cgroups_struct {
	struct vhost_work work;
	struct task_struct *owner;
	int ret;
};

static void vhost_attach_cgroups_work(struct vhost_work *work)
{
	struct vhost_attach_cgroups_struct *s;

	s = container_of(work, struct vhost_attach_cgroups_struct, work);
	s->ret = cgroup_attach_task_all(s->owner, current);
}

static int vhost_attach_cgroups(struct vhost_dev *dev)
{
	struct vhost_attach_cgroups_struct attach;

	attach.owner = current;
	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
	vhost_work_queue(dev, &attach.work);
	vhost_dev_flush(dev);
	return attach.ret;
}

/* Caller should have device mutex */
bool vhost_dev_has_owner(struct vhost_dev *dev)
{
@@ -571,10 +543,54 @@ static void vhost_detach_mm(struct vhost_dev *dev)
	dev->mm = NULL;
}

static void vhost_worker_free(struct vhost_dev *dev)
{
	struct vhost_worker *worker = dev->worker;

	if (!worker)
		return;

	dev->worker = NULL;
	WARN_ON(!llist_empty(&worker->work_list));
	vhost_task_stop(worker->vtsk);
	kfree(worker);
}

static int vhost_worker_create(struct vhost_dev *dev)
{
	struct vhost_worker *worker;
	struct vhost_task *vtsk;
	char name[TASK_COMM_LEN];
	int ret;

	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
	if (!worker)
		return -ENOMEM;

	dev->worker = worker;
	worker->kcov_handle = kcov_common_handle();
	init_llist_head(&worker->work_list);
	snprintf(name, sizeof(name), "vhost-%d", current->pid);

	vtsk = vhost_task_create(vhost_worker, worker, name);
	if (!vtsk) {
		ret = -ENOMEM;
		goto free_worker;
	}

	worker->vtsk = vtsk;
	vhost_task_start(vtsk);
	return 0;

free_worker:
	kfree(worker);
	dev->worker = NULL;
	return ret;
}

/* Caller should have device mutex */
long vhost_dev_set_owner(struct vhost_dev *dev)
{
	struct task_struct *worker;
	int err;

	/* Is there an owner already? */
@@ -585,36 +601,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev)

	vhost_attach_mm(dev);

	dev->kcov_handle = kcov_common_handle();
	if (dev->use_worker) {
		worker = kthread_create(vhost_worker, dev,
					"vhost-%d", current->pid);
		if (IS_ERR(worker)) {
			err = PTR_ERR(worker);
			goto err_worker;
		}

		dev->worker = worker;
		wake_up_process(worker); /* avoid contributing to loadavg */

		err = vhost_attach_cgroups(dev);
		err = vhost_worker_create(dev);
		if (err)
			goto err_cgroup;
			goto err_worker;
	}

	err = vhost_dev_alloc_iovecs(dev);
	if (err)
		goto err_cgroup;
		goto err_iovecs;

	return 0;
err_cgroup:
	if (dev->worker) {
		kthread_stop(dev->worker);
		dev->worker = NULL;
	}
err_iovecs:
	vhost_worker_free(dev);
err_worker:
	vhost_detach_mm(dev);
	dev->kcov_handle = 0;
err_mm:
	return err;
}
@@ -705,12 +706,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
	dev->iotlb = NULL;
	vhost_clear_msg(dev);
	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
	WARN_ON(!llist_empty(&dev->work_list));
	if (dev->worker) {
		kthread_stop(dev->worker);
		dev->worker = NULL;
		dev->kcov_handle = 0;
	}
	vhost_worker_free(dev);
	vhost_detach_mm(dev);
}
EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
+8 −3
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <linux/irqbypass.h>

struct vhost_work;
struct vhost_task;
typedef void (*vhost_work_fn_t)(struct vhost_work *work);

#define VHOST_WORK_QUEUED 1
@@ -25,6 +26,12 @@ struct vhost_work {
	unsigned long		flags;
};

struct vhost_worker {
	struct vhost_task	*vtsk;
	struct llist_head	work_list;
	u64			kcov_handle;
};

/* Poll a file (eventfd or socket) */
/* Note: there's nothing vhost specific about this structure. */
struct vhost_poll {
@@ -147,8 +154,7 @@ struct vhost_dev {
	struct vhost_virtqueue **vqs;
	int nvqs;
	struct eventfd_ctx *log_ctx;
	struct llist_head work_list;
	struct task_struct *worker;
	struct vhost_worker *worker;
	struct vhost_iotlb *umem;
	struct vhost_iotlb *iotlb;
	spinlock_t iotlb_lock;
@@ -158,7 +164,6 @@ struct vhost_dev {
	int iov_limit;
	int weight;
	int byte_weight;
	u64 kcov_handle;
	bool use_worker;
	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
			   struct vhost_iotlb_msg *msg);
Loading