Commit 7789c6bb authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Jakub Kicinski
Browse files

net: Add queue-create operation



Add a ynl netdev family operation called queue-create that creates a
new queue on a netdevice:

      name: queue-create
      attribute-set: queue
      flags: [admin-perm]
      do:
        request:
          attributes:
            - ifindex
            - type
            - lease
        reply: &queue-create-op
          attributes:
            - id

This is a generic operation such that it can be extended for various
use cases in future. Right now it is mandatory to specify ifindex,
the queue type which is enforced to rx and a lease. The newly created
queue id is returned to the caller.

A queue from a virtual device can have a lease which refers to another
queue from a physical device. This is useful for memory providers
and AF_XDP operations which take an ifindex and queue id to allow
applications to bind against virtual devices in containers. The lease
couples both queues together and allows to proxy the operations from
a virtual device in a container to the physical device.

In future, the nested lease attribute can be lifted and made optional
for other use-cases such as dynamic queue creation for physical
netdevs. The lack of lease and the specification of the physical
device as an ifindex will imply that we need a real queue to be
allocated. Similarly, the queue type enforcement to rx can then be
lifted as well to support tx.

An early implementation had only driver-specific integration [0], but
in order for other virtual devices to reuse, it makes sense to have
this as a generic API in core net.

For leasing queues, the virtual netdev must have real_num_rx_queues
less than num_rx_queues at the time of calling queue-create. The
queue-type must be rx as only rx queues are supported for leasing
for now. We also enforce that the queue-create ifindex must point
to a virtual device, and that the nested lease attribute's ifindex
must point to a physical device. The nested lease attribute set
contains a netns-id attribute which is optional and can specify a
netns-id relative to the caller's netns. It requires cap_net_admin
and if the netns-id attribute is not specified, the lease ifindex
will be retrieved from the current netns. Also, it is modeled as
an s32 type similarly as done elsewhere in the stack.

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Co-developed-by: default avatarDavid Wei <dw@davidwei.uk>
Signed-off-by: default avatarDavid Wei <dw@davidwei.uk>
Acked-by: default avatarStanislav Fomichev <sdf@fomichev.me>
Reviewed-by: default avatarNikolay Aleksandrov <razor@blackwall.org>
Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0]
Link: https://patch.msgid.link/20260402231031.447597-2-daniel@iogearbox.net


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 9700282a
Loading
Loading
Loading
Loading
+46 −0
Original line number Diff line number Diff line
@@ -339,6 +339,15 @@ attribute-sets:
        doc: XSK information for this queue, if any.
        type: nest
        nested-attributes: xsk-info
      -
        name: lease
        doc: |
          A queue from a virtual device can have a lease which refers to
          another queue from a physical device. This is useful for memory
          providers and AF_XDP operations which take an ifindex and queue id
          to allow applications to bind against virtual devices in containers.
        type: nest
        nested-attributes: lease
  -
    name: qstats
    doc: |
@@ -537,6 +546,26 @@ attribute-sets:
        name: id
      -
        name: type
  -
    name: lease
    attributes:
      -
        name: ifindex
        doc: The netdev ifindex to lease the queue from.
        type: u32
        checks:
          min: 1
      -
        name: queue
        doc: The netdev queue to lease from.
        type: nest
        nested-attributes: queue-id
      -
        name: netns-id
        doc: The network namespace id of the netdev.
        type: s32
        checks:
          min: 0
  -
    name: dmabuf
    attributes:
@@ -686,6 +715,7 @@ operations:
            - dmabuf
            - io-uring
            - xsk
            - lease
      dump:
        request:
          attributes:
@@ -797,6 +827,22 @@ operations:
        reply:
          attributes:
            - id
    -
      name: queue-create
      doc: |
        Create a new queue for the given netdevice. Whether this operation
        is supported depends on the device and the driver.
      attribute-set: queue
      flags: [admin-perm]
      do:
        request:
          attributes:
            - ifindex
            - type
            - lease
        reply: &queue-create-op
          attributes:
            - id

kernel-family:
  headers: ["net/netdev_netlink.h"]
+11 −0
Original line number Diff line number Diff line
@@ -160,6 +160,7 @@ enum {
	NETDEV_A_QUEUE_DMABUF,
	NETDEV_A_QUEUE_IO_URING,
	NETDEV_A_QUEUE_XSK,
	NETDEV_A_QUEUE_LEASE,

	__NETDEV_A_QUEUE_MAX,
	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -202,6 +203,15 @@ enum {
	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
};

enum {
	NETDEV_A_LEASE_IFINDEX = 1,
	NETDEV_A_LEASE_QUEUE,
	NETDEV_A_LEASE_NETNS_ID,

	__NETDEV_A_LEASE_MAX,
	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
};

enum {
	NETDEV_A_DMABUF_IFINDEX = 1,
	NETDEV_A_DMABUF_QUEUES,
@@ -228,6 +238,7 @@ enum {
	NETDEV_CMD_BIND_RX,
	NETDEV_CMD_NAPI_SET,
	NETDEV_CMD_BIND_TX,
	NETDEV_CMD_QUEUE_CREATE,

	__NETDEV_CMD_MAX,
	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
+20 −0
Original line number Diff line number Diff line
@@ -28,6 +28,12 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range
};

/* Common nested types */
const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = {
	[NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
	[NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
	[NETDEV_A_LEASE_NETNS_ID] = NLA_POLICY_MIN(NLA_S32, 0),
};

const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
	[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
	[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
@@ -107,6 +113,13 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1]
	[NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
};

/* NETDEV_CMD_QUEUE_CREATE - do */
static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = {
	[NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
	[NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
	[NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy),
};

/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
	{
@@ -205,6 +218,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
		.maxattr	= NETDEV_A_DMABUF_FD,
		.flags		= GENL_CMD_CAP_DO,
	},
	{
		.cmd		= NETDEV_CMD_QUEUE_CREATE,
		.doit		= netdev_nl_queue_create_doit,
		.policy		= netdev_queue_create_nl_policy,
		.maxattr	= NETDEV_A_QUEUE_LEASE,
		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
	},
};

static const struct genl_multicast_group netdev_nl_mcgrps[] = {
+2 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include <net/netdev_netlink.h>

/* Common nested types */
extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1];
extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];

@@ -36,6 +37,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info);

enum {
	NETDEV_NLGRP_MGMT,
+5 −0
Original line number Diff line number Diff line
@@ -1120,6 +1120,11 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
	return err;
}

int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
{
	return -EOPNOTSUPP;
}

void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
{
	INIT_LIST_HEAD(&priv->bindings);
Loading