Commit 15089225 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'

Daniel Borkmann says:

====================
netkit: Support for io_uring zero-copy and AF_XDP

Containers use virtual netdevs to route traffic from a physical netdev
in the host namespace. They do not have access to the physical netdev
in the host and thus can't use memory providers or AF_XDP that require
reconfiguring/restarting queues in the physical netdev.

This patchset adds the concept of queue leasing to virtual netdevs that
allow containers to use memory providers and AF_XDP at native speed.
Leased queues are bound to a real queue in a physical netdev and act
as a proxy.

Memory providers and AF_XDP operations take an ifindex and queue id,
so containers would pass in an ifindex for a virtual netdev and a queue
id of a leased queue, which then gets proxied to the underlying real
queue.

We have implemented support for this concept in netkit and tested the
latter against Nvidia ConnectX-6 (mlx5) as well as Broadcom BCM957504
(bnxt_en) 100G NICs. For more details see the individual patches.
====================

Link: https://patch.msgid.link/20260402231031.447597-1-daniel@iogearbox.net


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents b6e39e48 65d657d8
Loading
Loading
Loading
Loading
+46 −0
Original line number Diff line number Diff line
@@ -339,6 +339,15 @@ attribute-sets:
        doc: XSK information for this queue, if any.
        type: nest
        nested-attributes: xsk-info
      -
        name: lease
        doc: |
          A queue from a virtual device can have a lease which refers to
          another queue from a physical device. This is useful for memory
          providers and AF_XDP operations which take an ifindex and queue id
          to allow applications to bind against virtual devices in containers.
        type: nest
        nested-attributes: lease
  -
    name: qstats
    doc: |
@@ -537,6 +546,26 @@ attribute-sets:
        name: id
      -
        name: type
  -
    name: lease
    attributes:
      -
        name: ifindex
        doc: The netdev ifindex to lease the queue from.
        type: u32
        checks:
          min: 1
      -
        name: queue
        doc: The netdev queue to lease from.
        type: nest
        nested-attributes: queue-id
      -
        name: netns-id
        doc: The network namespace id of the netdev.
        type: s32
        checks:
          min: 0
  -
    name: dmabuf
    attributes:
@@ -686,6 +715,7 @@ operations:
            - dmabuf
            - io-uring
            - xsk
            - lease
      dump:
        request:
          attributes:
@@ -797,6 +827,22 @@ operations:
        reply:
          attributes:
            - id
    -
      name: queue-create
      doc: |
        Create a new queue for the given netdevice. Whether this operation
        is supported depends on the device and the driver.
      attribute-set: queue
      flags: [admin-perm]
      do:
        request:
          attributes:
            - ifindex
            - type
            - lease
        reply: &queue-create-op
          attributes:
            - id

kernel-family:
  headers: ["net/netdev_netlink.h"]
+11 −0
Original line number Diff line number Diff line
@@ -825,6 +825,13 @@ definitions:
    entries:
      - name: none
      - name: default
  -
    name: netkit-pairing
    type: enum
    enum-name: netkit-pairing
    entries:
      - name: pair
      - name: single
  -
    name: ovpn-mode
    enum-name: ovpn-mode
@@ -2299,6 +2306,10 @@ attribute-sets:
      -
        name: tailroom
        type: u16
      -
        name: pairing
        type: u32
        enum: netkit-pairing
  -
    name: linkinfo-ovpn-attrs
    name-prefix: ifla-ovpn-
+6 −0
Original line number Diff line number Diff line
@@ -329,6 +329,12 @@ by setting ``request_ops_lock`` to true. Code comments and docs refer
to drivers which have ops called under the instance lock as "ops locked".
See also the documentation of the ``lock`` member of struct net_device.

There is also a case of taking two per-netdev locks in sequence when netdev
queues are leased, that is, the netdev-scope lock is taken for both the
virtual and the physical device. To prevent deadlocks, the virtual device's
lock must always be acquired before the physical device's (see
``netdev_nl_queue_create_doit``).

In the future, there will be an option for individual
drivers to opt out of using ``rtnl_lock`` and instead perform their control
operations directly under the netdev instance lock.
+349 −63
Original line number Diff line number Diff line
@@ -9,11 +9,21 @@
#include <linux/bpf_mprog.h>
#include <linux/indirect_call_wrapper.h>

#include <net/netdev_lock.h>
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/xdp_sock_drv.h>
#include <net/netkit.h>
#include <net/dst.h>
#include <net/tcx.h>

#define DRV_NAME "netkit"
#define NETKIT_DRV_NAME	"netkit"

#define NETKIT_NUM_RX_QUEUES_MAX  1024
#define NETKIT_NUM_TX_QUEUES_MAX  1

#define NETKIT_NUM_RX_QUEUES_REAL 1
#define NETKIT_NUM_TX_QUEUES_REAL 1

struct netkit {
	__cacheline_group_begin(netkit_fastpath);
@@ -26,6 +36,7 @@ struct netkit {

	__cacheline_group_begin(netkit_slowpath);
	enum netkit_mode mode;
	enum netkit_pairing pair;
	bool primary;
	u32 headroom;
	__cacheline_group_end(netkit_slowpath);
@@ -36,6 +47,8 @@ struct netkit_link {
	struct net_device *dev;
};

static struct rtnl_link_ops netkit_link_ops;

static __always_inline int
netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
	   enum netkit_action ret)
@@ -135,6 +148,10 @@ static int netkit_open(struct net_device *dev)
	struct netkit *nk = netkit_priv(dev);
	struct net_device *peer = rtnl_dereference(nk->peer);

	if (nk->pair == NETKIT_DEVICE_SINGLE) {
		netif_carrier_on(dev);
		return 0;
	}
	if (!peer)
		return -ENOTCONN;
	if (peer->flags & IFF_UP) {
@@ -194,16 +211,17 @@ static void netkit_set_headroom(struct net_device *dev, int headroom)

	rcu_read_lock();
	peer = rcu_dereference(nk->peer);
	if (unlikely(!peer))
		goto out;

	if (!peer) {
		nk->headroom = headroom;
		dev->needed_headroom = headroom;
	} else {
		nk2 = netkit_priv(peer);
		nk->headroom = headroom;
		headroom = max(nk->headroom, nk2->headroom);

		peer->needed_headroom = headroom;
		dev->needed_headroom = headroom;
out:
	}
	rcu_read_unlock();
}

@@ -219,9 +237,96 @@ static void netkit_get_stats(struct net_device *dev,
	stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
}

static bool netkit_xsk_supported_at_phys(const struct net_device *dev)
{
	if (!dev->netdev_ops->ndo_bpf ||
	    !dev->netdev_ops->ndo_xdp_xmit ||
	    !dev->netdev_ops->ndo_xsk_wakeup)
		return false;
	return true;
}

static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
{
	struct netkit *nk = netkit_priv(dev);
	struct netdev_bpf xdp_lower;
	struct netdev_rx_queue *rxq;
	struct net_device *phys;
	bool create = false;
	int ret = -EBUSY;

	switch (xdp->command) {
	case XDP_SETUP_XSK_POOL:
		if (nk->pair == NETKIT_DEVICE_PAIR)
			return -EOPNOTSUPP;
		if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
			return -EINVAL;

		rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
		if (!rxq->lease)
			return -EOPNOTSUPP;

		phys = rxq->lease->dev;
		if (!netkit_xsk_supported_at_phys(phys))
			return -EOPNOTSUPP;

		create = xdp->xsk.pool;
		memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
		xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease);
		break;
	case XDP_SETUP_PROG:
		return -EOPNOTSUPP;
	default:
		return -EINVAL;
	}

	netdev_lock(phys);
	if (create &&
	    (phys->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) {
		ret = -EOPNOTSUPP;
		goto out;
	}
	if (!create || !dev_get_min_mp_channel_count(phys))
		ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
out:
	netdev_unlock(phys);
	return ret;
}

static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
{
	struct netdev_rx_queue *rxq, *rxq_lease;
	struct net_device *phys;

	if (queue_id >= dev->real_num_rx_queues)
		return -EINVAL;

	rxq = __netif_get_rx_queue(dev, queue_id);
	rxq_lease = READ_ONCE(rxq->lease);
	if (unlikely(!rxq_lease))
		return -EOPNOTSUPP;

	/* netkit_xsk already validated full xsk support, hence it's
	 * fine to call into ndo_xsk_wakeup right away given this
	 * was a prerequisite to get here in the first place. The
	 * phys xsk support cannot change without tearing down the
	 * device (which clears the lease first).
	 */
	phys = rxq_lease->dev;
	return phys->netdev_ops->ndo_xsk_wakeup(phys,
			get_netdev_rx_queue_index(rxq_lease), flags);
}

static int netkit_init(struct net_device *dev)
{
	netdev_lockdep_set_classes(dev);
	return 0;
}

static void netkit_uninit(struct net_device *dev);

static const struct net_device_ops netkit_netdev_ops = {
	.ndo_init		= netkit_init,
	.ndo_open		= netkit_open,
	.ndo_stop		= netkit_close,
	.ndo_start_xmit		= netkit_xmit,
@@ -232,19 +337,104 @@ static const struct net_device_ops netkit_netdev_ops = {
	.ndo_get_peer_dev	= netkit_peer_dev,
	.ndo_get_stats64	= netkit_get_stats,
	.ndo_uninit		= netkit_uninit,
	.ndo_bpf		= netkit_xsk,
	.ndo_xsk_wakeup		= netkit_xsk_wakeup,
	.ndo_features_check	= passthru_features_check,
};

static void netkit_get_drvinfo(struct net_device *dev,
			       struct ethtool_drvinfo *info)
{
	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
	strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver));
}

static const struct ethtool_ops netkit_ethtool_ops = {
	.get_drvinfo		= netkit_get_drvinfo,
};

static int netkit_queue_create(struct net_device *dev,
			       struct netlink_ext_ack *extack)
{
	struct netkit *nk = netkit_priv(dev);
	u32 rxq_count_old, rxq_count_new;
	int err;

	rxq_count_old = dev->real_num_rx_queues;
	rxq_count_new = rxq_count_old + 1;

	/* In paired mode, only the non-primary (peer) device can
	 * create leased queues since the primary is the management
	 * side. In single device mode, leasing is always allowed.
	 */
	if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) {
		NL_SET_ERR_MSG(extack,
			       "netkit can only lease against the peer device");
		return -EOPNOTSUPP;
	}

	err = netif_set_real_num_rx_queues(dev, rxq_count_new);
	if (err) {
		if (rxq_count_new > dev->num_rx_queues)
			NL_SET_ERR_MSG(extack,
				       "netkit maximum queue limit reached");
		else
			NL_SET_ERR_MSG_FMT(extack,
					   "netkit cannot create more queues err=%d", err);
		return err;
	}

	return rxq_count_old;
}

static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = {
	.ndo_queue_create	= netkit_queue_create,
};

static struct net_device *netkit_alloc(struct nlattr *tb[],
				       const char *ifname,
				       unsigned char name_assign_type,
				       unsigned int num_tx_queues,
				       unsigned int num_rx_queues)
{
	const struct rtnl_link_ops *ops = &netkit_link_ops;
	struct net_device *dev;

	if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX ||
	    num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX)
		return ERR_PTR(-EOPNOTSUPP);

	dev = alloc_netdev_mqs(ops->priv_size, ifname,
			       name_assign_type, ops->setup,
			       num_tx_queues, num_rx_queues);
	if (dev) {
		dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL;
		dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL;
	}
	return dev;
}

static void netkit_queue_unlease(struct net_device *dev)
{
	struct netdev_rx_queue *rxq, *rxq_lease;
	struct net_device *dev_lease;
	int i;

	if (dev->real_num_rx_queues == 1)
		return;

	netdev_lock(dev);
	for (i = 1; i < dev->real_num_rx_queues; i++) {
		rxq = __netif_get_rx_queue(dev, i);
		rxq_lease = rxq->lease;
		dev_lease = rxq_lease->dev;

		netdev_lock(dev_lease);
		netdev_rx_queue_unlease(rxq, rxq_lease);
		netdev_unlock(dev_lease);
	}
	netdev_unlock(dev);
}

static void netkit_setup(struct net_device *dev)
{
	static const netdev_features_t netkit_features_hw_vlan =
@@ -275,8 +465,9 @@ static void netkit_setup(struct net_device *dev)
	dev->priv_flags |= IFF_DISABLE_NETPOLL;
	dev->lltx = true;

	dev->ethtool_ops = &netkit_ethtool_ops;
	dev->netdev_ops     = &netkit_netdev_ops;
	dev->ethtool_ops    = &netkit_ethtool_ops;
	dev->queue_mgmt_ops = &netkit_queue_mgmt_ops;

	dev->features |= netkit_features;
	dev->hw_features = netkit_features;
@@ -325,8 +516,6 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
	return 0;
}

static struct rtnl_link_ops netkit_link_ops;

static int netkit_new_link(struct net_device *dev,
			   struct rtnl_newlink_params *params,
			   struct netlink_ext_ack *extack)
@@ -335,15 +524,17 @@ static int netkit_new_link(struct net_device *dev,
	enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
	enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr;
	enum netkit_pairing pair = NETKIT_DEVICE_PAIR;
	enum netkit_action policy_prim = NETKIT_PASS;
	enum netkit_action policy_peer = NETKIT_PASS;
	bool seen_peer = false, seen_scrub = false;
	struct nlattr **data = params->data;
	enum netkit_mode mode = NETKIT_L3;
	unsigned char ifname_assign_type;
	struct nlattr **tb = params->tb;
	u16 headroom = 0, tailroom = 0;
	struct ifinfomsg *ifmp = NULL;
	struct net_device *peer;
	struct net_device *peer = NULL;
	char ifname[IFNAMSIZ];
	struct netkit *nk;
	int err;
@@ -380,6 +571,13 @@ static int netkit_new_link(struct net_device *dev,
			headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
		if (data[IFLA_NETKIT_TAILROOM])
			tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
		if (data[IFLA_NETKIT_PAIRING])
			pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]);

		seen_scrub = data[IFLA_NETKIT_SCRUB];
		seen_peer = data[IFLA_NETKIT_PEER_INFO] ||
			    data[IFLA_NETKIT_PEER_SCRUB] ||
			    data[IFLA_NETKIT_PEER_POLICY];
	}

	if (ifmp && tbp[IFLA_IFNAME]) {
@@ -392,22 +590,22 @@ static int netkit_new_link(struct net_device *dev,
	if (mode != NETKIT_L2 &&
	    (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
		return -EOPNOTSUPP;
	if (pair == NETKIT_DEVICE_SINGLE &&
	    (tb != tbp || seen_peer || seen_scrub ||
	     policy_prim != NETKIT_PASS))
		return -EOPNOTSUPP;

	if (pair == NETKIT_DEVICE_PAIR) {
		peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
					&netkit_link_ops, tbp, extack);
		if (IS_ERR(peer))
			return PTR_ERR(peer);

		netif_inherit_tso_max(peer, dev);
	if (headroom) {
		if (headroom)
			peer->needed_headroom = headroom;
		dev->needed_headroom = headroom;
	}
	if (tailroom) {
		if (tailroom)
			peer->needed_tailroom = tailroom;
		dev->needed_tailroom = tailroom;
	}

		if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
			eth_hw_addr_random(peer);
		if (ifmp && dev->ifindex)
@@ -418,6 +616,7 @@ static int netkit_new_link(struct net_device *dev,
		nk->policy = policy_peer;
		nk->scrub = scrub_peer;
		nk->mode = mode;
		nk->pair = pair;
		nk->headroom = headroom;
		bpf_mprog_bundle_init(&nk->bundle);

@@ -431,6 +630,7 @@ static int netkit_new_link(struct net_device *dev,
		err = rtnl_configure_link(peer, NULL, 0, NULL);
		if (err < 0)
			goto err_configure_peer;
	}

	if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
		eth_hw_addr_random(dev);
@@ -438,15 +638,23 @@ static int netkit_new_link(struct net_device *dev,
		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
	else
		strscpy(dev->name, "nk%d", IFNAMSIZ);
	if (headroom)
		dev->needed_headroom = headroom;
	if (tailroom)
		dev->needed_tailroom = tailroom;

	nk = netkit_priv(dev);
	nk->primary = true;
	nk->policy = policy_prim;
	nk->scrub = scrub_prim;
	nk->mode = mode;
	nk->pair = pair;
	nk->headroom = headroom;
	bpf_mprog_bundle_init(&nk->bundle);

	if (pair == NETKIT_DEVICE_SINGLE)
		xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK);

	err = register_netdevice(dev);
	if (err < 0)
		goto err_configure_peer;
@@ -455,9 +663,11 @@ static int netkit_new_link(struct net_device *dev,
		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);

	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
	if (peer)
		rcu_assign_pointer(netkit_priv(peer)->peer, dev);
	return 0;
err_configure_peer:
	if (peer)
		unregister_netdevice(peer);
	return err;
err_register_peer:
@@ -518,6 +728,8 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi
	nk = netkit_priv(dev);
	if (!nk->primary)
		return ERR_PTR(-EACCES);
	if (nk->pair == NETKIT_DEVICE_SINGLE)
		return ERR_PTR(-EOPNOTSUPP);
	if (which == BPF_NETKIT_PEER) {
		dev = rcu_dereference_rtnl(nk->peer);
		if (!dev)
@@ -844,6 +1056,7 @@ static void netkit_release_all(struct net_device *dev)
static void netkit_uninit(struct net_device *dev)
{
	netkit_release_all(dev);
	netkit_queue_unlease(dev);
}

static void netkit_del_link(struct net_device *dev, struct list_head *head)
@@ -856,6 +1069,14 @@ static void netkit_del_link(struct net_device *dev, struct list_head *head)
	if (peer) {
		nk = netkit_priv(peer);
		RCU_INIT_POINTER(nk->peer, NULL);
		/* Guard against the peer already being in an unregister
		 * list (e.g. same-namespace teardown where the peer is
		 * in the caller's dev_kill_list). list_move_tail() on an
		 * already-queued device would otherwise corrupt that
		 * list's iteration. This situation can occur via netkit
		 * notifier, hence guard against this scenario.
		 */
		if (!unregister_netdevice_queued(peer))
			unregister_netdevice_queue(peer, head);
	}
}
@@ -879,6 +1100,7 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
		{ IFLA_NETKIT_PEER_INFO,  "peer info" },
		{ IFLA_NETKIT_HEADROOM,   "headroom" },
		{ IFLA_NETKIT_TAILROOM,   "tailroom" },
		{ IFLA_NETKIT_PAIRING,    "pairing" },
	};

	if (!nk->primary) {
@@ -898,8 +1120,10 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
	}

	if (data[IFLA_NETKIT_POLICY]) {
		err = -EOPNOTSUPP;
		attr = data[IFLA_NETKIT_POLICY];
		policy = nla_get_u32(attr);
		if (nk->pair == NETKIT_DEVICE_PAIR)
			err = netkit_check_policy(policy, attr, extack);
		if (err)
			return err;
@@ -921,6 +1145,50 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
	return 0;
}

static void netkit_check_lease_unregister(struct net_device *dev)
{
	LIST_HEAD(list_kill);
	u32 q_idx;

	if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING ||
	    !dev->dev.parent)
		return;

	netdev_lock_ops(dev);
	for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) {
		struct net_device *tmp = dev;
		struct netdev_rx_queue *rxq;
		u32 tmp_q_idx = q_idx;

		rxq = __netif_get_rx_queue_lease(&tmp, &tmp_q_idx,
						 NETIF_PHYS_TO_VIRT);
		if (rxq && tmp != dev &&
		    tmp->netdev_ops == &netkit_netdev_ops) {
			/* A single phys device can have multiple queues leased
			 * to one netkit device. We can only queue that netkit
			 * device once to the list_kill. Queues of that phys
			 * device can be leased with different individual netkit
			 * devices, hence we batch via list_kill.
			 */
			if (unregister_netdevice_queued(tmp))
				continue;
			netkit_del_link(tmp, &list_kill);
		}
	}
	netdev_unlock_ops(dev);
	unregister_netdevice_many(&list_kill);
}

static int netkit_notifier(struct notifier_block *this,
			   unsigned long event, void *ptr)
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);

	if (event == NETDEV_UNREGISTER)
		netkit_check_lease_unregister(dev);
	return NOTIFY_DONE;
}

static size_t netkit_get_size(const struct net_device *dev)
{
	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
@@ -931,6 +1199,7 @@ static size_t netkit_get_size(const struct net_device *dev)
	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */
	       0;
}

@@ -951,6 +1220,8 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
		return -EMSGSIZE;
	if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
		return -EMSGSIZE;
	if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair))
		return -EMSGSIZE;

	if (peer) {
		nk = netkit_priv(peer);
@@ -972,13 +1243,15 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
	[IFLA_NETKIT_TAILROOM]		= { .type = NLA_U16 },
	[IFLA_NETKIT_SCRUB]		= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
	[IFLA_NETKIT_PEER_SCRUB]	= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
	[IFLA_NETKIT_PAIRING]		= NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE),
	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
					    .reject_message = "Primary attribute is read-only" },
};

static struct rtnl_link_ops netkit_link_ops = {
	.kind		= DRV_NAME,
	.kind		= NETKIT_DRV_NAME,
	.priv_size	= sizeof(struct netkit),
	.alloc		= netkit_alloc,
	.setup		= netkit_setup,
	.newlink	= netkit_new_link,
	.dellink	= netkit_del_link,
@@ -992,26 +1265,39 @@ static struct rtnl_link_ops netkit_link_ops = {
	.maxtype	= IFLA_NETKIT_MAX,
};

static __init int netkit_init(void)
static struct notifier_block netkit_netdev_notifier = {
	.notifier_call	= netkit_notifier,
};

static __init int netkit_mod_init(void)
{
	int ret;

	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
		     (int)NETKIT_PASS != (int)TCX_PASS ||
		     (int)NETKIT_DROP != (int)TCX_DROP ||
		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);

	return rtnl_link_register(&netkit_link_ops);
	ret = rtnl_link_register(&netkit_link_ops);
	if (ret)
		return ret;
	ret = register_netdevice_notifier(&netkit_netdev_notifier);
	if (ret)
		rtnl_link_unregister(&netkit_link_ops);
	return ret;
}

static __exit void netkit_exit(void)
static __exit void netkit_mod_exit(void)
{
	unregister_netdevice_notifier(&netkit_netdev_notifier);
	rtnl_link_unregister(&netkit_link_ops);
}

module_init(netkit_init);
module_exit(netkit_exit);
module_init(netkit_mod_init);
module_exit(netkit_mod_exit);

MODULE_DESCRIPTION("BPF-programmable network device");
MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK(DRV_NAME);
MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME);
+10 −1
Original line number Diff line number Diff line
@@ -2561,7 +2561,14 @@ struct net_device {
	 * Also protects some fields in:
	 *	struct napi_struct, struct netdev_queue, struct netdev_rx_queue
	 *
	 * Ordering: take after rtnl_lock.
	 * Ordering:
	 *
	 * - take after rtnl_lock
	 *
	 * - for the case of netdev queue leasing, the netdev-scope lock is
	 *   taken for both the virtual and the physical device; to prevent
	 *   deadlocks, the virtual device's lock must always be acquired
	 *   before the physical device's (see netdev_nl_queue_create_doit)
	 */
	struct mutex		lock;

@@ -3413,6 +3420,8 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
bool unregister_netdevice_queued(const struct net_device *dev);

static inline void unregister_netdevice(struct net_device *dev)
{
	unregister_netdevice_queue(dev, NULL);
Loading