Commit 9cbed5aa authored by Chiara Meiohas's avatar Chiara Meiohas Committed by Leon Romanovsky
Browse files

RDMA/nldev: Add support for RDMA monitoring



Introduce a new netlink command to allow rdma event monitoring.
The rdma events supported now are IB device
registration/unregistration and net device attachment/detachment.

Example output of rdma monitor and the commands which trigger
the events:

$ rdma monitor
$ rmmod mlx5_ib
[UNREGISTER]	dev 1 rocep8s0f1
[UNREGISTER]	dev 0 rocep8s0f0

$ modprobe mlx5_ib
[REGISTER]	dev 2 mlx5_0
[NETDEV_ATTACH]	dev 2 mlx5_0 port 1 netdev 4 eth2
[REGISTER]	dev 3 mlx5_1
[NETDEV_ATTACH]	dev 3 mlx5_1 port 1 netdev 5 eth3

$ devlink dev eswitch set pci/0000:08:00.0 mode switchdev
[UNREGISTER]	dev 2 rocep8s0f0
[REGISTER]	dev 4 mlx5_0
[NETDEV_ATTACH]	dev 4 mlx5_0 port 30 netdev 4 eth2

$ echo 4 > /sys/class/net/eth2/device/sriov_numvfs
[NETDEV_ATTACH]	dev 4 rdmap8s0f0 port 2 netdev 7 eth4
[NETDEV_ATTACH]	dev 4 rdmap8s0f0 port 3 netdev 8 eth5
[NETDEV_ATTACH]	dev 4 rdmap8s0f0 port 4 netdev 9 eth6
[NETDEV_ATTACH]	dev 4 rdmap8s0f0 port 5 netdev 10 eth7
[REGISTER]	dev 5 mlx5_0
[NETDEV_ATTACH]	dev 5 mlx5_0 port 1 netdev 11 eth8
[REGISTER]	dev 6 mlx5_0
[NETDEV_ATTACH]	dev 6 mlx5_0 port 1 netdev 12 eth9
[REGISTER]	dev 7 mlx5_0
[NETDEV_ATTACH]	dev 7 mlx5_0 port 1 netdev 13 eth10
[REGISTER]	dev 8 mlx5_0
[NETDEV_ATTACH]	dev 8 mlx5_0 port 1 netdev 14 eth11

$ echo 0 > /sys/class/net/eth2/device/sriov_numvfs
[UNREGISTER]	dev 5 rocep8s0f0v0
[UNREGISTER]	dev 6 rocep8s0f0v1
[UNREGISTER]	dev 7 rocep8s0f0v2
[UNREGISTER]	dev 8 rocep8s0f0v3
[NETDEV_DETACH]	dev 4 rdmap8s0f0 port 2
[NETDEV_DETACH]	dev 4 rdmap8s0f0 port 3
[NETDEV_DETACH]	dev 4 rdmap8s0f0 port 4
[NETDEV_DETACH]	dev 4 rdmap8s0f0 port 5

Signed-off-by: default avatarChiara Meiohas <cmeiohas@nvidia.com>
Signed-off-by: default avatarMichael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909173025.30422-7-michaelgur@nvidia.com


Signed-off-by: default avatarLeon Romanovsky <leon@kernel.org>
parent 8d159eb2
Loading
Loading
Loading
Loading
+35 −0
Original line number Diff line number Diff line
@@ -1351,6 +1351,29 @@ static void prevent_dealloc_device(struct ib_device *ib_dev)
{
}

static void ib_device_notify_register(struct ib_device *device)
{
	struct net_device *netdev;
	u32 port;
	int ret;

	ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
	if (ret)
		return;

	rdma_for_each_port(device, port) {
		netdev = ib_device_get_netdev(device, port);
		if (!netdev)
			continue;

		ret = rdma_nl_notify_event(device, port,
					   RDMA_NETDEV_ATTACH_EVENT);
		dev_put(netdev);
		if (ret)
			return;
	}
}

/**
 * ib_register_device - Register an IB device with IB core
 * @device: Device to register
@@ -1449,6 +1472,8 @@ int ib_register_device(struct ib_device *device, const char *name,
	dev_set_uevent_suppress(&device->dev, false);
	/* Mark for userspace that device is ready */
	kobject_uevent(&device->dev.kobj, KOBJ_ADD);

	ib_device_notify_register(device);
	ib_device_put(device);

	return 0;
@@ -1491,6 +1516,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
		goto out;

	disable_device(ib_dev);
	rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT);

	/* Expedite removing unregistered pointers from the hash table */
	free_netdevs(ib_dev);
@@ -2159,6 +2185,7 @@ static void add_ndev_hash(struct ib_port_data *pdata)
int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
			 u32 port)
{
	enum rdma_nl_notify_event_type etype;
	struct net_device *old_ndev;
	struct ib_port_data *pdata;
	unsigned long flags;
@@ -2190,6 +2217,14 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
	spin_unlock_irqrestore(&pdata->netdev_lock, flags);

	add_ndev_hash(pdata);

	/* Make sure that the device is registered before we send events */
	if (xa_load(&devices, ib_dev->index) != ib_dev)
		return 0;

	etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT;
	rdma_nl_notify_event(ib_dev, port, etype);

	return 0;
}
EXPORT_SYMBOL(ib_device_set_netdev);
+1 −0
Original line number Diff line number Diff line
@@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet)
	struct net *net = read_pnet(&rnet->net);
	struct netlink_kernel_cfg cfg = {
		.input	= rdma_nl_rcv,
		.flags = NL_CFG_F_NONROOT_RECV,
	};
	struct sock *nls;

+124 −0
Original line number Diff line number Diff line
@@ -170,6 +170,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
	[RDMA_NLDEV_ATTR_DEV_TYPE]		= { .type = NLA_U8 },
	[RDMA_NLDEV_ATTR_PARENT_NAME]		= { .type = NLA_NUL_STRING },
	[RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE]	= { .type = NLA_U8 },
	[RDMA_NLDEV_ATTR_EVENT_TYPE]		= { .type = NLA_U8 },
};

static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -2722,6 +2723,129 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
	},
};

static int fill_mon_netdev_association(struct sk_buff *msg,
				       struct ib_device *device, u32 port,
				       const struct net *net)
{
	struct net_device *netdev = ib_device_get_netdev(device, port);
	int ret = 0;

	if (netdev && !net_eq(dev_net(netdev), net))
		goto out;

	ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index);
	if (ret)
		goto out;

	ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME,
			     dev_name(&device->dev));
	if (ret)
		goto out;

	ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port);
	if (ret)
		goto out;

	if (netdev) {
		ret = nla_put_u32(msg,
				  RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
		if (ret)
			goto out;

		ret = nla_put_string(msg,
				     RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
	}

out:
	dev_put(netdev);
	return ret;
}

static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num,
				    enum rdma_nl_notify_event_type type)
{
	struct net_device *netdev;

	switch (type) {
	case RDMA_REGISTER_EVENT:
		dev_warn_ratelimited(&device->dev,
				     "Failed to send RDMA monitor register device event\n");
		break;
	case RDMA_UNREGISTER_EVENT:
		dev_warn_ratelimited(&device->dev,
				     "Failed to send RDMA monitor unregister device event\n");
		break;
	case RDMA_NETDEV_ATTACH_EVENT:
		netdev = ib_device_get_netdev(device, port_num);
		dev_warn_ratelimited(&device->dev,
				     "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n",
				     port_num, netdev->ifindex);
		dev_put(netdev);
		break;
	case RDMA_NETDEV_DETACH_EVENT:
		dev_warn_ratelimited(&device->dev,
				     "Failed to send RDMA monitor netdev detach event: port %d\n",
				     port_num);
	default:
		break;
	}
}

int rdma_nl_notify_event(struct ib_device *device, u32 port_num,
			  enum rdma_nl_notify_event_type type)
{
	struct sk_buff *skb;
	struct net *net;
	int ret = 0;
	void *nlh;

	net = read_pnet(&device->coredev.rdma_net);
	if (!net)
		return -EINVAL;

	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
	if (!skb)
		return -ENOMEM;
	nlh = nlmsg_put(skb, 0, 0,
			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR),
			0, 0);

	switch (type) {
	case RDMA_REGISTER_EVENT:
	case RDMA_UNREGISTER_EVENT:
		ret = fill_nldev_handle(skb, device);
		if (ret)
			goto err_free;
		break;
	case RDMA_NETDEV_ATTACH_EVENT:
	case RDMA_NETDEV_DETACH_EVENT:
		ret = fill_mon_netdev_association(skb, device,
						  port_num, net);
		if (ret)
			goto err_free;
		break;
	default:
		break;
	}

	ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type);
	if (ret)
		goto err_free;

	nlmsg_end(skb, nlh);
	ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL);
	if (ret && ret != -ESRCH) {
		skb = NULL; /* skb is freed in the netlink send-op handling */
		goto err_free;
	}
	return 0;

err_free:
	rdma_nl_notify_err_msg(device, port_num, type);
	nlmsg_free(skb);
	return ret;
}

void __init nldev_init(void)
{
	rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
+12 −0
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@
#include <linux/netlink.h>
#include <uapi/rdma/rdma_netlink.h>

struct ib_device;

enum {
	RDMA_NLDEV_ATTR_EMPTY_STRING = 1,
	RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
@@ -110,6 +112,16 @@ int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
 */
bool rdma_nl_chk_listeners(unsigned int group);

/**
 * Prepare and send an event message
 * @ib: the IB device which triggered the event
 * @port_num: the port number which triggered the event - 0 if unused
 * @type: the event type
 * Returns 0 on success or a negative error code
 */
int rdma_nl_notify_event(struct ib_device *ib, u32 port_num,
			 enum rdma_nl_notify_event_type type);

struct rdma_link_ops {
	struct list_head list;
	const char *type;
+15 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@ enum {
enum {
	RDMA_NL_GROUP_IWPM = 2,
	RDMA_NL_GROUP_LS,
	RDMA_NL_GROUP_NOTIFY,
	RDMA_NL_NUM_GROUPS
};

@@ -305,6 +306,8 @@ enum rdma_nldev_command {

	RDMA_NLDEV_CMD_DELDEV,

	RDMA_NLDEV_CMD_MONITOR,

	RDMA_NLDEV_NUM_OPS
};

@@ -574,6 +577,8 @@ enum rdma_nldev_attr {

	RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE,	/* u8 */

	RDMA_NLDEV_ATTR_EVENT_TYPE,		/* u8 */

	/*
	 * Always the end
	 */
@@ -624,4 +629,14 @@ enum rdma_nl_name_assign_type {
	RDMA_NAME_ASSIGN_TYPE_USER = 1, /* Provided by user-space */
};

/*
 * Supported rdma monitoring event types.
 */
enum rdma_nl_notify_event_type {
	RDMA_REGISTER_EVENT,
	RDMA_UNREGISTER_EVENT,
	RDMA_NETDEV_ATTACH_EVENT,
	RDMA_NETDEV_DETACH_EVENT,
};

#endif /* _UAPI_RDMA_NETLINK_H */