Commit b1137e0b authored by Alex Markuze's avatar Alex Markuze Committed by Ilya Dryomov
Browse files

ceph: add subvolume metrics collection and reporting



Add complete infrastructure for per-subvolume I/O metrics collection
and reporting to the MDS. This enables administrators to monitor I/O
patterns at the subvolume granularity, which is useful for multi-tenant
CephFS deployments.

This patch adds:
- CEPHFS_FEATURE_SUBVOLUME_METRICS feature flag for MDS negotiation
- CEPH_SUBVOLUME_ID_NONE constant (0) for unknown/unset state
- Red-black tree based metrics tracker for efficient per-subvolume
  aggregation with kmem_cache for entry allocations
- Wire format encoding matching the MDS C++ AggregatedIOMetrics struct
- Integration with the existing CLIENT_METRICS message
- Recording of I/O operations from file read/write and writeback paths
- Debugfs interfaces for monitoring (metrics/subvolumes, metrics/metric_features)

Metrics tracked per subvolume include:
- Read/write operation counts
- Read/write byte counts
- Read/write latency sums (for average calculation)

The metrics are periodically sent to the MDS as part of the existing
metrics reporting infrastructure when the MDS advertises support for
the SUBVOLUME_METRICS feature.

CEPH_SUBVOLUME_ID_NONE enforces subvolume_id immutability. Following
the FUSE client convention, 0 means unknown/unset. Once an inode has
a valid (non-zero) subvolume_id, it should not change during the
inode's lifetime.

Signed-off-by: default avatarAlex Markuze <amarkuze@redhat.com>
Reviewed-by: default avatarViacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent 4a1c5434
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
	export.o caps.o snap.o xattr.o quota.o io.o \
	mds_client.o mdsmap.o strings.o ceph_frag.o \
	debugfs.o util.o metric.o
	debugfs.o util.o metric.o subvolume_metrics.o

ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
+14 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#include "mds_client.h"
#include "cache.h"
#include "metric.h"
#include "subvolume_metrics.h"
#include "crypto.h"
#include <linux/ceph/osd_client.h>
#include <linux/ceph/striper.h>
@@ -259,6 +260,10 @@ static void finish_netfs_read(struct ceph_osd_request *req)
					osd_data->length), false);
	}
	if (err > 0) {
		ceph_subvolume_metrics_record_io(fsc->mdsc, ceph_inode(inode),
						 false, err,
						 req->r_start_latency,
						 req->r_end_latency);
		subreq->transferred = err;
		err = 0;
	}
@@ -823,6 +828,10 @@ static int write_folio_nounlock(struct folio *folio,

	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
				  req->r_end_latency, len, err);
	if (err >= 0 && len > 0)
		ceph_subvolume_metrics_record_io(fsc->mdsc, ci, true, len,
						 req->r_start_latency,
						 req->r_end_latency);
	fscrypt_free_bounce_page(bounce_page);
	ceph_osdc_put_request(req);
	if (err == 0)
@@ -963,6 +972,11 @@ static void writepages_finish(struct ceph_osd_request *req)
	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
				  req->r_end_latency, len, rc);

	if (rc >= 0 && len > 0)
		ceph_subvolume_metrics_record_io(mdsc, ci, true, len,
						 req->r_start_latency,
						 req->r_end_latency);

	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);

	osd_data = osd_req_op_extent_osd_data(req, 0);
+157 −0
Original line number Diff line number Diff line
@@ -9,11 +9,13 @@
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/ktime.h>
#include <linux/atomic.h>

#include <linux/ceph/libceph.h>
#include <linux/ceph/mon_client.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
#include <linux/ceph/decode.h>

#include "super.h"

@@ -21,6 +23,36 @@

#include "mds_client.h"
#include "metric.h"
#include "subvolume_metrics.h"

/**
 * struct ceph_session_feature_desc - Maps feature bits to names for debugfs
 * @bit: Feature bit number from enum ceph_feature_type (see mds_client.h)
 * @name: Human-readable feature name for debugfs output
 *
 * Used by metric_features_show() to display negotiated session features.
 */
struct ceph_session_feature_desc {
	unsigned int bit;
	const char *name;
};

static const struct ceph_session_feature_desc ceph_session_feature_table[] = {
	{ CEPHFS_FEATURE_METRIC_COLLECT, "METRIC_COLLECT" },
	{ CEPHFS_FEATURE_REPLY_ENCODING, "REPLY_ENCODING" },
	{ CEPHFS_FEATURE_RECLAIM_CLIENT, "RECLAIM_CLIENT" },
	{ CEPHFS_FEATURE_LAZY_CAP_WANTED, "LAZY_CAP_WANTED" },
	{ CEPHFS_FEATURE_MULTI_RECONNECT, "MULTI_RECONNECT" },
	{ CEPHFS_FEATURE_DELEG_INO, "DELEG_INO" },
	{ CEPHFS_FEATURE_ALTERNATE_NAME, "ALTERNATE_NAME" },
	{ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, "NOTIFY_SESSION_STATE" },
	{ CEPHFS_FEATURE_OP_GETVXATTR, "OP_GETVXATTR" },
	{ CEPHFS_FEATURE_32BITS_RETRY_FWD, "32BITS_RETRY_FWD" },
	{ CEPHFS_FEATURE_NEW_SNAPREALM_INFO, "NEW_SNAPREALM_INFO" },
	{ CEPHFS_FEATURE_HAS_OWNER_UIDGID, "HAS_OWNER_UIDGID" },
	{ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, "MDS_AUTH_CAPS_CHECK" },
	{ CEPHFS_FEATURE_SUBVOLUME_METRICS, "SUBVOLUME_METRICS" },
};

static int mdsmap_show(struct seq_file *s, void *p)
{
@@ -360,6 +392,59 @@ static int status_show(struct seq_file *s, void *p)
	return 0;
}

static int subvolume_metrics_show(struct seq_file *s, void *p)
{
	struct ceph_fs_client *fsc = s->private;
	struct ceph_mds_client *mdsc = fsc->mdsc;
	struct ceph_subvol_metric_snapshot *snapshot = NULL;
	u32 nr = 0;
	u64 total_sent = 0;
	u64 nonzero_sends = 0;
	u32 i;

	if (!mdsc) {
		seq_puts(s, "mds client unavailable\n");
		return 0;
	}

	mutex_lock(&mdsc->subvol_metrics_last_mutex);
	if (mdsc->subvol_metrics_last && mdsc->subvol_metrics_last_nr) {
		nr = mdsc->subvol_metrics_last_nr;
		snapshot = kmemdup_array(mdsc->subvol_metrics_last, nr,
					 sizeof(*snapshot), GFP_KERNEL);
		if (!snapshot)
			nr = 0;
	}
	total_sent = mdsc->subvol_metrics_sent;
	nonzero_sends = mdsc->subvol_metrics_nonzero_sends;
	mutex_unlock(&mdsc->subvol_metrics_last_mutex);

	seq_puts(s, "Last sent subvolume metrics:\n");
	if (!nr) {
		seq_puts(s, "  (none)\n");
	} else {
		seq_puts(s, "  subvol_id          rd_ops    wr_ops    rd_bytes       wr_bytes       rd_lat_us      wr_lat_us\n");
		for (i = 0; i < nr; i++) {
			const struct ceph_subvol_metric_snapshot *e = &snapshot[i];

			seq_printf(s, "  %-18llu %-9llu %-9llu %-14llu %-14llu %-14llu %-14llu\n",
				   e->subvolume_id,
				   e->read_ops, e->write_ops,
				   e->read_bytes, e->write_bytes,
				   e->read_latency_us, e->write_latency_us);
		}
	}
	kfree(snapshot);

	seq_puts(s, "\nStatistics:\n");
	seq_printf(s, "  entries_sent:      %llu\n", total_sent);
	seq_printf(s, "  non_zero_sends:    %llu\n", nonzero_sends);

	seq_puts(s, "\nPending (unsent) subvolume metrics:\n");
	ceph_subvolume_metrics_dump(&mdsc->subvol_metrics, s);
	return 0;
}

DEFINE_SHOW_ATTRIBUTE(mdsmap);
DEFINE_SHOW_ATTRIBUTE(mdsc);
DEFINE_SHOW_ATTRIBUTE(caps);
@@ -369,7 +454,72 @@ DEFINE_SHOW_ATTRIBUTE(metrics_file);
DEFINE_SHOW_ATTRIBUTE(metrics_latency);
DEFINE_SHOW_ATTRIBUTE(metrics_size);
DEFINE_SHOW_ATTRIBUTE(metrics_caps);
DEFINE_SHOW_ATTRIBUTE(subvolume_metrics);

static int metric_features_show(struct seq_file *s, void *p)
{
	struct ceph_fs_client *fsc = s->private;
	struct ceph_mds_client *mdsc = fsc->mdsc;
	unsigned long session_features = 0;
	bool have_session = false;
	bool metric_collect = false;
	bool subvol_support = false;
	bool metrics_enabled = false;
	bool subvol_enabled = false;
	int i;

	if (!mdsc) {
		seq_puts(s, "mds client unavailable\n");
		return 0;
	}

	mutex_lock(&mdsc->mutex);
	if (mdsc->metric.session) {
		have_session = true;
		session_features = mdsc->metric.session->s_features;
	}
	mutex_unlock(&mdsc->mutex);

	if (have_session) {
		metric_collect =
			test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
				 &session_features);
		subvol_support =
			test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
				 &session_features);
	}

	metrics_enabled = !disable_send_metrics && have_session && metric_collect;
	subvol_enabled = metrics_enabled && subvol_support;

	seq_printf(s,
		   "metrics_enabled: %s (disable_send_metrics=%d, session=%s, metric_collect=%s)\n",
		   metrics_enabled ? "yes" : "no",
		   disable_send_metrics ? 1 : 0,
		   have_session ? "yes" : "no",
		   metric_collect ? "yes" : "no");
	seq_printf(s, "subvolume_metrics_enabled: %s\n",
		   subvol_enabled ? "yes" : "no");
	seq_printf(s, "session_feature_bits: 0x%lx\n", session_features);

	if (!have_session) {
		seq_puts(s, "(no active MDS session for metrics)\n");
		return 0;
	}

	for (i = 0; i < ARRAY_SIZE(ceph_session_feature_table); i++) {
		const struct ceph_session_feature_desc *desc =
			&ceph_session_feature_table[i];
		bool set = test_bit(desc->bit, &session_features);

		seq_printf(s, "  %-24s : %s\n", desc->name,
			   set ? "yes" : "no");
	}

	return 0;
}

DEFINE_SHOW_ATTRIBUTE(metric_features);

/*
 * debugfs
@@ -404,6 +554,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
	debugfs_remove(fsc->debugfs_caps);
	debugfs_remove(fsc->debugfs_status);
	debugfs_remove(fsc->debugfs_mdsc);
	debugfs_remove(fsc->debugfs_subvolume_metrics);
	debugfs_remove_recursive(fsc->debugfs_metrics_dir);
	doutc(fsc->client, "done\n");
}
@@ -468,6 +619,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
			    &metrics_size_fops);
	debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
			    &metrics_caps_fops);
	debugfs_create_file("metric_features", 0400, fsc->debugfs_metrics_dir,
			    fsc, &metric_features_fops);
	fsc->debugfs_subvolume_metrics =
		debugfs_create_file("subvolumes", 0400,
				    fsc->debugfs_metrics_dir, fsc,
				    &subvolume_metrics_fops);
	doutc(fsc->client, "done\n");
}

+64 −4
Original line number Diff line number Diff line
@@ -19,6 +19,25 @@
#include "cache.h"
#include "io.h"
#include "metric.h"
#include "subvolume_metrics.h"

/*
 * Record I/O for subvolume metrics tracking.
 *
 * Callers must ensure bytes > 0 for reads (ret > 0 check) to avoid counting
 * EOF as an I/O operation. For writes, the condition is (ret >= 0 && len > 0).
 */
static inline void ceph_record_subvolume_io(struct inode *inode, bool is_write,
					    ktime_t start, ktime_t end,
					    size_t bytes)
{
	if (!bytes)
		return;

	ceph_subvolume_metrics_record_io(ceph_sb_to_mdsc(inode->i_sb),
					 ceph_inode(inode),
					 is_write, bytes, start, end);
}

static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
{
@@ -1140,6 +1159,15 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
					 req->r_start_latency,
					 req->r_end_latency,
					 read_len, ret);
		/*
		 * Only record subvolume metrics for actual bytes read.
		 * ret == 0 means EOF (no data), not an I/O operation.
		 */
		if (ret > 0)
			ceph_record_subvolume_io(inode, false,
						 req->r_start_latency,
						 req->r_end_latency,
						 ret);

		if (ret > 0)
			objver = req->r_version;
@@ -1385,12 +1413,23 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)

	/* r_start_latency == 0 means the request was not submitted */
	if (req->r_start_latency) {
		if (aio_req->write)
		if (aio_req->write) {
			ceph_update_write_metrics(metric, req->r_start_latency,
						  req->r_end_latency, len, rc);
		else
			if (rc >= 0 && len)
				ceph_record_subvolume_io(inode, true,
							 req->r_start_latency,
							 req->r_end_latency,
							 len);
		} else {
			ceph_update_read_metrics(metric, req->r_start_latency,
						 req->r_end_latency, len, rc);
			if (rc > 0)
				ceph_record_subvolume_io(inode, false,
							 req->r_start_latency,
							 req->r_end_latency,
							 rc);
		}
	}

	put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
@@ -1614,12 +1653,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
		ceph_osdc_start_request(req->r_osdc, req);
		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);

		if (write)
		if (write) {
			ceph_update_write_metrics(metric, req->r_start_latency,
						  req->r_end_latency, len, ret);
		else
			if (ret >= 0 && len)
				ceph_record_subvolume_io(inode, true,
							 req->r_start_latency,
							 req->r_end_latency,
							 len);
		} else {
			ceph_update_read_metrics(metric, req->r_start_latency,
						 req->r_end_latency, len, ret);
			if (ret > 0)
				ceph_record_subvolume_io(inode, false,
							 req->r_start_latency,
							 req->r_end_latency,
							 ret);
		}

		size = i_size_read(inode);
		if (!write) {
@@ -1872,6 +1922,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
						 req->r_start_latency,
						 req->r_end_latency,
						 read_len, ret);
			if (ret > 0)
				ceph_record_subvolume_io(inode, false,
							 req->r_start_latency,
							 req->r_end_latency,
							 ret);

			/* Ok if object is not already present */
			if (ret == -ENOENT) {
@@ -2036,6 +2091,11 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,

		ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
					  req->r_end_latency, len, ret);
		if (ret >= 0 && write_len)
			ceph_record_subvolume_io(inode, true,
						 req->r_start_latency,
						 req->r_end_latency,
						 write_len);
		ceph_osdc_put_request(req);
		if (ret != 0) {
			doutc(cl, "osd write returned %d\n", ret);
+33 −1
Original line number Diff line number Diff line
@@ -68,6 +68,21 @@ static void ceph_cap_reclaim_work(struct work_struct *work);

static const struct ceph_connection_operations mds_con_ops;

static void ceph_metric_bind_session(struct ceph_mds_client *mdsc,
				     struct ceph_mds_session *session)
{
	struct ceph_mds_session *old;

	if (!mdsc || !session || disable_send_metrics)
		return;

	old = mdsc->metric.session;
	mdsc->metric.session = ceph_get_mds_session(session);
	if (old)
		ceph_put_mds_session(old);

	metric_schedule_delayed(&mdsc->metric);
}

/*
 * mds reply parsing
@@ -4347,6 +4362,11 @@ static void handle_session(struct ceph_mds_session *session,
		}
		mdsc->s_cap_auths_num = cap_auths_num;
		mdsc->s_cap_auths = cap_auths;

		session->s_features = features;
		if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
			     &session->s_features))
			ceph_metric_bind_session(mdsc, session);
	}
	if (op == CEPH_SESSION_CLOSE) {
		ceph_get_mds_session(session);
@@ -4373,7 +4393,11 @@ static void handle_session(struct ceph_mds_session *session,
			pr_info_client(cl, "mds%d reconnect success\n",
				       session->s_mds);

		session->s_features = features;
		if (test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
			     &session->s_features))
			ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, true);
		else
			ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, false);
		if (session->s_state == CEPH_MDS_SESSION_OPEN) {
			pr_notice_client(cl, "mds%d is already opened\n",
					 session->s_mds);
@@ -5616,6 +5640,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
	err = ceph_metric_init(&mdsc->metric);
	if (err)
		goto err_mdsmap;
	ceph_subvolume_metrics_init(&mdsc->subvol_metrics);
	mutex_init(&mdsc->subvol_metrics_last_mutex);
	mdsc->subvol_metrics_last = NULL;
	mdsc->subvol_metrics_last_nr = 0;
	mdsc->subvol_metrics_sent = 0;
	mdsc->subvol_metrics_nonzero_sends = 0;

	spin_lock_init(&mdsc->dentry_list_lock);
	INIT_LIST_HEAD(&mdsc->dentry_leases);
@@ -6149,6 +6179,8 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
	ceph_mdsc_stop(mdsc);

	ceph_metric_destroy(&mdsc->metric);
	ceph_subvolume_metrics_destroy(&mdsc->subvol_metrics);
	kfree(mdsc->subvol_metrics_last);

	fsc->mdsc = NULL;
	kfree(mdsc);
Loading