Commit ac2dc6d5 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-7.1-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "We have a series from Alex which extends CephFS client metrics with
  support for per-subvolume data I/O performance and latency tracking
  (metadata operations aren't included) and a good variety of fixes and
  cleanups across RBD and CephFS"

* tag 'ceph-for-7.1-rc1' of https://github.com/ceph/ceph-client:
  ceph: add subvolume metrics collection and reporting
  ceph: parse subvolume_id from InodeStat v9 and store in inode
  ceph: handle InodeStat v8 versioned field in reply parsing
  libceph: Fix slab-out-of-bounds access in auth message processing
  rbd: fix null-ptr-deref when device_add_disk() fails
  crush: cleanup in crush_do_rule() method
  ceph: clear s_cap_reconnect when ceph_pagelist_encode_32() fails
  ceph: only d_add() negative dentries when they are unhashed
  libceph: update outdated comment in ceph_sock_write_space()
  libceph: Remove obsolete session key alignment logic
  ceph: fix num_ops off-by-one when crypto allocation fails
  libceph: Prevent potential null-ptr-deref in ceph_handle_auth_reply()
parents ff9726d7 b1137e0b
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -7165,7 +7165,7 @@ static ssize_t do_rbd_add(const char *buf, size_t count)

	rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
	if (rc)
		goto err_out_cleanup_disk;
		goto err_out_device;

	spin_lock(&rbd_dev_list_lock);
	list_add_tail(&rbd_dev->node, &rbd_dev_list);
@@ -7179,8 +7179,8 @@ static ssize_t do_rbd_add(const char *buf, size_t count)
	module_put(THIS_MODULE);
	return rc;

err_out_cleanup_disk:
	rbd_free_disk(rbd_dev);
err_out_device:
	device_del(&rbd_dev->dev);
err_out_image_lock:
	rbd_dev_image_unlock(rbd_dev);
	rbd_dev_device_release(rbd_dev);
+1 −1
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
	export.o caps.o snap.o xattr.o quota.o io.o \
	mds_client.o mdsmap.o strings.o ceph_frag.o \
	debugfs.o util.o metric.o
	debugfs.o util.o metric.o subvolume_metrics.o

ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
+18 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#include "mds_client.h"
#include "cache.h"
#include "metric.h"
#include "subvolume_metrics.h"
#include "crypto.h"
#include <linux/ceph/osd_client.h>
#include <linux/ceph/striper.h>
@@ -259,6 +260,10 @@ static void finish_netfs_read(struct ceph_osd_request *req)
					osd_data->length), false);
	}
	if (err > 0) {
		ceph_subvolume_metrics_record_io(fsc->mdsc, ceph_inode(inode),
						 false, err,
						 req->r_start_latency,
						 req->r_end_latency);
		subreq->transferred = err;
		err = 0;
	}
@@ -823,6 +828,10 @@ static int write_folio_nounlock(struct folio *folio,

	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
				  req->r_end_latency, len, err);
	if (err >= 0 && len > 0)
		ceph_subvolume_metrics_record_io(fsc->mdsc, ci, true, len,
						 req->r_start_latency,
						 req->r_end_latency);
	fscrypt_free_bounce_page(bounce_page);
	ceph_osdc_put_request(req);
	if (err == 0)
@@ -963,6 +972,11 @@ static void writepages_finish(struct ceph_osd_request *req)
	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
				  req->r_end_latency, len, rc);

	if (rc >= 0 && len > 0)
		ceph_subvolume_metrics_record_io(mdsc, ci, true, len,
						 req->r_start_latency,
						 req->r_end_latency);

	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);

	osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -1365,6 +1379,10 @@ void ceph_process_folio_batch(struct address_space *mapping,
		rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
				folio);
		if (rc) {
			/* Did we just begin a new contiguous op? Nevermind! */
			if (ceph_wbc->len == 0)
				ceph_wbc->num_ops--;

			folio_redirty_for_writepage(wbc, folio);
			folio_unlock(folio);
			break;
+157 −0
Original line number Diff line number Diff line
@@ -9,11 +9,13 @@
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/ktime.h>
#include <linux/atomic.h>

#include <linux/ceph/libceph.h>
#include <linux/ceph/mon_client.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
#include <linux/ceph/decode.h>

#include "super.h"

@@ -21,6 +23,36 @@

#include "mds_client.h"
#include "metric.h"
#include "subvolume_metrics.h"

/**
 * struct ceph_session_feature_desc - Maps feature bits to names for debugfs
 * @bit: Feature bit number from enum ceph_feature_type (see mds_client.h)
 * @name: Human-readable feature name for debugfs output
 *
 * Used by metric_features_show() to display negotiated session features.
 */
struct ceph_session_feature_desc {
	unsigned int bit;
	const char *name;
};

static const struct ceph_session_feature_desc ceph_session_feature_table[] = {
	{ CEPHFS_FEATURE_METRIC_COLLECT, "METRIC_COLLECT" },
	{ CEPHFS_FEATURE_REPLY_ENCODING, "REPLY_ENCODING" },
	{ CEPHFS_FEATURE_RECLAIM_CLIENT, "RECLAIM_CLIENT" },
	{ CEPHFS_FEATURE_LAZY_CAP_WANTED, "LAZY_CAP_WANTED" },
	{ CEPHFS_FEATURE_MULTI_RECONNECT, "MULTI_RECONNECT" },
	{ CEPHFS_FEATURE_DELEG_INO, "DELEG_INO" },
	{ CEPHFS_FEATURE_ALTERNATE_NAME, "ALTERNATE_NAME" },
	{ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, "NOTIFY_SESSION_STATE" },
	{ CEPHFS_FEATURE_OP_GETVXATTR, "OP_GETVXATTR" },
	{ CEPHFS_FEATURE_32BITS_RETRY_FWD, "32BITS_RETRY_FWD" },
	{ CEPHFS_FEATURE_NEW_SNAPREALM_INFO, "NEW_SNAPREALM_INFO" },
	{ CEPHFS_FEATURE_HAS_OWNER_UIDGID, "HAS_OWNER_UIDGID" },
	{ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, "MDS_AUTH_CAPS_CHECK" },
	{ CEPHFS_FEATURE_SUBVOLUME_METRICS, "SUBVOLUME_METRICS" },
};

static int mdsmap_show(struct seq_file *s, void *p)
{
@@ -360,6 +392,59 @@ static int status_show(struct seq_file *s, void *p)
	return 0;
}

static int subvolume_metrics_show(struct seq_file *s, void *p)
{
	struct ceph_fs_client *fsc = s->private;
	struct ceph_mds_client *mdsc = fsc->mdsc;
	struct ceph_subvol_metric_snapshot *snapshot = NULL;
	u32 nr = 0;
	u64 total_sent = 0;
	u64 nonzero_sends = 0;
	u32 i;

	if (!mdsc) {
		seq_puts(s, "mds client unavailable\n");
		return 0;
	}

	mutex_lock(&mdsc->subvol_metrics_last_mutex);
	if (mdsc->subvol_metrics_last && mdsc->subvol_metrics_last_nr) {
		nr = mdsc->subvol_metrics_last_nr;
		snapshot = kmemdup_array(mdsc->subvol_metrics_last, nr,
					 sizeof(*snapshot), GFP_KERNEL);
		if (!snapshot)
			nr = 0;
	}
	total_sent = mdsc->subvol_metrics_sent;
	nonzero_sends = mdsc->subvol_metrics_nonzero_sends;
	mutex_unlock(&mdsc->subvol_metrics_last_mutex);

	seq_puts(s, "Last sent subvolume metrics:\n");
	if (!nr) {
		seq_puts(s, "  (none)\n");
	} else {
		seq_puts(s, "  subvol_id          rd_ops    wr_ops    rd_bytes       wr_bytes       rd_lat_us      wr_lat_us\n");
		for (i = 0; i < nr; i++) {
			const struct ceph_subvol_metric_snapshot *e = &snapshot[i];

			seq_printf(s, "  %-18llu %-9llu %-9llu %-14llu %-14llu %-14llu %-14llu\n",
				   e->subvolume_id,
				   e->read_ops, e->write_ops,
				   e->read_bytes, e->write_bytes,
				   e->read_latency_us, e->write_latency_us);
		}
	}
	kfree(snapshot);

	seq_puts(s, "\nStatistics:\n");
	seq_printf(s, "  entries_sent:      %llu\n", total_sent);
	seq_printf(s, "  non_zero_sends:    %llu\n", nonzero_sends);

	seq_puts(s, "\nPending (unsent) subvolume metrics:\n");
	ceph_subvolume_metrics_dump(&mdsc->subvol_metrics, s);
	return 0;
}

DEFINE_SHOW_ATTRIBUTE(mdsmap);
DEFINE_SHOW_ATTRIBUTE(mdsc);
DEFINE_SHOW_ATTRIBUTE(caps);
@@ -369,7 +454,72 @@ DEFINE_SHOW_ATTRIBUTE(metrics_file);
DEFINE_SHOW_ATTRIBUTE(metrics_latency);
DEFINE_SHOW_ATTRIBUTE(metrics_size);
DEFINE_SHOW_ATTRIBUTE(metrics_caps);
DEFINE_SHOW_ATTRIBUTE(subvolume_metrics);

static int metric_features_show(struct seq_file *s, void *p)
{
	struct ceph_fs_client *fsc = s->private;
	struct ceph_mds_client *mdsc = fsc->mdsc;
	unsigned long session_features = 0;
	bool have_session = false;
	bool metric_collect = false;
	bool subvol_support = false;
	bool metrics_enabled = false;
	bool subvol_enabled = false;
	int i;

	if (!mdsc) {
		seq_puts(s, "mds client unavailable\n");
		return 0;
	}

	mutex_lock(&mdsc->mutex);
	if (mdsc->metric.session) {
		have_session = true;
		session_features = mdsc->metric.session->s_features;
	}
	mutex_unlock(&mdsc->mutex);

	if (have_session) {
		metric_collect =
			test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
				 &session_features);
		subvol_support =
			test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS,
				 &session_features);
	}

	metrics_enabled = !disable_send_metrics && have_session && metric_collect;
	subvol_enabled = metrics_enabled && subvol_support;

	seq_printf(s,
		   "metrics_enabled: %s (disable_send_metrics=%d, session=%s, metric_collect=%s)\n",
		   metrics_enabled ? "yes" : "no",
		   disable_send_metrics ? 1 : 0,
		   have_session ? "yes" : "no",
		   metric_collect ? "yes" : "no");
	seq_printf(s, "subvolume_metrics_enabled: %s\n",
		   subvol_enabled ? "yes" : "no");
	seq_printf(s, "session_feature_bits: 0x%lx\n", session_features);

	if (!have_session) {
		seq_puts(s, "(no active MDS session for metrics)\n");
		return 0;
	}

	for (i = 0; i < ARRAY_SIZE(ceph_session_feature_table); i++) {
		const struct ceph_session_feature_desc *desc =
			&ceph_session_feature_table[i];
		bool set = test_bit(desc->bit, &session_features);

		seq_printf(s, "  %-24s : %s\n", desc->name,
			   set ? "yes" : "no");
	}

	return 0;
}

DEFINE_SHOW_ATTRIBUTE(metric_features);

/*
 * debugfs
@@ -404,6 +554,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
	debugfs_remove(fsc->debugfs_caps);
	debugfs_remove(fsc->debugfs_status);
	debugfs_remove(fsc->debugfs_mdsc);
	debugfs_remove(fsc->debugfs_subvolume_metrics);
	debugfs_remove_recursive(fsc->debugfs_metrics_dir);
	doutc(fsc->client, "done\n");
}
@@ -468,6 +619,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
			    &metrics_size_fops);
	debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
			    &metrics_caps_fops);
	debugfs_create_file("metric_features", 0400, fsc->debugfs_metrics_dir,
			    fsc, &metric_features_fops);
	fsc->debugfs_subvolume_metrics =
		debugfs_create_file("subvolumes", 0400,
				    fsc->debugfs_metrics_dir, fsc,
				    &subvolume_metrics_fops);
	doutc(fsc->client, "done\n");
}

+4 −2
Original line number Diff line number Diff line
@@ -769,6 +769,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
				d_drop(dentry);
				err = -ENOENT;
			} else {
				if (d_unhashed(dentry))
					d_add(dentry, NULL);
			}
		}
@@ -840,6 +841,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
			spin_unlock(&ci->i_ceph_lock);
			doutc(cl, " dir %llx.%llx complete, -ENOENT\n",
			      ceph_vinop(dir));
			if (d_unhashed(dentry))
				d_add(dentry, NULL);
			di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
			return NULL;
Loading