Commit e76e0e3f authored by Darrick J. Wong's avatar Darrick J. Wong
Browse files

xfs: convey externally discovered fsdax media errors to the health monitor



Connect the fsdax media failure notification code to the health monitor
so that xfs can send events about that to the xfs_healer daemon.

Later on we'll add the ability for the xfs_scrub media scan (phase 6) to
report the errors that it finds to the kernel so that those are also
logged by xfs_healer.

Signed-off-by: default avatar"Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
parent 74c4795e
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -1014,6 +1014,11 @@ struct xfs_rtgroup_geometry {
#define XFS_HEALTH_MONITOR_DOMAIN_INODE		(3)
#define XFS_HEALTH_MONITOR_DOMAIN_RTGROUP	(4)

/* disk events */
#define XFS_HEALTH_MONITOR_DOMAIN_DATADEV	(5)
#define XFS_HEALTH_MONITOR_DOMAIN_RTDEV		(6)
#define XFS_HEALTH_MONITOR_DOMAIN_LOGDEV	(7)

/* Health monitor event types */

/* status of the monitor itself */
@@ -1031,6 +1036,9 @@ struct xfs_rtgroup_geometry {
/* filesystem shutdown */
#define XFS_HEALTH_MONITOR_TYPE_SHUTDOWN	(6)

/* media errors */
#define XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR	(7)

/* lost events */
struct xfs_health_monitor_lost {
	__u64	count;
@@ -1071,6 +1079,12 @@ struct xfs_health_monitor_shutdown {
	__u32	reasons;
};

/* disk media errors */
struct xfs_health_monitor_media {
	__u64	daddr;
	__u64	bbcount;
};

struct xfs_health_monitor_event {
	/* XFS_HEALTH_MONITOR_DOMAIN_* */
	__u32	domain;
@@ -1092,6 +1106,7 @@ struct xfs_health_monitor_event {
		struct xfs_health_monitor_group group;
		struct xfs_health_monitor_inode inode;
		struct xfs_health_monitor_shutdown shutdown;
		struct xfs_health_monitor_media media;
	} e;

	/* zeroes */
+66 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include "xfs_health.h"
#include "xfs_healthmon.h"
#include "xfs_fsops.h"
#include "xfs_notify_failure.h"

#include <linux/anon_inodes.h>
#include <linux/eventpoll.h>
@@ -208,6 +209,19 @@ xfs_healthmon_merge_events(
		/* yes, we can race to shutdown */
		existing->flags |= new->flags;
		return true;

	case XFS_HEALTHMON_MEDIA_ERROR:
		/* physically adjacent errors can merge */
		if (existing->daddr + existing->bbcount == new->daddr) {
			existing->bbcount += new->bbcount;
			return true;
		}
		if (new->daddr + new->bbcount == existing->daddr) {
			existing->daddr = new->daddr;
			existing->bbcount += new->bbcount;
			return true;
		}
		return false;
	}

	return false;
@@ -522,6 +536,48 @@ xfs_healthmon_report_shutdown(
	xfs_healthmon_put(hm);
}

static inline enum xfs_healthmon_domain
media_error_domain(
	enum xfs_device			fdev)
{
	switch (fdev) {
	case XFS_DEV_DATA:
		return XFS_HEALTHMON_DATADEV;
	case XFS_DEV_LOG:
		return XFS_HEALTHMON_LOGDEV;
	case XFS_DEV_RT:
		return XFS_HEALTHMON_RTDEV;
	}

	ASSERT(0);
	return 0;
}

/* Add a media error event to the reporting queue. */
void
xfs_healthmon_report_media(
	struct xfs_mount		*mp,
	enum xfs_device			fdev,
	xfs_daddr_t			daddr,
	uint64_t			bbcount)
{
	struct xfs_healthmon_event	event = {
		.type			= XFS_HEALTHMON_MEDIA_ERROR,
		.domain			= media_error_domain(fdev),
		.daddr			= daddr,
		.bbcount		= bbcount,
	};
	struct xfs_healthmon		*hm = xfs_healthmon_get(mp);

	if (!hm)
		return;

	trace_xfs_healthmon_report_media(hm, fdev, &event);

	xfs_healthmon_push(hm, &event);
	xfs_healthmon_put(hm);
}

static inline void
xfs_healthmon_reset_outbuf(
	struct xfs_healthmon		*hm)
@@ -574,6 +630,9 @@ static const unsigned int domain_map[] = {
	[XFS_HEALTHMON_AG]		= XFS_HEALTH_MONITOR_DOMAIN_AG,
	[XFS_HEALTHMON_INODE]		= XFS_HEALTH_MONITOR_DOMAIN_INODE,
	[XFS_HEALTHMON_RTGROUP]		= XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
	[XFS_HEALTHMON_DATADEV]		= XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
	[XFS_HEALTHMON_RTDEV]		= XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
	[XFS_HEALTHMON_LOGDEV]		= XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
};

static const unsigned int type_map[] = {
@@ -584,6 +643,7 @@ static const unsigned int type_map[] = {
	[XFS_HEALTHMON_HEALTHY]		= XFS_HEALTH_MONITOR_TYPE_HEALTHY,
	[XFS_HEALTHMON_UNMOUNT]		= XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
	[XFS_HEALTHMON_SHUTDOWN]	= XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
	[XFS_HEALTHMON_MEDIA_ERROR]	= XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
};

/* Render event as a V0 structure */
@@ -635,6 +695,12 @@ xfs_healthmon_format_v0(
		hme.e.inode.ino = event->ino;
		hme.e.inode.gen = event->gen;
		break;
	case XFS_HEALTHMON_DATADEV:
	case XFS_HEALTHMON_LOGDEV:
	case XFS_HEALTHMON_RTDEV:
		hme.e.media.daddr = event->daddr;
		hme.e.media.bbcount = event->bbcount;
		break;
	default:
		break;
	}
+16 −0
Original line number Diff line number Diff line
@@ -79,6 +79,9 @@ enum xfs_healthmon_type {
	XFS_HEALTHMON_SICK,	/* runtime corruption observed */
	XFS_HEALTHMON_CORRUPT,	/* fsck reported corruption */
	XFS_HEALTHMON_HEALTHY,	/* fsck reported healthy structure */

	/* media errors */
	XFS_HEALTHMON_MEDIA_ERROR,
};

enum xfs_healthmon_domain {
@@ -89,6 +92,11 @@ enum xfs_healthmon_domain {
	XFS_HEALTHMON_AG,	/* allocation group metadata */
	XFS_HEALTHMON_INODE,	/* inode metadata */
	XFS_HEALTHMON_RTGROUP,	/* realtime group metadata */

	/* media errors */
	XFS_HEALTHMON_DATADEV,
	XFS_HEALTHMON_RTDEV,
	XFS_HEALTHMON_LOGDEV,
};

struct xfs_healthmon_event {
@@ -126,6 +134,11 @@ struct xfs_healthmon_event {
		struct {
			unsigned int	flags;
		};
		/* media errors */
		struct {
			xfs_daddr_t	daddr;
			uint64_t	bbcount;
		};
	};
};

@@ -141,6 +154,9 @@ void xfs_healthmon_report_inode(struct xfs_inode *ip,

void xfs_healthmon_report_shutdown(struct xfs_mount *mp, uint32_t flags);

void xfs_healthmon_report_media(struct xfs_mount *mp, enum xfs_device fdev,
		xfs_daddr_t daddr, uint64_t bbcount);

long xfs_ioc_health_monitor(struct file *file,
		struct xfs_health_monitor __user *arg);

+12 −5
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include "xfs_notify_failure.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_healthmon.h"

#include <linux/mm.h>
#include <linux/dax.h>
@@ -219,6 +220,8 @@ xfs_dax_notify_logdev_failure(
	if (error)
		return error;

	xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen);

	/*
	 * In the pre-remove case the failure notification is attempting to
	 * trigger a force unmount.  The expectation is that the device is
@@ -252,16 +255,20 @@ xfs_dax_notify_dev_failure(
	uint64_t		bblen;
	struct xfs_group	*xg = NULL;

	if (!xfs_has_rmapbt(mp)) {
		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
		return -EOPNOTSUPP;
	}

	error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
			offset, len, &daddr, &bblen);
	if (error)
		return error;

	xfs_healthmon_report_media(mp,
			type == XG_TYPE_RTG ?  XFS_DEV_RT : XFS_DEV_DATA,
			daddr, bblen);

	if (!xfs_has_rmapbt(mp)) {
		xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
		return -EOPNOTSUPP;
	}

	if (type == XG_TYPE_RTG) {
		start_bno = xfs_daddr_to_rtb(mp, daddr);
		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
+1 −0
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@
#include "xfs_zone_priv.h"
#include "xfs_health.h"
#include "xfs_healthmon.h"
#include "xfs_notify_failure.h"

/*
 * We include this last to have the helpers above available for the trace
Loading