Commit 94119eeb authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Add IO error counts to bch_member



We now track IO errors per device since filesystem creation.

IO error counts can be viewed in sysfs, or with the 'bcachefs
show-super' command.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 5394fe94
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -502,6 +502,8 @@ struct bch_dev {
	 * Committed by bch2_write_super() -> bch_fs_mi_update()
	 */
	struct bch_member_cpu	mi;
	atomic64_t		errors[BCH_MEMBER_ERROR_NR];

	__uuid_t		uuid;
	char			name[BDEVNAME_SIZE];

+15 −0
Original line number Diff line number Diff line
@@ -1268,6 +1268,18 @@ enum bch_iops_measurement {
	BCH_IOPS_NR
};

#define BCH_MEMBER_ERROR_TYPES()		\
	x(read,		0)			\
	x(write,	1)			\
	x(checksum,	2)

enum bch_member_error_type {
#define x(t, n) BCH_MEMBER_ERROR_##t = n,
	BCH_MEMBER_ERROR_TYPES()
#undef x
	BCH_MEMBER_ERROR_NR
};

struct bch_member {
	__uuid_t		uuid;
	__le64			nbuckets;	/* device size */
@@ -1278,6 +1290,9 @@ struct bch_member {

	__le64			flags;
	__le32			iops[4];
	__le64			errors[BCH_MEMBER_ERROR_NR];
	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
	__le64			errors_reset_time;
};

#define BCH_MEMBER_V1_BYTES	56
+16 −7
Original line number Diff line number Diff line
@@ -934,8 +934,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
	while (b->written < (ptr_written ?: btree_sectors(c))) {
		unsigned sectors;
		struct nonce nonce;
		struct bch_csum csum;
		bool first = !b->written;
		bool csum_bad;

		if (!b->written) {
			i = &b->data->keys;
@@ -946,9 +946,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
				     BSET_CSUM_TYPE(i));

			nonce = btree_nonce(i, b->written << 9);
			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);

			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
			csum_bad = bch2_crc_cmp(b->data->csum,
				csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
			if (csum_bad)
				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);

			btree_err_on(csum_bad,
				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
				     "invalid checksum");

@@ -976,9 +980,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
				     BSET_CSUM_TYPE(i));

			nonce = btree_nonce(i, b->written << 9);
			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
			csum_bad = bch2_crc_cmp(bne->csum,
				csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
			if (csum_bad)
				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);

			btree_err_on(bch2_crc_cmp(csum, bne->csum),
			btree_err_on(csum_bad,
				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
				     "invalid checksum");

@@ -1168,7 +1175,8 @@ static void btree_node_read_work(struct work_struct *work)
start:
		printbuf_reset(&buf);
		bch2_btree_pos_to_text(&buf, c, b);
		bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
		bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
				   "btree read error %s for %s",
				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
		if (rb->have_ioref)
			percpu_ref_put(&ca->io_ref);
@@ -1749,7 +1757,8 @@ static void btree_node_write_endio(struct bio *bio)
	if (wbio->have_ioref)
		bch2_latency_acct(ca, wbio->submit_time, WRITE);

	if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
			       "btree write error: %s",
			       bch2_blk_status_to_str(bio->bi_status)) ||
	    bch2_meta_write_fault("btree")) {
		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+5 −1
Original line number Diff line number Diff line
@@ -373,7 +373,11 @@ static void ec_block_endio(struct bio *bio)
	struct bch_dev *ca = ec_bio->ca;
	struct closure *cl = bio->bi_private;

	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
	if (bch2_dev_io_err_on(bio->bi_status, ca,
			       bio_data_dir(bio)
			       ? BCH_MEMBER_ERROR_write
			       : BCH_MEMBER_ERROR_read,
			       "erasure coding %s error: %s",
			       bio_data_dir(bio) ? "write" : "read",
			       bch2_blk_status_to_str(bio->bi_status)))
		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+2 −1
Original line number Diff line number Diff line
@@ -56,8 +56,9 @@ void bch2_io_error_work(struct work_struct *work)
	up_write(&c->state_lock);
}

void bch2_io_error(struct bch_dev *ca)
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
	atomic64_inc(&ca->errors[type]);
	//queue_work(system_long_wq, &ca->io_error_work);
}

Loading