Commit 7d9f8468 authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: Data update path won't accidentaly grow replicas



Previously, there was a bug where if an extent had greater durability
than required (because we needed to move a durability=1 pointer and
ended up putting it on a durability 2 device), we would submit a write
for replicas=2 - the durability of the pointer being rewritten - instead
of the number of replicas required to bring it back up to the
data_replicas option.

This, plus the allocation path sometimes allocating on a greater
durability device than requested, meant that extents could continue
having more and more replicas added as they were being rewritten.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 0af8a06a
Loading
Loading
Loading
Loading
+82 −10
Original line number Diff line number Diff line
@@ -356,7 +356,7 @@ void bch2_data_update_exit(struct data_update *update)
	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
}

void bch2_update_unwritten_extent(struct btree_trans *trans,
static void bch2_update_unwritten_extent(struct btree_trans *trans,
				  struct data_update *update)
{
	struct bch_fs *c = update->op.c;
@@ -436,7 +436,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
	}
}

int bch2_extent_drop_ptrs(struct btree_trans *trans,
			  struct btree_iter *iter,
			  struct bkey_s_c k,
			  struct data_update_opts data_opts)
{
	struct bch_fs *c = trans->c;
	struct bkey_i *n;
	int ret;

	n = bch2_bkey_make_mut_noupdate(trans, k);
	ret = PTR_ERR_OR_ZERO(n);
	if (ret)
		return ret;

	while (data_opts.kill_ptrs) {
		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
		struct bch_extent_ptr *ptr;

		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
		data_opts.kill_ptrs ^= 1U << drop;
	}

	/*
	 * If the new extent no longer has any pointers, bch2_extent_normalize()
	 * will do the appropriate thing with it (turning it into a
	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
	 */
	bch2_extent_normalize(c, bkey_i_to_s(n));

	/*
	 * Since we're not inserting through an extent iterator
	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
	 * we aren't using the extent overwrite path to delete, we're
	 * just using the normal key deletion path:
	 */
	if (bkey_deleted(&n->k))
		n->k.size = 0;

	return bch2_trans_relock(trans) ?:
		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}

int bch2_data_update_init(struct btree_trans *trans,
			  struct btree_iter *iter,
			  struct moving_context *ctxt,
			  struct data_update *m,
			  struct write_point_specifier wp,
@@ -452,7 +496,7 @@ int bch2_data_update_init(struct btree_trans *trans,
	const struct bch_extent_ptr *ptr;
	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
	unsigned ptrs_locked = 0;
	int ret;
	int ret = 0;

	bch2_bkey_buf_init(&m->k);
	bch2_bkey_buf_reassemble(&m->k, c, k);
@@ -478,6 +522,8 @@ int bch2_data_update_init(struct btree_trans *trans,
	bkey_for_each_ptr(ptrs, ptr)
		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);

	unsigned durability_have = 0, durability_removing = 0;

	i = 0;
	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
		bool locked;
@@ -489,8 +535,11 @@ int bch2_data_update_init(struct btree_trans *trans,
				reserve_sectors += k.k->size;

			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
		} else if (!p.ptr.cached) {
			durability_removing += bch2_extent_ptr_desired_durability(c, &p);
		} else if (!p.ptr.cached &&
			   !((1U << i) & m->data_opts.kill_ptrs)) {
			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
			durability_have += bch2_extent_ptr_durability(c, &p);
		}

		/*
@@ -529,6 +578,29 @@ int bch2_data_update_init(struct btree_trans *trans,
		i++;
	}

	/*
	 * If current extent durability is less than io_opts.data_replicas,
	 * we're not trying to rereplicate the extent up to data_replicas here -
	 * unless extra_replicas was specified
	 *
	 * Increasing replication is an explicit operation triggered by
	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
	 */
	if (durability_have >= io_opts.data_replicas) {
		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
		m->data_opts.rewrite_ptrs = 0;
		/* if iter == NULL, it's just a promote */
		if (iter)
			ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts);
		goto done;
	}

	m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
		m->data_opts.extra_replicas;
	m->op.nr_replicas_required = m->op.nr_replicas;

	BUG_ON(!m->op.nr_replicas);

	if (reserve_sectors) {
		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
				m->data_opts.extra_replicas
@@ -538,14 +610,11 @@ int bch2_data_update_init(struct btree_trans *trans,
			goto err;
	}

	m->op.nr_replicas += m->data_opts.extra_replicas;
	m->op.nr_replicas_required = m->op.nr_replicas;

	BUG_ON(!m->op.nr_replicas);
	if (bkey_extent_is_unwritten(k)) {
		bch2_update_unwritten_extent(trans, m);
		goto done;
	}

	/* Special handling required: */
	if (bkey_extent_is_unwritten(k))
		return -BCH_ERR_unwritten_extent_update;
	return 0;
err:
	i = 0;
@@ -560,6 +629,9 @@ int bch2_data_update_init(struct btree_trans *trans,
	bch2_bkey_buf_exit(&m->k, c);
	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
	return ret;
done:
	bch2_data_update_exit(m);
	return ret ?: -BCH_ERR_data_update_done;
}

void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+7 −2
Original line number Diff line number Diff line
@@ -32,9 +32,14 @@ int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *,
				struct bch_extent_crc_unpacked);

int bch2_extent_drop_ptrs(struct btree_trans *,
			  struct btree_iter *,
			  struct bkey_s_c,
			  struct data_update_opts);

void bch2_data_update_exit(struct data_update *);
void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct moving_context *,
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
			  struct moving_context *,
			  struct data_update *,
			  struct write_point_specifier,
			  struct bch_io_opts, struct data_update_opts,
+1 −1
Original line number Diff line number Diff line
@@ -162,7 +162,7 @@
	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
	x(0,				restart_recovery)			\
	x(0,				unwritten_extent_update)		\
	x(0,				data_update_done)			\
	x(EINVAL,			device_state_not_allowed)		\
	x(EINVAL,			member_info_missing)			\
	x(EINVAL,			mismatched_block_size)			\
+1 −1
Original line number Diff line number Diff line
@@ -209,7 +209,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
	bio = &op->write.op.wbio.bio;
	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);

	ret = bch2_data_update_init(trans, NULL, &op->write,
	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
			writepoint_hashed((unsigned long) current),
			opts,
			(struct data_update_opts) {
+5 −53
Original line number Diff line number Diff line
@@ -229,49 +229,6 @@ void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
	scnprintf(stats->name, sizeof(stats->name), "%s", name);
}

static int bch2_extent_drop_ptrs(struct btree_trans *trans,
				 struct btree_iter *iter,
				 struct bkey_s_c k,
				 struct data_update_opts data_opts)
{
	struct bch_fs *c = trans->c;
	struct bkey_i *n;
	int ret;

	n = bch2_bkey_make_mut_noupdate(trans, k);
	ret = PTR_ERR_OR_ZERO(n);
	if (ret)
		return ret;

	while (data_opts.kill_ptrs) {
		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
		struct bch_extent_ptr *ptr;

		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
		data_opts.kill_ptrs ^= 1U << drop;
	}

	/*
	 * If the new extent no longer has any pointers, bch2_extent_normalize()
	 * will do the appropriate thing with it (turning it into a
	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
	 */
	bch2_extent_normalize(c, bkey_i_to_s(n));

	/*
	 * Since we're not inserting through an extent iterator
	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
	 * we aren't using the extent overwrite path to delete, we're
	 * just using the normal key deletion path:
	 */
	if (bkey_deleted(&n->k))
		n->k.size = 0;

	return bch2_trans_relock(trans) ?:
		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}

int bch2_move_extent(struct moving_context *ctxt,
		     struct move_bucket_in_flight *bucket_in_flight,
		     struct btree_iter *iter,
@@ -341,19 +298,11 @@ int bch2_move_extent(struct moving_context *ctxt,
	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
	io->rbio.bio.bi_end_io		= move_read_endio;

	ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
	ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
				    io_opts, data_opts, iter->btree_id, k);
	if (ret && ret != -BCH_ERR_unwritten_extent_update)
	if (ret)
		goto err_free_pages;

	if (ret == -BCH_ERR_unwritten_extent_update) {
		bch2_update_unwritten_extent(trans, &io->write);
		move_free(io);
		return 0;
	}

	BUG_ON(ret);

	io->write.op.end_io = move_write_done;

	if (ctxt->rate)
@@ -397,6 +346,9 @@ int bch2_move_extent(struct moving_context *ctxt,
err_free:
	kfree(io);
err:
	if (ret == -BCH_ERR_data_update_done)
		return 0;

	this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
	trace_move_extent_alloc_mem_fail2(c, k);
	return ret;