Commit 7a7d17b2 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet
Browse files

bcachefs: Whiteouts for snapshots



This patch adds KEY_TYPE_whiteout, a new type of whiteout for snapshots,
when we're deleting and the key being deleted is in an ancestor
snapshot - and updates the transaction update/commit path to use it.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
parent 8c6d298a
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -327,7 +327,7 @@ static inline void bkey_init(struct bkey *k)
*/
#define BCH_BKEY_TYPES()				\
	x(deleted,		0)			\
	x(discard,		1)			\
	x(whiteout,		1)			\
	x(error,		2)			\
	x(cookie,		3)			\
	x(hash_whiteout,	4)			\
@@ -361,7 +361,7 @@ struct bch_deleted {
	struct bch_val		v;
};

struct bch_discard {
struct bch_whiteout {
	struct bch_val		v;
};

+1 −1
Original line number Diff line number Diff line
@@ -63,7 +63,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)

#define bkey_whiteout(_k)				\
	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)

enum bkey_lr_packed {
	BKEY_PACKED_BOTH,
+19 −7
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
	.key_invalid = deleted_key_invalid,		\
}

#define bch2_bkey_ops_discard (struct bkey_ops) {	\
#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
	.key_invalid = deleted_key_invalid,		\
}

@@ -101,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)

static unsigned bch2_key_types_allowed[] = {
	[BKEY_TYPE_extents] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_whiteout)|
		(1U << KEY_TYPE_error)|
		(1U << KEY_TYPE_cookie)|
		(1U << KEY_TYPE_extent)|
@@ -108,30 +110,43 @@ static unsigned bch2_key_types_allowed[] = {
		(1U << KEY_TYPE_reflink_p)|
		(1U << KEY_TYPE_inline_data),
	[BKEY_TYPE_inodes] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_whiteout)|
		(1U << KEY_TYPE_inode)|
		(1U << KEY_TYPE_inode_generation),
	[BKEY_TYPE_dirents] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_whiteout)|
		(1U << KEY_TYPE_hash_whiteout)|
		(1U << KEY_TYPE_dirent),
	[BKEY_TYPE_xattrs] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_whiteout)|
		(1U << KEY_TYPE_cookie)|
		(1U << KEY_TYPE_hash_whiteout)|
		(1U << KEY_TYPE_xattr),
	[BKEY_TYPE_alloc] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_alloc)|
		(1U << KEY_TYPE_alloc_v2),
	[BKEY_TYPE_quotas] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_quota),
	[BKEY_TYPE_stripes] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_stripe),
	[BKEY_TYPE_reflink] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_reflink_v)|
		(1U << KEY_TYPE_indirect_inline_data),
	[BKEY_TYPE_subvolumes] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_subvolume),
	[BKEY_TYPE_snapshots] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_snapshot),
	[BKEY_TYPE_btree] =
		(1U << KEY_TYPE_deleted)|
		(1U << KEY_TYPE_btree_ptr)|
		(1U << KEY_TYPE_btree_ptr_v2),
};
@@ -139,21 +154,18 @@ static unsigned bch2_key_types_allowed[] = {
const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
				enum btree_node_type type)
{
	unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
		bch2_key_types_allowed[type] ;

	if (k.k->u64s < BKEY_U64s)
		return "u64s too small";

	if (!(key_types_allowed & (1U << k.k->type)))
	if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
		return "invalid key type for this btree";

	if (type == BKEY_TYPE_btree &&
	    bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
		return "value too big";

	if (btree_node_type_is_extents(type)) {
		if ((k.k->size == 0) != bkey_deleted(k.k))
	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
		if (k.k->size == 0)
			return "bad size field";

		if (k.k->size > k.k->p.offset)
+105 −8
Original line number Diff line number Diff line
@@ -1002,21 +1002,24 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
		goto next;
	}

	if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k)))
	if (!bkey_cmp(k.k->p, start))
		goto next;

	while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
		bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
		bool back_split  = bkey_cmp(k.k->p, insert->k.p) > 0;

		/*
		 * If we're going to be splitting a compressed extent, note it
		 * so that __bch2_trans_commit() can increase our disk
		 * reservation:
		 */
		if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
		    bkey_cmp(k.k->p, insert->k.p) > 0 &&
		if (((front_split && back_split) ||
		     ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
			trans->extra_journal_res += compressed_sectors;

		if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
		if (front_split) {
			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
			if ((ret = PTR_ERR_OR_ZERO(update)))
				goto err;
@@ -1027,6 +1030,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,

			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
					     BTREE_ITER_NOT_EXTENTS|
					     BTREE_ITER_ALL_SNAPSHOTS|
					     BTREE_ITER_INTENT);
			ret   = bch2_btree_iter_traverse(&update_iter) ?:
				bch2_trans_update(trans, &update_iter, update,
						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
						  flags);
			bch2_trans_iter_exit(trans, &update_iter);

			if (ret)
				goto err;
		}

		if (k.k->p.snapshot != insert->k.p.snapshot &&
		    (front_split || back_split)) {
			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
			if ((ret = PTR_ERR_OR_ZERO(update)))
				goto err;

			bkey_reassemble(update, k);

			bch2_cut_front(start, update);
			bch2_cut_back(insert->k.p, update);

			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
					     BTREE_ITER_NOT_EXTENTS|
					     BTREE_ITER_ALL_SNAPSHOTS|
					     BTREE_ITER_INTENT);
			ret   = bch2_btree_iter_traverse(&update_iter) ?:
				bch2_trans_update(trans, &update_iter, update,
@@ -1038,12 +1067,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
		}

		if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
			ret = bch2_btree_delete_at(trans, &iter, flags);
			update = bch2_trans_kmalloc(trans, sizeof(*update));
			if ((ret = PTR_ERR_OR_ZERO(update)))
				goto err;

			bkey_init(&update->k);
			update->k.p = k.k->p;

			if (insert->k.p.snapshot != k.k->p.snapshot) {
				update->k.p.snapshot = insert->k.p.snapshot;
				update->k.type = KEY_TYPE_whiteout;
			}

			bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
					     BTREE_ITER_NOT_EXTENTS|
					     BTREE_ITER_INTENT);
			ret   = bch2_btree_iter_traverse(&update_iter) ?:
				bch2_trans_update(trans, &update_iter, update,
						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
						  flags);
			bch2_trans_iter_exit(trans, &update_iter);

			if (ret)
				goto err;
		}

		if (bkey_cmp(k.k->p, insert->k.p) > 0) {
		if (back_split) {
			update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
			if ((ret = PTR_ERR_OR_ZERO(update)))
				goto err;
@@ -1051,10 +1100,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
			bkey_reassemble(update, k);
			bch2_cut_front(insert->k.p, update);

			ret = bch2_trans_update(trans, &iter, update, flags);
			bch2_trans_copy_iter(&update_iter, &iter);
			update_iter.pos = update->k.p;
			ret   = bch2_trans_update(trans, &update_iter, update,
						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
						  flags);
			bch2_trans_iter_exit(trans, &update_iter);

			if (ret)
				goto err;

			goto out;
		}
next:
@@ -1086,6 +1140,39 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
	return ret;
}

/*
 * When deleting, check if we need to emit a whiteout (because we're overwriting
 * something in an ancestor snapshot)
 */
static int need_whiteout_for_snapshot(struct btree_trans *trans,
				      enum btree_id btree_id, struct bpos pos)
{
	struct btree_iter iter;
	struct bkey_s_c k;
	u32 snapshot = pos.snapshot;
	int ret;

	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
		return 0;

	pos.snapshot++;

	for_each_btree_key(trans, iter, btree_id, pos,
			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
		if (bkey_cmp(k.k->p, pos))
			break;

		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
					      k.k->p.snapshot)) {
			ret = !bkey_whiteout(k.k);
			break;
		}
	}
	bch2_trans_iter_exit(trans, &iter);

	return ret;
}

int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
		      struct bkey_i *k, enum btree_update_flags flags)
{
@@ -1118,6 +1205,16 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
		       btree_insert_entry_cmp(i - 1, i) >= 0);
#endif

	if (bkey_deleted(&n.k->k) &&
	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
		int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
		if (unlikely(ret < 0))
			return ret;

		if (ret)
			n.k->k.type = KEY_TYPE_whiteout;
	}

	/*
	 * Pending updates are kept sorted: first, find position of new update,
	 * then delete/trim any updates the new update overwrites: