Commit f0c3f88b authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet
Browse files

bcachefs: Run insert triggers before overwrite triggers



Currently, btree triggers are run in natural key order, which presents a
problem for fallocate in INSERT_RANGE mode: since we're moving existing
extents to higher offsets, the trigger for deleting the old extent runs
before the trigger that adds the new extent, potentially leading to
indirect extents being deleted that shouldn't be when the delete causes
the refcount to hit 0.

This changes the order we run triggers so that for a givin btree, we run
all insert triggers before overwrite triggers, nicely sidestepping this
issue.

Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
parent c714614b
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -338,7 +338,8 @@ struct btree_insert_entry {
	enum btree_id		btree_id:8;
	u8			level;
	bool			cached:1;
	bool			trans_triggers_run:1;
	bool			insert_trigger_run:1;
	bool			overwrite_trigger_run:1;
	struct bkey_i		*k;
	struct btree_path	*path;
	unsigned long		ip_allocated;
+107 −26
Original line number Diff line number Diff line
@@ -816,10 +816,112 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
	return 0;
}

static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
	struct bkey		_deleted = KEY(0, 0, 0);
	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
	struct bkey_s_c		old;
	struct bkey		unpacked;
	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
	bool trans_trigger_run;
	unsigned btree_id = 0;
	int ret = 0;

	/*
	 *
	 * For a given btree, this algorithm runs insert triggers before
	 * overwrite triggers: this is so that when extents are being moved
	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
	 * they are re-added.
	 */
	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
		while (btree_id_start < trans->updates + trans->nr_updates &&
		       btree_id_start->btree_id < btree_id)
			btree_id_start++;

		/*
		 * Running triggers will append more updates to the list of updates as
		 * we're walking it:
		 */
		do {
			trans_trigger_run = false;

			for (i = btree_id_start;
			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
			     i++) {
				if (i->insert_trigger_run ||
				    (i->flags & BTREE_TRIGGER_NORUN) ||
				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
					continue;

				BUG_ON(i->overwrite_trigger_run);

				i->insert_trigger_run = true;
				trans_trigger_run = true;

				old = bch2_btree_path_peek_slot(i->path, &unpacked);
				_deleted.p = i->path->pos;

				if (old.k->type == i->k->k.type &&
				    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
					i->overwrite_trigger_run = true;
					ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
							BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
				} else {
					ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
							BTREE_TRIGGER_INSERT|i->flags);
				}

				if (ret == -EINTR)
					trace_trans_restart_mark(trans->ip, _RET_IP_,
							i->btree_id, &i->path->pos);
				if (ret)
					return ret;
			}
		} while (trans_trigger_run);

		do {
			trans_trigger_run = false;

			for (i = btree_id_start;
			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
			     i++) {
				if (i->overwrite_trigger_run ||
				    (i->flags & BTREE_TRIGGER_NORUN) ||
				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
					continue;

				BUG_ON(!i->insert_trigger_run);

				i->overwrite_trigger_run = true;
				trans_trigger_run = true;

				old = bch2_btree_path_peek_slot(i->path, &unpacked);
				_deleted.p = i->path->pos;

				ret = bch2_trans_mark_key(trans, old, deleted,
						BTREE_TRIGGER_OVERWRITE|i->flags);

				if (ret == -EINTR)
					trace_trans_restart_mark(trans->ip, _RET_IP_,
							i->btree_id, &i->path->pos);
				if (ret)
					return ret;
			}
		} while (trans_trigger_run);
	}

	trans_for_each_update(trans, i)
		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
		       (!i->insert_trigger_run || !i->overwrite_trigger_run));

	return 0;
}

int __bch2_trans_commit(struct btree_trans *trans)
{
	struct btree_insert_entry *i = NULL;
	bool trans_trigger_run;
	unsigned u64s;
	int ret = 0;

@@ -854,30 +956,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
					i->btree_id, i->k->k.p);
#endif

	/*
	 * Running triggers will append more updates to the list of updates as
	 * we're walking it:
	 */
	do {
		trans_trigger_run = false;

		trans_for_each_update(trans, i) {
			if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
			    !i->trans_triggers_run) {
				i->trans_triggers_run = true;
				trans_trigger_run = true;

				ret = bch2_trans_mark_update(trans, i->path,
							     i->k, i->flags);
				if (unlikely(ret)) {
					if (ret == -EINTR)
						trace_trans_restart_mark(trans->ip, _RET_IP_,
								i->btree_id, &i->path->pos);
	ret = bch2_trans_commit_run_triggers(trans);
	if (ret)
		goto out;
				}
			}
		}
	} while (trans_trigger_run);

	trans_for_each_update(trans, i) {
		BUG_ON(!i->path->should_be_locked);
@@ -1297,7 +1378,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,

	if (i < trans->updates + trans->nr_updates &&
	    !btree_insert_entry_cmp(&n, i)) {
		BUG_ON(i->trans_triggers_run);
		BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);

		/*
		 * This is a hack to ensure that inode creates update the btree,
+0 −35
Original line number Diff line number Diff line
@@ -1882,41 +1882,6 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
	}
}

int bch2_trans_mark_update(struct btree_trans *trans,
			   struct btree_path *path,
			   struct bkey_i *new,
			   unsigned flags)
{
	struct bkey		_deleted = KEY(0, 0, 0);
	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
	struct bkey_s_c		old;
	struct bkey		unpacked;
	int ret;

	_deleted.p = path->pos;

	if (unlikely(flags & BTREE_TRIGGER_NORUN))
		return 0;

	if (!btree_node_type_needs_gc(path->btree_id))
		return 0;

	old = bch2_btree_path_peek_slot(path, &unpacked);

	if (old.k->type == new->k.type &&
	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
		ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
	} else {
		ret   = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
				BTREE_TRIGGER_INSERT|flags) ?:
			bch2_trans_mark_key(trans, old, deleted,
				BTREE_TRIGGER_OVERWRITE|flags);
	}

	return ret;
}

static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
				    struct bch_dev *ca, size_t b,
				    enum bch_data_type type,
+0 −2
Original line number Diff line number Diff line
@@ -233,8 +233,6 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *,

int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
			struct bkey_s_c, unsigned);
int bch2_trans_mark_update(struct btree_trans *, struct btree_path *,
			   struct bkey_i *, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);

int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,