Commit 9d861787 authored by Kent Overstreet's avatar Kent Overstreet
Browse files

bcachefs: bch2_inode_or_descendents_is_open()



fsck can now correctly check if inodes in interior snapshot nodes are
open/in use.

- Tweak the vfs inode rhashtable so that the subvolume ID isn't hashed,
  meaning inums in different subvolumes will hash to the same slot. Note
  that this is a hack, and will cause problems if anyone ever has the
  same file in many different snapshots open all at the same time.

- Then check if any of those subvolumes is a descendent of the snapshot
  ID being checked

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 84878e82
Loading
Loading
Loading
Loading
+91 −15
Original line number Diff line number Diff line
@@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
	return a.subvol == b.subvol && a.inum == b.inum;
}

static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
{
	const subvol_inum *inum = data;

	return jhash(&inum->inum, sizeof(inum->inum), seed);
}

static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
{
	const struct bch_inode_info *inode = data;

	return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
}

static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
				 const void *obj)
{
@@ -170,32 +184,93 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
	.head_offset		= offsetof(struct bch_inode_info, hash),
	.key_offset		= offsetof(struct bch_inode_info, ei_inum),
	.key_len		= sizeof(subvol_inum),
	.hashfn			= bch2_vfs_inode_hash_fn,
	.obj_hashfn		= bch2_vfs_inode_obj_hash_fn,
	.obj_cmpfn		= bch2_vfs_inode_cmp_fn,
	.automatic_shrinking	= true,
};

static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
{
	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
}
	struct bch_fs *c = trans->c;
	struct rhashtable *ht = &c->vfs_inodes_table;
	subvol_inum inum = (subvol_inum) { .inum = p.offset };
	DARRAY(u32) subvols;
	int ret = 0;

bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{
	if (!test_bit(BCH_FS_started, &c->flags))
		return false;

	subvol_inum inum = {
		.subvol = snapshot_t(c, p.snapshot)->subvol,
		.inum	= p.offset,
	};
	darray_init(&subvols);
restart_from_top:

	/* snapshot tree interior node, can't safely delete while online (yet) */
	if (!inum.subvol) {
		bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
		return true;
	/*
	 * Tweaked version of __rhashtable_lookup(); we need to get a list of
	 * subvolumes in which the given inode number is open.
	 *
	 * For this to work, we don't include the subvolume ID in the key that
	 * we hash - all inodes with the same inode number regardless of
	 * subvolume will hash to the same slot.
	 *
	 * This will be less than ideal if the same file is ever open
	 * simultaneously in many different snapshots:
	 */
	rcu_read_lock();
	struct rhash_lock_head __rcu *const *bkt;
	struct rhash_head *he;
	unsigned int hash;
	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
	hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
	bkt = rht_bucket(tbl, hash);
	do {
		struct bch_inode_info *inode;

		rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
			if (inode->ei_inum.inum == inum.inum) {
				ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
						      GFP_NOWAIT|__GFP_NOWARN);
				if (ret) {
					rcu_read_unlock();
					ret = darray_make_room(&subvols, 1);
					if (ret)
						goto err;
					subvols.nr = 0;
					goto restart_from_top;
				}
			}
		}
		/* An object might have been moved to a different hash chain,
		 * while we walk along it - better check and retry.
		 */
	} while (he != RHT_NULLS_MARKER(bkt));

	/* Ensure we see any new tables. */
	smp_rmb();

	tbl = rht_dereference_rcu(tbl->future_tbl, ht);
	if (unlikely(tbl))
		goto restart;
	rcu_read_unlock();

	darray_for_each(subvols, i) {
		u32 snap;
		ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
		if (ret)
			goto err;

	return __bch2_inode_hash_find(c, inum) != NULL;
		ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
		if (ret)
			break;
	}
err:
	darray_exit(&subvols);
	return ret;
}

static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{
	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
}

static void __wait_on_freeing_inode(struct bch_fs *c,
@@ -271,7 +346,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,

	set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
	if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
	if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
					&inode->ei_inum,
					&inode->hash,
					bch2_vfs_inodes_params))) {
		old = bch2_inode_hash_find(c, trans, inode->ei_inum);
+3 −3
Original line number Diff line number Diff line
@@ -146,6 +146,8 @@ struct bch_inode_info *
__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);

int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);

int bch2_fs_quota_transfer(struct bch_fs *,
			   struct bch_inode_info *,
			   struct bch_qid,
@@ -179,8 +181,6 @@ void bch2_inode_update_after_write(struct btree_trans *,
int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
				  inode_set_fn, void *, unsigned);

bool bch2_inode_is_open(struct bch_fs *c, struct bpos p);

int bch2_setattr_nonsize(struct mnt_idmap *,
			 struct bch_inode_info *,
			 struct iattr *);
@@ -198,7 +198,7 @@ int bch2_vfs_init(void);

#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })

static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; }
static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }

static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
					       snapshot_id_list *s) {}
+6 −1
Original line number Diff line number Diff line
@@ -1213,7 +1213,11 @@ static int check_inode(struct btree_trans *trans,
			if (ret)
				goto err;
		} else {
			if (fsck_err_on(!bch2_inode_is_open(c, k.k->p),
			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
			if (ret < 0)
				goto err;

			if (fsck_err_on(!ret,
					trans, inode_unlinked_and_not_open,
				      "inode %llu%u unlinked and not open",
				      u.bi_inum, u.bi_snapshot)) {
@@ -1221,6 +1225,7 @@ static int check_inode(struct btree_trans *trans,
				bch_err_msg(c, ret, "in fsck deleting inode");
				goto err_noprint;
			}
			ret = 0;
		}
	}

+3 −2
Original line number Diff line number Diff line
@@ -1244,8 +1244,9 @@ static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpo
	if (!unlinked)
		return 0;

	if (bch2_inode_is_open(trans->c, pos))
		return 0;
	ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
	if (ret)
		return ret < 0 ? ret : 0;

	ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
	if (ret)