Commit e3c94a53 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs fixes from David Sterba:

 - fix a few races related to inode link count

 - fix inode leak on failure to add link to inode

 - move transaction aborts closer to where they happen

* tag 'for-6.17-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: avoid load/store tearing races when checking if an inode was logged
  btrfs: fix race between setting last_dir_index_offset and inode logging
  btrfs: fix race between logging inode and checking if it was logged before
  btrfs: simplify error handling logic for btrfs_link()
  btrfs: fix inode leak on failure to add link to inode
  btrfs: abort transaction on failure to add link to inode
parents b320789d 986bf6ed
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -248,7 +248,7 @@ struct btrfs_inode {
		u64 new_delalloc_bytes;
		/*
		 * The offset of the last dir index key that was logged.
		 * This is used only for directories.
		 * This is used only for directories. Protected by 'log_mutex'.
		 */
		u64 last_dir_index_offset;
	};
+22 −22
Original line number Diff line number Diff line
@@ -6805,7 +6805,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
	struct fscrypt_name fname;
	u64 index;
	int ret;
	int drop_inode = 0;

	/* do not allow sys_link's with other subvols of the same device */
	if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
@@ -6837,44 +6836,44 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,

	/* There are several dir indexes for this inode, clear the cache. */
	BTRFS_I(inode)->dir_index = 0ULL;
	inc_nlink(inode);
	inode_inc_iversion(inode);
	inode_set_ctime_current(inode);
	ihold(inode);
	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);

	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
			     &fname.disk_name, 1, index);
	if (ret)
		goto fail;

	if (ret) {
		drop_inode = 1;
	} else {
		struct dentry *parent = dentry->d_parent;

	/* Link added now we update the inode item with the new link count. */
	inc_nlink(inode);
	ret = btrfs_update_inode(trans, BTRFS_I(inode));
		if (ret)
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto fail;
	}

	if (inode->i_nlink == 1) {
		/*
			 * If new hard link count is 1, it's a file created
			 * with open(2) O_TMPFILE flag.
		 * If the new hard link count is 1, it's a file created with the
		 * open(2) O_TMPFILE flag.
		 */
		ret = btrfs_orphan_del(trans, BTRFS_I(inode));
			if (ret)
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto fail;
		}
		d_instantiate(dentry, inode);
		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
	}

	/* Grab reference for the new dentry passed to d_instantiate(). */
	ihold(inode);
	d_instantiate(dentry, inode);
	btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);

fail:
	fscrypt_free_filename(&fname);
	if (trans)
		btrfs_end_transaction(trans);
	if (drop_inode) {
		inode_dec_link_count(inode);
		iput(inode);
	}
	btrfs_btree_balance_dirty(fs_info);
	return ret;
}
@@ -7830,6 +7829,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
	ei->last_sub_trans = 0;
	ei->logged_trans = 0;
	ei->delalloc_bytes = 0;
	/* new_delalloc_bytes and last_dir_index_offset are in a union. */
	ei->new_delalloc_bytes = 0;
	ei->defrag_bytes = 0;
	ei->disk_i_size = 0;
+53 −25
Original line number Diff line number Diff line
@@ -3340,6 +3340,31 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
	return 0;
}

static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans,
				     struct btrfs_inode *inode)
{
	bool ret = false;

	/*
	 * Do this only if ->logged_trans is still 0 to prevent races with
	 * concurrent logging as we may see the inode not logged when
	 * inode_logged() is called but it gets logged after inode_logged() did
	 * not find it in the log tree and we end up setting ->logged_trans to a
	 * value less than trans->transid after the concurrent logging task has
	 * set it to trans->transid. As a consequence, subsequent rename, unlink
	 * and link operations may end up not logging new names and removing old
	 * names from the log.
	 */
	spin_lock(&inode->lock);
	if (inode->logged_trans == 0)
		inode->logged_trans = trans->transid - 1;
	else if (inode->logged_trans == trans->transid)
		ret = true;
	spin_unlock(&inode->lock);

	return ret;
}

/*
 * Check if an inode was logged in the current transaction. This correctly deals
 * with the case where the inode was logged but has a logged_trans of 0, which
@@ -3357,15 +3382,32 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
	struct btrfs_key key;
	int ret;

	if (inode->logged_trans == trans->transid)
	/*
	 * Quick lockless call, since once ->logged_trans is set to the current
	 * transaction, we never set it to a lower value anywhere else.
	 */
	if (data_race(inode->logged_trans) == trans->transid)
		return 1;

	/*
	 * If logged_trans is not 0, then we know the inode logged was not logged
	 * in this transaction, so we can return false right away.
	 * If logged_trans is not 0 and not trans->transid, then we know the
	 * inode was not logged in this transaction, so we can return false
	 * right away. We take the lock to avoid a race caused by load/store
	 * tearing with a concurrent btrfs_log_inode() call or a concurrent task
	 * in this function further below - an update to trans->transid can be
	 * teared into two 32 bits updates for example, in which case we could
	 * see a positive value that is not trans->transid and assume the inode
	 * was not logged when it was.
	 */
	if (inode->logged_trans > 0)
	spin_lock(&inode->lock);
	if (inode->logged_trans == trans->transid) {
		spin_unlock(&inode->lock);
		return 1;
	} else if (inode->logged_trans > 0) {
		spin_unlock(&inode->lock);
		return 0;
	}
	spin_unlock(&inode->lock);

	/*
	 * If no log tree was created for this root in this transaction, then
@@ -3374,10 +3416,8 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
	 * transaction's ID, to avoid the search below in a future call in case
	 * a log tree gets created after this.
	 */
	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
		inode->logged_trans = trans->transid - 1;
		return 0;
	}
	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
		return mark_inode_as_not_logged(trans, inode);

	/*
	 * We have a log tree and the inode's logged_trans is 0. We can't tell
@@ -3431,8 +3471,7 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
		 * Set logged_trans to a value greater than 0 and less then the
		 * current transaction to avoid doing the search in future calls.
		 */
		inode->logged_trans = trans->transid - 1;
		return 0;
		return mark_inode_as_not_logged(trans, inode);
	}

	/*
@@ -3440,20 +3479,9 @@ static int inode_logged(const struct btrfs_trans_handle *trans,
	 * the current transacion's ID, to avoid future tree searches as long as
	 * the inode is not evicted again.
	 */
	spin_lock(&inode->lock);
	inode->logged_trans = trans->transid;

	/*
	 * If it's a directory, then we must set last_dir_index_offset to the
	 * maximum possible value, so that the next attempt to log the inode does
	 * not skip checking if dir index keys found in modified subvolume tree
	 * leaves have been logged before, otherwise it would result in attempts
	 * to insert duplicate dir index keys in the log tree. This must be done
	 * because last_dir_index_offset is an in-memory only field, not persisted
	 * in the inode item or any other on-disk structure, so its value is lost
	 * once the inode is evicted.
	 */
	if (S_ISDIR(inode->vfs_inode.i_mode))
		inode->last_dir_index_offset = (u64)-1;
	spin_unlock(&inode->lock);

	return 1;
}
@@ -4045,7 +4073,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,

/*
 * If the inode was logged before and it was evicted, then its
 * last_dir_index_offset is (u64)-1, so we don't the value of the last index
 * last_dir_index_offset is 0, so we don't know the value of the last index
 * key offset. If that's the case, search for it and update the inode. This
 * is to avoid lookups in the log tree every time we try to insert a dir index
 * key from a leaf changed in the current transaction, and to allow us to always
@@ -4061,7 +4089,7 @@ static int update_last_dir_index_offset(struct btrfs_inode *inode,

	lockdep_assert_held(&inode->log_mutex);

	if (inode->last_dir_index_offset != (u64)-1)
	if (inode->last_dir_index_offset != 0)
		return 0;

	if (!ctx->logged_before) {