Commit 9aa29a20 authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba
Browse files

btrfs: move the direct IO code into its own file



The direct IO code is over a thousand lines and it's currently spread
between file.c and inode.c, which makes it not easy to locate some parts
of it sometimes. Also inode.c is about 11 thousand lines and file.c about
4 thousand lines, both too big. So move all the direct IO code into a
dedicated file, so that it's easy to locate all its code and reduce the
sizes of inode.c and file.c.

This is a pure move of code without any other changes except export a
a couple functions from inode.c (get_extent_allocation_hint() and
create_io_em()) because they are used in inode.c and the new direct-io.c
file, and a couple functions from file.c (btrfs_buffered_write() and
btrfs_write_check()) because they are used both in file.c and in the new
direct-io.c file.

Reviewed-by: default avatarBoris Burkov <boris@bur.io>
Reviewed-by: default avatarQu Wenruo <wqu@suse.com>
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 0d9b7e16
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
	   lru_cache.o raid-stripe-tree.o fiemap.o
	   lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o

btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+5 −4
Original line number Diff line number Diff line
@@ -610,10 +610,6 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
			       const struct btrfs_ioctl_encoded_io_args *encoded);

ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
		       size_t done_before);
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
				  size_t done_before);
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);

extern const struct dentry_operations btrfs_dentry_operations;
@@ -630,5 +626,10 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
			      const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
				     u64 num_bytes);
struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
				      const struct btrfs_file_extent *file_extent,
				      int type);

#endif

fs/btrfs/direct-io.c

0 → 100644
+1052 −0

File added.

Preview size limit exceeded, changes collapsed.

fs/btrfs/direct-io.h

0 → 100644
+14 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_DIRECT_IO_H
#define BTRFS_DIRECT_IO_H

#include <linux/types.h>

int __init btrfs_init_dio(void);
void __cold btrfs_destroy_dio(void);

ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from);
ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to);

#endif /* BTRFS_DIRECT_IO_H */
+3 −284
Original line number Diff line number Diff line
@@ -17,8 +17,8 @@
#include <linux/uio.h>
#include <linux/iversion.h>
#include <linux/fsverity.h>
#include <linux/iomap.h>
#include "ctree.h"
#include "direct-io.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
@@ -1140,8 +1140,7 @@ static void update_time_for_write(struct inode *inode)
		inode_inc_iversion(inode);
}

static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
			     size_t count)
int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file_inode(file);
@@ -1187,8 +1186,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
	return 0;
}

static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
					       struct iov_iter *i)
ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
{
	struct file *file = iocb->ki_filp;
	loff_t pos;
@@ -1451,194 +1449,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
	return num_written ? num_written : ret;
}

static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
			       const struct iov_iter *iter, loff_t offset)
{
	const u32 blocksize_mask = fs_info->sectorsize - 1;

	if (offset & blocksize_mask)
		return -EINVAL;

	if (iov_iter_alignment(iter) & blocksize_mask)
		return -EINVAL;

	return 0;
}

static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file_inode(file);
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
	loff_t pos;
	ssize_t written = 0;
	ssize_t written_buffered;
	size_t prev_left = 0;
	loff_t endbyte;
	ssize_t ret;
	unsigned int ilock_flags = 0;
	struct iomap_dio *dio;

	if (iocb->ki_flags & IOCB_NOWAIT)
		ilock_flags |= BTRFS_ILOCK_TRY;

	/*
	 * If the write DIO is within EOF, use a shared lock and also only if
	 * security bits will likely not be dropped by file_remove_privs() called
	 * from btrfs_write_check(). Either will need to be rechecked after the
	 * lock was acquired.
	 */
	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
		ilock_flags |= BTRFS_ILOCK_SHARED;

relock:
	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
	if (ret < 0)
		return ret;

	/* Shared lock cannot be used with security bits set. */
	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
		ilock_flags &= ~BTRFS_ILOCK_SHARED;
		goto relock;
	}

	ret = generic_write_checks(iocb, from);
	if (ret <= 0) {
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
		return ret;
	}

	ret = btrfs_write_check(iocb, from, ret);
	if (ret < 0) {
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
		goto out;
	}

	pos = iocb->ki_pos;
	/*
	 * Re-check since file size may have changed just before taking the
	 * lock or pos may have changed because of O_APPEND in generic_write_check()
	 */
	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
	    pos + iov_iter_count(from) > i_size_read(inode)) {
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
		ilock_flags &= ~BTRFS_ILOCK_SHARED;
		goto relock;
	}

	if (check_direct_IO(fs_info, from, pos)) {
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
		goto buffered;
	}

	/*
	 * The iov_iter can be mapped to the same file range we are writing to.
	 * If that's the case, then we will deadlock in the iomap code, because
	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
	 * an ordered extent, and after that it will fault in the pages that the
	 * iov_iter refers to. During the fault in we end up in the readahead
	 * pages code (starting at btrfs_readahead()), which will lock the range,
	 * find that ordered extent and then wait for it to complete (at
	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
	 * obviously the ordered extent can never complete as we didn't submit
	 * yet the respective bio(s). This always happens when the buffer is
	 * memory mapped to the same file range, since the iomap DIO code always
	 * invalidates pages in the target file range (after starting and waiting
	 * for any writeback).
	 *
	 * So here we disable page faults in the iov_iter and then retry if we
	 * got -EFAULT, faulting in the pages before the retry.
	 */
	from->nofault = true;
	dio = btrfs_dio_write(iocb, from, written);
	from->nofault = false;

	/*
	 * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
	 * iocb, and that needs to lock the inode. So unlock it before calling
	 * iomap_dio_complete() to avoid a deadlock.
	 */
	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);

	if (IS_ERR_OR_NULL(dio))
		ret = PTR_ERR_OR_ZERO(dio);
	else
		ret = iomap_dio_complete(dio);

	/* No increment (+=) because iomap returns a cumulative value. */
	if (ret > 0)
		written = ret;

	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
		const size_t left = iov_iter_count(from);
		/*
		 * We have more data left to write. Try to fault in as many as
		 * possible of the remainder pages and retry. We do this without
		 * releasing and locking again the inode, to prevent races with
		 * truncate.
		 *
		 * Also, in case the iov refers to pages in the file range of the
		 * file we want to write to (due to a mmap), we could enter an
		 * infinite loop if we retry after faulting the pages in, since
		 * iomap will invalidate any pages in the range early on, before
		 * it tries to fault in the pages of the iov. So we keep track of
		 * how much was left of iov in the previous EFAULT and fallback
		 * to buffered IO in case we haven't made any progress.
		 */
		if (left == prev_left) {
			ret = -ENOTBLK;
		} else {
			fault_in_iov_iter_readable(from, left);
			prev_left = left;
			goto relock;
		}
	}

	/*
	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
	 * we must fallback to buffered IO.
	 */
	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
		goto out;

buffered:
	/*
	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
	 * it must retry the operation in a context where blocking is acceptable,
	 * because even if we end up not blocking during the buffered IO attempt
	 * below, we will block when flushing and waiting for the IO.
	 */
	if (iocb->ki_flags & IOCB_NOWAIT) {
		ret = -EAGAIN;
		goto out;
	}

	pos = iocb->ki_pos;
	written_buffered = btrfs_buffered_write(iocb, from);
	if (written_buffered < 0) {
		ret = written_buffered;
		goto out;
	}
	/*
	 * Ensure all data is persisted. We want the next direct IO read to be
	 * able to read what was just written.
	 */
	endbyte = pos + written_buffered - 1;
	ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
	if (ret)
		goto out;
	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
	if (ret)
		goto out;
	written += written_buffered;
	iocb->ki_pos = pos + written_buffered;
	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
				 endbyte >> PAGE_SHIFT);
out:
	return ret < 0 ? ret : written;
}

static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
			const struct btrfs_ioctl_encoded_io_args *encoded)
{
@@ -3914,97 +3724,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
	return generic_file_open(inode, filp);
}

static int check_direct_read(struct btrfs_fs_info *fs_info,
			     const struct iov_iter *iter, loff_t offset)
{
	int ret;
	int i, seg;

	ret = check_direct_IO(fs_info, iter, offset);
	if (ret < 0)
		return ret;

	if (!iter_is_iovec(iter))
		return 0;

	for (seg = 0; seg < iter->nr_segs; seg++) {
		for (i = seg + 1; i < iter->nr_segs; i++) {
			const struct iovec *iov1 = iter_iov(iter) + seg;
			const struct iovec *iov2 = iter_iov(iter) + i;

			if (iov1->iov_base == iov2->iov_base)
				return -EINVAL;
		}
	}
	return 0;
}

static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
{
	struct inode *inode = file_inode(iocb->ki_filp);
	size_t prev_left = 0;
	ssize_t read = 0;
	ssize_t ret;

	if (fsverity_active(inode))
		return 0;

	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
		return 0;

	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
again:
	/*
	 * This is similar to what we do for direct IO writes, see the comment
	 * at btrfs_direct_write(), but we also disable page faults in addition
	 * to disabling them only at the iov_iter level. This is because when
	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
	 * which can still trigger page fault ins despite having set ->nofault
	 * to true of our 'to' iov_iter.
	 *
	 * The difference to direct IO writes is that we deadlock when trying
	 * to lock the extent range in the inode's tree during he page reads
	 * triggered by the fault in (while for writes it is due to waiting for
	 * our own ordered extent). This is because for direct IO reads,
	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
	 */
	pagefault_disable();
	to->nofault = true;
	ret = btrfs_dio_read(iocb, to, read);
	to->nofault = false;
	pagefault_enable();

	/* No increment (+=) because iomap returns a cumulative value. */
	if (ret > 0)
		read = ret;

	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
		const size_t left = iov_iter_count(to);

		if (left == prev_left) {
			/*
			 * We didn't make any progress since the last attempt,
			 * fallback to a buffered read for the remainder of the
			 * range. This is just to avoid any possibility of looping
			 * for too long.
			 */
			ret = read;
		} else {
			/*
			 * We made some progress since the last retry or this is
			 * the first time we are retrying. Fault in as many pages
			 * as possible and retry.
			 */
			fault_in_iov_iter_writeable(to, left);
			prev_left = left;
			goto again;
		}
	}
	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
	return ret < 0 ? ret : read;
}

static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
	ssize_t ret = 0;
Loading