xfs: introduce new file range exchange ioctl (9a64d9b3) · Commits · git / linux-net

fs/xfs/Makefile

+1 −0

Original line number	Diff line number	Diff line
		@@ -67,6 +67,7 @@ xfs-y += xfs_aops.o \
		xfs_dir2_readdir.o \
		xfs_discard.o \
		xfs_error.o \
		xfs_exchrange.o \
		xfs_export.o \
		xfs_extent_busy.o \
		xfs_file.o \

fs/xfs/libxfs/xfs_fs.h

+41 −0

Original line number	Diff line number	Diff line
		@@ -772,6 +772,46 @@ struct xfs_scrub_metadata {
		# define XFS_XATTR_LIST_MAX 65536
		#endif

		/*
		* Exchange part of file1 with part of the file that this ioctl that is being
		* called against (which we'll call file2). Filesystems must be able to
		* restart and complete the operation even after the system goes down.
		*/
		struct xfs_exchange_range {
		__s32 file1_fd;
		__u32 pad; /* must be zeroes */
		__u64 file1_offset; /* file1 offset, bytes */
		__u64 file2_offset; /* file2 offset, bytes */
		__u64 length; /* bytes to exchange */

		__u64 flags; /* see XFS_EXCHANGE_RANGE_* below */
		};

		/*
		* Exchange file data all the way to the ends of both files, and then exchange
		* the file sizes. This flag can be used to replace a file's contents with a
		* different amount of data. length will be ignored.
		*/
		#define XFS_EXCHANGE_RANGE_TO_EOF (1ULL << 0)

		/* Flush all changes in file data and file metadata to disk before returning. */
		#define XFS_EXCHANGE_RANGE_DSYNC (1ULL << 1)

		/* Dry run; do all the parameter verification but do not change anything. */
		#define XFS_EXCHANGE_RANGE_DRY_RUN (1ULL << 2)

		/*
		* Exchange only the parts of the two files where the file allocation units
		* mapped to file1's range have been written to. This can accelerate
		* scatter-gather atomic writes with a temp file if all writes are aligned to
		* the file allocation unit.
		*/
		#define XFS_EXCHANGE_RANGE_FILE1_WRITTEN (1ULL << 3)

		#define XFS_EXCHANGE_RANGE_ALL_FLAGS (XFS_EXCHANGE_RANGE_TO_EOF \| \
		XFS_EXCHANGE_RANGE_DSYNC \| \
		XFS_EXCHANGE_RANGE_DRY_RUN \| \
		XFS_EXCHANGE_RANGE_FILE1_WRITTEN)

		/*
		* ioctl commands that are used by Linux filesystems
		@@ -843,6 +883,7 @@ struct xfs_scrub_metadata {
		#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
		#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
		#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
		#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range)
		/* XFS_IOC_GETFSUUID ---------- deprecated 140 */

fs/xfs/xfs_exchrange.c

0 → 100644

+339 −0

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0-or-later
		/*
		* Copyright (c) 2020-2024 Oracle. All Rights Reserved.
		* Author: Darrick J. Wong <djwong@kernel.org>
		*/
		#include "xfs.h"
		#include "xfs_shared.h"
		#include "xfs_format.h"
		#include "xfs_log_format.h"
		#include "xfs_trans_resv.h"
		#include "xfs_mount.h"
		#include "xfs_defer.h"
		#include "xfs_inode.h"
		#include "xfs_trans.h"
		#include "xfs_exchrange.h"
		#include <linux/fsnotify.h>

		/*
		* Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
		* This part deals with struct file objects and byte ranges and does not deal
		* with XFS-specific data structures such as xfs_inodes and block ranges. This
		* separation may some day facilitate porting to another filesystem.
		*
		* The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
		* file1 with the same number of bytes starting at fxr.file2_offset in file2.
		* Implementations must call xfs_exchange_range_prep to prepare the two
		* files prior to taking locks; and they must update the inode change and mod
		* times of both files as part of the metadata update. The timestamp update
		* and freshness checks must be done atomically as part of the data exchange
		* operation to ensure correctness of the freshness check.
		* xfs_exchange_range_finish must be called after the operation completes
		* successfully but before locks are dropped.
		*/

		/* Verify that we have security clearance to perform this operation. */
		static int
		xfs_exchange_range_verify_area(
		struct xfs_exchrange *fxr)
		{
		int ret;

		ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
		true);
		if (ret)
		return ret;

		return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
		true);
		}

		/*
		* Performs necessary checks before doing a range exchange, having stabilized
		* mutable inode attributes via i_rwsem.
		*/
		static inline int
		xfs_exchange_range_checks(
		struct xfs_exchrange *fxr,
		unsigned int alloc_unit)
		{
		struct inode *inode1 = file_inode(fxr->file1);
		struct inode *inode2 = file_inode(fxr->file2);
		uint64_t allocmask = alloc_unit - 1;
		int64_t test_len;
		uint64_t blen;
		loff_t size1, size2, tmp;
		int error;

		/* Don't touch certain kinds of inodes */
		if (IS_IMMUTABLE(inode1) \|\| IS_IMMUTABLE(inode2))
		return -EPERM;
		if (IS_SWAPFILE(inode1) \|\| IS_SWAPFILE(inode2))
		return -ETXTBSY;

		size1 = i_size_read(inode1);
		size2 = i_size_read(inode2);

		/* Ranges cannot start after EOF. */
		if (fxr->file1_offset > size1 \|\| fxr->file2_offset > size2)
		return -EINVAL;

		/*
		* If the caller said to exchange to EOF, we set the length of the
		* request large enough to cover everything to the end of both files.
		*/
		if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
		fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
		size2 - fxr->file2_offset);

		error = xfs_exchange_range_verify_area(fxr);
		if (error)
		return error;
		}

		/*
		* The start of both ranges must be aligned to the file allocation
		* unit.
		*/
		if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) \|\|
		!IS_ALIGNED(fxr->file2_offset, alloc_unit))
		return -EINVAL;

		/* Ensure offsets don't wrap. */
		if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) \|\|
		check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
		return -EINVAL;

		/*
		* We require both ranges to end within EOF, unless we're exchanging
		* to EOF.
		*/
		if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
		(fxr->file1_offset + fxr->length > size1 \|\|
		fxr->file2_offset + fxr->length > size2))
		return -EINVAL;

		/*
		* Make sure we don't hit any file size limits. If we hit any size
		* limits such that test_length was adjusted, we abort the whole
		* operation.
		*/
		test_len = fxr->length;
		error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
		&test_len);
		if (error)
		return error;
		error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
		&test_len);
		if (error)
		return error;
		if (test_len != fxr->length)
		return -EINVAL;

		/*
		* If the user wanted us to exchange up to the infile's EOF, round up
		* to the next allocation unit boundary for this check. Do the same
		* for the outfile.
		*
		* Otherwise, reject the range length if it's not aligned to an
		* allocation unit.
		*/
		if (fxr->file1_offset + fxr->length == size1)
		blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
		else if (fxr->file2_offset + fxr->length == size2)
		blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
		else if (!IS_ALIGNED(fxr->length, alloc_unit))
		return -EINVAL;
		else
		blen = fxr->length;

		/* Don't allow overlapped exchanges within the same file. */
		if (inode1 == inode2 &&
		fxr->file2_offset + blen > fxr->file1_offset &&
		fxr->file1_offset + blen > fxr->file2_offset)
		return -EINVAL;

		/*
		* Ensure that we don't exchange a partial EOF block into the middle of
		* another file.
		*/
		if ((fxr->length & allocmask) == 0)
		return 0;

		blen = fxr->length;
		if (fxr->file2_offset + blen < size2)
		blen &= ~allocmask;

		if (fxr->file1_offset + blen < size1)
		blen &= ~allocmask;

		return blen == fxr->length ? 0 : -EINVAL;
		}

		/*
		* Check that the two inodes are eligible for range exchanges, the ranges make
		* sense, and then flush all dirty data. Caller must ensure that the inodes
		* have been locked against any other modifications.
		*/
		static inline int
		xfs_exchange_range_prep(
		struct xfs_exchrange *fxr,
		unsigned int alloc_unit)
		{
		struct inode *inode1 = file_inode(fxr->file1);
		struct inode *inode2 = file_inode(fxr->file2);
		bool same_inode = (inode1 == inode2);
		int error;

		/* Check that we don't violate system file offset limits. */
		error = xfs_exchange_range_checks(fxr, alloc_unit);
		if (error \|\| fxr->length == 0)
		return error;

		/* Wait for the completion of any pending IOs on both files */
		inode_dio_wait(inode1);
		if (!same_inode)
		inode_dio_wait(inode2);

		error = filemap_write_and_wait_range(inode1->i_mapping,
		fxr->file1_offset,
		fxr->file1_offset + fxr->length - 1);
		if (error)
		return error;

		error = filemap_write_and_wait_range(inode2->i_mapping,
		fxr->file2_offset,
		fxr->file2_offset + fxr->length - 1);
		if (error)
		return error;

		/*
		* If the files or inodes involved require synchronous writes, amend
		* the request to force the filesystem to flush all data and metadata
		* to disk after the operation completes.
		*/
		if (((fxr->file1->f_flags \| fxr->file2->f_flags) & O_SYNC) \|\|
		IS_SYNC(inode1) \|\| IS_SYNC(inode2))
		fxr->flags \|= XFS_EXCHANGE_RANGE_DSYNC;

		return 0;
		}

		/*
		* Finish a range exchange operation, if it was successful. Caller must ensure
		* that the inodes are still locked against any other modifications.
		*/
		static inline int
		xfs_exchange_range_finish(
		struct xfs_exchrange *fxr)
		{
		int error;

		error = file_remove_privs(fxr->file1);
		if (error)
		return error;
		if (file_inode(fxr->file1) == file_inode(fxr->file2))
		return 0;

		return file_remove_privs(fxr->file2);
		}

		/* Exchange parts of two files. */
		static int
		xfs_exchange_range(
		struct xfs_exchrange *fxr)
		{
		struct inode *inode1 = file_inode(fxr->file1);
		struct inode *inode2 = file_inode(fxr->file2);
		int ret;

		BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
		XFS_EXCHANGE_RANGE_PRIV_FLAGS);

		/* Both files must be on the same mount/filesystem. */
		if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
		return -EXDEV;

		if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
		return -EINVAL;

		/* Userspace requests only honored for regular files. */
		if (S_ISDIR(inode1->i_mode) \|\| S_ISDIR(inode2->i_mode))
		return -EISDIR;
		if (!S_ISREG(inode1->i_mode) \|\| !S_ISREG(inode2->i_mode))
		return -EINVAL;

		/* Both files must be opened for read and write. */
		if (!(fxr->file1->f_mode & FMODE_READ) \|\|
		!(fxr->file1->f_mode & FMODE_WRITE) \|\|
		!(fxr->file2->f_mode & FMODE_READ) \|\|
		!(fxr->file2->f_mode & FMODE_WRITE))
		return -EBADF;

		/* Neither file can be opened append-only. */
		if ((fxr->file1->f_flags & O_APPEND) \|\|
		(fxr->file2->f_flags & O_APPEND))
		return -EBADF;

		/*
		* If we're not exchanging to EOF, we can check the areas before
		* stabilizing both files' i_size.
		*/
		if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
		ret = xfs_exchange_range_verify_area(fxr);
		if (ret)
		return ret;
		}

		/* Update cmtime if the fd/inode don't forbid it. */
		if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
		fxr->flags \|= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
		if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
		fxr->flags \|= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;

		file_start_write(fxr->file2);
		ret = -EOPNOTSUPP; /* XXX call out to lower level code */
		file_end_write(fxr->file2);
		if (ret)
		return ret;

		fsnotify_modify(fxr->file1);
		if (fxr->file2 != fxr->file1)
		fsnotify_modify(fxr->file2);
		return 0;
		}

		/* Collect exchange-range arguments from userspace. */
		long
		xfs_ioc_exchange_range(
		struct file *file,
		struct xfs_exchange_range __user *argp)
		{
		struct xfs_exchrange fxr = {
		.file2 = file,
		};
		struct xfs_exchange_range args;
		struct fd file1;
		int error;

		if (copy_from_user(&args, argp, sizeof(args)))
		return -EFAULT;
		if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
		return -EINVAL;
		if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
		return -EINVAL;

		fxr.file1_offset = args.file1_offset;
		fxr.file2_offset = args.file2_offset;
		fxr.length = args.length;
		fxr.flags = args.flags;

		file1 = fdget(args.file1_fd);
		if (!file1.file)
		return -EBADF;
		fxr.file1 = file1.file;

		error = xfs_exchange_range(&fxr);
		fdput(file1);
		return error;
		}

fs/xfs/xfs_exchrange.h

0 → 100644

+30 −0

Original line number	Diff line number	Diff line
		/* SPDX-License-Identifier: GPL-2.0-or-later */
		/*
		* Copyright (c) 2020-2024 Oracle. All Rights Reserved.
		* Author: Darrick J. Wong <djwong@kernel.org>
		*/
		#ifndef __XFS_EXCHRANGE_H__
		#define __XFS_EXCHRANGE_H__

		/* Update the mtime/cmtime of file1 and file2 */
		#define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63)
		#define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62)

		#define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 \| \
		__XFS_EXCHANGE_RANGE_UPD_CMTIME2)

		struct xfs_exchrange {
		struct file *file1;
		struct file *file2;

		loff_t file1_offset;
		loff_t file2_offset;
		u64 length;

		u64 flags; /* XFS_EXCHANGE_RANGE flags */
		};

		long xfs_ioc_exchange_range(struct file *file,
		struct xfs_exchange_range __user *argp);

		#endif /* __XFS_EXCHRANGE_H__ */

fs/xfs/xfs_ioctl.c

+4 −0

Original line number	Diff line number	Diff line
		@@ -40,6 +40,7 @@
		#include "xfs_xattr.h"
		#include "xfs_rtbitmap.h"
		#include "xfs_file.h"
		#include "xfs_exchrange.h"

		#include <linux/mount.h>
		#include <linux/namei.h>
		@@ -2170,6 +2171,9 @@ xfs_file_ioctl(
		return error;
		}

		case XFS_IOC_EXCHANGE_RANGE:
		return xfs_ioc_exchange_range(filp, arg);

		default:
		return -ENOTTY;
		}