Commit 058dd70c authored by Christoph Hellwig's avatar Christoph Hellwig
Browse files

xfs: implement buffered writes to zoned RT devices



Implement buffered writes including page faults and block zeroing for
zoned RT devices.  Buffered writes to zoned RT devices are split into
three phases:

 1) a reservation for the worst case data block usage is taken before
    acquiring the iolock.  When there are enough free blocks but not
    enough available one, garbage collection is kicked off to free the
    space before continuing with the write.  If there isn't enough
    freeable space, the block reservation is reduced and a short write
    will happen as expected by normal Linux write semantics.
 2) with the iolock held, the generic iomap buffered write code is
    called, which through the iomap_begin operation usually just inserts
    delalloc extents for the range in a single iteration.  Only for
    overwrites of existing data that are not block aligned, or zeroing
    operations the existing extent mapping is read to fill out the srcmap
    and to figure out if zeroing is required.
 3) the ->map_blocks callback to the generic iomap writeback code
    calls into the zoned space allocator to actually allocate on-disk
    space for the range before kicking of the writeback.

Note that because all writes are out of place, truncate or hole punches
that are not aligned to block size boundaries need to allocate space.
For block zeroing from truncate, ->setattr is called with the iolock
(aka i_rwsem) already held, so a hacky deviation from the above
scheme is needed.  In this case the space reservations is called with
the iolock held, but is required not to block and can dip into the
reserved block pool.  This can lead to -ENOSPC when truncating a
file, which is unfortunate.  But fixing the calling conventions in
the VFS is probably much easier with code requiring it already in
mainline.

Similarly because all writes are out place, the zoned allocator can't
support unwritten extents and thus the FALLOC_FL_ALLOCATE_RANGE range
mode of fallocate.  Other fallocate modes that would reserved space
but don't need to to provide proper semantics do work but do not
reserve space.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatar"Darrick J. Wong" <djwong@kernel.org>
parent 080d01c4
Loading
Loading
Loading
Loading
+158 −9
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * Copyright (c) 2016-2018 Christoph Hellwig.
 * Copyright (c) 2016-2025 Christoph Hellwig.
 * All Rights Reserved.
 */
#include "xfs.h"
@@ -20,6 +20,8 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#include "xfs_rtgroup.h"

struct xfs_writepage_ctx {
	struct iomap_writepage_ctx ctx;
@@ -77,6 +79,26 @@ xfs_setfilesize(
	return xfs_trans_commit(tp);
}

static void
xfs_ioend_put_open_zones(
	struct iomap_ioend	*ioend)
{
	struct iomap_ioend *tmp;

	/*
	 * Put the open zone for all ioends merged into this one (if any).
	 */
	list_for_each_entry(tmp, &ioend->io_list, io_list)
		xfs_open_zone_put(tmp->io_private);

	/*
	 * The main ioend might not have an open zone if the submission failed
	 * before xfs_zone_alloc_and_submit got called.
	 */
	if (ioend->io_private)
		xfs_open_zone_put(ioend->io_private);
}

/*
 * IO write completion.
 */
@@ -86,6 +108,7 @@ xfs_end_ioend(
{
	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
	struct xfs_mount	*mp = ip->i_mount;
	bool			is_zoned = xfs_is_zoned_inode(ip);
	xfs_off_t		offset = ioend->io_offset;
	size_t			size = ioend->io_size;
	unsigned int		nofs_flag;
@@ -116,9 +139,10 @@ xfs_end_ioend(
	error = blk_status_to_errno(ioend->io_bio.bi_status);
	if (unlikely(error)) {
		if (ioend->io_flags & IOMAP_IOEND_SHARED) {
			ASSERT(!is_zoned);
			xfs_reflink_cancel_cow_range(ip, offset, size, true);
			xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
					offset + size);
					offset + size, NULL);
		}
		goto done;
	}
@@ -126,7 +150,10 @@ xfs_end_ioend(
	/*
	 * Success: commit the COW or unwritten blocks if needed.
	 */
	if (ioend->io_flags & IOMAP_IOEND_SHARED)
	if (is_zoned)
		error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
				ioend->io_private, NULLFSBLOCK);
	else if (ioend->io_flags & IOMAP_IOEND_SHARED)
		error = xfs_reflink_end_cow(ip, offset, size);
	else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
		error = xfs_iomap_write_unwritten(ip, offset, size, false);
@@ -134,6 +161,8 @@ xfs_end_ioend(
	if (!error && xfs_ioend_is_append(ioend))
		error = xfs_setfilesize(ip, offset, size);
done:
	if (is_zoned)
		xfs_ioend_put_open_zones(ioend);
	iomap_finish_ioends(ioend, error);
	memalloc_nofs_restore(nofs_flag);
}
@@ -176,17 +205,27 @@ xfs_end_io(
	}
}

STATIC void
static void
xfs_end_bio(
	struct bio		*bio)
{
	struct iomap_ioend	*ioend = iomap_ioend_from_bio(bio);
	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
	struct xfs_mount	*mp = ip->i_mount;
	unsigned long		flags;

	/*
	 * For Appends record the actually written block number and set the
	 * boundary flag if needed.
	 */
	if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
		ioend->io_sector = bio->bi_iter.bi_sector;
		xfs_mark_rtg_boundary(ioend);
	}

	spin_lock_irqsave(&ip->i_ioend_lock, flags);
	if (list_empty(&ip->i_ioend_list))
		WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
		WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
					 &ip->i_ioend_work));
	list_add_tail(&ioend->io_list, &ip->i_ioend_list);
	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
@@ -463,7 +502,7 @@ xfs_discard_folio(
	 * folio itself and not the start offset that is passed in.
	 */
	xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
				folio_pos(folio) + folio_size(folio));
				folio_pos(folio) + folio_size(folio), NULL);
}

static const struct iomap_writeback_ops xfs_writeback_ops = {
@@ -472,15 +511,125 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
	.discard_folio		= xfs_discard_folio,
};

struct xfs_zoned_writepage_ctx {
	struct iomap_writepage_ctx	ctx;
	struct xfs_open_zone		*open_zone;
};

static inline struct xfs_zoned_writepage_ctx *
XFS_ZWPC(struct iomap_writepage_ctx *ctx)
{
	return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
}

static int
xfs_zoned_map_blocks(
	struct iomap_writepage_ctx *wpc,
	struct inode		*inode,
	loff_t			offset,
	unsigned int		len)
{
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + len);
	xfs_filblks_t		count_fsb;
	struct xfs_bmbt_irec	imap, del;
	struct xfs_iext_cursor	icur;

	if (xfs_is_shutdown(mp))
		return -EIO;

	XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);

	/*
	 * All dirty data must be covered by delalloc extents.  But truncate can
	 * remove delalloc extents underneath us or reduce their size.
	 * Returning a hole tells iomap to not write back any data from this
	 * range, which is the right thing to do in that case.
	 *
	 * Otherwise just tell iomap to treat ranges previously covered by a
	 * delalloc extent as mapped.  The actual block allocation will be done
	 * just before submitting the bio.
	 *
	 * This implies we never map outside folios that are locked or marked
	 * as under writeback, and thus there is no need check the fork sequence
	 * count here.
	 */
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
	if (imap.br_startoff > offset_fsb) {
		imap.br_blockcount = imap.br_startoff - offset_fsb;
		imap.br_startoff = offset_fsb;
		imap.br_startblock = HOLESTARTBLOCK;
		imap.br_state = XFS_EXT_NORM;
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
		return 0;
	}
	end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
	count_fsb = end_fsb - offset_fsb;

	del = imap;
	xfs_trim_extent(&del, offset_fsb, count_fsb);
	xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
			XFS_BMAPI_REMAP);
	xfs_iunlock(ip, XFS_ILOCK_EXCL);

	wpc->iomap.type = IOMAP_MAPPED;
	wpc->iomap.flags = IOMAP_F_DIRTY;
	wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
	wpc->iomap.offset = offset;
	wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
	wpc->iomap.flags = IOMAP_F_ANON_WRITE;

	trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
	return 0;
}

static int
xfs_zoned_submit_ioend(
	struct iomap_writepage_ctx *wpc,
	int			status)
{
	wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
	if (status)
		return status;
	xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
	return 0;
}

static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
	.map_blocks		= xfs_zoned_map_blocks,
	.submit_ioend		= xfs_zoned_submit_ioend,
	.discard_folio		= xfs_discard_folio,
};

STATIC int
xfs_vm_writepages(
	struct address_space	*mapping,
	struct writeback_control *wbc)
{
	struct xfs_inode	*ip = XFS_I(mapping->host);

	xfs_iflags_clear(ip, XFS_ITRUNCATED);

	if (xfs_is_zoned_inode(ip)) {
		struct xfs_zoned_writepage_ctx	xc = { };
		int				error;

		error = iomap_writepages(mapping, wbc, &xc.ctx,
					 &xfs_zoned_writeback_ops);
		if (xc.open_zone)
			xfs_open_zone_put(xc.open_zone);
		return error;
	} else {
		struct xfs_writepage_ctx	wpc = { };

	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
		return iomap_writepages(mapping, wbc, &wpc.ctx,
				&xfs_writeback_ops);
	}
}

STATIC int
+25 −7
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"

/* Kernel only BMAP related definitions and functions */

@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
	struct xfs_inode	*ip,
	int			whichfork,
	xfs_off_t		start_byte,
	xfs_off_t		end_byte)
	xfs_off_t		end_byte,
	struct xfs_zone_alloc_ctx *ac)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
@@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range(
			continue;
		}

		xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
		if (xfs_is_zoned_inode(ip) && ac) {
			/*
			 * In a zoned buffered write context we need to return
			 * the punched delalloc allocations to the allocation
			 * context.  This allows reusing them in the following
			 * iomap iterations.
			 */
			xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
					&del, XFS_BMAPI_REMAP);
			ac->reserved_blocks += del.br_blockcount;
		} else {
			xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
					&del, 0);
		}

		if (!xfs_iext_get_extent(ifp, &icur, &got))
			break;
	}
@@ -582,7 +598,7 @@ xfs_free_eofblocks(
		if (ip->i_delayed_blks) {
			xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
				round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
				LLONG_MAX);
				LLONG_MAX, NULL);
		}
		xfs_inode_clear_eofblocks_tag(ip);
		return 0;
@@ -825,7 +841,8 @@ int
xfs_free_file_space(
	struct xfs_inode	*ip,
	xfs_off_t		offset,
	xfs_off_t		len)
	xfs_off_t		len,
	struct xfs_zone_alloc_ctx *ac)
{
	struct xfs_mount	*mp = ip->i_mount;
	xfs_fileoff_t		startoffset_fsb;
@@ -880,7 +897,7 @@ xfs_free_file_space(
		return 0;
	if (offset + len > XFS_ISIZE(ip))
		len = XFS_ISIZE(ip) - offset;
	error = xfs_zero_range(ip, offset, len, NULL);
	error = xfs_zero_range(ip, offset, len, ac, NULL);
	if (error)
		return error;

@@ -968,7 +985,8 @@ int
xfs_collapse_file_space(
	struct xfs_inode	*ip,
	xfs_off_t		offset,
	xfs_off_t		len)
	xfs_off_t		len,
	struct xfs_zone_alloc_ctx *ac)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_trans	*tp;
@@ -981,7 +999,7 @@ xfs_collapse_file_space(

	trace_xfs_collapse_file_space(ip);

	error = xfs_free_file_space(ip, offset, len);
	error = xfs_free_file_space(ip, offset, len, ac);
	if (error)
		return error;

+7 −5
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
struct xfs_zone_alloc_ctx;

#ifdef CONFIG_XFS_RT
int	xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */

void	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
		xfs_off_t start_byte, xfs_off_t end_byte);
		xfs_off_t start_byte, xfs_off_t end_byte,
		struct xfs_zone_alloc_ctx *ac);

struct kgetbmap {
	__s64		bmv_offset;	/* file offset of segment in blocks */
@@ -56,9 +58,9 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
		xfs_off_t len);
int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
			    xfs_off_t len);
		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
				xfs_off_t len);
		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
		xfs_off_t len);

+216 −25
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
#include "xfs_zone_alloc.h"

#include <linux/dax.h>
#include <linux/falloc.h>
@@ -360,7 +361,8 @@ xfs_file_write_zero_eof(
	struct iov_iter		*from,
	unsigned int		*iolock,
	size_t			count,
	bool			*drained_dio)
	bool			*drained_dio,
	struct xfs_zone_alloc_ctx *ac)
{
	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
	loff_t			isize;
@@ -414,7 +416,7 @@ xfs_file_write_zero_eof(
	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);

	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
	error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);

	return error;
@@ -431,7 +433,8 @@ STATIC ssize_t
xfs_file_write_checks(
	struct kiocb		*iocb,
	struct iov_iter		*from,
	unsigned int		*iolock)
	unsigned int		*iolock,
	struct xfs_zone_alloc_ctx *ac)
{
	struct inode		*inode = iocb->ki_filp->f_mapping->host;
	size_t			count = iov_iter_count(from);
@@ -481,7 +484,7 @@ xfs_file_write_checks(
	 */
	if (iocb->ki_pos > i_size_read(inode)) {
		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
				&drained_dio);
				&drained_dio, ac);
		if (error == 1)
			goto restart;
		if (error)
@@ -491,6 +494,48 @@ xfs_file_write_checks(
	return kiocb_modified(iocb);
}

static ssize_t
xfs_zoned_write_space_reserve(
	struct xfs_inode		*ip,
	struct kiocb			*iocb,
	struct iov_iter			*from,
	unsigned int			flags,
	struct xfs_zone_alloc_ctx	*ac)
{
	loff_t				count = iov_iter_count(from);
	int				error;

	if (iocb->ki_flags & IOCB_NOWAIT)
		flags |= XFS_ZR_NOWAIT;

	/*
	 * Check the rlimit and LFS boundary first so that we don't over-reserve
	 * by possibly a lot.
	 *
	 * The generic write path will redo this check later, and it might have
	 * changed by then.  If it got expanded we'll stick to our earlier
	 * smaller limit, and if it is decreased the new smaller limit will be
	 * used and our extra space reservation will be returned after finishing
	 * the write.
	 */
	error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
	if (error)
		return error;

	/*
	 * Sloppily round up count to file system blocks.
	 *
	 * This will often reserve an extra block, but that avoids having to look
	 * at the start offset, which isn't stable for O_APPEND until taking the
	 * iolock.  Also we need to reserve a block each for zeroing the old
	 * EOF block and the new start block if they are unaligned.
	 *
	 * Any remaining block will be returned after the write.
	 */
	return xfs_zoned_space_reserve(ip,
			XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
}

static int
xfs_dio_write_end_io(
	struct kiocb		*iocb,
@@ -597,7 +642,7 @@ xfs_file_dio_write_aligned(
	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
	if (ret)
		return ret;
	ret = xfs_file_write_checks(iocb, from, &iolock);
	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
	if (ret)
		goto out_unlock;

@@ -675,7 +720,7 @@ xfs_file_dio_write_unaligned(
		goto out_unlock;
	}

	ret = xfs_file_write_checks(iocb, from, &iolock);
	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
	if (ret)
		goto out_unlock;

@@ -749,7 +794,7 @@ xfs_file_dax_write(
	ret = xfs_ilock_iocb(iocb, iolock);
	if (ret)
		return ret;
	ret = xfs_file_write_checks(iocb, from, &iolock);
	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
	if (ret)
		goto out;

@@ -793,7 +838,7 @@ xfs_file_buffered_write(
	if (ret)
		return ret;

	ret = xfs_file_write_checks(iocb, from, &iolock);
	ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
	if (ret)
		goto out;

@@ -840,6 +885,67 @@ xfs_file_buffered_write(
	return ret;
}

STATIC ssize_t
xfs_file_buffered_write_zoned(
	struct kiocb		*iocb,
	struct iov_iter		*from)
{
	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
	struct xfs_mount	*mp = ip->i_mount;
	unsigned int		iolock = XFS_IOLOCK_EXCL;
	bool			cleared_space = false;
	struct xfs_zone_alloc_ctx ac = { };
	ssize_t			ret;

	ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
	if (ret < 0)
		return ret;

	ret = xfs_ilock_iocb(iocb, iolock);
	if (ret)
		goto out_unreserve;

	ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
	if (ret)
		goto out_unlock;

	/*
	 * Truncate the iter to the length that we were actually able to
	 * allocate blocks for.  This needs to happen after
	 * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
	 * writes.
	 */
	iov_iter_truncate(from,
			XFS_FSB_TO_B(mp, ac.reserved_blocks) -
			(iocb->ki_pos & mp->m_blockmask));
	if (!iov_iter_count(from))
		goto out_unlock;

retry:
	trace_xfs_file_buffered_write(iocb, from);
	ret = iomap_file_buffered_write(iocb, from,
			&xfs_buffered_write_iomap_ops, &ac);
	if (ret == -ENOSPC && !cleared_space) {
		/*
		 * Kick off writeback to convert delalloc space and release the
		 * usually too pessimistic indirect block reservations.
		 */
		xfs_flush_inodes(mp);
		cleared_space = true;
		goto retry;
	}

out_unlock:
	xfs_iunlock(ip, iolock);
out_unreserve:
	xfs_zoned_space_unreserve(ip, &ac);
	if (ret > 0) {
		XFS_STATS_ADD(mp, xs_write_bytes, ret);
		ret = generic_write_sync(iocb, ret);
	}
	return ret;
}

STATIC ssize_t
xfs_file_write_iter(
	struct kiocb		*iocb,
@@ -887,6 +993,8 @@ xfs_file_write_iter(
			return ret;
	}

	if (xfs_is_zoned_inode(ip))
		return xfs_file_buffered_write_zoned(iocb, from);
	return xfs_file_buffered_write(iocb, from);
}

@@ -941,7 +1049,8 @@ static int
xfs_falloc_collapse_range(
	struct file		*file,
	loff_t			offset,
	loff_t			len)
	loff_t			len,
	struct xfs_zone_alloc_ctx *ac)
{
	struct inode		*inode = file_inode(file);
	loff_t			new_size = i_size_read(inode) - len;
@@ -957,7 +1066,7 @@ xfs_falloc_collapse_range(
	if (offset + len >= i_size_read(inode))
		return -EINVAL;

	error = xfs_collapse_file_space(XFS_I(inode), offset, len);
	error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
	if (error)
		return error;
	return xfs_falloc_setsize(file, new_size);
@@ -1013,7 +1122,8 @@ xfs_falloc_zero_range(
	struct file		*file,
	int			mode,
	loff_t			offset,
	loff_t			len)
	loff_t			len,
	struct xfs_zone_alloc_ctx *ac)
{
	struct inode		*inode = file_inode(file);
	unsigned int		blksize = i_blocksize(inode);
@@ -1026,7 +1136,7 @@ xfs_falloc_zero_range(
	if (error)
		return error;

	error = xfs_free_file_space(XFS_I(inode), offset, len);
	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
	if (error)
		return error;

@@ -1097,22 +1207,18 @@ xfs_falloc_allocate_range(
		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)

STATIC long
xfs_file_fallocate(
__xfs_file_fallocate(
	struct file		*file,
	int			mode,
	loff_t			offset,
	loff_t			len)
	loff_t			len,
	struct xfs_zone_alloc_ctx *ac)
{
	struct inode		*inode = file_inode(file);
	struct xfs_inode	*ip = XFS_I(inode);
	long			error;
	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;

	if (!S_ISREG(inode->i_mode))
		return -EINVAL;
	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
		return -EOPNOTSUPP;

	xfs_ilock(ip, iolock);
	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
	if (error)
@@ -1133,16 +1239,16 @@ xfs_file_fallocate(

	switch (mode & FALLOC_FL_MODE_MASK) {
	case FALLOC_FL_PUNCH_HOLE:
		error = xfs_free_file_space(ip, offset, len);
		error = xfs_free_file_space(ip, offset, len, ac);
		break;
	case FALLOC_FL_COLLAPSE_RANGE:
		error = xfs_falloc_collapse_range(file, offset, len);
		error = xfs_falloc_collapse_range(file, offset, len, ac);
		break;
	case FALLOC_FL_INSERT_RANGE:
		error = xfs_falloc_insert_range(file, offset, len);
		break;
	case FALLOC_FL_ZERO_RANGE:
		error = xfs_falloc_zero_range(file, mode, offset, len);
		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
		break;
	case FALLOC_FL_UNSHARE_RANGE:
		error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1163,6 +1269,54 @@ xfs_file_fallocate(
	return error;
}

static long
xfs_file_zoned_fallocate(
	struct file		*file,
	int			mode,
	loff_t			offset,
	loff_t			len)
{
	struct xfs_zone_alloc_ctx ac = { };
	struct xfs_inode	*ip = XFS_I(file_inode(file));
	int			error;

	error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
	if (error)
		return error;
	error = __xfs_file_fallocate(file, mode, offset, len, &ac);
	xfs_zoned_space_unreserve(ip, &ac);
	return error;
}

static long
xfs_file_fallocate(
	struct file		*file,
	int			mode,
	loff_t			offset,
	loff_t			len)
{
	struct inode		*inode = file_inode(file);

	if (!S_ISREG(inode->i_mode))
		return -EINVAL;
	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
		return -EOPNOTSUPP;

	/*
	 * For zoned file systems, zeroing the first and last block of a hole
	 * punch requires allocating a new block to rewrite the remaining data
	 * and new zeroes out of place.  Get a reservations for those before
	 * taking the iolock.  Dip into the reserved pool because we are
	 * expected to be able to punch a hole even on a completely full
	 * file system.
	 */
	if (xfs_is_zoned_inode(XFS_I(inode)) &&
	    (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
		     FALLOC_FL_COLLAPSE_RANGE)))
		return xfs_file_zoned_fallocate(file, mode, offset, len);
	return __xfs_file_fallocate(file, mode, offset, len, NULL);
}

STATIC int
xfs_file_fadvise(
	struct file	*file,
@@ -1488,9 +1642,10 @@ xfs_dax_read_fault(
 *         i_lock (XFS - extent map serialisation)
 */
static vm_fault_t
xfs_write_fault(
__xfs_write_fault(
	struct vm_fault		*vmf,
	unsigned int		order)
	unsigned int		order,
	struct xfs_zone_alloc_ctx *ac)
{
	struct inode		*inode = file_inode(vmf->vma->vm_file);
	struct xfs_inode	*ip = XFS_I(inode);
@@ -1528,13 +1683,49 @@ xfs_write_fault(
		ret = xfs_dax_fault_locked(vmf, order, true);
	else
		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
				NULL);
				ac);
	xfs_iunlock(ip, lock_mode);

	sb_end_pagefault(inode->i_sb);
	return ret;
}

static vm_fault_t
xfs_write_fault_zoned(
	struct vm_fault		*vmf,
	unsigned int		order)
{
	struct xfs_inode	*ip = XFS_I(file_inode(vmf->vma->vm_file));
	unsigned int		len = folio_size(page_folio(vmf->page));
	struct xfs_zone_alloc_ctx ac = { };
	int			error;
	vm_fault_t		ret;

	/*
	 * This could over-allocate as it doesn't check for truncation.
	 *
	 * But as the overallocation is limited to less than a folio and will be
	 * release instantly that's just fine.
	 */
	error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
			&ac);
	if (error < 0)
		return vmf_fs_error(error);
	ret = __xfs_write_fault(vmf, order, &ac);
	xfs_zoned_space_unreserve(ip, &ac);
	return ret;
}

static vm_fault_t
xfs_write_fault(
	struct vm_fault		*vmf,
	unsigned int		order)
{
	if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
		return xfs_write_fault_zoned(vmf, order);
	return __xfs_write_fault(vmf, order, NULL);
}

static inline bool
xfs_is_write_fault(
	struct vm_fault		*vmf)
+183 −3

File changed.

Preview size limit exceeded, changes collapsed.

Loading