xfs: implement buffered writes to zoned RT devices (058dd70c) · Commits · git / linux-net

fs/xfs/xfs_aops.c

+158 −9

Original line number	Diff line number	Diff line
		// SPDX-License-Identifier: GPL-2.0
		/*
		* Copyright (c) 2000-2005 Silicon Graphics, Inc.
		* Copyright (c) 2016-2018 Christoph Hellwig.
		* Copyright (c) 2016-2025 Christoph Hellwig.
		* All Rights Reserved.
		*/
		#include "xfs.h"
		@@ -20,6 +20,8 @@
		#include "xfs_errortag.h"
		#include "xfs_error.h"
		#include "xfs_icache.h"
		#include "xfs_zone_alloc.h"
		#include "xfs_rtgroup.h"

		struct xfs_writepage_ctx {
		struct iomap_writepage_ctx ctx;
		@@ -77,6 +79,26 @@ xfs_setfilesize(
		return xfs_trans_commit(tp);
		}

		static void
		xfs_ioend_put_open_zones(
		struct iomap_ioend *ioend)
		{
		struct iomap_ioend *tmp;

		/*
		* Put the open zone for all ioends merged into this one (if any).
		*/
		list_for_each_entry(tmp, &ioend->io_list, io_list)
		xfs_open_zone_put(tmp->io_private);

		/*
		* The main ioend might not have an open zone if the submission failed
		* before xfs_zone_alloc_and_submit got called.
		*/
		if (ioend->io_private)
		xfs_open_zone_put(ioend->io_private);
		}

		/*
		* IO write completion.
		*/
		@@ -86,6 +108,7 @@ xfs_end_ioend(
		{
		struct xfs_inode *ip = XFS_I(ioend->io_inode);
		struct xfs_mount *mp = ip->i_mount;
		bool is_zoned = xfs_is_zoned_inode(ip);
		xfs_off_t offset = ioend->io_offset;
		size_t size = ioend->io_size;
		unsigned int nofs_flag;
		@@ -116,9 +139,10 @@ xfs_end_ioend(
		error = blk_status_to_errno(ioend->io_bio.bi_status);
		if (unlikely(error)) {
		if (ioend->io_flags & IOMAP_IOEND_SHARED) {
		ASSERT(!is_zoned);
		xfs_reflink_cancel_cow_range(ip, offset, size, true);
		xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
		offset + size);
		offset + size, NULL);
		}
		goto done;
		}
		@@ -126,7 +150,10 @@ xfs_end_ioend(
		/*
		* Success: commit the COW or unwritten blocks if needed.
		*/
		if (ioend->io_flags & IOMAP_IOEND_SHARED)
		if (is_zoned)
		error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
		ioend->io_private, NULLFSBLOCK);
		else if (ioend->io_flags & IOMAP_IOEND_SHARED)
		error = xfs_reflink_end_cow(ip, offset, size);
		else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
		error = xfs_iomap_write_unwritten(ip, offset, size, false);
		@@ -134,6 +161,8 @@ xfs_end_ioend(
		if (!error && xfs_ioend_is_append(ioend))
		error = xfs_setfilesize(ip, offset, size);
		done:
		if (is_zoned)
		xfs_ioend_put_open_zones(ioend);
		iomap_finish_ioends(ioend, error);
		memalloc_nofs_restore(nofs_flag);
		}
		@@ -176,17 +205,27 @@ xfs_end_io(
		}
		}

		STATIC void
		static void
		xfs_end_bio(
		struct bio *bio)
		{
		struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
		struct xfs_inode *ip = XFS_I(ioend->io_inode);
		struct xfs_mount *mp = ip->i_mount;
		unsigned long flags;

		/*
		* For Appends record the actually written block number and set the
		* boundary flag if needed.
		*/
		if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
		ioend->io_sector = bio->bi_iter.bi_sector;
		xfs_mark_rtg_boundary(ioend);
		}

		spin_lock_irqsave(&ip->i_ioend_lock, flags);
		if (list_empty(&ip->i_ioend_list))
		WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
		WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
		&ip->i_ioend_work));
		list_add_tail(&ioend->io_list, &ip->i_ioend_list);
		spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
		@@ -463,7 +502,7 @@ xfs_discard_folio(
		* folio itself and not the start offset that is passed in.
		*/
		xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
		folio_pos(folio) + folio_size(folio));
		folio_pos(folio) + folio_size(folio), NULL);
		}

		static const struct iomap_writeback_ops xfs_writeback_ops = {
		@@ -472,15 +511,125 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
		.discard_folio = xfs_discard_folio,
		};

		struct xfs_zoned_writepage_ctx {
		struct iomap_writepage_ctx ctx;
		struct xfs_open_zone *open_zone;
		};

		static inline struct xfs_zoned_writepage_ctx *
		XFS_ZWPC(struct iomap_writepage_ctx *ctx)
		{
		return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
		}

		static int
		xfs_zoned_map_blocks(
		struct iomap_writepage_ctx *wpc,
		struct inode *inode,
		loff_t offset,
		unsigned int len)
		{
		struct xfs_inode *ip = XFS_I(inode);
		struct xfs_mount *mp = ip->i_mount;
		xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
		xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
		xfs_filblks_t count_fsb;
		struct xfs_bmbt_irec imap, del;
		struct xfs_iext_cursor icur;

		if (xfs_is_shutdown(mp))
		return -EIO;

		XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);

		/*
		* All dirty data must be covered by delalloc extents. But truncate can
		* remove delalloc extents underneath us or reduce their size.
		* Returning a hole tells iomap to not write back any data from this
		* range, which is the right thing to do in that case.
		*
		* Otherwise just tell iomap to treat ranges previously covered by a
		* delalloc extent as mapped. The actual block allocation will be done
		* just before submitting the bio.
		*
		* This implies we never map outside folios that are locked or marked
		* as under writeback, and thus there is no need check the fork sequence
		* count here.
		*/
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
		imap.br_startoff = end_fsb; /* fake a hole past EOF */
		if (imap.br_startoff > offset_fsb) {
		imap.br_blockcount = imap.br_startoff - offset_fsb;
		imap.br_startoff = offset_fsb;
		imap.br_startblock = HOLESTARTBLOCK;
		imap.br_state = XFS_EXT_NORM;
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
		return 0;
		}
		end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
		count_fsb = end_fsb - offset_fsb;

		del = imap;
		xfs_trim_extent(&del, offset_fsb, count_fsb);
		xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
		XFS_BMAPI_REMAP);
		xfs_iunlock(ip, XFS_ILOCK_EXCL);

		wpc->iomap.type = IOMAP_MAPPED;
		wpc->iomap.flags = IOMAP_F_DIRTY;
		wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
		wpc->iomap.offset = offset;
		wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
		wpc->iomap.flags = IOMAP_F_ANON_WRITE;

		trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
		return 0;
		}

		static int
		xfs_zoned_submit_ioend(
		struct iomap_writepage_ctx *wpc,
		int status)
		{
		wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
		if (status)
		return status;
		xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
		return 0;
		}

		static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
		.map_blocks = xfs_zoned_map_blocks,
		.submit_ioend = xfs_zoned_submit_ioend,
		.discard_folio = xfs_discard_folio,
		};

		STATIC int
		xfs_vm_writepages(
		struct address_space *mapping,
		struct writeback_control *wbc)
		{
		struct xfs_inode *ip = XFS_I(mapping->host);

		xfs_iflags_clear(ip, XFS_ITRUNCATED);

		if (xfs_is_zoned_inode(ip)) {
		struct xfs_zoned_writepage_ctx xc = { };
		int error;

		error = iomap_writepages(mapping, wbc, &xc.ctx,
		&xfs_zoned_writeback_ops);
		if (xc.open_zone)
		xfs_open_zone_put(xc.open_zone);
		return error;
		} else {
		struct xfs_writepage_ctx wpc = { };

		xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
		return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
		return iomap_writepages(mapping, wbc, &wpc.ctx,
		&xfs_writeback_ops);
		}
		}

		STATIC int

fs/xfs/xfs_bmap_util.c

+25 −7

Original line number	Diff line number	Diff line
		@@ -30,6 +30,7 @@
		#include "xfs_reflink.h"
		#include "xfs_rtbitmap.h"
		#include "xfs_rtgroup.h"
		#include "xfs_zone_alloc.h"

		/* Kernel only BMAP related definitions and functions */

		@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
		struct xfs_inode *ip,
		int whichfork,
		xfs_off_t start_byte,
		xfs_off_t end_byte)
		xfs_off_t end_byte,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct xfs_mount *mp = ip->i_mount;
		struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
		@@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range(
		continue;
		}

		xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del, 0);
		if (xfs_is_zoned_inode(ip) && ac) {
		/*
		* In a zoned buffered write context we need to return
		* the punched delalloc allocations to the allocation
		* context. This allows reusing them in the following
		* iomap iterations.
		*/
		xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
		&del, XFS_BMAPI_REMAP);
		ac->reserved_blocks += del.br_blockcount;
		} else {
		xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
		&del, 0);
		}

		if (!xfs_iext_get_extent(ifp, &icur, &got))
		break;
		}
		@@ -582,7 +598,7 @@ xfs_free_eofblocks(
		if (ip->i_delayed_blks) {
		xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
		round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
		LLONG_MAX);
		LLONG_MAX, NULL);
		}
		xfs_inode_clear_eofblocks_tag(ip);
		return 0;
		@@ -825,7 +841,8 @@ int
		xfs_free_file_space(
		struct xfs_inode *ip,
		xfs_off_t offset,
		xfs_off_t len)
		xfs_off_t len,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct xfs_mount *mp = ip->i_mount;
		xfs_fileoff_t startoffset_fsb;
		@@ -880,7 +897,7 @@ xfs_free_file_space(
		return 0;
		if (offset + len > XFS_ISIZE(ip))
		len = XFS_ISIZE(ip) - offset;
		error = xfs_zero_range(ip, offset, len, NULL);
		error = xfs_zero_range(ip, offset, len, ac, NULL);
		if (error)
		return error;

		@@ -968,7 +985,8 @@ int
		xfs_collapse_file_space(
		struct xfs_inode *ip,
		xfs_off_t offset,
		xfs_off_t len)
		xfs_off_t len,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct xfs_mount *mp = ip->i_mount;
		struct xfs_trans *tp;
		@@ -981,7 +999,7 @@ xfs_collapse_file_space(

		trace_xfs_collapse_file_space(ip);

		error = xfs_free_file_space(ip, offset, len);
		error = xfs_free_file_space(ip, offset, len, ac);
		if (error)
		return error;

fs/xfs/xfs_bmap_util.h

+7 −5

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@ struct xfs_inode;
		struct xfs_mount;
		struct xfs_trans;
		struct xfs_bmalloca;
		struct xfs_zone_alloc_ctx;

		#ifdef CONFIG_XFS_RT
		int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
		@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
		#endif /* CONFIG_XFS_RT */

		void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
		xfs_off_t start_byte, xfs_off_t end_byte);
		xfs_off_t start_byte, xfs_off_t end_byte,
		struct xfs_zone_alloc_ctx *ac);

		struct kgetbmap {
		__s64 bmv_offset; /* file offset of segment in blocks */
		@@ -56,9 +58,9 @@ int xfs_bmap_last_extent(struct xfs_trans tp, struct xfs_inode ip,
		int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
		xfs_off_t len);
		int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
		xfs_off_t len);
		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
		int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
		xfs_off_t len);
		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
		int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
		xfs_off_t len);

fs/xfs/xfs_file.c

+216 −25

Original line number	Diff line number	Diff line
		@@ -25,6 +25,7 @@
		#include "xfs_iomap.h"
		#include "xfs_reflink.h"
		#include "xfs_file.h"
		#include "xfs_zone_alloc.h"

		#include <linux/dax.h>
		#include <linux/falloc.h>
		@@ -360,7 +361,8 @@ xfs_file_write_zero_eof(
		struct iov_iter *from,
		unsigned int *iolock,
		size_t count,
		bool *drained_dio)
		bool *drained_dio,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
		loff_t isize;
		@@ -414,7 +416,7 @@ xfs_file_write_zero_eof(
		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);

		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
		error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
		error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);

		return error;
		@@ -431,7 +433,8 @@ STATIC ssize_t
		xfs_file_write_checks(
		struct kiocb *iocb,
		struct iov_iter *from,
		unsigned int *iolock)
		unsigned int *iolock,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct inode *inode = iocb->ki_filp->f_mapping->host;
		size_t count = iov_iter_count(from);
		@@ -481,7 +484,7 @@ xfs_file_write_checks(
		*/
		if (iocb->ki_pos > i_size_read(inode)) {
		error = xfs_file_write_zero_eof(iocb, from, iolock, count,
		&drained_dio);
		&drained_dio, ac);
		if (error == 1)
		goto restart;
		if (error)
		@@ -491,6 +494,48 @@ xfs_file_write_checks(
		return kiocb_modified(iocb);
		}

		static ssize_t
		xfs_zoned_write_space_reserve(
		struct xfs_inode *ip,
		struct kiocb *iocb,
		struct iov_iter *from,
		unsigned int flags,
		struct xfs_zone_alloc_ctx *ac)
		{
		loff_t count = iov_iter_count(from);
		int error;

		if (iocb->ki_flags & IOCB_NOWAIT)
		flags \|= XFS_ZR_NOWAIT;

		/*
		* Check the rlimit and LFS boundary first so that we don't over-reserve
		* by possibly a lot.
		*
		* The generic write path will redo this check later, and it might have
		* changed by then. If it got expanded we'll stick to our earlier
		* smaller limit, and if it is decreased the new smaller limit will be
		* used and our extra space reservation will be returned after finishing
		* the write.
		*/
		error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
		if (error)
		return error;

		/*
		* Sloppily round up count to file system blocks.
		*
		* This will often reserve an extra block, but that avoids having to look
		* at the start offset, which isn't stable for O_APPEND until taking the
		* iolock. Also we need to reserve a block each for zeroing the old
		* EOF block and the new start block if they are unaligned.
		*
		* Any remaining block will be returned after the write.
		*/
		return xfs_zoned_space_reserve(ip,
		XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
		}

		static int
		xfs_dio_write_end_io(
		struct kiocb *iocb,
		@@ -597,7 +642,7 @@ xfs_file_dio_write_aligned(
		ret = xfs_ilock_iocb_for_write(iocb, &iolock);
		if (ret)
		return ret;
		ret = xfs_file_write_checks(iocb, from, &iolock);
		ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
		if (ret)
		goto out_unlock;

		@@ -675,7 +720,7 @@ xfs_file_dio_write_unaligned(
		goto out_unlock;
		}

		ret = xfs_file_write_checks(iocb, from, &iolock);
		ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
		if (ret)
		goto out_unlock;

		@@ -749,7 +794,7 @@ xfs_file_dax_write(
		ret = xfs_ilock_iocb(iocb, iolock);
		if (ret)
		return ret;
		ret = xfs_file_write_checks(iocb, from, &iolock);
		ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
		if (ret)
		goto out;

		@@ -793,7 +838,7 @@ xfs_file_buffered_write(
		if (ret)
		return ret;

		ret = xfs_file_write_checks(iocb, from, &iolock);
		ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
		if (ret)
		goto out;

		@@ -840,6 +885,67 @@ xfs_file_buffered_write(
		return ret;
		}

		STATIC ssize_t
		xfs_file_buffered_write_zoned(
		struct kiocb *iocb,
		struct iov_iter *from)
		{
		struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
		struct xfs_mount *mp = ip->i_mount;
		unsigned int iolock = XFS_IOLOCK_EXCL;
		bool cleared_space = false;
		struct xfs_zone_alloc_ctx ac = { };
		ssize_t ret;

		ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
		if (ret < 0)
		return ret;

		ret = xfs_ilock_iocb(iocb, iolock);
		if (ret)
		goto out_unreserve;

		ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
		if (ret)
		goto out_unlock;

		/*
		* Truncate the iter to the length that we were actually able to
		* allocate blocks for. This needs to happen after
		* xfs_file_write_checks, because that assigns ki_pos for O_APPEND
		* writes.
		*/
		iov_iter_truncate(from,
		XFS_FSB_TO_B(mp, ac.reserved_blocks) -
		(iocb->ki_pos & mp->m_blockmask));
		if (!iov_iter_count(from))
		goto out_unlock;

		retry:
		trace_xfs_file_buffered_write(iocb, from);
		ret = iomap_file_buffered_write(iocb, from,
		&xfs_buffered_write_iomap_ops, &ac);
		if (ret == -ENOSPC && !cleared_space) {
		/*
		* Kick off writeback to convert delalloc space and release the
		* usually too pessimistic indirect block reservations.
		*/
		xfs_flush_inodes(mp);
		cleared_space = true;
		goto retry;
		}

		out_unlock:
		xfs_iunlock(ip, iolock);
		out_unreserve:
		xfs_zoned_space_unreserve(ip, &ac);
		if (ret > 0) {
		XFS_STATS_ADD(mp, xs_write_bytes, ret);
		ret = generic_write_sync(iocb, ret);
		}
		return ret;
		}

		STATIC ssize_t
		xfs_file_write_iter(
		struct kiocb *iocb,
		@@ -887,6 +993,8 @@ xfs_file_write_iter(
		return ret;
		}

		if (xfs_is_zoned_inode(ip))
		return xfs_file_buffered_write_zoned(iocb, from);
		return xfs_file_buffered_write(iocb, from);
		}

		@@ -941,7 +1049,8 @@ static int
		xfs_falloc_collapse_range(
		struct file *file,
		loff_t offset,
		loff_t len)
		loff_t len,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct inode *inode = file_inode(file);
		loff_t new_size = i_size_read(inode) - len;
		@@ -957,7 +1066,7 @@ xfs_falloc_collapse_range(
		if (offset + len >= i_size_read(inode))
		return -EINVAL;

		error = xfs_collapse_file_space(XFS_I(inode), offset, len);
		error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
		if (error)
		return error;
		return xfs_falloc_setsize(file, new_size);
		@@ -1013,7 +1122,8 @@ xfs_falloc_zero_range(
		struct file *file,
		int mode,
		loff_t offset,
		loff_t len)
		loff_t len,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct inode *inode = file_inode(file);
		unsigned int blksize = i_blocksize(inode);
		@@ -1026,7 +1136,7 @@ xfs_falloc_zero_range(
		if (error)
		return error;

		error = xfs_free_file_space(XFS_I(inode), offset, len);
		error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
		if (error)
		return error;

		@@ -1097,22 +1207,18 @@ xfs_falloc_allocate_range(
		FALLOC_FL_INSERT_RANGE \| FALLOC_FL_UNSHARE_RANGE)

		STATIC long
		xfs_file_fallocate(
		__xfs_file_fallocate(
		struct file *file,
		int mode,
		loff_t offset,
		loff_t len)
		loff_t len,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct inode *inode = file_inode(file);
		struct xfs_inode *ip = XFS_I(inode);
		long error;
		uint iolock = XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL;

		if (!S_ISREG(inode->i_mode))
		return -EINVAL;
		if (mode & ~XFS_FALLOC_FL_SUPPORTED)
		return -EOPNOTSUPP;

		xfs_ilock(ip, iolock);
		error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
		if (error)
		@@ -1133,16 +1239,16 @@ xfs_file_fallocate(

		switch (mode & FALLOC_FL_MODE_MASK) {
		case FALLOC_FL_PUNCH_HOLE:
		error = xfs_free_file_space(ip, offset, len);
		error = xfs_free_file_space(ip, offset, len, ac);
		break;
		case FALLOC_FL_COLLAPSE_RANGE:
		error = xfs_falloc_collapse_range(file, offset, len);
		error = xfs_falloc_collapse_range(file, offset, len, ac);
		break;
		case FALLOC_FL_INSERT_RANGE:
		error = xfs_falloc_insert_range(file, offset, len);
		break;
		case FALLOC_FL_ZERO_RANGE:
		error = xfs_falloc_zero_range(file, mode, offset, len);
		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
		break;
		case FALLOC_FL_UNSHARE_RANGE:
		error = xfs_falloc_unshare_range(file, mode, offset, len);
		@@ -1163,6 +1269,54 @@ xfs_file_fallocate(
		return error;
		}

		static long
		xfs_file_zoned_fallocate(
		struct file *file,
		int mode,
		loff_t offset,
		loff_t len)
		{
		struct xfs_zone_alloc_ctx ac = { };
		struct xfs_inode *ip = XFS_I(file_inode(file));
		int error;

		error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
		if (error)
		return error;
		error = __xfs_file_fallocate(file, mode, offset, len, &ac);
		xfs_zoned_space_unreserve(ip, &ac);
		return error;
		}

		static long
		xfs_file_fallocate(
		struct file *file,
		int mode,
		loff_t offset,
		loff_t len)
		{
		struct inode *inode = file_inode(file);

		if (!S_ISREG(inode->i_mode))
		return -EINVAL;
		if (mode & ~XFS_FALLOC_FL_SUPPORTED)
		return -EOPNOTSUPP;

		/*
		* For zoned file systems, zeroing the first and last block of a hole
		* punch requires allocating a new block to rewrite the remaining data
		* and new zeroes out of place. Get a reservations for those before
		* taking the iolock. Dip into the reserved pool because we are
		* expected to be able to punch a hole even on a completely full
		* file system.
		*/
		if (xfs_is_zoned_inode(XFS_I(inode)) &&
		(mode & (FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE \|
		FALLOC_FL_COLLAPSE_RANGE)))
		return xfs_file_zoned_fallocate(file, mode, offset, len);
		return __xfs_file_fallocate(file, mode, offset, len, NULL);
		}

		STATIC int
		xfs_file_fadvise(
		struct file *file,
		@@ -1488,9 +1642,10 @@ xfs_dax_read_fault(
		* i_lock (XFS - extent map serialisation)
		*/
		static vm_fault_t
		xfs_write_fault(
		__xfs_write_fault(
		struct vm_fault *vmf,
		unsigned int order)
		unsigned int order,
		struct xfs_zone_alloc_ctx *ac)
		{
		struct inode *inode = file_inode(vmf->vma->vm_file);
		struct xfs_inode *ip = XFS_I(inode);
		@@ -1528,13 +1683,49 @@ xfs_write_fault(
		ret = xfs_dax_fault_locked(vmf, order, true);
		else
		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
		NULL);
		ac);
		xfs_iunlock(ip, lock_mode);

		sb_end_pagefault(inode->i_sb);
		return ret;
		}

		static vm_fault_t
		xfs_write_fault_zoned(
		struct vm_fault *vmf,
		unsigned int order)
		{
		struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
		unsigned int len = folio_size(page_folio(vmf->page));
		struct xfs_zone_alloc_ctx ac = { };
		int error;
		vm_fault_t ret;

		/*
		* This could over-allocate as it doesn't check for truncation.
		*
		* But as the overallocation is limited to less than a folio and will be
		* release instantly that's just fine.
		*/
		error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
		&ac);
		if (error < 0)
		return vmf_fs_error(error);
		ret = __xfs_write_fault(vmf, order, &ac);
		xfs_zoned_space_unreserve(ip, &ac);
		return ret;
		}

		static vm_fault_t
		xfs_write_fault(
		struct vm_fault *vmf,
		unsigned int order)
		{
		if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
		return xfs_write_fault_zoned(vmf, order);
		return __xfs_write_fault(vmf, order, NULL);
		}

		static inline bool
		xfs_is_write_fault(
		struct vm_fault *vmf)

fs/xfs/xfs_iomap.c

+183 −3

File changed.

Preview size limit exceeded, changes collapsed.