Unverified Commit 867f8567 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "iomap: fix zero padding data issue in concurrent append writes"

Long Li <leo.lilong@huawei.com> says:

This patch series fixes zero padding data issues in concurrent append
write scenarios. A detailed problem description and solution can be
found in patch 2. Patch 1 is introduced as preparation for the fix in
patch 2, eliminating the need to resample inode size for io_size
trimming and avoiding issues caused by inode size changes during
concurrent writeback and truncate operations.

* patches from https://lore.kernel.org/r/20241209114241.3725722-1-leo.lilong@huawei.com:
  iomap: fix zero padding data issue in concurrent append writes
  iomap: pass byte granular end position to iomap_add_to_ioend

Link: https://lore.kernel.org/r/20241209114241.3725722-1-leo.lilong@huawei.com


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 930e7c20 51d20d1d
Loading
Loading
Loading
Loading
+57 −9
Original line number Diff line number Diff line
@@ -1774,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 */
static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct folio *folio,
		struct inode *inode, loff_t pos, unsigned len)
		struct inode *inode, loff_t pos, loff_t end_pos,
		unsigned len)
{
	struct iomap_folio_state *ifs = folio->private;
	size_t poff = offset_in_folio(folio, pos);
@@ -1793,15 +1794,60 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,

	if (ifs)
		atomic_add(len, &ifs->write_bytes_pending);

	/*
	 * Clamp io_offset and io_size to the incore EOF so that ondisk
	 * file size updates in the ioend completion are byte-accurate.
	 * This avoids recovering files with zeroed tail regions when
	 * writeback races with appending writes:
	 *
	 *    Thread 1:                  Thread 2:
	 *    ------------               -----------
	 *    write [A, A+B]
	 *    update inode size to A+B
	 *    submit I/O [A, A+BS]
	 *                               write [A+B, A+B+C]
	 *                               update inode size to A+B+C
	 *    <I/O completes, updates disk size to min(A+B+C, A+BS)>
	 *    <power failure>
	 *
	 *  After reboot:
	 *    1) with A+B+C < A+BS, the file has zero padding in range
	 *       [A+B, A+B+C]
	 *
	 *    |<     Block Size (BS)   >|
	 *    |DDDDDDDDDDDD0000000000000|
	 *    ^           ^        ^
	 *    A          A+B     A+B+C
	 *                       (EOF)
	 *
	 *    2) with A+B+C > A+BS, the file has zero padding in range
	 *       [A+B, A+BS]
	 *
	 *    |<     Block Size (BS)   >|<     Block Size (BS)    >|
	 *    |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
	 *    ^           ^             ^           ^
	 *    A          A+B           A+BS       A+B+C
	 *                             (EOF)
	 *
	 *    D = Valid Data
	 *    0 = Zero Padding
	 *
	 * Note that this defeats the ability to chain the ioends of
	 * appending writes.
	 */
	wpc->ioend->io_size += len;
	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
		wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;

	wbc_account_cgroup_owner(wbc, folio, len);
	return 0;
}

static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
		struct writeback_control *wbc, struct folio *folio,
		struct inode *inode, u64 pos, unsigned dirty_len,
		unsigned *count)
		struct inode *inode, u64 pos, u64 end_pos,
		unsigned dirty_len, unsigned *count)
{
	int error;

@@ -1826,7 +1872,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
			break;
		default:
			error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
					map_len);
					end_pos, map_len);
			if (!error)
				(*count)++;
			break;
@@ -1897,11 +1943,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
		 *    remaining memory is zeroed when mapped, and writes to that
		 *    region are not written out to the file.
		 *
		 * Also adjust the writeback range to skip all blocks entirely
		 * beyond i_size.
		 * Also adjust the end_pos to the end of file and skip writeback
		 * for all blocks entirely beyond i_size.
		 */
		folio_zero_segment(folio, poff, folio_size(folio));
		*end_pos = round_up(isize, i_blocksize(inode));
		*end_pos = isize;
	}

	return true;
@@ -1914,6 +1960,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
	struct inode *inode = folio->mapping->host;
	u64 pos = folio_pos(folio);
	u64 end_pos = pos + folio_size(folio);
	u64 end_aligned = 0;
	unsigned count = 0;
	int error = 0;
	u32 rlen;
@@ -1955,9 +2002,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
	/*
	 * Walk through the folio to find dirty areas to write back.
	 */
	while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
	end_aligned = round_up(end_pos, i_blocksize(inode));
	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
				pos, rlen, &count);
				pos, end_pos, rlen, &count);
		if (error)
			break;
		pos += rlen;
+1 −1
Original line number Diff line number Diff line
@@ -335,7 +335,7 @@ struct iomap_ioend {
	u16			io_type;
	u16			io_flags;	/* IOMAP_F_* */
	struct inode		*io_inode;	/* file being written to */
	size_t			io_size;	/* size of the extent */
	size_t			io_size;	/* size of data within eof */
	loff_t			io_offset;	/* offset in the file */
	sector_t		io_sector;	/* start sector of ioend */
	struct bio		io_bio;		/* MUST BE LAST! */