Commit 0b3bb205 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull vfs fixes from Christian Brauner:

 - kthread: consolidate kthread exit paths to prevent use-after-free

 - iomap:
    - don't mark folio uptodate if read IO has bytes pending
    - don't report direct-io retries to fserror
    - reject delalloc mappings during writeback

 - ns: tighten visibility checks

 - netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict
   sequence

* tag 'vfs-7.0-rc3.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  iomap: reject delalloc mappings during writeback
  iomap: don't mark folio uptodate if read IO has bytes pending
  selftests: fix mntns iteration selftests
  nstree: tighten permission checks for listing
  nsfs: tighten permission checks for handle opening
  nsfs: tighten permission checks for ns iteration ioctls
  netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence
  kthread: consolidate kthread exit paths to prevent use-after-free
  iomap: don't report direct-io retries to fserror
parents ecc64d2d d320f160
Loading
Loading
Loading
Loading
+12 −3
Original line number Diff line number Diff line
@@ -80,18 +80,27 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
{
	struct iomap_folio_state *ifs = folio->private;
	unsigned long flags;
	bool uptodate = true;
	bool mark_uptodate = true;

	if (folio_test_uptodate(folio))
		return;

	if (ifs) {
		spin_lock_irqsave(&ifs->state_lock, flags);
		uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
		/*
		 * If a read with bytes pending is in progress, we must not call
		 * folio_mark_uptodate(). The read completion path
		 * (iomap_read_end()) will call folio_end_read(), which uses XOR
		 * semantics to set the uptodate bit. If we set it here, the XOR
		 * in folio_end_read() will clear it, leaving the folio not
		 * uptodate.
		 */
		mark_uptodate = ifs_set_range_uptodate(folio, ifs, off, len) &&
				!ifs->read_bytes_pending;
		spin_unlock_irqrestore(&ifs->state_lock, flags);
	}

	if (uptodate)
	if (mark_uptodate)
		folio_mark_uptodate(folio);
}

+14 −1
Original line number Diff line number Diff line
@@ -87,6 +87,19 @@ static inline enum fserror_type iomap_dio_err_type(const struct iomap_dio *dio)
	return FSERR_DIRECTIO_READ;
}

static inline bool should_report_dio_fserror(const struct iomap_dio *dio)
{
	switch (dio->error) {
	case 0:
	case -EAGAIN:
	case -ENOTBLK:
		/* don't send fsnotify for success or magic retry codes */
		return false;
	default:
		return true;
	}
}

ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
	const struct iomap_dio_ops *dops = dio->dops;
@@ -96,7 +109,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)

	if (dops && dops->end_io)
		ret = dops->end_io(iocb, dio->size, ret, dio->flags);
	if (dio->error)
	if (should_report_dio_fserror(dio))
		fserror_report_io(file_inode(iocb->ki_filp),
				  iomap_dio_err_type(dio), offset, dio->size,
				  dio->error, GFP_NOFS);
+7 −6
Original line number Diff line number Diff line
@@ -215,17 +215,18 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
	WARN_ON_ONCE(!folio->private && map_len < dirty_len);

	switch (wpc->iomap.type) {
	case IOMAP_INLINE:
		WARN_ON_ONCE(1);
		return -EIO;
	case IOMAP_UNWRITTEN:
		ioend_flags |= IOMAP_IOEND_UNWRITTEN;
		break;
	case IOMAP_MAPPED:
		break;
	case IOMAP_HOLE:
		return map_len;
	default:
		break;
		WARN_ON_ONCE(1);
		return -EIO;
	}

	if (wpc->iomap.type == IOMAP_UNWRITTEN)
		ioend_flags |= IOMAP_IOEND_UNWRITTEN;
	if (wpc->iomap.flags & IOMAP_F_SHARED)
		ioend_flags |= IOMAP_IOEND_SHARED;
	if (folio_test_dropbehind(folio))
+212 −16
Original line number Diff line number Diff line
@@ -9,6 +9,202 @@
#include <linux/uio.h>
#include "internal.h"

/*
 * Perform the cleanup rituals after an unbuffered write is complete.
 */
static void netfs_unbuffered_write_done(struct netfs_io_request *wreq)
{
	struct netfs_inode *ictx = netfs_inode(wreq->inode);

	_enter("R=%x", wreq->debug_id);

	/* Okay, declare that all I/O is complete. */
	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);

	if (!wreq->error)
		netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);

	if (wreq->origin == NETFS_DIO_WRITE &&
	    wreq->mapping->nrpages) {
		/* mmap may have got underfoot and we may now have folios
		 * locally covering the region we just wrote.  Attempt to
		 * discard the folios, but leave in place any modified locally.
		 * ->write_iter() is prevented from interfering by the DIO
		 * counter.
		 */
		pgoff_t first = wreq->start >> PAGE_SHIFT;
		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;

		invalidate_inode_pages2_range(wreq->mapping, first, last);
	}

	if (wreq->origin == NETFS_DIO_WRITE)
		inode_dio_end(wreq->inode);

	_debug("finished");
	netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
	/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */

	if (wreq->iocb) {
		size_t written = umin(wreq->transferred, wreq->len);

		wreq->iocb->ki_pos += written;
		if (wreq->iocb->ki_complete) {
			trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
			wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written);
		}
		wreq->iocb = VFS_PTR_POISON;
	}

	netfs_clear_subrequests(wreq);
}

/*
 * Collect the subrequest results of unbuffered write subrequests.
 */
static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,
					   struct netfs_io_stream *stream,
					   struct netfs_io_subrequest *subreq)
{
	trace_netfs_collect_sreq(wreq, subreq);

	spin_lock(&wreq->lock);
	list_del_init(&subreq->rreq_link);
	spin_unlock(&wreq->lock);

	wreq->transferred += subreq->transferred;
	iov_iter_advance(&wreq->buffer.iter, subreq->transferred);

	stream->collected_to = subreq->start + subreq->transferred;
	wreq->collected_to = stream->collected_to;
	netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);

	trace_netfs_collect_stream(wreq, stream);
	trace_netfs_collect_state(wreq, wreq->collected_to, 0);
}

/*
 * Write data to the server without going through the pagecache and without
 * writing it to the local cache.  We dispatch the subrequests serially and
 * wait for each to complete before dispatching the next, lest we leave a gap
 * in the data written due to a failure such as ENOSPC.  We could, however
 * attempt to do preparation such as content encryption for the next subreq
 * whilst the current is in progress.
 */
static int netfs_unbuffered_write(struct netfs_io_request *wreq)
{
	struct netfs_io_subrequest *subreq = NULL;
	struct netfs_io_stream *stream = &wreq->io_streams[0];
	int ret;

	_enter("%llx", wreq->len);

	if (wreq->origin == NETFS_DIO_WRITE)
		inode_dio_begin(wreq->inode);

	stream->collected_to = wreq->start;

	for (;;) {
		bool retry = false;

		if (!subreq) {
			netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
			subreq = stream->construct;
			stream->construct = NULL;
			stream->front = NULL;
		}

		/* Check if (re-)preparation failed. */
		if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {
			netfs_write_subrequest_terminated(subreq, subreq->error);
			wreq->error = subreq->error;
			break;
		}

		iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred);
		if (!iov_iter_count(&subreq->io_iter))
			break;

		subreq->len = netfs_limit_iter(&subreq->io_iter, 0,
					       stream->sreq_max_len,
					       stream->sreq_max_segs);
		iov_iter_truncate(&subreq->io_iter, subreq->len);
		stream->submit_extendable_to = subreq->len;

		trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
		stream->issue_write(subreq);

		/* Async, need to wait. */
		netfs_wait_for_in_progress_stream(wreq, stream);

		if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
			retry = true;
		} else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
			ret = subreq->error;
			wreq->error = ret;
			netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);
			subreq = NULL;
			break;
		}
		ret = 0;

		if (!retry) {
			netfs_unbuffered_write_collect(wreq, stream, subreq);
			subreq = NULL;
			if (wreq->transferred >= wreq->len)
				break;
			if (!wreq->iocb && signal_pending(current)) {
				ret = wreq->transferred ? -EINTR : -ERESTARTSYS;
				trace_netfs_rreq(wreq, netfs_rreq_trace_intr);
				break;
			}
			continue;
		}

		/* We need to retry the last subrequest, so first reset the
		 * iterator, taking into account what, if anything, we managed
		 * to transfer.
		 */
		subreq->error = -EAGAIN;
		trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
		if (subreq->transferred > 0)
			iov_iter_advance(&wreq->buffer.iter, subreq->transferred);

		if (stream->source == NETFS_UPLOAD_TO_SERVER &&
		    wreq->netfs_ops->retry_request)
			wreq->netfs_ops->retry_request(wreq, stream);

		__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
		__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
		__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
		subreq->io_iter		= wreq->buffer.iter;
		subreq->start		= wreq->start + wreq->transferred;
		subreq->len		= wreq->len   - wreq->transferred;
		subreq->transferred	= 0;
		subreq->retry_count	+= 1;
		stream->sreq_max_len	= UINT_MAX;
		stream->sreq_max_segs	= INT_MAX;

		netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
		stream->prepare_write(subreq);

		__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
		netfs_stat(&netfs_n_wh_retry_write_subreq);
	}

	netfs_unbuffered_write_done(wreq);
	_leave(" = %d", ret);
	return ret;
}

static void netfs_unbuffered_write_async(struct work_struct *work)
{
	struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);

	netfs_unbuffered_write(wreq);
	netfs_put_request(wreq, netfs_rreq_trace_put_complete);
}

/*
 * Perform an unbuffered write where we may have to do an RMW operation on an
 * encrypted file.  This can also be used for direct I/O writes.
@@ -70,35 +266,35 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
			 */
			wreq->buffer.iter = *iter;
		}

		wreq->len = iov_iter_count(&wreq->buffer.iter);
	}

	__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
	if (async)
		__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);

	/* Copy the data into the bounce buffer and encrypt it. */
	// TODO

	/* Dispatch the write. */
	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
	if (async)

	if (async) {
		INIT_WORK(&wreq->work, netfs_unbuffered_write_async);
		wreq->iocb = iocb;
	wreq->len = iov_iter_count(&wreq->buffer.iter);
	ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
		queue_work(system_dfl_wq, &wreq->work);
		ret = -EIOCBQUEUED;
	} else {
		ret = netfs_unbuffered_write(wreq);
		if (ret < 0) {
			_debug("begin = %zd", ret);
		goto out;
		} else {
			iocb->ki_pos += wreq->transferred;
			ret = wreq->transferred ?: wreq->error;
		}

	if (!async) {
		ret = netfs_wait_for_write(wreq);
		if (ret > 0)
			iocb->ki_pos += ret;
	} else {
		ret = -EIOCBQUEUED;
		netfs_put_request(wreq, netfs_rreq_trace_put_complete);
	}

out:
	netfs_put_request(wreq, netfs_rreq_trace_put_return);
	return ret;

+3 −1
Original line number Diff line number Diff line
@@ -198,6 +198,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
						struct file *file,
						loff_t start,
						enum netfs_io_origin origin);
void netfs_prepare_write(struct netfs_io_request *wreq,
			 struct netfs_io_stream *stream,
			 loff_t start);
void netfs_reissue_write(struct netfs_io_stream *stream,
			 struct netfs_io_subrequest *subreq,
			 struct iov_iter *source);
@@ -212,7 +215,6 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
			       struct folio **writethrough_cache);
ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
			       struct folio *writethrough_cache);
int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);

/*
 * write_retry.c
Loading