Commit 71ee2fde authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull vfs pipe updates from Christian Brauner:

 - Introduce struct file_operations pipeanon_fops

 - Don't update {a,c,m}time for anonymous pipes to avoid the performance
   costs associated with it

 - Change pipe_write() to never add a zero-sized buffer

 - Limit the slots in pipe_resize_ring()

 - Use pipe_buf() to retrieve the pipe buffer everywhere

 - Drop an always true check in anon_pipe_write()

 - Cache 2 pages instead of 1

 - Avoid spurious calls to prepare_to_wait_event() in ___wait_event()

* tag 'vfs-6.15-rc1.pipe' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  fs/splice: Use pipe_buf() helper to retrieve pipe buffer
  fs/pipe: Use pipe_buf() helper to retrieve pipe buffer
  kernel/watch_queue: Use pipe_buf() to retrieve the pipe buffer
  fs/pipe: Limit the slots in pipe_resize_ring()
  wait: avoid spurious calls to prepare_to_wait_event() in ___wait_event()
  pipe: cache 2 pages instead of 1
  pipe: drop an always true check in anon_pipe_write()
  pipe: change pipe_write() to never add a zero-sized buffer
  pipe: don't update {a,c,m}time for anonymous pipes
  pipe: introduce struct file_operations pipeanon_fops
parents fd101da6 3732d8f1
Loading
Loading
Loading
Loading
+103 −78
Original line number Diff line number Diff line
@@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
	pipe_lock(pipe2);
}

static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe)
{
	for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
		if (pipe->tmp_page[i]) {
			struct page *page = pipe->tmp_page[i];
			pipe->tmp_page[i] = NULL;
			return page;
		}
	}

	return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
}

static void anon_pipe_put_page(struct pipe_inode_info *pipe,
			       struct page *page)
{
	if (page_count(page) == 1) {
		for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
			if (!pipe->tmp_page[i]) {
				pipe->tmp_page[i] = page;
				return;
			}
		}
	}

	put_page(page);
}

static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
				  struct pipe_buffer *buf)
{
	struct page *page = buf->page;

	/*
	 * If nobody else uses this page, and we don't already have a
	 * temporary page, let's keep track of it as a one-deep
	 * allocation cache. (Otherwise just release our reference to it)
	 */
	if (page_count(page) == 1 && !pipe->tmp_page)
		pipe->tmp_page = page;
	else
		put_page(page);
	anon_pipe_put_page(pipe, page);
}

static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
@@ -247,7 +267,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
}

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
	size_t total_len = iov_iter_count(to);
	struct file *filp = iocb->ki_filp;
@@ -274,7 +294,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
		/* Read ->head with a barrier vs post_one_notification() */
		unsigned int head = smp_load_acquire(&pipe->head);
		unsigned int tail = pipe->tail;
		unsigned int mask = pipe->ring_size - 1;

#ifdef CONFIG_WATCH_QUEUE
		if (pipe->note_loss) {
@@ -301,7 +320,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
#endif

		if (!pipe_empty(head, tail)) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
			struct pipe_buffer *buf = pipe_buf(pipe, tail);
			size_t chars = buf->len;
			size_t written;
			int error;
@@ -359,29 +378,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
			break;
		}
		mutex_unlock(&pipe->mutex);

		/*
		 * We only get here if we didn't actually read anything.
		 *
		 * However, we could have seen (and removed) a zero-sized
		 * pipe buffer, and might have made space in the buffers
		 * that way.
		 *
		 * You can't make zero-sized pipe buffers by doing an empty
		 * write (not even in packet mode), but they can happen if
		 * the writer gets an EFAULT when trying to fill a buffer
		 * that already got allocated and inserted in the buffer
		 * array.
		 *
		 * So we still need to wake up any pending writers in the
		 * _very_ unlikely case that the pipe was full, but we got
		 * no data.
		 */
		if (unlikely(wake_writer))
			wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

		/*
		 * But because we didn't read anything, at this point we can
		 * just return directly with -ERESTARTSYS if we're interrupted,
		 * since we've done any required wakeups and there's no need
@@ -390,7 +389,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
			return -ERESTARTSYS;

		wake_writer = false;
		wake_next_reader = true;
		mutex_lock(&pipe->mutex);
	}
@@ -403,8 +401,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
	if (wake_next_reader)
		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
	return ret;
}

static ssize_t
fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
	int ret = anon_pipe_read(iocb, to);
	if (ret > 0)
		file_accessed(filp);
		file_accessed(iocb->ki_filp);
	return ret;
}

@@ -424,7 +429,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe)
}

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *filp = iocb->ki_filp;
	struct pipe_inode_info *pipe = filp->private_data;
@@ -471,8 +476,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
	was_empty = pipe_empty(head, pipe->tail);
	chars = total_len & (PAGE_SIZE-1);
	if (chars && !was_empty) {
		unsigned int mask = pipe->ring_size - 1;
		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
		struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
		int offset = buf->offset + buf->len;

		if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
@@ -503,54 +507,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)

		head = pipe->head;
		if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
			unsigned int mask = pipe->ring_size - 1;
			struct pipe_buffer *buf;
			struct page *page = pipe->tmp_page;
			struct page *page;
			int copied;

			if (!page) {
				page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
			page = anon_pipe_get_page(pipe);
			if (unlikely(!page)) {
					ret = ret ? : -ENOMEM;
				if (!ret)
					ret = -ENOMEM;
				break;
			}
				pipe->tmp_page = page;

			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
				anon_pipe_put_page(pipe, page);
				if (!ret)
					ret = -EFAULT;
				break;
			}

			/* Allocate a slot in the ring in advance and attach an
			 * empty buffer.  If we fault or otherwise fail to use
			 * it, either the reader will consume it or it'll still
			 * be there for the next write.
			 */
			pipe->head = head + 1;

			/* Insert it into the buffer array */
			buf = &pipe->bufs[head & mask];
			buf = pipe_buf(pipe, head);
			buf->page = page;
			buf->ops = &anon_pipe_buf_ops;
			buf->offset = 0;
			buf->len = 0;
			if (is_packetized(filp))
				buf->flags = PIPE_BUF_FLAG_PACKET;
			else
				buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
			pipe->tmp_page = NULL;

			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
				if (!ret)
					ret = -EFAULT;
				break;
			}
			ret += copied;
			buf->len = copied;
			ret += copied;

			if (!iov_iter_count(from))
				break;
		}

		if (!pipe_full(head, pipe->tail, pipe->max_usage))
			continue;
		}

		/* Wait for buffer space to become available. */
		if ((filp->f_flags & O_NONBLOCK) ||
@@ -602,12 +596,22 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	if (wake_next_writer)
		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
	return ret;
}

static ssize_t
fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
	int ret = anon_pipe_write(iocb, from);
	if (ret > 0) {
		struct file *filp = iocb->ki_filp;
		if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
			int err = file_update_time(filp);
			if (err)
				ret = err;
			sb_end_write(file_inode(filp)->i_sb);
		}
	}
	return ret;
}

@@ -853,8 +857,10 @@ void free_pipe_info(struct pipe_inode_info *pipe)
	if (pipe->watch_queue)
		put_watch_queue(pipe->watch_queue);
#endif
	if (pipe->tmp_page)
		__free_page(pipe->tmp_page);
	for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
		if (pipe->tmp_page[i])
			__free_page(pipe->tmp_page[i]);
	}
	kfree(pipe->bufs);
	kfree(pipe);
}
@@ -874,6 +880,8 @@ static const struct dentry_operations pipefs_dentry_operations = {
	.d_dname	= pipefs_dname,
};

static const struct file_operations pipeanon_fops;

static struct inode * get_pipe_inode(void)
{
	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
@@ -891,7 +899,7 @@ static struct inode * get_pipe_inode(void)
	inode->i_pipe = pipe;
	pipe->files = 2;
	pipe->readers = pipe->writers = 1;
	inode->i_fop = &pipefifo_fops;
	inode->i_fop = &pipeanon_fops;

	/*
	 * Mark the inode dirty from the very beginning,
@@ -934,7 +942,7 @@ int create_pipe_files(struct file **res, int flags)

	f = alloc_file_pseudo(inode, pipe_mnt, "",
				O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
				&pipefifo_fops);
				&pipeanon_fops);
	if (IS_ERR(f)) {
		free_pipe_info(inode->i_pipe);
		iput(inode);
@@ -945,7 +953,7 @@ int create_pipe_files(struct file **res, int flags)
	f->f_pipe = 0;

	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
				  &pipefifo_fops);
				  &pipeanon_fops);
	if (IS_ERR(res[0])) {
		put_pipe_info(inode, inode->i_pipe);
		fput(f);
@@ -1109,8 +1117,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe)

static int fifo_open(struct inode *inode, struct file *filp)
{
	bool is_pipe = inode->i_fop == &pipeanon_fops;
	struct pipe_inode_info *pipe;
	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
	int ret;

	filp->f_pipe = 0;
@@ -1234,8 +1242,19 @@ static int fifo_open(struct inode *inode, struct file *filp)

const struct file_operations pipefifo_fops = {
	.open		= fifo_open,
	.read_iter	= pipe_read,
	.write_iter	= pipe_write,
	.read_iter	= fifo_pipe_read,
	.write_iter	= fifo_pipe_write,
	.poll		= pipe_poll,
	.unlocked_ioctl	= pipe_ioctl,
	.release	= pipe_release,
	.fasync		= pipe_fasync,
	.splice_write	= iter_file_splice_write,
};

static const struct file_operations pipeanon_fops = {
	.open		= fifo_open,
	.read_iter	= anon_pipe_read,
	.write_iter	= anon_pipe_write,
	.poll		= pipe_poll,
	.unlocked_ioctl	= pipe_ioctl,
	.release	= pipe_release,
@@ -1271,6 +1290,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
	struct pipe_buffer *bufs;
	unsigned int head, tail, mask, n;

	/* nr_slots larger than limits of pipe->{head,tail} */
	if (unlikely(nr_slots > (pipe_index_t)-1u))
		return -EINVAL;

	bufs = kcalloc(nr_slots, sizeof(*bufs),
		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
	if (unlikely(!bufs))
@@ -1390,7 +1413,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
	struct pipe_inode_info *pipe = file->private_data;

	if (file->f_op != &pipefifo_fops || !pipe)
	if (!pipe)
		return NULL;
	if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops)
		return NULL;
	if (for_splice && pipe_has_watch_queue(pipe))
		return NULL;
+14 −26
Original line number Diff line number Diff line
@@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
	unsigned int spd_pages = spd->nr_pages;
	unsigned int tail = pipe->tail;
	unsigned int head = pipe->head;
	unsigned int mask = pipe->ring_size - 1;
	ssize_t ret = 0;
	int page_nr = 0;

@@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
	}

	while (!pipe_full(head, tail, pipe->max_usage)) {
		struct pipe_buffer *buf = &pipe->bufs[head & mask];
		struct pipe_buffer *buf = pipe_buf(pipe, head);

		buf->page = spd->pages[page_nr];
		buf->offset = spd->partial[page_nr].offset;
@@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
	unsigned int head = pipe->head;
	unsigned int tail = pipe->tail;
	unsigned int mask = pipe->ring_size - 1;
	int ret;

	if (unlikely(!pipe->readers)) {
@@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
	} else if (pipe_full(head, tail, pipe->max_usage)) {
		ret = -EAGAIN;
	} else {
		pipe->bufs[head & mask] = *buf;
		*pipe_buf(pipe, head) = *buf;
		pipe->head = head + 1;
		return buf->len;
	}
@@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
{
	unsigned int head = pipe->head;
	unsigned int tail = pipe->tail;
	unsigned int mask = pipe->ring_size - 1;
	int ret;

	while (!pipe_empty(head, tail)) {
		struct pipe_buffer *buf = &pipe->bufs[tail & mask];
		struct pipe_buffer *buf = pipe_buf(pipe, tail);

		sd->len = buf->len;
		if (sd->len > sd->total_len)
@@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
	unsigned int tail = pipe->tail;
	unsigned int mask = pipe->ring_size - 1;
	struct pipe_buffer *buf = &pipe->bufs[tail & mask];
	struct pipe_buffer *buf = pipe_buf(pipe, tail);

	if (unlikely(!buf->len)) {
		pipe_buf_release(pipe, buf);
@@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
	while (sd.total_len) {
		struct kiocb kiocb;
		struct iov_iter from;
		unsigned int head, tail, mask;
		unsigned int head, tail;
		size_t left;
		int n;

@@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,

		head = pipe->head;
		tail = pipe->tail;
		mask = pipe->ring_size - 1;

		/* build the vector */
		left = sd.total_len;
		for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
			struct pipe_buffer *buf = pipe_buf(pipe, tail);
			size_t this_len = buf->len;

			/* zero-length bvecs are not supported, skip them */
@@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
		/* dismiss the fully eaten buffers, adjust the partial one */
		tail = pipe->tail;
		while (ret) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
			struct pipe_buffer *buf = pipe_buf(pipe, tail);
			if (ret >= buf->len) {
				ret -= buf->len;
				buf->len = 0;
@@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
	pipe_lock(pipe);

	while (len > 0) {
		unsigned int head, tail, mask, bc = 0;
		unsigned int head, tail, bc = 0;
		size_t remain = len;

		/*
@@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,

		head = pipe->head;
		tail = pipe->tail;
		mask = pipe->ring_size - 1;

		while (!pipe_empty(head, tail)) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
			struct pipe_buffer *buf = pipe_buf(pipe, tail);
			size_t seg;

			if (!buf->len) {
@@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
		len -= ret;
		tail = pipe->tail;
		while (ret > 0) {
			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
			struct pipe_buffer *buf = pipe_buf(pipe, tail);
			size_t seg = min_t(size_t, ret, buf->len);

			buf->offset += seg;
@@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
	struct pipe_buffer *ibuf, *obuf;
	unsigned int i_head, o_head;
	unsigned int i_tail, o_tail;
	unsigned int i_mask, o_mask;
	int ret = 0;
	bool input_wakeup = false;

@@ -1747,9 +1740,7 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
	pipe_double_lock(ipipe, opipe);

	i_tail = ipipe->tail;
	i_mask = ipipe->ring_size - 1;
	o_head = opipe->head;
	o_mask = opipe->ring_size - 1;

	do {
		size_t o_len;
@@ -1792,8 +1783,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
			goto retry;
		}

		ibuf = &ipipe->bufs[i_tail & i_mask];
		obuf = &opipe->bufs[o_head & o_mask];
		ibuf = pipe_buf(ipipe, i_tail);
		obuf = pipe_buf(opipe, o_head);

		if (len >= ibuf->len) {
			/*
@@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
	struct pipe_buffer *ibuf, *obuf;
	unsigned int i_head, o_head;
	unsigned int i_tail, o_tail;
	unsigned int i_mask, o_mask;
	ssize_t ret = 0;

	/*
@@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
	pipe_double_lock(ipipe, opipe);

	i_tail = ipipe->tail;
	i_mask = ipipe->ring_size - 1;
	o_head = opipe->head;
	o_mask = opipe->ring_size - 1;

	do {
		if (!opipe->readers) {
@@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
		    pipe_full(o_head, o_tail, opipe->max_usage))
			break;

		ibuf = &ipipe->bufs[i_tail & i_mask];
		obuf = &opipe->bufs[o_head & o_mask];
		ibuf = pipe_buf(ipipe, i_tail);
		obuf = pipe_buf(opipe, o_head);

		/*
		 * Get a reference to this pipe buffer,
+1 −1
Original line number Diff line number Diff line
@@ -108,7 +108,7 @@ struct pipe_inode_info {
#ifdef CONFIG_WATCH_QUEUE
	bool note_loss;
#endif
	struct page *tmp_page;
	struct page *tmp_page[2];
	struct fasync_struct *fasync_readers;
	struct fasync_struct *fasync_writers;
	struct pipe_buffer *bufs;
+3 −0
Original line number Diff line number Diff line
@@ -316,6 +316,9 @@ extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
		}								\
										\
		cmd;								\
										\
		if (condition)							\
			break;							\
	}									\
	finish_wait(&wq_head, &__wq_entry);					\
__out:	__ret;									\
+3 −4
Original line number Diff line number Diff line
@@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue,
	struct pipe_inode_info *pipe = wqueue->pipe;
	struct pipe_buffer *buf;
	struct page *page;
	unsigned int head, tail, mask, note, offset, len;
	unsigned int head, tail, note, offset, len;
	bool done = false;

	spin_lock_irq(&pipe->rd_wait.lock);

	mask = pipe->ring_size - 1;
	head = pipe->head;
	tail = pipe->tail;
	if (pipe_full(head, tail, pipe->ring_size))
@@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
	memcpy(p + offset, n, len);
	kunmap_atomic(p);

	buf = &pipe->bufs[head & mask];
	buf = pipe_buf(pipe, head);
	buf->page = page;
	buf->private = (unsigned long)wqueue;
	buf->ops = &watch_queue_pipe_buf_ops;
@@ -147,7 +146,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
	return done;

lost:
	buf = &pipe->bufs[(head - 1) & mask];
	buf = pipe_buf(pipe, head - 1);
	buf->flags |= PIPE_BUF_FLAG_LOSS;
	goto out;
}