Commit 5076a604 authored by Darrick J. Wong's avatar Darrick J. Wong
Browse files

xfs: support in-memory buffer cache targets



Allow the buffer cache to target in-memory files by making it possible
to have a buftarg that maps pages from private shmem files.  As the
prevous patch alludes, the in-memory buftarg contains its own cache,
points to a shmem file, and does not point to a block_device.

The next few patches will make it possible to construct an xfs_btree in
pageable memory by using this buftarg.

Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
parent e7b58f7c
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -128,6 +128,9 @@ config XFS_LIVE_HOOKS
	bool
	select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL

config XFS_MEMORY_BUFS
	bool

config XFS_ONLINE_SCRUB
	bool "XFS online metadata check support"
	default n
@@ -135,6 +138,7 @@ config XFS_ONLINE_SCRUB
	depends on TMPFS && SHMEM
	select XFS_LIVE_HOOKS
	select XFS_DRAIN_INTENTS
	select XFS_MEMORY_BUFS
	help
	  If you say Y here you will be able to check metadata on a
	  mounted XFS filesystem.  This feature is intended to reduce
+1 −0
Original line number Diff line number Diff line
@@ -137,6 +137,7 @@ endif

xfs-$(CONFIG_XFS_DRAIN_INTENTS)	+= xfs_drain.o
xfs-$(CONFIG_XFS_LIVE_HOOKS)	+= xfs_hooks.o
xfs-$(CONFIG_XFS_MEMORY_BUFS)	+= xfs_buf_mem.o

# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
+86 −46
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_buf_mem.h"

struct kmem_cache *xfs_buf_cache;

@@ -318,7 +319,9 @@ xfs_buf_free(

	ASSERT(list_empty(&bp->b_lru));

	if (bp->b_flags & _XBF_PAGES)
	if (xfs_buftarg_is_mem(bp->b_target))
		xmbuf_unmap_page(bp);
	else if (bp->b_flags & _XBF_PAGES)
		xfs_buf_free_pages(bp);
	else if (bp->b_flags & _XBF_KMEM)
		kfree(bp->b_addr);
@@ -634,18 +637,20 @@ xfs_buf_find_insert(
	if (error)
		goto out_drop_pag;

	if (xfs_buftarg_is_mem(new_bp->b_target)) {
		error = xmbuf_map_page(new_bp);
	} else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
		   xfs_buf_alloc_kmem(new_bp, flags) < 0) {
		/*
	 * For buffers that fit entirely within a single page, first attempt to
	 * allocate the memory from the heap to minimise memory usage. If we
	 * can't get heap memory for these small buffers, we fall back to using
	 * the page allocator.
		 * For buffers that fit entirely within a single page, first
		 * attempt to allocate the memory from the heap to minimise
		 * memory usage. If we can't get heap memory for these small
		 * buffers, we fall back to using the page allocator.
		 */
	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
		error = xfs_buf_alloc_pages(new_bp, flags);
	}
	if (error)
		goto out_free_buf;
	}

	spin_lock(&bch->bc_lock);
	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
@@ -688,6 +693,8 @@ xfs_buftarg_get_pag(
{
	struct xfs_mount		*mp = btp->bt_mount;

	if (xfs_buftarg_is_mem(btp))
		return NULL;
	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
}

@@ -696,7 +703,9 @@ xfs_buftarg_buf_cache(
	struct xfs_buftarg		*btp,
	struct xfs_perag		*pag)
{
	if (pag)
		return &pag->pag_bcache;
	return btp->bt_cache;
}

/*
@@ -926,6 +935,13 @@ xfs_buf_readahead_map(
{
	struct xfs_buf		*bp;

	/*
	 * Currently we don't have a good means or justification for performing
	 * xmbuf_map_page asynchronously, so we don't do readahead.
	 */
	if (xfs_buftarg_is_mem(target))
		return;

	xfs_buf_read_map(target, map, nmaps,
		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
		     __this_address);
@@ -991,6 +1007,9 @@ xfs_buf_get_uncached(
	if (error)
		return error;

	if (xfs_buftarg_is_mem(bp->b_target))
		error = xmbuf_map_page(bp);
	else
		error = xfs_buf_alloc_pages(bp, flags);
	if (error)
		goto fail_free_buf;
@@ -1633,6 +1652,12 @@ _xfs_buf_ioapply(
	/* we only use the buffer cache for meta-data */
	op |= REQ_META;

	/* in-memory targets are directly mapped, no IO required. */
	if (xfs_buftarg_is_mem(bp->b_target)) {
		xfs_buf_ioend(bp);
		return;
	}

	/*
	 * Walk all the vectors issuing IO on them. Set up the initial offset
	 * into the buffer and the desired IO size before we start -
@@ -1988,19 +2013,24 @@ xfs_buftarg_shrink_count(
}

void
xfs_free_buftarg(
xfs_destroy_buftarg(
	struct xfs_buftarg	*btp)
{
	shrinker_free(btp->bt_shrinker);
	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
	percpu_counter_destroy(&btp->bt_io_count);
	list_lru_destroy(&btp->bt_lru);
}

void
xfs_free_buftarg(
	struct xfs_buftarg	*btp)
{
	xfs_destroy_buftarg(btp);
	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
	/* the main block device is closed by kill_block_super */
	if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
		bdev_release(btp->bt_bdev_handle);

	kfree(btp);
}

@@ -2023,6 +2053,45 @@ xfs_setsize_buftarg(
	return 0;
}

int
xfs_init_buftarg(
	struct xfs_buftarg		*btp,
	size_t				logical_sectorsize,
	const char			*descr)
{
	/* Set up device logical sector size mask */
	btp->bt_logical_sectorsize = logical_sectorsize;
	btp->bt_logical_sectormask = logical_sectorsize - 1;

	/*
	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
	 * per 30 seconds so as to not spam logs too much on repeated errors.
	 */
	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
			     DEFAULT_RATELIMIT_BURST);

	if (list_lru_init(&btp->bt_lru))
		return -ENOMEM;
	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
		goto out_destroy_lru;

	btp->bt_shrinker =
		shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
	if (!btp->bt_shrinker)
		goto out_destroy_io_count;
	btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
	btp->bt_shrinker->private_data = btp;
	shrinker_register(btp->bt_shrinker);
	return 0;

out_destroy_io_count:
	percpu_counter_destroy(&btp->bt_io_count);
out_destroy_lru:
	list_lru_destroy(&btp->bt_lru);
	return -ENOMEM;
}

struct xfs_buftarg *
xfs_alloc_buftarg(
	struct xfs_mount	*mp,
@@ -2049,41 +2118,12 @@ xfs_alloc_buftarg(
	 */
	if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
		goto error_free;

	/* Set up device logical sector size mask */
	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;

	/*
	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
	 * per 30 seconds so as to not spam logs too much on repeated errors.
	 */
	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
			     DEFAULT_RATELIMIT_BURST);

	if (list_lru_init(&btp->bt_lru))
	if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
			mp->m_super->s_id))
		goto error_free;

	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
		goto error_lru;

	btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
					  mp->m_super->s_id);
	if (!btp->bt_shrinker)
		goto error_pcpu;

	btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
	btp->bt_shrinker->private_data = btp;

	shrinker_register(btp->bt_shrinker);

	return btp;

error_pcpu:
	percpu_counter_destroy(&btp->bt_io_count);
error_lru:
	list_lru_destroy(&btp->bt_lru);
error_free:
	kfree(btp);
	return NULL;
+9 −0
Original line number Diff line number Diff line
@@ -109,6 +109,7 @@ struct xfs_buftarg {
	struct bdev_handle	*bt_bdev_handle;
	struct block_device	*bt_bdev;
	struct dax_device	*bt_daxdev;
	struct file		*bt_file;
	u64			bt_dax_part_off;
	struct xfs_mount	*bt_mount;
	unsigned int		bt_meta_sectorsize;
@@ -122,6 +123,9 @@ struct xfs_buftarg {

	struct percpu_counter	bt_io_count;
	struct ratelimit_state	bt_ioerror_rl;

	/* built-in cache, if we're not using the perag one */
	struct xfs_buf_cache	bt_cache[];
};

#define XB_PAGES	2
@@ -387,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);

/* for xfs_buf_mem.c only: */
int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
		const char *descr);
void xfs_destroy_buftarg(struct xfs_buftarg *btp);

#endif	/* __XFS_BUF_H__ */

fs/xfs/xfs_buf_mem.c

0 → 100644
+189 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_buf.h"
#include "xfs_buf_mem.h"
#include "xfs_trace.h"
#include <linux/shmem_fs.h>

/*
 * Buffer Cache for In-Memory Files
 * ================================
 *
 * Online fsck wants to create ephemeral ordered recordsets.  The existing
 * btree infrastructure can do this, but we need the buffer cache to target
 * memory instead of block devices.
 *
 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
 * requirements.  Therefore, the xmbuf mechanism uses an unlinked shmem file to
 * store our staging data.  This file is not installed in the file descriptor
 * table so that user programs cannot access the data, which means that the
 * xmbuf must be freed with xmbuf_destroy.
 *
 * xmbufs assume that the caller will handle all required concurrency
 * management; standard vfs locks (freezer and inode) are not taken.  Reads
 * and writes are satisfied directly from the page cache.
 *
 * The only supported block size is PAGE_SIZE, and we cannot use highmem.
 */

/*
 * shmem files used to back an in-memory buffer cache must not be exposed to
 * userspace.  Upper layers must coordinate access to the one handle returned
 * by the constructor, so establish a separate lock class for xmbufs to avoid
 * confusing lockdep.
 */
static struct lock_class_key xmbuf_i_mutex_key;

/*
 * Allocate a buffer cache target for a memory-backed file and set up the
 * buffer target.
 */
int
xmbuf_alloc(
	struct xfs_mount	*mp,
	const char		*descr,
	struct xfs_buftarg	**btpp)
{
	struct file		*file;
	struct inode		*inode;
	struct xfs_buftarg	*btp;
	int			error;

	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
	if (!btp)
		return -ENOMEM;

	file = shmem_kernel_file_setup(descr, 0, 0);
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_btp;
	}
	inode = file_inode(file);

	/* private file, private locking */
	lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);

	/*
	 * We don't want to bother with kmapping data during repair, so don't
	 * allow highmem pages to back this mapping.
	 */
	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);

	/* ensure all writes are below EOF to avoid pagecache zeroing */
	i_size_write(inode, inode->i_sb->s_maxbytes);

	trace_xmbuf_create(btp);

	error = xfs_buf_cache_init(btp->bt_cache);
	if (error)
		goto out_file;

	/* Initialize buffer target */
	btp->bt_mount = mp;
	btp->bt_dev = (dev_t)-1U;
	btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
	btp->bt_file = file;
	btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
	btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;

	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
	if (error)
		goto out_bcache;

	*btpp = btp;
	return 0;

out_bcache:
	xfs_buf_cache_destroy(btp->bt_cache);
out_file:
	fput(file);
out_free_btp:
	kfree(btp);
	return error;
}

/* Free a buffer cache target for a memory-backed buffer cache. */
void
xmbuf_free(
	struct xfs_buftarg	*btp)
{
	ASSERT(xfs_buftarg_is_mem(btp));
	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);

	trace_xmbuf_free(btp);

	xfs_destroy_buftarg(btp);
	xfs_buf_cache_destroy(btp->bt_cache);
	fput(btp->bt_file);
	kfree(btp);
}

/* Directly map a shmem page into the buffer cache. */
int
xmbuf_map_page(
	struct xfs_buf		*bp)
{
	struct inode		*inode = file_inode(bp->b_target->bt_file);
	struct folio		*folio = NULL;
	struct page		*page;
	loff_t                  pos = BBTOB(xfs_buf_daddr(bp));
	int			error;

	ASSERT(xfs_buftarg_is_mem(bp->b_target));

	if (bp->b_map_count != 1)
		return -ENOMEM;
	if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
		return -ENOMEM;
	if (offset_in_page(pos) != 0) {
		ASSERT(offset_in_page(pos));
		return -ENOMEM;
	}

	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
	if (error)
		return error;

	if (filemap_check_wb_err(inode->i_mapping, 0)) {
		folio_unlock(folio);
		folio_put(folio);
		return -EIO;
	}

	page = folio_file_page(folio, pos >> PAGE_SHIFT);

	/*
	 * Mark the page dirty so that it won't be reclaimed once we drop the
	 * (potentially last) reference in xmbuf_unmap_page.
	 */
	set_page_dirty(page);
	unlock_page(page);

	bp->b_addr = page_address(page);
	bp->b_pages = bp->b_page_array;
	bp->b_pages[0] = page;
	bp->b_page_count = 1;
	return 0;
}

/* Unmap a shmem page that was mapped into the buffer cache. */
void
xmbuf_unmap_page(
	struct xfs_buf		*bp)
{
	struct page		*page = bp->b_pages[0];

	ASSERT(xfs_buftarg_is_mem(bp->b_target));

	put_page(page);

	bp->b_addr = NULL;
	bp->b_pages[0] = NULL;
	bp->b_pages = NULL;
	bp->b_page_count = 0;
}
Loading