Commit ca3d643a authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Carlos Maiolino
Browse files

xfs: cache open zone in inode->i_private



The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones.  One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.

So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private.  This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.

Fixes: 4e4d5207 ("xfs: add the zoned space allocator")
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarHans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: default avatarDamien Le Moal <dlemoal@kernel.org>
Tested-by: default avatarDamien Le Moal <dlemoal@kernel.org>
Signed-off-by: default avatarCarlos Maiolino <cem@kernel.org>
parent a8c861f4
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -236,7 +236,6 @@ typedef struct xfs_mount {
	bool			m_update_sb;	/* sb needs update in mount */
	unsigned int		m_max_open_zones;
	unsigned int		m_zonegc_low_space;
	struct xfs_mru_cache	*m_zone_cache;  /* Inode to open zone cache */

	/* max_atomic_write mount option value */
	unsigned long long	m_awu_max_bytes;
+6 −0
Original line number Diff line number Diff line
@@ -786,6 +786,12 @@ xfs_fs_evict_inode(

	truncate_inode_pages_final(&inode->i_data);
	clear_inode(inode);

	if (IS_ENABLED(CONFIG_XFS_RT) &&
	    S_ISREG(inode->i_mode) && inode->i_private) {
		xfs_open_zone_put(inode->i_private);
		inode->i_private = NULL;
	}
}

static void
+45 −84
Original line number Diff line number Diff line
@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"

void
xfs_open_zone_put(
	struct xfs_open_zone	*oz)
static void
xfs_open_zone_free_rcu(
	struct callback_head	*cb)
{
	if (atomic_dec_and_test(&oz->oz_ref)) {
	struct xfs_open_zone	*oz = container_of(cb, typeof(*oz), oz_rcu);

	xfs_rtgroup_rele(oz->oz_rtg);
	kfree(oz);
}

void
xfs_open_zone_put(
	struct xfs_open_zone	*oz)
{
	if (atomic_dec_and_test(&oz->oz_ref))
		call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}

static inline uint32_t
@@ -756,98 +764,55 @@ xfs_mark_rtg_boundary(
		ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}

/*
 * Cache the last zone written to for an inode so that it is considered first
 * for subsequent writes.
 */
struct xfs_zone_cache_item {
	struct xfs_mru_cache_elem	mru;
	struct xfs_open_zone		*oz;
};

static inline struct xfs_zone_cache_item *
xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
{
	return container_of(mru, struct xfs_zone_cache_item, mru);
}

static void
xfs_zone_cache_free_func(
	void				*data,
	struct xfs_mru_cache_elem	*mru)
{
	struct xfs_zone_cache_item	*item = xfs_zone_cache_item(mru);

	xfs_open_zone_put(item->oz);
	kfree(item);
}

/*
 * Check if we have a cached last open zone available for the inode and
 * if yes return a reference to it.
 */
static struct xfs_open_zone *
xfs_cached_zone(
	struct xfs_mount		*mp,
xfs_get_cached_zone(
	struct xfs_inode	*ip)
{
	struct xfs_mru_cache_elem	*mru;
	struct xfs_open_zone	*oz;

	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
	if (!mru)
		return NULL;
	oz = xfs_zone_cache_item(mru)->oz;
	rcu_read_lock();
	oz = VFS_I(ip)->i_private;
	if (oz) {
		/*
		 * GC only steals open zones at mount time, so no GC zones
		 * should end up in the cache.
		 */
		ASSERT(!oz->oz_is_gc);
		ASSERT(atomic_read(&oz->oz_ref) > 0);
		atomic_inc(&oz->oz_ref);
		if (!atomic_inc_not_zero(&oz->oz_ref))
			oz = NULL;
	}
	xfs_mru_cache_done(mp->m_zone_cache);
	rcu_read_unlock();

	return oz;
}

/*
 * Update the last used zone cache for a given inode.
 * Stash our zone in the inode so that is is reused for future allocations.
 *
 * The caller must have a reference on the open zone.
 * The open_zone structure will be pinned until either the inode is freed or
 * until the cached open zone is replaced with a different one because the
 * current one was full when we tried to use it.  This means we keep any
 * open zone around forever as long as any inode that used it for the last
 * write is cached, which slightly increases the memory use of cached inodes
 * that were every written to, but significantly simplifies the cached zone
 * lookup.  Because the open_zone is clearly marked as full when all data
 * in the underlying RTG was written, the caching is always safe.
 */
static void
xfs_zone_cache_create_association(
xfs_set_cached_zone(
	struct xfs_inode	*ip,
	struct xfs_open_zone	*oz)
{
	struct xfs_mount		*mp = ip->i_mount;
	struct xfs_zone_cache_item	*item = NULL;
	struct xfs_mru_cache_elem	*mru;
	struct xfs_open_zone	*old_oz;

	ASSERT(atomic_read(&oz->oz_ref) > 0);
	atomic_inc(&oz->oz_ref);

	mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
	if (mru) {
		/*
		 * If we have an association already, update it to point to the
		 * new zone.
		 */
		item = xfs_zone_cache_item(mru);
		xfs_open_zone_put(item->oz);
		item->oz = oz;
		xfs_mru_cache_done(mp->m_zone_cache);
		return;
	}

	item = kmalloc(sizeof(*item), GFP_KERNEL);
	if (!item) {
		xfs_open_zone_put(oz);
		return;
	}
	item->oz = oz;
	xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
	old_oz = xchg(&VFS_I(ip)->i_private, oz);
	if (old_oz)
		xfs_open_zone_put(old_oz);
}

static void
@@ -891,15 +856,14 @@ xfs_zone_alloc_and_submit(
	 * the inode is still associated with a zone and use that if so.
	 */
	if (!*oz)
		*oz = xfs_cached_zone(mp, ip);
		*oz = xfs_get_cached_zone(ip);

	if (!*oz) {
select_zone:
		*oz = xfs_select_zone(mp, write_hint, pack_tight);
		if (!*oz)
			goto out_error;

		xfs_zone_cache_create_association(ip, *oz);
		xfs_set_cached_zone(ip, *oz);
	}

	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -977,6 +941,12 @@ xfs_free_open_zones(
		xfs_open_zone_put(oz);
	}
	spin_unlock(&zi->zi_open_zones_lock);

	/*
	 * Wait for all open zones to be freed so that they drop the group
	 * references:
	 */
	rcu_barrier();
}

struct xfs_init_zones {
@@ -1290,14 +1260,6 @@ xfs_mount_zones(
	error = xfs_zone_gc_mount(mp);
	if (error)
		goto out_free_zone_info;

	/*
	 * Set up a mru cache to track inode to open zone for data placement
	 * purposes. The magic values for group count and life time is the
	 * same as the defaults for file streams, which seems sane enough.
	 */
	xfs_mru_cache_create(&mp->m_zone_cache, mp,
			5000, 10, xfs_zone_cache_free_func);
	return 0;

out_free_zone_info:
@@ -1311,5 +1273,4 @@ xfs_unmount_zones(
{
	xfs_zone_gc_unmount(mp);
	xfs_free_zone_info(mp->m_zone_info);
	xfs_mru_cache_destroy(mp->m_zone_cache);
}
+2 −0
Original line number Diff line number Diff line
@@ -44,6 +44,8 @@ struct xfs_open_zone {
	 * the life time of an open zone.
	 */
	struct xfs_rtgroup	*oz_rtg;

	struct rcu_head		oz_rcu;
};

/*