mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
synced 2026-04-18 03:23:53 -04:00
This was done entirely with mindless brute force, using
git grep -l '\<k[vmz]*alloc_objs*(.*, GFP_KERNEL)' |
xargs sed -i 's/\(alloc_objs*(.*\), GFP_KERNEL)/\1)/'
to convert the new alloc_obj() users that had a simple GFP_KERNEL
argument to just drop that argument.
Note that due to the extreme simplicity of the scripting, any slightly
more complex cases spread over multiple lines would not be triggered:
they definitely exist, but this covers the vast bulk of the cases, and
the resulting diff is also then easier to check automatically.
For the same reason the 'flex' versions will be done as a separate
conversion.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1341 lines
35 KiB
C
1341 lines
35 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (c) 2023-2025 Christoph Hellwig.
|
|
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
|
|
*/
|
|
#include "xfs_platform.h"
|
|
#include "xfs_shared.h"
|
|
#include "xfs_format.h"
|
|
#include "xfs_log_format.h"
|
|
#include "xfs_error.h"
|
|
#include "xfs_trans_resv.h"
|
|
#include "xfs_mount.h"
|
|
#include "xfs_inode.h"
|
|
#include "xfs_iomap.h"
|
|
#include "xfs_trans.h"
|
|
#include "xfs_alloc.h"
|
|
#include "xfs_bmap.h"
|
|
#include "xfs_bmap_btree.h"
|
|
#include "xfs_trans_space.h"
|
|
#include "xfs_refcount.h"
|
|
#include "xfs_rtbitmap.h"
|
|
#include "xfs_rtrmap_btree.h"
|
|
#include "xfs_zone_alloc.h"
|
|
#include "xfs_zone_priv.h"
|
|
#include "xfs_zones.h"
|
|
#include "xfs_trace.h"
|
|
#include "xfs_mru_cache.h"
|
|
|
|
static void
|
|
xfs_open_zone_free_rcu(
|
|
struct callback_head *cb)
|
|
{
|
|
struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
|
|
|
|
xfs_rtgroup_rele(oz->oz_rtg);
|
|
kfree(oz);
|
|
}
|
|
|
|
void
|
|
xfs_open_zone_put(
|
|
struct xfs_open_zone *oz)
|
|
{
|
|
if (atomic_dec_and_test(&oz->oz_ref))
|
|
call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
|
|
}
|
|
|
|
static inline uint32_t
|
|
xfs_zone_bucket(
|
|
struct xfs_mount *mp,
|
|
uint32_t used_blocks)
|
|
{
|
|
return XFS_ZONE_USED_BUCKETS * used_blocks /
|
|
mp->m_groups[XG_TYPE_RTG].blocks;
|
|
}
|
|
|
|
static inline void
|
|
xfs_zone_add_to_bucket(
|
|
struct xfs_zone_info *zi,
|
|
xfs_rgnumber_t rgno,
|
|
uint32_t to_bucket)
|
|
{
|
|
__set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]);
|
|
zi->zi_used_bucket_entries[to_bucket]++;
|
|
}
|
|
|
|
static inline void
|
|
xfs_zone_remove_from_bucket(
|
|
struct xfs_zone_info *zi,
|
|
xfs_rgnumber_t rgno,
|
|
uint32_t from_bucket)
|
|
{
|
|
__clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]);
|
|
zi->zi_used_bucket_entries[from_bucket]--;
|
|
}
|
|
|
|
static void
|
|
xfs_zone_account_reclaimable(
|
|
struct xfs_rtgroup *rtg,
|
|
uint32_t freed)
|
|
{
|
|
struct xfs_group *xg = &rtg->rtg_group;
|
|
struct xfs_mount *mp = rtg_mount(rtg);
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
uint32_t used = rtg_rmap(rtg)->i_used_blocks;
|
|
xfs_rgnumber_t rgno = rtg_rgno(rtg);
|
|
uint32_t from_bucket = xfs_zone_bucket(mp, used + freed);
|
|
uint32_t to_bucket = xfs_zone_bucket(mp, used);
|
|
bool was_full = (used + freed == rtg_blocks(rtg));
|
|
|
|
/*
|
|
* This can be called from log recovery, where the zone_info structure
|
|
* hasn't been allocated yet. Skip all work as xfs_mount_zones will
|
|
* add the zones to the right buckets before the file systems becomes
|
|
* active.
|
|
*/
|
|
if (!zi)
|
|
return;
|
|
|
|
if (!used) {
|
|
/*
|
|
* The zone is now empty, remove it from the bottom bucket and
|
|
* trigger a reset.
|
|
*/
|
|
trace_xfs_zone_emptied(rtg);
|
|
|
|
spin_lock(&zi->zi_used_buckets_lock);
|
|
if (!was_full)
|
|
xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
|
|
spin_unlock(&zi->zi_used_buckets_lock);
|
|
|
|
spin_lock(&zi->zi_reset_list_lock);
|
|
xg->xg_next_reset = zi->zi_reset_list;
|
|
zi->zi_reset_list = xg;
|
|
spin_unlock(&zi->zi_reset_list_lock);
|
|
|
|
if (zi->zi_gc_thread)
|
|
wake_up_process(zi->zi_gc_thread);
|
|
} else if (was_full) {
|
|
/*
|
|
* The zone transitioned from full, mark it up as reclaimable
|
|
* and wake up GC which might be waiting for zones to reclaim.
|
|
*/
|
|
spin_lock(&zi->zi_used_buckets_lock);
|
|
xfs_zone_add_to_bucket(zi, rgno, to_bucket);
|
|
spin_unlock(&zi->zi_used_buckets_lock);
|
|
|
|
if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
|
|
wake_up_process(zi->zi_gc_thread);
|
|
} else if (to_bucket != from_bucket) {
|
|
/*
|
|
* Move the zone to a new bucket if it dropped below the
|
|
* threshold.
|
|
*/
|
|
spin_lock(&zi->zi_used_buckets_lock);
|
|
xfs_zone_add_to_bucket(zi, rgno, to_bucket);
|
|
xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
|
|
spin_unlock(&zi->zi_used_buckets_lock);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Check if we have any zones that can be reclaimed by looking at the entry
|
|
* counters for the zone buckets.
|
|
*/
|
|
bool
|
|
xfs_zoned_have_reclaimable(
|
|
struct xfs_zone_info *zi)
|
|
{
|
|
int i;
|
|
|
|
spin_lock(&zi->zi_used_buckets_lock);
|
|
for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
|
|
if (zi->zi_used_bucket_entries[i]) {
|
|
spin_unlock(&zi->zi_used_buckets_lock);
|
|
return true;
|
|
}
|
|
}
|
|
spin_unlock(&zi->zi_used_buckets_lock);
|
|
|
|
return false;
|
|
}
|
|
|
|
static void
|
|
xfs_open_zone_mark_full(
|
|
struct xfs_open_zone *oz)
|
|
{
|
|
struct xfs_rtgroup *rtg = oz->oz_rtg;
|
|
struct xfs_mount *mp = rtg_mount(rtg);
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
uint32_t used = rtg_rmap(rtg)->i_used_blocks;
|
|
|
|
trace_xfs_zone_full(rtg);
|
|
|
|
WRITE_ONCE(rtg->rtg_open_zone, NULL);
|
|
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
if (oz->oz_is_gc) {
|
|
ASSERT(current == zi->zi_gc_thread);
|
|
zi->zi_open_gc_zone = NULL;
|
|
} else {
|
|
zi->zi_nr_open_zones--;
|
|
list_del_init(&oz->oz_entry);
|
|
}
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
xfs_open_zone_put(oz);
|
|
|
|
wake_up_all(&zi->zi_zone_wait);
|
|
if (used < rtg_blocks(rtg))
|
|
xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
|
|
}
|
|
|
|
static void
|
|
xfs_zone_record_blocks(
|
|
struct xfs_trans *tp,
|
|
struct xfs_open_zone *oz,
|
|
xfs_fsblock_t fsbno,
|
|
xfs_filblks_t len)
|
|
{
|
|
struct xfs_mount *mp = tp->t_mountp;
|
|
struct xfs_rtgroup *rtg = oz->oz_rtg;
|
|
struct xfs_inode *rmapip = rtg_rmap(rtg);
|
|
|
|
trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
|
|
|
|
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
|
|
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
|
|
rmapip->i_used_blocks += len;
|
|
ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
|
|
oz->oz_written += len;
|
|
if (oz->oz_written == rtg_blocks(rtg))
|
|
xfs_open_zone_mark_full(oz);
|
|
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
|
|
}
|
|
|
|
/*
|
|
* Called for blocks that have been written to disk, but not actually linked to
|
|
* an inode, which can happen when garbage collection races with user data
|
|
* writes to a file.
|
|
*/
|
|
static void
|
|
xfs_zone_skip_blocks(
|
|
struct xfs_open_zone *oz,
|
|
xfs_filblks_t len)
|
|
{
|
|
struct xfs_rtgroup *rtg = oz->oz_rtg;
|
|
|
|
trace_xfs_zone_skip_blocks(oz, 0, len);
|
|
|
|
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
|
|
oz->oz_written += len;
|
|
if (oz->oz_written == rtg_blocks(rtg))
|
|
xfs_open_zone_mark_full(oz);
|
|
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
|
|
|
|
xfs_add_frextents(rtg_mount(rtg), len);
|
|
}
|
|
|
|
static int
|
|
xfs_zoned_map_extent(
|
|
struct xfs_trans *tp,
|
|
struct xfs_inode *ip,
|
|
struct xfs_bmbt_irec *new,
|
|
struct xfs_open_zone *oz,
|
|
xfs_fsblock_t old_startblock)
|
|
{
|
|
struct xfs_bmbt_irec data;
|
|
int nmaps = 1;
|
|
int error;
|
|
|
|
/* Grab the corresponding mapping in the data fork. */
|
|
error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
|
|
&nmaps, 0);
|
|
if (error)
|
|
return error;
|
|
|
|
/*
|
|
* Cap the update to the existing extent in the data fork because we can
|
|
* only overwrite one extent at a time.
|
|
*/
|
|
ASSERT(new->br_blockcount >= data.br_blockcount);
|
|
new->br_blockcount = data.br_blockcount;
|
|
|
|
/*
|
|
* If a data write raced with this GC write, keep the existing data in
|
|
* the data fork, mark our newly written GC extent as reclaimable, then
|
|
* move on to the next extent.
|
|
*
|
|
* Note that this can also happen when racing with operations that do
|
|
* not actually invalidate the data, but just move it to a different
|
|
* inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the
|
|
* inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the
|
|
* data was just moved around, GC fails to free the zone, but the zone
|
|
* becomes a GC candidate again as soon as all previous GC I/O has
|
|
* finished and these blocks will be moved out eventually.
|
|
*/
|
|
if (old_startblock != NULLFSBLOCK &&
|
|
old_startblock != data.br_startblock)
|
|
goto skip;
|
|
|
|
trace_xfs_reflink_cow_remap_from(ip, new);
|
|
trace_xfs_reflink_cow_remap_to(ip, &data);
|
|
|
|
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
|
|
XFS_IEXT_REFLINK_END_COW_CNT);
|
|
if (error)
|
|
return error;
|
|
|
|
if (data.br_startblock != HOLESTARTBLOCK) {
|
|
ASSERT(data.br_startblock != DELAYSTARTBLOCK);
|
|
ASSERT(!isnullstartblock(data.br_startblock));
|
|
|
|
xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
|
|
if (xfs_is_reflink_inode(ip)) {
|
|
xfs_refcount_decrease_extent(tp, true, &data);
|
|
} else {
|
|
error = xfs_free_extent_later(tp, data.br_startblock,
|
|
data.br_blockcount, NULL,
|
|
XFS_AG_RESV_NONE,
|
|
XFS_FREE_EXTENT_REALTIME);
|
|
if (error)
|
|
return error;
|
|
}
|
|
}
|
|
|
|
xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
|
|
|
|
/* Map the new blocks into the data fork. */
|
|
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
|
|
return 0;
|
|
|
|
skip:
|
|
trace_xfs_reflink_cow_remap_skip(ip, new);
|
|
xfs_zone_skip_blocks(oz, new->br_blockcount);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
xfs_zoned_end_io(
|
|
struct xfs_inode *ip,
|
|
xfs_off_t offset,
|
|
xfs_off_t count,
|
|
xfs_daddr_t daddr,
|
|
struct xfs_open_zone *oz,
|
|
xfs_fsblock_t old_startblock)
|
|
{
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
|
|
struct xfs_bmbt_irec new = {
|
|
.br_startoff = XFS_B_TO_FSBT(mp, offset),
|
|
.br_startblock = xfs_daddr_to_rtb(mp, daddr),
|
|
.br_state = XFS_EXT_NORM,
|
|
};
|
|
unsigned int resblks =
|
|
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
|
|
struct xfs_trans *tp;
|
|
int error;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
return -EIO;
|
|
|
|
while (new.br_startoff < end_fsb) {
|
|
new.br_blockcount = end_fsb - new.br_startoff;
|
|
|
|
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
|
|
XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
|
|
if (error)
|
|
return error;
|
|
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
|
xfs_trans_ijoin(tp, ip, 0);
|
|
|
|
error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock);
|
|
if (error)
|
|
xfs_trans_cancel(tp);
|
|
else
|
|
error = xfs_trans_commit(tp);
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
|
if (error)
|
|
return error;
|
|
|
|
new.br_startoff += new.br_blockcount;
|
|
new.br_startblock += new.br_blockcount;
|
|
if (old_startblock != NULLFSBLOCK)
|
|
old_startblock += new.br_blockcount;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* "Free" blocks allocated in a zone.
|
|
*
|
|
* Just decrement the used blocks counter and report the space as freed.
|
|
*/
|
|
int
|
|
xfs_zone_free_blocks(
|
|
struct xfs_trans *tp,
|
|
struct xfs_rtgroup *rtg,
|
|
xfs_fsblock_t fsbno,
|
|
xfs_filblks_t len)
|
|
{
|
|
struct xfs_mount *mp = tp->t_mountp;
|
|
struct xfs_inode *rmapip = rtg_rmap(rtg);
|
|
|
|
xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
|
|
|
|
if (len > rmapip->i_used_blocks) {
|
|
xfs_err(mp,
|
|
"trying to free more blocks (%lld) than used counter (%u).",
|
|
len, rmapip->i_used_blocks);
|
|
ASSERT(len <= rmapip->i_used_blocks);
|
|
xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP);
|
|
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
|
|
|
|
rmapip->i_used_blocks -= len;
|
|
/*
|
|
* Don't add open zones to the reclaimable buckets. The I/O completion
|
|
* for writing the last block will take care of accounting for already
|
|
* unused blocks instead.
|
|
*/
|
|
if (!READ_ONCE(rtg->rtg_open_zone))
|
|
xfs_zone_account_reclaimable(rtg, len);
|
|
xfs_add_frextents(mp, len);
|
|
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
|
|
return 0;
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_init_open_zone(
|
|
struct xfs_rtgroup *rtg,
|
|
xfs_rgblock_t write_pointer,
|
|
enum rw_hint write_hint,
|
|
bool is_gc)
|
|
{
|
|
struct xfs_open_zone *oz;
|
|
|
|
oz = kzalloc_obj(*oz, GFP_NOFS | __GFP_NOFAIL);
|
|
spin_lock_init(&oz->oz_alloc_lock);
|
|
atomic_set(&oz->oz_ref, 1);
|
|
oz->oz_rtg = rtg;
|
|
oz->oz_allocated = write_pointer;
|
|
oz->oz_written = write_pointer;
|
|
oz->oz_write_hint = write_hint;
|
|
oz->oz_is_gc = is_gc;
|
|
|
|
/*
|
|
* All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
|
|
* inode, but we don't really want to take that here because we are
|
|
* under the zone_list_lock. Ensure the pointer is only set for a fully
|
|
* initialized open zone structure so that a racy lookup finding it is
|
|
* fine.
|
|
*/
|
|
WRITE_ONCE(rtg->rtg_open_zone, oz);
|
|
return oz;
|
|
}
|
|
|
|
/*
|
|
* Find a completely free zone, open it, and return a reference.
|
|
*/
|
|
struct xfs_open_zone *
|
|
xfs_open_zone(
|
|
struct xfs_mount *mp,
|
|
enum rw_hint write_hint,
|
|
bool is_gc)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, 0);
|
|
struct xfs_group *xg;
|
|
|
|
/*
|
|
* Pick the free zone with lowest index. Zones in the beginning of the
|
|
* address space typically provides higher bandwidth than those at the
|
|
* end of the address space on HDDs.
|
|
*/
|
|
xas_lock(&xas);
|
|
xas_for_each_marked(&xas, xg, ULONG_MAX, XFS_RTG_FREE)
|
|
if (atomic_inc_not_zero(&xg->xg_active_ref))
|
|
goto found;
|
|
xas_unlock(&xas);
|
|
return NULL;
|
|
|
|
found:
|
|
xas_clear_mark(&xas, XFS_RTG_FREE);
|
|
atomic_dec(&zi->zi_nr_free_zones);
|
|
xas_unlock(&xas);
|
|
|
|
set_current_state(TASK_RUNNING);
|
|
return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc);
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_try_open_zone(
|
|
struct xfs_mount *mp,
|
|
enum rw_hint write_hint)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
struct xfs_open_zone *oz;
|
|
|
|
if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES)
|
|
return NULL;
|
|
if (atomic_read(&zi->zi_nr_free_zones) <
|
|
XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
|
|
return NULL;
|
|
|
|
/*
|
|
* Increment the open zone count to reserve our slot before dropping
|
|
* zi_open_zones_lock.
|
|
*/
|
|
zi->zi_nr_open_zones++;
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
oz = xfs_open_zone(mp, write_hint, false);
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
if (!oz) {
|
|
zi->zi_nr_open_zones--;
|
|
return NULL;
|
|
}
|
|
|
|
atomic_inc(&oz->oz_ref);
|
|
list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
|
|
|
|
/*
|
|
* If this was the last free zone, other waiters might be waiting
|
|
* on us to write to it as well.
|
|
*/
|
|
wake_up_all(&zi->zi_zone_wait);
|
|
|
|
if (xfs_zoned_need_gc(mp))
|
|
wake_up_process(zi->zi_gc_thread);
|
|
|
|
trace_xfs_zone_opened(oz->oz_rtg);
|
|
return oz;
|
|
}
|
|
|
|
enum xfs_zone_alloc_score {
|
|
/* Any open zone will do it, we're desperate */
|
|
XFS_ZONE_ALLOC_ANY = 0,
|
|
|
|
/* It better fit somehow */
|
|
XFS_ZONE_ALLOC_OK = 1,
|
|
|
|
/* Only reuse a zone if it fits really well. */
|
|
XFS_ZONE_ALLOC_GOOD = 2,
|
|
};
|
|
|
|
/*
|
|
* Life time hint co-location matrix. Fields not set default to 0
|
|
* aka XFS_ZONE_ALLOC_ANY.
|
|
*/
|
|
static const unsigned int
|
|
xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = {
|
|
[WRITE_LIFE_NOT_SET] = {
|
|
[WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK,
|
|
},
|
|
[WRITE_LIFE_NONE] = {
|
|
[WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK,
|
|
},
|
|
[WRITE_LIFE_SHORT] = {
|
|
[WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD,
|
|
},
|
|
[WRITE_LIFE_MEDIUM] = {
|
|
[WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD,
|
|
},
|
|
[WRITE_LIFE_LONG] = {
|
|
[WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
|
|
[WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
|
|
},
|
|
[WRITE_LIFE_EXTREME] = {
|
|
[WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
|
|
[WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
|
|
},
|
|
};
|
|
|
|
static bool
|
|
xfs_try_use_zone(
|
|
struct xfs_zone_info *zi,
|
|
enum rw_hint file_hint,
|
|
struct xfs_open_zone *oz,
|
|
unsigned int goodness)
|
|
{
|
|
if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
|
|
return false;
|
|
|
|
if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness)
|
|
return false;
|
|
|
|
if (!atomic_inc_not_zero(&oz->oz_ref))
|
|
return false;
|
|
|
|
/*
|
|
* If we have a hint set for the data, use that for the zone even if
|
|
* some data was written already without any hint set, but don't change
|
|
* the temperature after that as that would make little sense without
|
|
* tracking per-temperature class written block counts, which is
|
|
* probably overkill anyway.
|
|
*/
|
|
if (file_hint != WRITE_LIFE_NOT_SET &&
|
|
oz->oz_write_hint == WRITE_LIFE_NOT_SET)
|
|
oz->oz_write_hint = file_hint;
|
|
|
|
/*
|
|
* If we couldn't match by inode or life time we just pick the first
|
|
* zone with enough space above. For that we want the least busy zone
|
|
* for some definition of "least" busy. For now this simple LRU
|
|
* algorithm that rotates every zone to the end of the list will do it,
|
|
* even if it isn't exactly cache friendly.
|
|
*/
|
|
if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
|
|
list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
|
|
return true;
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_select_open_zone_lru(
|
|
struct xfs_zone_info *zi,
|
|
enum rw_hint file_hint,
|
|
unsigned int goodness)
|
|
{
|
|
struct xfs_open_zone *oz;
|
|
|
|
lockdep_assert_held(&zi->zi_open_zones_lock);
|
|
|
|
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
|
|
if (xfs_try_use_zone(zi, file_hint, oz, goodness))
|
|
return oz;
|
|
|
|
cond_resched_lock(&zi->zi_open_zones_lock);
|
|
return NULL;
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_select_open_zone_mru(
|
|
struct xfs_zone_info *zi,
|
|
enum rw_hint file_hint)
|
|
{
|
|
struct xfs_open_zone *oz;
|
|
|
|
lockdep_assert_held(&zi->zi_open_zones_lock);
|
|
|
|
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
|
|
if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK))
|
|
return oz;
|
|
|
|
cond_resched_lock(&zi->zi_open_zones_lock);
|
|
return NULL;
|
|
}
|
|
|
|
static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
|
|
{
|
|
if (xfs_has_nolifetime(ip->i_mount))
|
|
return WRITE_LIFE_NOT_SET;
|
|
return VFS_I(ip)->i_write_hint;
|
|
}
|
|
|
|
/*
|
|
* Try to tightly pack small files that are written back after they were closed
|
|
* instead of trying to open new zones for them or spread them to the least
|
|
* recently used zone. This optimizes the data layout for workloads that untar
|
|
* or copy a lot of small files. Right now this does not separate multiple such
|
|
* streams.
|
|
*/
|
|
static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
|
|
{
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
size_t zone_capacity =
|
|
XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks);
|
|
|
|
/*
|
|
* Do not pack write files that are already using a full zone to avoid
|
|
* fragmentation.
|
|
*/
|
|
if (i_size_read(VFS_I(ip)) >= zone_capacity)
|
|
return false;
|
|
|
|
return !inode_is_open_for_write(VFS_I(ip)) &&
|
|
!(ip->i_diflags & XFS_DIFLAG_APPEND);
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_select_zone_nowait(
|
|
struct xfs_mount *mp,
|
|
enum rw_hint write_hint,
|
|
bool pack_tight)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
struct xfs_open_zone *oz = NULL;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
return NULL;
|
|
|
|
/*
|
|
* Try to fill up open zones with matching temperature if available. It
|
|
* is better to try to co-locate data when this is favorable, so we can
|
|
* activate empty zones when it is statistically better to separate
|
|
* data.
|
|
*/
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD);
|
|
if (oz)
|
|
goto out_unlock;
|
|
|
|
if (pack_tight)
|
|
oz = xfs_select_open_zone_mru(zi, write_hint);
|
|
if (oz)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* See if we can open a new zone and use that so that data for different
|
|
* files is mixed as little as possible.
|
|
*/
|
|
oz = xfs_try_open_zone(mp, write_hint);
|
|
if (oz)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* Try to find an zone that is an ok match to colocate data with.
|
|
*/
|
|
oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
|
|
if (oz)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* Pick the least recently used zone, regardless of hint match
|
|
*/
|
|
oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY);
|
|
out_unlock:
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
return oz;
|
|
}
|
|
|
|
static struct xfs_open_zone *
|
|
xfs_select_zone(
|
|
struct xfs_mount *mp,
|
|
enum rw_hint write_hint,
|
|
bool pack_tight)
|
|
{
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
DEFINE_WAIT (wait);
|
|
struct xfs_open_zone *oz;
|
|
|
|
oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
|
|
if (oz)
|
|
return oz;
|
|
|
|
for (;;) {
|
|
prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
|
|
oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
|
|
if (oz || xfs_is_shutdown(mp))
|
|
break;
|
|
schedule();
|
|
}
|
|
finish_wait(&zi->zi_zone_wait, &wait);
|
|
return oz;
|
|
}
|
|
|
|
static unsigned int
|
|
xfs_zone_alloc_blocks(
|
|
struct xfs_open_zone *oz,
|
|
xfs_filblks_t count_fsb,
|
|
sector_t *sector,
|
|
bool *is_seq)
|
|
{
|
|
struct xfs_rtgroup *rtg = oz->oz_rtg;
|
|
struct xfs_mount *mp = rtg_mount(rtg);
|
|
xfs_rgblock_t allocated;
|
|
|
|
spin_lock(&oz->oz_alloc_lock);
|
|
count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
|
|
(xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated);
|
|
if (!count_fsb) {
|
|
spin_unlock(&oz->oz_alloc_lock);
|
|
return 0;
|
|
}
|
|
allocated = oz->oz_allocated;
|
|
oz->oz_allocated += count_fsb;
|
|
spin_unlock(&oz->oz_alloc_lock);
|
|
|
|
trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb);
|
|
|
|
*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
|
|
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
|
|
if (!*is_seq)
|
|
*sector += XFS_FSB_TO_BB(mp, allocated);
|
|
return XFS_FSB_TO_B(mp, count_fsb);
|
|
}
|
|
|
|
void
|
|
xfs_mark_rtg_boundary(
|
|
struct iomap_ioend *ioend)
|
|
{
|
|
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
|
|
sector_t sector = ioend->io_bio.bi_iter.bi_sector;
|
|
|
|
if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
|
|
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
|
|
}
|
|
|
|
/*
|
|
* Check if we have a cached last open zone available for the inode and
|
|
* if yes return a reference to it.
|
|
*/
|
|
static struct xfs_open_zone *
|
|
xfs_get_cached_zone(
|
|
struct xfs_inode *ip)
|
|
{
|
|
struct xfs_open_zone *oz;
|
|
|
|
rcu_read_lock();
|
|
oz = VFS_I(ip)->i_private;
|
|
if (oz) {
|
|
/*
|
|
* GC only steals open zones at mount time, so no GC zones
|
|
* should end up in the cache.
|
|
*/
|
|
ASSERT(!oz->oz_is_gc);
|
|
if (!atomic_inc_not_zero(&oz->oz_ref))
|
|
oz = NULL;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return oz;
|
|
}
|
|
|
|
/*
|
|
* Stash our zone in the inode so that is is reused for future allocations.
|
|
*
|
|
* The open_zone structure will be pinned until either the inode is freed or
|
|
* until the cached open zone is replaced with a different one because the
|
|
* current one was full when we tried to use it. This means we keep any
|
|
* open zone around forever as long as any inode that used it for the last
|
|
* write is cached, which slightly increases the memory use of cached inodes
|
|
* that were every written to, but significantly simplifies the cached zone
|
|
* lookup. Because the open_zone is clearly marked as full when all data
|
|
* in the underlying RTG was written, the caching is always safe.
|
|
*/
|
|
static void
|
|
xfs_set_cached_zone(
|
|
struct xfs_inode *ip,
|
|
struct xfs_open_zone *oz)
|
|
{
|
|
struct xfs_open_zone *old_oz;
|
|
|
|
atomic_inc(&oz->oz_ref);
|
|
old_oz = xchg(&VFS_I(ip)->i_private, oz);
|
|
if (old_oz)
|
|
xfs_open_zone_put(old_oz);
|
|
}
|
|
|
|
static void
|
|
xfs_submit_zoned_bio(
|
|
struct iomap_ioend *ioend,
|
|
struct xfs_open_zone *oz,
|
|
bool is_seq)
|
|
{
|
|
ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
|
|
ioend->io_private = oz;
|
|
atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
|
|
|
|
if (is_seq) {
|
|
ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
|
|
ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
|
|
} else {
|
|
xfs_mark_rtg_boundary(ioend);
|
|
}
|
|
|
|
submit_bio(&ioend->io_bio);
|
|
}
|
|
|
|
void
|
|
xfs_zone_alloc_and_submit(
|
|
struct iomap_ioend *ioend,
|
|
struct xfs_open_zone **oz)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(ioend->io_inode);
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
enum rw_hint write_hint = xfs_inode_write_hint(ip);
|
|
bool pack_tight = xfs_zoned_pack_tight(ip);
|
|
unsigned int alloc_len;
|
|
struct iomap_ioend *split;
|
|
bool is_seq;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
goto out_error;
|
|
|
|
/*
|
|
* If we don't have a locally cached zone in this write context, see if
|
|
* the inode is still associated with a zone and use that if so.
|
|
*/
|
|
if (!*oz)
|
|
*oz = xfs_get_cached_zone(ip);
|
|
|
|
if (!*oz) {
|
|
select_zone:
|
|
*oz = xfs_select_zone(mp, write_hint, pack_tight);
|
|
if (!*oz)
|
|
goto out_error;
|
|
xfs_set_cached_zone(ip, *oz);
|
|
}
|
|
|
|
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
|
|
&ioend->io_sector, &is_seq);
|
|
if (!alloc_len) {
|
|
xfs_open_zone_put(*oz);
|
|
goto select_zone;
|
|
}
|
|
|
|
while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) {
|
|
if (IS_ERR(split))
|
|
goto out_split_error;
|
|
alloc_len -= split->io_bio.bi_iter.bi_size;
|
|
xfs_submit_zoned_bio(split, *oz, is_seq);
|
|
if (!alloc_len) {
|
|
xfs_open_zone_put(*oz);
|
|
goto select_zone;
|
|
}
|
|
}
|
|
|
|
xfs_submit_zoned_bio(ioend, *oz, is_seq);
|
|
return;
|
|
|
|
out_split_error:
|
|
ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split));
|
|
out_error:
|
|
bio_io_error(&ioend->io_bio);
|
|
}
|
|
|
|
/*
|
|
* Wake up all threads waiting for a zoned space allocation when the file system
|
|
* is shut down.
|
|
*/
|
|
void
|
|
xfs_zoned_wake_all(
|
|
struct xfs_mount *mp)
|
|
{
|
|
/*
|
|
* Don't wake up if there is no m_zone_info. This is complicated by the
|
|
* fact that unmount can't atomically clear m_zone_info and thus we need
|
|
* to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE
|
|
* during log recovery so we can't entirely rely on that either.
|
|
*/
|
|
if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info)
|
|
wake_up_all(&mp->m_zone_info->zi_zone_wait);
|
|
}
|
|
|
|
/*
|
|
* Check if @rgbno in @rgb is a potentially valid block. It might still be
|
|
* unused, but that information is only found in the rmap.
|
|
*/
|
|
bool
|
|
xfs_zone_rgbno_is_valid(
|
|
struct xfs_rtgroup *rtg,
|
|
xfs_rgnumber_t rgbno)
|
|
{
|
|
lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
|
|
|
|
if (rtg->rtg_open_zone)
|
|
return rgbno < rtg->rtg_open_zone->oz_allocated;
|
|
return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
|
|
rtg_rgno(rtg), XFS_RTG_FREE);
|
|
}
|
|
|
|
static void
|
|
xfs_free_open_zones(
|
|
struct xfs_zone_info *zi)
|
|
{
|
|
struct xfs_open_zone *oz;
|
|
|
|
spin_lock(&zi->zi_open_zones_lock);
|
|
while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
|
|
struct xfs_open_zone, oz_entry))) {
|
|
list_del(&oz->oz_entry);
|
|
xfs_open_zone_put(oz);
|
|
}
|
|
spin_unlock(&zi->zi_open_zones_lock);
|
|
|
|
/*
|
|
* Wait for all open zones to be freed so that they drop the group
|
|
* references:
|
|
*/
|
|
rcu_barrier();
|
|
}
|
|
|
|
struct xfs_init_zones {
|
|
uint32_t zone_size;
|
|
uint32_t zone_capacity;
|
|
uint64_t available;
|
|
uint64_t reclaimable;
|
|
};
|
|
|
|
/*
|
|
* For sequential write required zones, we restart writing at the hardware write
|
|
* pointer returned by xfs_validate_blk_zone().
|
|
*
|
|
* For conventional zones or conventional devices we have to query the rmap to
|
|
* find the highest recorded block and set the write pointer to the block after
|
|
* that. In case of a power loss this misses blocks where the data I/O has
|
|
* completed but not recorded in the rmap yet, and it also rewrites blocks if
|
|
* the most recently written ones got deleted again before unmount, but this is
|
|
* the best we can do without hardware support.
|
|
*/
|
|
static int
|
|
xfs_query_write_pointer(
|
|
struct xfs_init_zones *iz,
|
|
struct xfs_rtgroup *rtg,
|
|
xfs_rgblock_t *write_pointer)
|
|
{
|
|
struct xfs_mount *mp = rtg_mount(rtg);
|
|
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
|
|
sector_t start = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
|
|
xfs_rgblock_t highest_rgbno;
|
|
struct blk_zone zone = {};
|
|
int error;
|
|
|
|
if (bdev_is_zoned(bdev)) {
|
|
error = blkdev_get_zone_info(bdev, start, &zone);
|
|
if (error)
|
|
return error;
|
|
if (zone.start != start) {
|
|
xfs_warn(mp, "mismatched zone start: 0x%llx/0x%llx.",
|
|
zone.start, start);
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
if (!xfs_validate_blk_zone(mp, &zone, rtg_rgno(rtg),
|
|
iz->zone_size, iz->zone_capacity,
|
|
write_pointer))
|
|
return -EFSCORRUPTED;
|
|
|
|
/*
|
|
* Use the hardware write pointer returned by
|
|
* xfs_validate_blk_zone for sequential write required zones,
|
|
* else fall through to the rmap-based estimation below.
|
|
*/
|
|
if (zone.cond != BLK_ZONE_COND_NOT_WP)
|
|
return 0;
|
|
}
|
|
|
|
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
|
|
highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
|
|
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
|
|
|
|
if (highest_rgbno == NULLRGBLOCK)
|
|
*write_pointer = 0;
|
|
else
|
|
*write_pointer = highest_rgbno + 1;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
xfs_init_zone(
|
|
struct xfs_init_zones *iz,
|
|
struct xfs_rtgroup *rtg,
|
|
xfs_rgblock_t write_pointer)
|
|
{
|
|
struct xfs_mount *mp = rtg_mount(rtg);
|
|
struct xfs_zone_info *zi = mp->m_zone_info;
|
|
uint32_t used = rtg_rmap(rtg)->i_used_blocks;
|
|
int error;
|
|
|
|
if (write_pointer > rtg->rtg_extents) {
|
|
xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
|
|
rtg_rgno(rtg), write_pointer);
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
if (used > rtg->rtg_extents) {
|
|
xfs_warn(mp,
|
|
"zone %u has used counter (0x%x) larger than zone capacity (0x%llx).",
|
|
rtg_rgno(rtg), used, rtg->rtg_extents);
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
if (used > write_pointer) {
|
|
xfs_warn(mp,
|
|
"zone %u has used counter (0x%x) larger than write pointer (0x%x).",
|
|
rtg_rgno(rtg), used, write_pointer);
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
if (write_pointer == 0 && used != 0) {
|
|
xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
|
|
rtg_rgno(rtg), used);
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
/*
|
|
* If there are no used blocks, but the zone is not in empty state yet
|
|
* we lost power before the zoned reset. In that case finish the work
|
|
* here.
|
|
*/
|
|
if (write_pointer == rtg_blocks(rtg) && used == 0) {
|
|
error = xfs_zone_gc_reset_sync(rtg);
|
|
if (error)
|
|
return error;
|
|
write_pointer = 0;
|
|
}
|
|
|
|
if (write_pointer == 0) {
|
|
/* zone is empty */
|
|
atomic_inc(&zi->zi_nr_free_zones);
|
|
xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
|
|
iz->available += rtg_blocks(rtg);
|
|
} else if (write_pointer < rtg_blocks(rtg)) {
|
|
/* zone is open */
|
|
struct xfs_open_zone *oz;
|
|
|
|
atomic_inc(&rtg_group(rtg)->xg_active_ref);
|
|
oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET,
|
|
false);
|
|
list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
|
|
zi->zi_nr_open_zones++;
|
|
|
|
iz->available += (rtg_blocks(rtg) - write_pointer);
|
|
iz->reclaimable += write_pointer - used;
|
|
} else if (used < rtg_blocks(rtg)) {
|
|
/* zone fully written, but has freed blocks */
|
|
xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
|
|
iz->reclaimable += (rtg_blocks(rtg) - used);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Calculate the max open zone limit based on the of number of backing zones
|
|
* available.
|
|
*/
|
|
static inline uint32_t
|
|
xfs_max_open_zones(
|
|
struct xfs_mount *mp)
|
|
{
|
|
unsigned int max_open, max_open_data_zones;
|
|
|
|
/*
|
|
* We need two zones for every open data zone, one in reserve as we
|
|
* don't reclaim open zones. One data zone and its spare is included
|
|
* in XFS_MIN_ZONES to support at least one user data writer.
|
|
*/
|
|
max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
|
|
max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
|
|
|
|
/*
|
|
* Cap the max open limit to 1/4 of available space. Without this we'd
|
|
* run out of easy reclaim targets too quickly and storage devices don't
|
|
* handle huge numbers of concurrent write streams overly well.
|
|
*/
|
|
max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
|
|
|
|
return max(XFS_MIN_OPEN_ZONES, max_open);
|
|
}
|
|
|
|
/*
|
|
* Normally we use the open zone limit that the device reports. If there is
|
|
* none let the user pick one from the command line.
|
|
*
|
|
* If the device doesn't report an open zone limit and there is no override,
|
|
* allow to hold about a quarter of the zones open. In theory we could allow
|
|
* all to be open, but at that point we run into GC deadlocks because we can't
|
|
* reclaim open zones.
|
|
*
|
|
* When used on conventional SSDs a lower open limit is advisable as we'll
|
|
* otherwise overwhelm the FTL just as much as a conventional block allocator.
|
|
*
|
|
* Note: To debug the open zone management code, force max_open to 1 here.
|
|
*/
|
|
static int
|
|
xfs_calc_open_zones(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
|
|
unsigned int bdev_open_zones = bdev_max_open_zones(bdev);
|
|
|
|
if (!mp->m_max_open_zones) {
|
|
if (bdev_open_zones)
|
|
mp->m_max_open_zones = bdev_open_zones;
|
|
else
|
|
mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES;
|
|
}
|
|
|
|
if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
|
|
xfs_notice(mp, "need at least %u open zones.",
|
|
XFS_MIN_OPEN_ZONES);
|
|
return -EIO;
|
|
}
|
|
|
|
if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
|
|
mp->m_max_open_zones = bdev_open_zones;
|
|
xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
|
|
bdev_open_zones);
|
|
}
|
|
|
|
if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
|
|
mp->m_max_open_zones = xfs_max_open_zones(mp);
|
|
xfs_info(mp,
|
|
"limiting open zones to %u due to total zone count (%u)",
|
|
mp->m_max_open_zones, mp->m_sb.sb_rgcount);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static unsigned long *
|
|
xfs_alloc_bucket_bitmap(
|
|
struct xfs_mount *mp)
|
|
{
|
|
return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount),
|
|
sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO);
|
|
}
|
|
|
|
static struct xfs_zone_info *
|
|
xfs_alloc_zone_info(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_zone_info *zi;
|
|
int i;
|
|
|
|
zi = kzalloc_obj(*zi);
|
|
if (!zi)
|
|
return NULL;
|
|
INIT_LIST_HEAD(&zi->zi_open_zones);
|
|
INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
|
|
spin_lock_init(&zi->zi_reset_list_lock);
|
|
spin_lock_init(&zi->zi_open_zones_lock);
|
|
spin_lock_init(&zi->zi_reservation_lock);
|
|
init_waitqueue_head(&zi->zi_zone_wait);
|
|
spin_lock_init(&zi->zi_used_buckets_lock);
|
|
for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
|
|
zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp);
|
|
if (!zi->zi_used_bucket_bitmap[i])
|
|
goto out_free_bitmaps;
|
|
}
|
|
return zi;
|
|
|
|
out_free_bitmaps:
|
|
while (--i > 0)
|
|
kvfree(zi->zi_used_bucket_bitmap[i]);
|
|
kfree(zi);
|
|
return NULL;
|
|
}
|
|
|
|
static void
|
|
xfs_free_zone_info(
|
|
struct xfs_zone_info *zi)
|
|
{
|
|
int i;
|
|
|
|
xfs_free_open_zones(zi);
|
|
for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++)
|
|
kvfree(zi->zi_used_bucket_bitmap[i]);
|
|
kfree(zi);
|
|
}
|
|
|
|
int
|
|
xfs_mount_zones(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_init_zones iz = {
|
|
.zone_capacity = mp->m_groups[XG_TYPE_RTG].blocks,
|
|
.zone_size = xfs_rtgroup_raw_size(mp),
|
|
};
|
|
struct xfs_rtgroup *rtg = NULL;
|
|
int error;
|
|
|
|
if (!mp->m_rtdev_targp) {
|
|
xfs_notice(mp, "RT device missing.");
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
|
|
xfs_notice(mp, "invalid flag combination.");
|
|
return -EFSCORRUPTED;
|
|
}
|
|
if (mp->m_sb.sb_rextsize != 1) {
|
|
xfs_notice(mp, "zoned file systems do not support rextsize.");
|
|
return -EFSCORRUPTED;
|
|
}
|
|
if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
|
|
xfs_notice(mp,
|
|
"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
error = xfs_calc_open_zones(mp);
|
|
if (error)
|
|
return error;
|
|
|
|
mp->m_zone_info = xfs_alloc_zone_info(mp);
|
|
if (!mp->m_zone_info)
|
|
return -ENOMEM;
|
|
|
|
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
|
|
mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones);
|
|
trace_xfs_zones_mount(mp);
|
|
|
|
/*
|
|
* The writeback code switches between inodes regularly to provide
|
|
* fairness. The default lower bound is 4MiB, but for zoned file
|
|
* systems we want to increase that both to reduce seeks, but also more
|
|
* importantly so that workloads that writes files in a multiple of the
|
|
* zone size do not get fragmented and require garbage collection when
|
|
* they shouldn't. Increase is to the zone size capped by the max
|
|
* extent len.
|
|
*
|
|
* Note that because s_min_writeback_pages is a superblock field, this
|
|
* value also get applied to non-zoned files on the data device if
|
|
* there are any. On typical zoned setup all data is on the RT device
|
|
* because using the more efficient sequential write required zones
|
|
* is the reason for using the zone allocator, and either the RT device
|
|
* and the (meta)data device are on the same block device, or the
|
|
* (meta)data device is on a fast SSD while the data on the RT device
|
|
* is on a SMR HDD. In any combination of the above cases enforcing
|
|
* the higher min_writeback_pages for non-RT inodes is either a noop
|
|
* or beneficial.
|
|
*/
|
|
mp->m_super->s_min_writeback_pages =
|
|
XFS_FSB_TO_B(mp, min(iz.zone_capacity, XFS_MAX_BMBT_EXTLEN)) >>
|
|
PAGE_SHIFT;
|
|
|
|
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
|
|
xfs_rgblock_t write_pointer;
|
|
|
|
error = xfs_query_write_pointer(&iz, rtg, &write_pointer);
|
|
if (!error)
|
|
error = xfs_init_zone(&iz, rtg, write_pointer);
|
|
if (error) {
|
|
xfs_rtgroup_rele(rtg);
|
|
goto out_free_zone_info;
|
|
}
|
|
}
|
|
|
|
xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
|
|
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
|
|
iz.available + iz.reclaimable);
|
|
|
|
/*
|
|
* The user may configure GC to free up a percentage of unused blocks.
|
|
* By default this is 0. GC will always trigger at the minimum level
|
|
* for keeping max_open_zones available for data placement.
|
|
*/
|
|
mp->m_zonegc_low_space = 0;
|
|
|
|
error = xfs_zone_gc_mount(mp);
|
|
if (error)
|
|
goto out_free_zone_info;
|
|
return 0;
|
|
|
|
out_free_zone_info:
|
|
xfs_free_zone_info(mp->m_zone_info);
|
|
return error;
|
|
}
|
|
|
|
void
|
|
xfs_unmount_zones(
|
|
struct xfs_mount *mp)
|
|
{
|
|
xfs_zone_gc_unmount(mp);
|
|
xfs_free_zone_info(mp->m_zone_info);
|
|
}
|