Commit d32e907d authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'xfs-fixes-6.16-rc5' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Carlos Maiolino:

 - Fix umount hang with unflushable inodes (and add new tracepoint used
   for debugging this)

 - Fix ABBA deadlock in xfs_reclaim_inode() vs xfs_ifree_cluster()

 - Fix dquot buffer pin deadlock

* tag 'xfs-fixes-6.16-rc5' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: add FALLOC_FL_ALLOCATE_RANGE to supported flags mask
  xfs: fix unmount hang with unflushable inodes stuck in the AIL
  xfs: factor out stale buffer item completion
  xfs: rearrange code in xfs_buf_item.c
  xfs: add tracepoints for stale pinned inode state debug
  xfs: avoid dquot buffer pin deadlock
  xfs: catch stale AGF/AGF metadata
  xfs: xfs_ifree_cluster vs xfs_iflush_shutdown_abort deadlock
  xfs: actually use the xfs_growfs_check_rtgeom tracepoint
  xfs: Improve error handling in xfs_mru_cache_create()
  xfs: move xfs_submit_zoned_bio a bit
  xfs: use xfs_readonly_buftarg in xfs_remount_rw
  xfs: remove NULL pointer checks in xfs_mru_cache_insert
  xfs: check for shutdown before going to sleep in xfs_select_zone
parents b4911fb0 9e9b4667
Loading
Loading
Loading
Loading
+33 −8
Original line number Diff line number Diff line
@@ -3444,16 +3444,41 @@ xfs_alloc_read_agf(

		set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
	}

#ifdef DEBUG
	else if (!xfs_is_shutdown(mp)) {
		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
		ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
		ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
	/*
	 * It's possible for the AGF to be out of sync if the block device is
	 * silently dropping writes. This can happen in fstests with dmflakey
	 * enabled, which allows the buffer to be cleaned and reclaimed by
	 * memory pressure and then re-read from disk here. We will get a
	 * stale version of the AGF from disk, and nothing good can happen from
	 * here. Hence if we detect this situation, immediately shut down the
	 * filesystem.
	 *
	 * This can also happen if we are already in the middle of a forced
	 * shutdown, so don't bother checking if we are already shut down.
	 */
	if (!xfs_is_shutdown(pag_mount(pag))) {
		bool	ok = true;

		ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
		ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
		ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks);
		ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount);
		ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest);
		ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level);
		ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level);

		if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
			xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
			xfs_trans_brelse(tp, agfbp);
			xfs_force_shutdown(pag_mount(pag),
					SHUTDOWN_CORRUPT_ONDISK);
			return -EFSCORRUPTED;
		}
#endif
	}
#endif /* DEBUG */

	if (agfbpp)
		*agfbpp = agfbp;
	else
+27 −4
Original line number Diff line number Diff line
@@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi(
		set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
	}

#ifdef DEBUG
	/*
	 * It's possible for these to be out of sync if
	 * we are in the middle of a forced shutdown.
	 * It's possible for the AGF to be out of sync if the block device is
	 * silently dropping writes. This can happen in fstests with dmflakey
	 * enabled, which allows the buffer to be cleaned and reclaimed by
	 * memory pressure and then re-read from disk here. We will get a
	 * stale version of the AGF from disk, and nothing good can happen from
	 * here. Hence if we detect this situation, immediately shut down the
	 * filesystem.
	 *
	 * This can also happen if we are already in the middle of a forced
	 * shutdown, so don't bother checking if we are already shut down.
	 */
	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
		xfs_is_shutdown(pag_mount(pag)));
	if (!xfs_is_shutdown(pag_mount(pag))) {
		bool	ok = true;

		ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount);
		ok &= pag->pagi_count == be32_to_cpu(agi->agi_count);

		if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
			xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
			xfs_trans_brelse(tp, agibp);
			xfs_force_shutdown(pag_mount(pag),
					SHUTDOWN_CORRUPT_ONDISK);
			return -EFSCORRUPTED;
		}
	}
#endif /* DEBUG */

	if (agibpp)
		*agibpp = agibp;
	else
+0 −38
Original line number Diff line number Diff line
@@ -2082,44 +2082,6 @@ xfs_buf_delwri_submit(
	return error;
}

/*
 * Push a single buffer on a delwri queue.
 *
 * The purpose of this function is to submit a single buffer of a delwri queue
 * and return with the buffer still on the original queue.
 *
 * The buffer locking and queue management logic between _delwri_pushbuf() and
 * _delwri_queue() guarantee that the buffer cannot be queued to another list
 * before returning.
 */
int
xfs_buf_delwri_pushbuf(
	struct xfs_buf		*bp,
	struct list_head	*buffer_list)
{
	int			error;

	ASSERT(bp->b_flags & _XBF_DELWRI_Q);

	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);

	xfs_buf_lock(bp);
	bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
	bp->b_flags |= XBF_WRITE;
	xfs_buf_submit(bp);

	/*
	 * The buffer is now locked, under I/O but still on the original delwri
	 * queue. Wait for I/O completion, restore the DELWRI_Q flag and
	 * return with the buffer unlocked and still on the original queue.
	 */
	error = xfs_buf_iowait(bp);
	bp->b_flags |= _XBF_DELWRI_Q;
	xfs_buf_unlock(bp);

	return error;
}

void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
	/*
+0 −1
Original line number Diff line number Diff line
@@ -326,7 +326,6 @@ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);

static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
+179 −116
Original line number Diff line number Diff line
@@ -32,6 +32,61 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
	return container_of(lip, struct xfs_buf_log_item, bli_item);
}

static void
xfs_buf_item_get_format(
	struct xfs_buf_log_item	*bip,
	int			count)
{
	ASSERT(bip->bli_formats == NULL);
	bip->bli_format_count = count;

	if (count == 1) {
		bip->bli_formats = &bip->__bli_format;
		return;
	}

	bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
				GFP_KERNEL | __GFP_NOFAIL);
}

static void
xfs_buf_item_free_format(
	struct xfs_buf_log_item	*bip)
{
	if (bip->bli_formats != &bip->__bli_format) {
		kfree(bip->bli_formats);
		bip->bli_formats = NULL;
	}
}

static void
xfs_buf_item_free(
	struct xfs_buf_log_item	*bip)
{
	xfs_buf_item_free_format(bip);
	kvfree(bip->bli_item.li_lv_shadow);
	kmem_cache_free(xfs_buf_item_cache, bip);
}

/*
 * xfs_buf_item_relse() is called when the buf log item is no longer needed.
 */
static void
xfs_buf_item_relse(
	struct xfs_buf_log_item	*bip)
{
	struct xfs_buf		*bp = bip->bli_buf;

	trace_xfs_buf_item_relse(bp, _RET_IP_);

	ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
	ASSERT(atomic_read(&bip->bli_refcount) == 0);

	bp->b_log_item = NULL;
	xfs_buf_rele(bp);
	xfs_buf_item_free(bip);
}

/* Is this log iovec plausibly large enough to contain the buffer log format? */
bool
xfs_buf_log_check_iovec(
@@ -389,6 +444,42 @@ xfs_buf_item_pin(
	atomic_inc(&bip->bli_buf->b_pin_count);
}

/*
 * For a stale BLI, process all the necessary completions that must be
 * performed when the final BLI reference goes away. The buffer will be
 * referenced and locked here - we return to the caller with the buffer still
 * referenced and locked for them to finalise processing of the buffer.
 */
static void
xfs_buf_item_finish_stale(
	struct xfs_buf_log_item	*bip)
{
	struct xfs_buf		*bp = bip->bli_buf;
	struct xfs_log_item	*lip = &bip->bli_item;

	ASSERT(bip->bli_flags & XFS_BLI_STALE);
	ASSERT(xfs_buf_islocked(bp));
	ASSERT(bp->b_flags & XBF_STALE);
	ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
	ASSERT(list_empty(&lip->li_trans));
	ASSERT(!bp->b_transp);

	if (bip->bli_flags & XFS_BLI_STALE_INODE) {
		xfs_buf_item_done(bp);
		xfs_buf_inode_iodone(bp);
		ASSERT(list_empty(&bp->b_li_list));
		return;
	}

	/*
	 * We may or may not be on the AIL here, xfs_trans_ail_delete() will do
	 * the right thing regardless of the situation in which we are called.
	 */
	xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
	xfs_buf_item_relse(bip);
	ASSERT(bp->b_log_item == NULL);
}

/*
 * This is called to unpin the buffer associated with the buf log item which was
 * previously pinned with a call to xfs_buf_item_pin().  We enter this function
@@ -438,13 +529,6 @@ xfs_buf_item_unpin(
	}

	if (stale) {
		ASSERT(bip->bli_flags & XFS_BLI_STALE);
		ASSERT(xfs_buf_islocked(bp));
		ASSERT(bp->b_flags & XBF_STALE);
		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
		ASSERT(list_empty(&lip->li_trans));
		ASSERT(!bp->b_transp);

		trace_xfs_buf_item_unpin_stale(bip);

		/*
@@ -455,22 +539,7 @@ xfs_buf_item_unpin(
		 * processing is complete.
		 */
		xfs_buf_rele(bp);

		/*
		 * If we get called here because of an IO error, we may or may
		 * not have the item on the AIL. xfs_trans_ail_delete() will
		 * take care of that situation. xfs_trans_ail_delete() drops
		 * the AIL lock.
		 */
		if (bip->bli_flags & XFS_BLI_STALE_INODE) {
			xfs_buf_item_done(bp);
			xfs_buf_inode_iodone(bp);
			ASSERT(list_empty(&bp->b_li_list));
		} else {
			xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
			xfs_buf_item_relse(bp);
			ASSERT(bp->b_log_item == NULL);
		}
		xfs_buf_item_finish_stale(bip);
		xfs_buf_relse(bp);
		return;
	}
@@ -543,43 +612,42 @@ xfs_buf_item_push(
 * Drop the buffer log item refcount and take appropriate action. This helper
 * determines whether the bli must be freed or not, since a decrement to zero
 * does not necessarily mean the bli is unused.
 *
 * Return true if the bli is freed, false otherwise.
 */
bool
void
xfs_buf_item_put(
	struct xfs_buf_log_item	*bip)
{
	struct xfs_log_item	*lip = &bip->bli_item;
	bool			aborted;
	bool			dirty;

	ASSERT(xfs_buf_islocked(bip->bli_buf));

	/* drop the bli ref and return if it wasn't the last one */
	if (!atomic_dec_and_test(&bip->bli_refcount))
		return false;
		return;

	/*
	 * We dropped the last ref and must free the item if clean or aborted.
	 * If the bli is dirty and non-aborted, the buffer was clean in the
	 * transaction but still awaiting writeback from previous changes. In
	 * that case, the bli is freed on buffer writeback completion.
	 */
	aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
			xlog_is_shutdown(lip->li_log);
	dirty = bip->bli_flags & XFS_BLI_DIRTY;
	if (dirty && !aborted)
		return false;
	/* If the BLI is in the AIL, then it is still dirty and in use */
	if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) {
		ASSERT(bip->bli_flags & XFS_BLI_DIRTY);
		return;
	}

	/*
	 * The bli is aborted or clean. An aborted item may be in the AIL
	 * regardless of dirty state.  For example, consider an aborted
	 * transaction that invalidated a dirty bli and cleared the dirty
	 * state.
	 * In shutdown conditions, we can be asked to free a dirty BLI that
	 * isn't in the AIL. This can occur due to a checkpoint aborting a BLI
	 * instead of inserting it into the AIL at checkpoint IO completion. If
	 * there's another bli reference (e.g. a btree cursor holds a clean
	 * reference) and it is released via xfs_trans_brelse(), we can get here
	 * with that aborted, dirty BLI. In this case, it is safe to free the
	 * dirty BLI immediately, as it is not in the AIL and there are no
	 * other references to it.
	 *
	 * We should never get here with a stale BLI via that path as
	 * xfs_trans_brelse() specifically holds onto stale buffers rather than
	 * releasing them.
	 */
	if (aborted)
		xfs_trans_ail_delete(lip, 0);
	xfs_buf_item_relse(bip->bli_buf);
	return true;
	ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) ||
			test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags));
	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
	xfs_buf_item_relse(bip);
}

/*
@@ -600,6 +668,15 @@ xfs_buf_item_put(
 * if necessary but do not unlock the buffer.  This is for support of
 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
 * free the item.
 *
 * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must*
 * perform a completion abort of any objects attached to the buffer for IO
 * tracking purposes. This generally only happens in shutdown situations,
 * normally xfs_buf_item_unpin() will drop the last BLI reference and perform
 * completion processing. However, because transaction completion can race with
 * checkpoint completion during a shutdown, this release context may end up
 * being the last active reference to the BLI and so needs to perform this
 * cleanup.
 */
STATIC void
xfs_buf_item_release(
@@ -607,18 +684,19 @@ xfs_buf_item_release(
{
	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
	struct xfs_buf		*bp = bip->bli_buf;
	bool			released;
	bool			hold = bip->bli_flags & XFS_BLI_HOLD;
	bool			stale = bip->bli_flags & XFS_BLI_STALE;
#if defined(DEBUG) || defined(XFS_WARN)
	bool			ordered = bip->bli_flags & XFS_BLI_ORDERED;
	bool			dirty = bip->bli_flags & XFS_BLI_DIRTY;
	bool			aborted = test_bit(XFS_LI_ABORTED,
						   &lip->li_flags);
	bool			dirty = bip->bli_flags & XFS_BLI_DIRTY;
#if defined(DEBUG) || defined(XFS_WARN)
	bool			ordered = bip->bli_flags & XFS_BLI_ORDERED;
#endif

	trace_xfs_buf_item_release(bip);

	ASSERT(xfs_buf_islocked(bp));

	/*
	 * The bli dirty state should match whether the blf has logged segments
	 * except for ordered buffers, where only the bli should be dirty.
@@ -634,16 +712,56 @@ xfs_buf_item_release(
	bp->b_transp = NULL;
	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);

	/* If there are other references, then we have nothing to do. */
	if (!atomic_dec_and_test(&bip->bli_refcount))
		goto out_release;

	/*
	 * Stale buffer completion frees the BLI, unlocks and releases the
	 * buffer. Neither the BLI or buffer are safe to reference after this
	 * call, so there's nothing more we need to do here.
	 *
	 * If we get here with a stale buffer and references to the BLI remain,
	 * we must not unlock the buffer as the last BLI reference owns lock
	 * context, not us.
	 */
	if (stale) {
		xfs_buf_item_finish_stale(bip);
		xfs_buf_relse(bp);
		ASSERT(!hold);
		return;
	}

	/*
	 * Dirty or clean, aborted items are done and need to be removed from
	 * the AIL and released. This frees the BLI, but leaves the buffer
	 * locked and referenced.
	 */
	if (aborted || xlog_is_shutdown(lip->li_log)) {
		ASSERT(list_empty(&bip->bli_buf->b_li_list));
		xfs_buf_item_done(bp);
		goto out_release;
	}

	/*
	 * Clean, unreferenced BLIs can be immediately freed, leaving the buffer
	 * locked and referenced.
	 *
	 * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback.
	 */
	if (!dirty)
		xfs_buf_item_relse(bip);
	else
		ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));

	/* Not safe to reference the BLI from here */
out_release:
	/*
	 * Unref the item and unlock the buffer unless held or stale. Stale
	 * buffers remain locked until final unpin unless the bli is freed by
	 * the unref call. The latter implies shutdown because buffer
	 * invalidation dirties the bli and transaction.
	 * If we get here with a stale buffer, we must not unlock the
	 * buffer as the last BLI reference owns lock context, not us.
	 */
	released = xfs_buf_item_put(bip);
	if (hold || (stale && !released))
	if (stale || hold)
		return;
	ASSERT(!stale || aborted);
	xfs_buf_relse(bp);
}

@@ -729,33 +847,6 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
	.iop_push	= xfs_buf_item_push,
};

STATIC void
xfs_buf_item_get_format(
	struct xfs_buf_log_item	*bip,
	int			count)
{
	ASSERT(bip->bli_formats == NULL);
	bip->bli_format_count = count;

	if (count == 1) {
		bip->bli_formats = &bip->__bli_format;
		return;
	}

	bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
				GFP_KERNEL | __GFP_NOFAIL);
}

STATIC void
xfs_buf_item_free_format(
	struct xfs_buf_log_item	*bip)
{
	if (bip->bli_formats != &bip->__bli_format) {
		kfree(bip->bli_formats);
		bip->bli_formats = NULL;
	}
}

/*
 * Allocate a new buf log item to go with the given buffer.
 * Set the buffer's b_log_item field to point to the new
@@ -976,34 +1067,6 @@ xfs_buf_item_dirty_format(
	return false;
}

STATIC void
xfs_buf_item_free(
	struct xfs_buf_log_item	*bip)
{
	xfs_buf_item_free_format(bip);
	kvfree(bip->bli_item.li_lv_shadow);
	kmem_cache_free(xfs_buf_item_cache, bip);
}

/*
 * xfs_buf_item_relse() is called when the buf log item is no longer needed.
 */
void
xfs_buf_item_relse(
	struct xfs_buf	*bp)
{
	struct xfs_buf_log_item	*bip = bp->b_log_item;

	trace_xfs_buf_item_relse(bp, _RET_IP_);
	ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));

	if (atomic_read(&bip->bli_refcount))
		return;
	bp->b_log_item = NULL;
	xfs_buf_rele(bp);
	xfs_buf_item_free(bip);
}

void
xfs_buf_item_done(
	struct xfs_buf		*bp)
@@ -1023,5 +1086,5 @@ xfs_buf_item_done(
	xfs_trans_ail_delete(&bp->b_log_item->bli_item,
			     (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
			     SHUTDOWN_CORRUPT_INCORE);
	xfs_buf_item_relse(bp);
	xfs_buf_item_relse(bp->b_log_item);
}
Loading