drm/i915/guc: Close deregister-context race against CT-loss (2f2cc53b) · Commits · git / linux-nf

drivers/gpu/drm/i915/gem/i915_gem_pm.c

+10 −0

Original line number	Diff line number	Diff line
		@@ -28,6 +28,13 @@ void i915_gem_suspend(struct drm_i915_private *i915)
		GEM_TRACE("%s\n", dev_name(i915->drm.dev));

		intel_wakeref_auto(&i915->runtime_pm.userfault_wakeref, 0);
		/*
		* On rare occasions, we've observed the fence completion triggers
		* free_engines asynchronously via rcu_call. Ensure those are done.
		* This path is only called on suspend, so it's an acceptable cost.
		*/
		rcu_barrier();

		flush_workqueue(i915->wq);

		/*
		@@ -160,6 +167,9 @@ void i915_gem_suspend_late(struct drm_i915_private *i915)
		* machine in an unusable condition.
		*/

		/* Like i915_gem_suspend, flush tasks staged from fence triggers */
		rcu_barrier();

		for_each_gt(gt, i915, i)
		intel_gt_suspend_late(gt);

drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

+68 −5

Original line number	Diff line number	Diff line
		@@ -236,6 +236,13 @@ set_context_destroyed(struct intel_context *ce)
		ce->guc_state.sched_state \|= SCHED_STATE_DESTROYED;
		}

		static inline void
		clr_context_destroyed(struct intel_context *ce)
		{
		lockdep_assert_held(&ce->guc_state.lock);
		ce->guc_state.sched_state &= ~SCHED_STATE_DESTROYED;
		}

		static inline bool context_pending_disable(struct intel_context *ce)
		{
		return ce->guc_state.sched_state & SCHED_STATE_PENDING_DISABLE;
		@@ -613,6 +620,8 @@ static int guc_submission_send_busy_loop(struct intel_guc *guc,
		u32 g2h_len_dw,
		bool loop)
		{
		int ret;

		/*
		* We always loop when a send requires a reply (i.e. g2h_len_dw > 0),
		* so we don't handle the case where we don't get a reply because we
		@@ -623,7 +632,11 @@ static int guc_submission_send_busy_loop(struct intel_guc *guc,
		if (g2h_len_dw)
		atomic_inc(&guc->outstanding_submission_g2h);

		return intel_guc_send_busy_loop(guc, action, len, g2h_len_dw, loop);
		ret = intel_guc_send_busy_loop(guc, action, len, g2h_len_dw, loop);
		if (ret)
		atomic_dec(&guc->outstanding_submission_g2h);

		return ret;
		}

		int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
		@@ -3288,12 +3301,13 @@ static void guc_context_close(struct intel_context *ce)
		spin_unlock_irqrestore(&ce->guc_state.lock, flags);
		}

		static inline void guc_lrc_desc_unpin(struct intel_context *ce)
		static inline int guc_lrc_desc_unpin(struct intel_context *ce)
		{
		struct intel_guc *guc = ce_to_guc(ce);
		struct intel_gt *gt = guc_to_gt(guc);
		unsigned long flags;
		bool disabled;
		int ret;

		GEM_BUG_ON(!intel_gt_pm_is_awake(gt));
		GEM_BUG_ON(!ctx_id_mapped(guc, ce->guc_id.id));
		@@ -3304,18 +3318,41 @@ static inline void guc_lrc_desc_unpin(struct intel_context *ce)
		spin_lock_irqsave(&ce->guc_state.lock, flags);
		disabled = submission_disabled(guc);
		if (likely(!disabled)) {
		/*
		* Take a gt-pm ref and change context state to be destroyed.
		* NOTE: a G2H IRQ that comes after will put this gt-pm ref back
		*/
		__intel_gt_pm_get(gt);
		set_context_destroyed(ce);
		clr_context_registered(ce);
		}
		spin_unlock_irqrestore(&ce->guc_state.lock, flags);

		if (unlikely(disabled)) {
		release_guc_id(guc, ce);
		__guc_context_destroy(ce);
		return;
		return 0;
		}

		deregister_context(ce, ce->guc_id.id);
		/*
		* GuC is active, lets destroy this context, but at this point we can still be racing
		* with suspend, so we undo everything if the H2G fails in deregister_context so
		* that GuC reset will find this context during clean up.
		*/
		ret = deregister_context(ce, ce->guc_id.id);
		if (ret) {
		spin_lock(&ce->guc_state.lock);
		set_context_registered(ce);
		clr_context_destroyed(ce);
		spin_unlock(&ce->guc_state.lock);
		/*
		* As gt-pm is awake at function entry, intel_wakeref_put_async merely decrements
		* the wakeref immediately but per function spec usage call this after unlock.
		*/
		intel_wakeref_put_async(&gt->wakeref);
		}

		return ret;
		}

		static void __guc_context_destroy(struct intel_context *ce)
		@@ -3383,7 +3420,22 @@ static void deregister_destroyed_contexts(struct intel_guc *guc)
		if (!ce)
		break;

		guc_lrc_desc_unpin(ce);
		if (guc_lrc_desc_unpin(ce)) {
		/*
		* This means GuC's CT link severed mid-way which could happen
		* in suspend-resume corner cases. In this case, put the
		* context back into the destroyed_contexts list which will
		* get picked up on the next context deregistration event or
		* purged in a GuC sanitization event (reset/unload/wedged/...).
		*/
		spin_lock_irqsave(&guc->submission_state.lock, flags);
		list_add_tail(&ce->destroyed_link,
		&guc->submission_state.destroyed_contexts);
		spin_unlock_irqrestore(&guc->submission_state.lock, flags);
		/* Bail now since the list might never be emptied if h2gs fail */
		break;
		}

		}
		}

		@@ -3394,6 +3446,17 @@ static void destroyed_worker_func(struct work_struct *w)
		struct intel_gt *gt = guc_to_gt(guc);
		intel_wakeref_t wakeref;

		/*
		* In rare cases we can get here via async context-free fence-signals that
		* come very late in suspend flow or very early in resume flows. In these
		* cases, GuC won't be ready but just skipping it here is fine as these
		* pending-destroy-contexts get destroyed totally at GuC reset time at the
		* end of suspend.. OR.. this worker can be picked up later on the next
		* context destruction trigger after resume-completes
		*/
		if (!intel_guc_is_ready(guc))
		return;

		with_intel_gt_pm(gt, wakeref)
		deregister_destroyed_contexts(guc);
		}