Commit 7c6e68c7 authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher
Browse files

drm/amdgpu: Avoid HW GPU reset for RAS.



Problem:
Under certain conditions, when some IP bocks take a RAS error,
we can get into a situation where a GPU reset is not possible
due to issues in RAS in SMU/PSP.

Temporary fix until proper solution in PSP/SMU is ready:
When uncorrectable error happens the DF will unconditionally
broadcast error event packets to all its clients/slave upon
receiving fatal error event and freeze all its outbound queues,
err_event_athub interrupt  will be triggered.
In such case and we use this interrupt
to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
reset, only stops schedulers, deatches all in progress and not yet scheduled
job's fences, set error code on them and signals.
Also reject any new incoming job submissions from user space.
All this is done to notify the applications of the problem.

v2:
Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev
Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c
Remove print param from amdgpu_ras_query_error_count

v3:
Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset
for other XGMI hive memebers.

Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Acked-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 12ffa55d
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#include "amdgpu_trace.h"
#include "amdgpu_gmc.h"
#include "amdgpu_gem.h"
#include "amdgpu_ras.h"

static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p,
				      struct drm_amdgpu_cs_chunk_fence *data,
@@ -1290,6 +1291,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
	bool reserved_buffers = false;
	int i, r;

	if (amdgpu_ras_intr_triggered())
		return -EHWPOISON;

	if (!adev->accel_working)
		return -EBUSY;

+28 −10
Original line number Diff line number Diff line
@@ -3736,25 +3736,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
		adev->mp1_state = PP_MP1_STATE_NONE;
		break;
	}
	/* Block kfd: SRIOV would do it separately */
	if (!amdgpu_sriov_vf(adev))
                amdgpu_amdkfd_pre_reset(adev);

	return true;
}

static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
{
	/*unlock kfd: SRIOV would do it separately */
	if (!amdgpu_sriov_vf(adev))
                amdgpu_amdkfd_post_reset(adev);
	amdgpu_vf_error_trans_all(adev);
	adev->mp1_state = PP_MP1_STATE_NONE;
	adev->in_gpu_reset = 0;
	mutex_unlock(&adev->lock_reset);
}


/**
 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
 *
@@ -3774,11 +3767,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	struct amdgpu_hive_info *hive = NULL;
	struct amdgpu_device *tmp_adev = NULL;
	int i, r = 0;
	bool in_ras_intr = amdgpu_ras_intr_triggered();

	need_full_reset = job_signaled = false;
	INIT_LIST_HEAD(&device_list);

	dev_info(adev->dev, "GPU reset begin!\n");
	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");

	cancel_delayed_work_sync(&adev->delayed_init_work);

@@ -3805,9 +3799,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		return 0;
	}

	/* Block kfd: SRIOV would do it separately */
	if (!amdgpu_sriov_vf(adev))
                amdgpu_amdkfd_pre_reset(adev);

	/* Build list of devices to reset */
	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
		if (!hive) {
			/*unlock kfd: SRIOV would do it separately */
			if (!amdgpu_sriov_vf(adev))
		                amdgpu_amdkfd_post_reset(adev);
			amdgpu_device_unlock_adev(adev);
			return -ENODEV;
		}
@@ -3825,8 +3826,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

	/* block all schedulers and reset given job's ring */
	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
		if (tmp_adev != adev)
		if (tmp_adev != adev) {
			amdgpu_device_lock_adev(tmp_adev, false);
			if (!amdgpu_sriov_vf(tmp_adev))
			                amdgpu_amdkfd_pre_reset(tmp_adev);
		}

		/*
		 * Mark these ASICs to be reseted as untracked first
		 * And add them back after reset completed
@@ -3834,7 +3839,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		amdgpu_unregister_gpu_instance(tmp_adev);

		/* disable ras on ALL IPs */
		if (amdgpu_device_ip_need_full_reset(tmp_adev))
		if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
			amdgpu_ras_suspend(tmp_adev);

		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -3844,10 +3849,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
				continue;

			drm_sched_stop(&ring->sched, job ? &job->base : NULL);

			if (in_ras_intr)
				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
		}
	}


	if (in_ras_intr)
		goto skip_sched_resume;

	/*
	 * Must check guilty signal here since after this point all old
	 * HW fences are force signaled.
@@ -3906,6 +3917,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

	/* Post ASIC reset for all devs .*/
	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
			struct amdgpu_ring *ring = tmp_adev->rings[i];

@@ -3932,7 +3944,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		} else {
			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
		}
	}

skip_sched_resume:
	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
		/*unlock kfd: SRIOV would do it separately */
		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
	                amdgpu_amdkfd_post_reset(tmp_adev);
		amdgpu_device_unlock_adev(tmp_adev);
	}

+5 −0
Original line number Diff line number Diff line
@@ -42,6 +42,8 @@

#include "amdgpu_amdkfd.h"

#include "amdgpu_ras.h"

/*
 * KMS wrapper.
 * - 3.0.0 - initial driver
@@ -1098,6 +1100,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
	struct drm_device *dev = pci_get_drvdata(pdev);
	struct amdgpu_device *adev = dev->dev_private;

	if (amdgpu_ras_intr_triggered())
		return;

	/* if we are running in a VM, make sure the device
	 * torn down properly on reboot/shutdown.
	 * unfortunately we can't detect certain
+38 −0
Original line number Diff line number Diff line
@@ -246,6 +246,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
	return fence;
}

#define to_drm_sched_job(sched_job)		\
		container_of((sched_job), struct drm_sched_job, queue_node)

void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
{
	struct drm_sched_job *s_job;
	struct drm_sched_entity *s_entity = NULL;
	int i;

	/* Signal all jobs not yet scheduled */
	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
		struct drm_sched_rq *rq = &sched->sched_rq[i];

		if (!rq)
			continue;

		spin_lock(&rq->lock);
		list_for_each_entry(s_entity, &rq->entities, list) {
			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
				struct drm_sched_fence *s_fence = s_job->s_fence;

				dma_fence_signal(&s_fence->scheduled);
				dma_fence_set_error(&s_fence->finished, -EHWPOISON);
				dma_fence_signal(&s_fence->finished);
			}
		}
		spin_unlock(&rq->lock);
	}

	/* Signal all jobs already scheduled to HW */
	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
		struct drm_sched_fence *s_fence = s_job->s_fence;

		dma_fence_set_error(&s_fence->finished, -EHWPOISON);
		dma_fence_signal(&s_fence->finished);
	}
}

const struct drm_sched_backend_ops amdgpu_sched_ops = {
	.dependency = amdgpu_job_dependency,
	.run_job = amdgpu_job_run,
+3 −0
Original line number Diff line number Diff line
@@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
		      void *owner, struct dma_fence **f);
int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
			     struct dma_fence **fence);

void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);

#endif
Loading