Commit f1403342 authored by Christian König's avatar Christian König Committed by Alex Deucher
Browse files

drm/amdgpu: revert "fix system hang issue during GPU reset"



The whole approach wasn't thought through till the end.

We already had a reset lock like this in the past and it caused the same problems like this one.

Completely revert the patch for now and add individual trylock protection to the hardware access functions as necessary.

This reverts commit df9c8d1a.

Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 05f39286
Loading
Loading
Loading
Loading
+2 −7
Original line number Original line Diff line number Diff line
@@ -949,9 +949,9 @@ struct amdgpu_device {
	bool                            in_suspend;
	bool                            in_suspend;
	bool				in_hibernate;
	bool				in_hibernate;


	atomic_t                        in_gpu_reset;
	bool                            in_gpu_reset;
	enum pp_mp1_state               mp1_state;
	enum pp_mp1_state               mp1_state;
	struct rw_semaphore	reset_sem;
	struct mutex  lock_reset;
	struct amdgpu_doorbell_index doorbell_index;
	struct amdgpu_doorbell_index doorbell_index;


	struct mutex			notifier_lock;
	struct mutex			notifier_lock;
@@ -1266,9 +1266,4 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
       return adev->gmc.tmz_enabled;
       return adev->gmc.tmz_enabled;
}
}


static inline bool amdgpu_in_reset(struct amdgpu_device *adev)
{
	return atomic_read(&adev->in_gpu_reset) ? true : false;
}

#endif
#endif
+3 −37
Original line number Original line Diff line number Diff line
@@ -244,14 +244,11 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
	if (cp_mqd_gfx9)
	if (cp_mqd_gfx9)
		bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;
		bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9;


	if (!down_read_trylock(&adev->reset_sem))
		return -EIO;

	r = amdgpu_bo_create(adev, &bp, &bo);
	r = amdgpu_bo_create(adev, &bp, &bo);
	if (r) {
	if (r) {
		dev_err(adev->dev,
		dev_err(adev->dev,
			"failed to allocate BO for amdkfd (%d)\n", r);
			"failed to allocate BO for amdkfd (%d)\n", r);
		goto err;
		return r;
	}
	}


	/* map the buffer */
	/* map the buffer */
@@ -286,7 +283,6 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,


	amdgpu_bo_unreserve(bo);
	amdgpu_bo_unreserve(bo);


	up_read(&adev->reset_sem);
	return 0;
	return 0;


allocate_mem_kmap_bo_failed:
allocate_mem_kmap_bo_failed:
@@ -295,25 +291,19 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
	amdgpu_bo_unreserve(bo);
	amdgpu_bo_unreserve(bo);
allocate_mem_reserve_bo_failed:
allocate_mem_reserve_bo_failed:
	amdgpu_bo_unref(&bo);
	amdgpu_bo_unref(&bo);
err:

	up_read(&adev->reset_sem);
	return r;
	return r;
}
}


void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
{
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
	struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;


	down_read(&adev->reset_sem);

	amdgpu_bo_reserve(bo, true);
	amdgpu_bo_reserve(bo, true);
	amdgpu_bo_kunmap(bo);
	amdgpu_bo_kunmap(bo);
	amdgpu_bo_unpin(bo);
	amdgpu_bo_unpin(bo);
	amdgpu_bo_unreserve(bo);
	amdgpu_bo_unreserve(bo);
	amdgpu_bo_unref(&(bo));
	amdgpu_bo_unref(&(bo));

	up_read(&adev->reset_sem);
}
}


int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,
@@ -345,14 +335,9 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size,


void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj)
{
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;
	struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj;


	down_read(&adev->reset_sem);

	amdgpu_bo_unref(&bo);
	amdgpu_bo_unref(&bo);

	up_read(&adev->reset_sem);
}
}


uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
@@ -626,15 +611,8 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
	job->vmid = vmid;
	job->vmid = vmid;


	if (!down_read_trylock(&adev->reset_sem)) {
		ret = -EIO;
		goto err_ib_sched;
	}

	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);


	up_read(&adev->reset_sem);

	if (ret) {
	if (ret) {
		DRM_ERROR("amdgpu: failed to schedule IB.\n");
		DRM_ERROR("amdgpu: failed to schedule IB.\n");
		goto err_ib_sched;
		goto err_ib_sched;
@@ -670,9 +648,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
{
{
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;


	if (!down_read_trylock(&adev->reset_sem))
		return -EIO;

	if (adev->family == AMDGPU_FAMILY_AI) {
	if (adev->family == AMDGPU_FAMILY_AI) {
		int i;
		int i;


@@ -682,8 +657,6 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
		amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
		amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
	}
	}


	up_read(&adev->reset_sem);

	return 0;
	return 0;
}
}


@@ -692,18 +665,11 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
	const uint32_t flush_type = 0;
	const uint32_t flush_type = 0;
	bool all_hub = false;
	bool all_hub = false;
	int ret = -EIO;


	if (adev->family == AMDGPU_FAMILY_AI)
	if (adev->family == AMDGPU_FAMILY_AI)
		all_hub = true;
		all_hub = true;


	if (down_read_trylock(&adev->reset_sem)) {
	return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
		ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev,
					pasid, flush_type, all_hub);
		up_read(&adev->reset_sem);
	}

	return ret;
}
}


bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
+1 −1
Original line number Original line Diff line number Diff line
@@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	uint32_t temp;
	uint32_t temp;
	struct v10_compute_mqd *m = get_mqd(mqd);
	struct v10_compute_mqd *m = get_mqd(mqd);


	if (amdgpu_in_reset(adev))
	if (adev->in_gpu_reset)
		return -EIO;
		return -EIO;


#if 0
#if 0
+1 −1
Original line number Original line Diff line number Diff line
@@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	unsigned long flags, end_jiffies;
	unsigned long flags, end_jiffies;
	int retry;
	int retry;


	if (amdgpu_in_reset(adev))
	if (adev->in_gpu_reset)
		return -EIO;
		return -EIO;


	acquire_queue(kgd, pipe_id, queue_id);
	acquire_queue(kgd, pipe_id, queue_id);
+1 −1
Original line number Original line Diff line number Diff line
@@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
	int retry;
	int retry;
	struct vi_mqd *m = get_mqd(mqd);
	struct vi_mqd *m = get_mqd(mqd);


	if (amdgpu_in_reset(adev))
	if (adev->in_gpu_reset)
		return -EIO;
		return -EIO;


	acquire_queue(kgd, pipe_id, queue_id);
	acquire_queue(kgd, pipe_id, queue_id);
Loading