drm/amdgpu: race issue when jobs on 2 ring timeout (91fb309d) · Commits · jan.koester / Linux

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+59 −10

Original line number	Diff line number	Diff line
		@@ -4461,6 +4461,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
		up_write(&adev->reset_sem);
		}

		/*
		* to lockup a list of amdgpu devices in a hive safely, if not a hive
		* with multiple nodes, it will be similar as amdgpu_device_lock_adev.
		*
		* unlock won't require roll back.
		*/
		static int amdgpu_device_lock_hive_adev(struct amdgpu_device adev, struct amdgpu_hive_info hive)
		{
		struct amdgpu_device *tmp_adev = NULL;

		if (adev->gmc.xgmi.num_physical_nodes > 1) {
		if (!hive) {
		dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
		return -ENODEV;
		}
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
		if (!amdgpu_device_lock_adev(tmp_adev, hive))
		goto roll_back;
		}
		} else if (!amdgpu_device_lock_adev(adev, hive))
		return -EAGAIN;

		return 0;
		roll_back:
		if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
		/*
		* if the lockup iteration break in the middle of a hive,
		* it may means there may has a race issue,
		* or a hive device locked up independently.
		* we may be in trouble and may not, so will try to roll back
		* the lock and give out a warnning.
		*/
		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
		amdgpu_device_unlock_adev(tmp_adev);
		}
		}
		return -EAGAIN;
		}

		static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
		{
		struct pci_dev *p = NULL;
		@@ -4574,11 +4614,29 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
		job ? job->base.id : -1, hive->hive_id);
		amdgpu_put_xgmi_hive(hive);
		if (job)
		drm_sched_increase_karma(&job->base);
		return 0;
		}
		mutex_lock(&hive->hive_lock);
		}

		/*
		* lock the device before we try to operate the linked list
		* if didn't get the device lock, don't touch the linked list since
		* others may iterating it.
		*/
		r = amdgpu_device_lock_hive_adev(adev, hive);
		if (r) {
		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
		job ? job->base.id : -1);

		/* even we skipped this reset, still need to set the job to guilty */
		if (job)
		drm_sched_increase_karma(&job->base);
		goto skip_recovery;
		}

		/*
		* Build list of devices to reset.
		* In case we are in XGMI hive mode, resort the device list
		@@ -4586,8 +4644,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		*/
		INIT_LIST_HEAD(&device_list);
		if (adev->gmc.xgmi.num_physical_nodes > 1) {
		if (!hive)
		return -ENODEV;
		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
		list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
		device_list_handle = &hive->device_list;
		@@ -4598,13 +4654,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

		/* block all schedulers and reset given job's ring */
		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
		dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
		job ? job->base.id : -1);
		r = 0;
		goto skip_recovery;
		}

		/*
		* Try to put the audio codec into suspend state
		* before gpu reset started.
		@@ -4742,7 +4791,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		amdgpu_put_xgmi_hive(hive);
		}

		if (r)
		if (r && r != -EAGAIN)
		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
		return r;
		}