Commit f5fe7edf authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher
Browse files

drm/amdkfd: Update interrupt handling for GFX9.4.3



Update interrupt handling in CPX mode for GFX9.4.3 by using the
VMID space instead of SDMA client id to determine if an interrupt
should be processed by a KFD node. This is especially needed for
handling retry faults from MMHUB.

Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent cb30544e
Loading
Loading
Loading
Loading
+5 −2
Original line number Original line Diff line number Diff line
@@ -2434,6 +2434,9 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 * amdgpu_vm_handle_fault - graceful handling of VM faults.
 * amdgpu_vm_handle_fault - graceful handling of VM faults.
 * @adev: amdgpu device pointer
 * @adev: amdgpu device pointer
 * @pasid: PASID of the VM
 * @pasid: PASID of the VM
 * @vmid: VMID, only used for GFX 9.4.3.
 * @node_id: Node_id received in IH cookie. Only applicable for
 *           GFX 9.4.3.
 * @addr: Address of the fault
 * @addr: Address of the fault
 * @write_fault: true is write fault, false is read fault
 * @write_fault: true is write fault, false is read fault
 *
 *
@@ -2441,7 +2444,7 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 * shouldn't be reported any more.
 * shouldn't be reported any more.
 */
 */
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
			    u32 client_id, u32 node_id, uint64_t addr,
			    u32 vmid, u32 node_id, uint64_t addr,
			    bool write_fault)
			    bool write_fault)
{
{
	bool is_compute_context = false;
	bool is_compute_context = false;
@@ -2466,7 +2469,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,


	addr /= AMDGPU_GPU_PAGE_SIZE;
	addr /= AMDGPU_GPU_PAGE_SIZE;


	if (is_compute_context && !svm_range_restore_pages(adev, pasid, client_id,
	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
	    node_id, addr, write_fault)) {
	    node_id, addr, write_fault)) {
		amdgpu_bo_unref(&root);
		amdgpu_bo_unref(&root);
		return true;
		return true;
+1 −1
Original line number Original line Diff line number Diff line
@@ -455,7 +455,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
			     struct amdgpu_task_info *task_info);
			     struct amdgpu_task_info *task_info);
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
			    u32 client_id, u32 node_id, uint64_t addr,
			    u32 vmid, u32 node_id, uint64_t addr,
			    bool write_fault);
			    bool write_fault);


void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
+2 −2
Original line number Original line Diff line number Diff line
@@ -587,7 +587,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,


			cam_index = entry->src_data[2] & 0x3ff;
			cam_index = entry->src_data[2] & 0x3ff;


			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->client_id, node_id,
			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
						     addr, write_fault);
						     addr, write_fault);
			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
			if (ret)
			if (ret)
@@ -610,7 +610,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
			/* Try to handle the recoverable page faults by filling page
			/* Try to handle the recoverable page faults by filling page
			 * tables
			 * tables
			 */
			 */
			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->client_id, node_id,
			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
						   addr, write_fault))
						   addr, write_fault))
				return 1;
				return 1;
		}
		}
+6 −10
Original line number Original line Diff line number Diff line
@@ -1073,18 +1073,14 @@ struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
struct kfd_node *kfd_device_by_id(uint32_t gpu_id);
struct kfd_node *kfd_device_by_id(uint32_t gpu_id);
struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev);
struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev);
struct kfd_node *kfd_device_by_adev(const struct amdgpu_device *adev);
struct kfd_node *kfd_device_by_adev(const struct amdgpu_device *adev);
static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t client_id,
static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t node_id,
				     uint32_t node_id)
					uint32_t vmid)
{
{
	if ((node->interrupt_bitmap & (0x1U << node_id)) ||
	return (node->interrupt_bitmap & (1 << node_id)) != 0 &&
	    ((node_id % 4) == 0 &&
	       (node->compute_vmid_bitmap & (1 << vmid)) != 0;
	    (node->interrupt_bitmap >> 16) & (0x1U << client_id)))
		return true;

	return false;
}
}
static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
					uint32_t client_id, uint32_t node_id) {
					uint32_t node_id, uint32_t vmid) {
	struct kfd_dev *dev = adev->kfd.dev;
	struct kfd_dev *dev = adev->kfd.dev;
	uint32_t i;
	uint32_t i;


@@ -1092,7 +1088,7 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
		return dev->nodes[0];
		return dev->nodes[0];


	for (i = 0; i < dev->num_nodes; i++)
	for (i = 0; i < dev->num_nodes; i++)
		if (kfd_irq_is_from_node(dev->nodes[i], client_id, node_id))
		if (kfd_irq_is_from_node(dev->nodes[i], node_id, vmid))
			return dev->nodes[i];
			return dev->nodes[i];


	return NULL;
	return NULL;
+4 −4
Original line number Original line Diff line number Diff line
@@ -2799,7 +2799,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)


int
int
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
			uint32_t client_id, uint32_t node_id,
			uint32_t vmid, uint32_t node_id,
			uint64_t addr, bool write_fault)
			uint64_t addr, bool write_fault)
{
{
	struct mm_struct *mm = NULL;
	struct mm_struct *mm = NULL;
@@ -2851,10 +2851,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
		goto out;
		goto out;
	}
	}


	node = kfd_node_by_irq_ids(adev, node_id, client_id);
	node = kfd_node_by_irq_ids(adev, node_id, vmid);
	if (!node) {
	if (!node) {
		pr_debug("kfd node does not exist node_id: %d, client_id: %d\n", node_id,
		pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
			 client_id);
			 vmid);
		r = -EFAULT;
		r = -EFAULT;
		goto out;
		goto out;
	}
	}
Loading