Commit ab18b7b3 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'drm-next-2022-06-03-1' of git://anongit.freedesktop.org/drm/drm

Pull more drm updates from Dave Airlie:
 "This is mostly regular fixes, msm and amdgpu. There is a tegra patch
  that is bit of prep work for a 5.20 feature to avoid some inter-tree
  syncs, and a couple of late addition amdgpu uAPI changes but best to
  get those in early, and the userspace pieces are ready.

  msm:
   - Limiting WB modes to max sspp linewidth
   - Fixing the supported rotations to add 180 back for IGT
   - Fix to handle pm_runtime_get_sync() errors to avoid unclocked
     access in the bind() path for dpu driver
   - Fix the irq_free() without request issue which was a big-time
     hitter in the CI-runs.

  amdgpu:
   - Update fdinfo to the common drm format
   - uapi:
       - Add VM_NOALLOC GPUVM attribute to prevent buffers for going
         into the MALL
       - Add AMDGPU_GEM_CREATE_DISCARDABLE flag to create buffers that
         can be discarded on eviction
       - Mesa code which uses these:
           https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16466
   - Link training fixes
   - DPIA fixes
   - Misc code cleanups
   - Aux fixes
   - Hotplug fixes
   - More FP clean up
   - Misc GFX9/10 fixes
   - Fix a possible memory leak in SMU shutdown
   - SMU 13 updates
   - RAS fixes
   - TMZ fixes
   - GC 11 updates
   - SMU 11 metrics fixes
   - Fix coverage blend mode for overlay plane
   - Note DDR vs LPDDR memory
   - Fuzz fix for CS IOCTL
   - Add new PCI DID

  amdkfd:
   - Clean up hive setup
   - Misc fixes

  tegra:
   - add some prelim 5.20 work to avoid inter-tree mess"

* tag 'drm-next-2022-06-03-1' of git://anongit.freedesktop.org/drm/drm: (57 commits)
  drm/msm/dpu: Move min BW request and full BW disable back to mdss
  drm/msm/dpu: Fix pointer dereferenced before checking
  drm/msm/dpu: Remove unused code
  drm/msm/disp/dpu1: remove superfluous init
  drm/msm/dp: Always clear mask bits to disable interrupts at dp_ctrl_reset_irq_ctrl()
  gpu: host1x: Add context bus
  drm/amdgpu: add drm-client-id to fdinfo v2
  drm/amdgpu: Convert to common fdinfo format v5
  drm/amdgpu: bump minor version number
  drm/amdgpu: add AMDGPU_VM_NOALLOC v2
  drm/amdgpu: add AMDGPU_GEM_CREATE_DISCARDABLE
  drm/amdgpu: add beige goby PCI ID
  drm/amd/pm: Return auto perf level, if unsupported
  drm/amdkfd: fix typo in comment
  drm/amdgpu/gfx: fix typos in comments
  drm/amdgpu/cs: make commands with 0 chunks illegal behaviour.
  drm/amdgpu: differentiate between LP and non-LP DDR memory
  drm/amdgpu: Resolve pcie_bif RAS recovery bug
  drm/amdgpu: clean up asd on the ta_firmware_header_v2_0
  drm/amdgpu/discovery: validate VCN and SDMA instances
  ...
parents 50fd82b3 40420434
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -2,7 +2,6 @@
# drm/tegra depends on host1x, so if both drivers are built-in care must be
# taken to initialize them in the correct order. Link order is the only way
# to ensure this currently.
obj-$(CONFIG_TEGRA_HOST1X)	+= host1x/
obj-y			+= drm/ vga/
obj-y			+= host1x/ drm/ vga/
obj-$(CONFIG_IMX_IPUV3_CORE)	+= ipu-v3/
obj-$(CONFIG_TRACE_GPU_MEM)		+= trace/
+1 −1
Original line number Diff line number Diff line
@@ -1621,7 +1621,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(

	mutex_lock(&mem->lock);

	/* Unpin MMIO/DOORBELL BO's that were pinnned during allocation */
	/* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
	if (mem->alloc_flags &
	    (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
	     KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
+6 −2
Original line number Diff line number Diff line
@@ -188,13 +188,17 @@ static int convert_atom_mem_type_to_vram_type(struct amdgpu_device *adev,
			vram_type = AMDGPU_VRAM_TYPE_DDR3;
			break;
		case Ddr4MemType:
		case LpDdr4MemType:
			vram_type = AMDGPU_VRAM_TYPE_DDR4;
			break;
		case LpDdr4MemType:
			vram_type = AMDGPU_VRAM_TYPE_LPDDR4;
			break;
		case Ddr5MemType:
		case LpDdr5MemType:
			vram_type = AMDGPU_VRAM_TYPE_DDR5;
			break;
		case LpDdr5MemType:
			vram_type = AMDGPU_VRAM_TYPE_LPDDR5;
			break;
		default:
			vram_type = AMDGPU_VRAM_TYPE_UNKNOWN;
			break;
+2 −2
Original line number Diff line number Diff line
@@ -116,7 +116,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
	int ret;

	if (cs->in.num_chunks == 0)
		return 0;
		return -EINVAL;

	chunk_array = kvmalloc_array(cs->in.num_chunks, sizeof(uint64_t), GFP_KERNEL);
	if (!chunk_array)
@@ -1252,7 +1252,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,

	p->fence = dma_fence_get(&job->base.s_fence->finished);

	amdgpu_ctx_add_fence(p->ctx, entity, p->fence, &seq);
	seq = amdgpu_ctx_add_fence(p->ctx, entity, p->fence);
	amdgpu_cs_post_dependencies(p);

	if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
+113 −110
Original line number Diff line number Diff line
@@ -135,9 +135,9 @@ static enum amdgpu_ring_priority_level amdgpu_ctx_sched_prio_to_ring_prio(int32_

static unsigned int amdgpu_ctx_get_hw_prio(struct amdgpu_ctx *ctx, u32 hw_ip)
{
	struct amdgpu_device *adev = ctx->adev;
	int32_t ctx_prio;
	struct amdgpu_device *adev = ctx->mgr->adev;
	unsigned int hw_prio;
	int32_t ctx_prio;

	ctx_prio = (ctx->override_priority == AMDGPU_CTX_PRIORITY_UNSET) ?
			ctx->init_priority : ctx->override_priority;
@@ -162,17 +162,50 @@ static unsigned int amdgpu_ctx_get_hw_prio(struct amdgpu_ctx *ctx, u32 hw_ip)
	return hw_prio;
}

/* Calculate the time spend on the hw */
static ktime_t amdgpu_ctx_fence_time(struct dma_fence *fence)
{
	struct drm_sched_fence *s_fence;

	if (!fence)
		return ns_to_ktime(0);

	/* When the fence is not even scheduled it can't have spend time */
	s_fence = to_drm_sched_fence(fence);
	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &s_fence->scheduled.flags))
		return ns_to_ktime(0);

	/* When it is still running account how much already spend */
	if (!test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &s_fence->finished.flags))
		return ktime_sub(ktime_get(), s_fence->scheduled.timestamp);

	return ktime_sub(s_fence->finished.timestamp,
			 s_fence->scheduled.timestamp);
}

static ktime_t amdgpu_ctx_entity_time(struct amdgpu_ctx *ctx,
				      struct amdgpu_ctx_entity *centity)
{
	ktime_t res = ns_to_ktime(0);
	uint32_t i;

	spin_lock(&ctx->ring_lock);
	for (i = 0; i < amdgpu_sched_jobs; i++) {
		res = ktime_add(res, amdgpu_ctx_fence_time(centity->fences[i]));
	}
	spin_unlock(&ctx->ring_lock);
	return res;
}

static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
				  const u32 ring)
{
	struct amdgpu_device *adev = ctx->adev;
	struct amdgpu_ctx_entity *entity;
	struct drm_gpu_scheduler **scheds = NULL, *sched = NULL;
	unsigned num_scheds = 0;
	int32_t ctx_prio;
	unsigned int hw_prio;
	struct amdgpu_device *adev = ctx->mgr->adev;
	struct amdgpu_ctx_entity *entity;
	enum drm_sched_priority drm_prio;
	unsigned int hw_prio, num_scheds;
	int32_t ctx_prio;
	int r;

	entity = kzalloc(struct_size(entity, fences, amdgpu_sched_jobs),
@@ -182,6 +215,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,

	ctx_prio = (ctx->override_priority == AMDGPU_CTX_PRIORITY_UNSET) ?
			ctx->init_priority : ctx->override_priority;
	entity->hw_ip = hw_ip;
	entity->sequence = 1;
	hw_prio = amdgpu_ctx_get_hw_prio(ctx, hw_ip);
	drm_prio = amdgpu_ctx_to_drm_sched_prio(ctx_prio);
@@ -220,10 +254,25 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
	return r;
}

static int amdgpu_ctx_init(struct amdgpu_device *adev,
			   int32_t priority,
			   struct drm_file *filp,
			   struct amdgpu_ctx *ctx)
static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_ctx_entity *entity)
{
	ktime_t res = ns_to_ktime(0);
	int i;

	if (!entity)
		return res;

	for (i = 0; i < amdgpu_sched_jobs; ++i) {
		res = ktime_add(res, amdgpu_ctx_fence_time(entity->fences[i]));
		dma_fence_put(entity->fences[i]);
	}

	kfree(entity);
	return res;
}

static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
			   struct drm_file *filp, struct amdgpu_ctx *ctx)
{
	int r;

@@ -233,15 +282,14 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,

	memset(ctx, 0, sizeof(*ctx));

	ctx->adev = adev;

	kref_init(&ctx->refcount);
	ctx->mgr = mgr;
	spin_lock_init(&ctx->ring_lock);
	mutex_init(&ctx->lock);

	ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
	ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
	ctx->reset_counter_query = ctx->reset_counter;
	ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
	ctx->vram_lost_counter = atomic_read(&mgr->adev->vram_lost_counter);
	ctx->init_priority = priority;
	ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
	ctx->stable_pstate = AMDGPU_CTX_STABLE_PSTATE_NONE;
@@ -249,24 +297,10 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
	return 0;
}

static void amdgpu_ctx_fini_entity(struct amdgpu_ctx_entity *entity)
{

	int i;

	if (!entity)
		return;

	for (i = 0; i < amdgpu_sched_jobs; ++i)
		dma_fence_put(entity->fences[i]);

	kfree(entity);
}

static int amdgpu_ctx_get_stable_pstate(struct amdgpu_ctx *ctx,
					u32 *stable_pstate)
{
	struct amdgpu_device *adev = ctx->adev;
	struct amdgpu_device *adev = ctx->mgr->adev;
	enum amd_dpm_forced_level current_level;

	current_level = amdgpu_dpm_get_performance_level(adev);
@@ -294,7 +328,7 @@ static int amdgpu_ctx_get_stable_pstate(struct amdgpu_ctx *ctx,
static int amdgpu_ctx_set_stable_pstate(struct amdgpu_ctx *ctx,
					u32 stable_pstate)
{
	struct amdgpu_device *adev = ctx->adev;
	struct amdgpu_device *adev = ctx->mgr->adev;
	enum amd_dpm_forced_level level;
	u32 current_stable_pstate;
	int r;
@@ -345,7 +379,8 @@ static int amdgpu_ctx_set_stable_pstate(struct amdgpu_ctx *ctx,
static void amdgpu_ctx_fini(struct kref *ref)
{
	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
	struct amdgpu_device *adev = ctx->adev;
	struct amdgpu_ctx_mgr *mgr = ctx->mgr;
	struct amdgpu_device *adev = mgr->adev;
	unsigned i, j, idx;

	if (!adev)
@@ -353,8 +388,10 @@ static void amdgpu_ctx_fini(struct kref *ref)

	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
		for (j = 0; j < AMDGPU_MAX_ENTITY_NUM; ++j) {
			amdgpu_ctx_fini_entity(ctx->entities[i][j]);
			ctx->entities[i][j] = NULL;
			ktime_t spend;

			spend = amdgpu_ctx_fini_entity(ctx->entities[i][j]);
			atomic64_add(ktime_to_ns(spend), &mgr->time_spend[i]);
		}
	}

@@ -421,7 +458,7 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
	}

	*id = (uint32_t)r;
	r = amdgpu_ctx_init(adev, priority, filp, ctx);
	r = amdgpu_ctx_init(mgr, priority, filp, ctx);
	if (r) {
		idr_remove(&mgr->ctx_handles, *id);
		*id = 0;
@@ -671,9 +708,9 @@ int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
	return 0;
}

void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
			      struct drm_sched_entity *entity,
			  struct dma_fence *fence, uint64_t *handle)
			      struct dma_fence *fence)
{
	struct amdgpu_ctx_entity *centity = to_amdgpu_ctx_entity(entity);
	uint64_t seq = centity->sequence;
@@ -682,8 +719,7 @@ void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,

	idx = seq & (amdgpu_sched_jobs - 1);
	other = centity->fences[idx];
	if (other)
		BUG_ON(!dma_fence_is_signaled(other));
	WARN_ON(other && !dma_fence_is_signaled(other));

	dma_fence_get(fence);

@@ -692,9 +728,11 @@ void amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx,
	centity->sequence++;
	spin_unlock(&ctx->ring_lock);

	atomic64_add(ktime_to_ns(amdgpu_ctx_fence_time(other)),
		     &ctx->mgr->time_spend[centity->hw_ip]);

	dma_fence_put(other);
	if (handle)
		*handle = seq;
	return seq;
}

struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
@@ -731,7 +769,7 @@ static void amdgpu_ctx_set_entity_priority(struct amdgpu_ctx *ctx,
					   int hw_ip,
					   int32_t priority)
{
	struct amdgpu_device *adev = ctx->adev;
	struct amdgpu_device *adev = ctx->mgr->adev;
	unsigned int hw_prio;
	struct drm_gpu_scheduler **scheds = NULL;
	unsigned num_scheds;
@@ -796,10 +834,17 @@ int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,
	return r;
}

void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr,
			 struct amdgpu_device *adev)
{
	unsigned int i;

	mgr->adev = adev;
	mutex_init(&mgr->lock);
	idr_init(&mgr->ctx_handles);

	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
		atomic64_set(&mgr->time_spend[i], 0);
}

long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
@@ -875,80 +920,38 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
	mutex_destroy(&mgr->lock);
}

static void amdgpu_ctx_fence_time(struct amdgpu_ctx *ctx,
		struct amdgpu_ctx_entity *centity, ktime_t *total, ktime_t *max)
{
	ktime_t now, t1;
	uint32_t i;

	*total = *max = 0;

	now = ktime_get();
	for (i = 0; i < amdgpu_sched_jobs; i++) {
		struct dma_fence *fence;
		struct drm_sched_fence *s_fence;

		spin_lock(&ctx->ring_lock);
		fence = dma_fence_get(centity->fences[i]);
		spin_unlock(&ctx->ring_lock);
		if (!fence)
			continue;
		s_fence = to_drm_sched_fence(fence);
		if (!dma_fence_is_signaled(&s_fence->scheduled)) {
			dma_fence_put(fence);
			continue;
		}
		t1 = s_fence->scheduled.timestamp;
		if (!ktime_before(t1, now)) {
			dma_fence_put(fence);
			continue;
		}
		if (dma_fence_is_signaled(&s_fence->finished) &&
			s_fence->finished.timestamp < now)
			*total += ktime_sub(s_fence->finished.timestamp, t1);
		else
			*total += ktime_sub(now, t1);
		t1 = ktime_sub(now, t1);
		dma_fence_put(fence);
		*max = max(t1, *max);
	}
}

ktime_t amdgpu_ctx_mgr_fence_usage(struct amdgpu_ctx_mgr *mgr, uint32_t hwip,
		uint32_t idx, uint64_t *elapsed)
void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
			  ktime_t usage[AMDGPU_HW_IP_NUM])
{
	struct idr *idp;
	struct amdgpu_ctx *ctx;
	unsigned int hw_ip, i;
	uint32_t id;
	struct amdgpu_ctx_entity *centity;
	ktime_t total = 0, max = 0;

	if (idx >= AMDGPU_MAX_ENTITY_NUM)
		return 0;
	idp = &mgr->ctx_handles;
	/*
	 * This is a little bit racy because it can be that a ctx or a fence are
	 * destroyed just in the moment we try to account them. But that is ok
	 * since exactly that case is explicitely allowed by the interface.
	 */
	mutex_lock(&mgr->lock);
	idr_for_each_entry(idp, ctx, id) {
		ktime_t ttotal, tmax;
	for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
		uint64_t ns = atomic64_read(&mgr->time_spend[hw_ip]);

		if (!ctx->entities[hwip][idx])
			continue;
		usage[hw_ip] = ns_to_ktime(ns);
	}

		centity = ctx->entities[hwip][idx];
		amdgpu_ctx_fence_time(ctx, centity, &ttotal, &tmax);
	idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
		for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
			for (i = 0; i < amdgpu_ctx_num_entities[hw_ip]; ++i) {
				struct amdgpu_ctx_entity *centity;
				ktime_t spend;

		/* Harmonic mean approximation diverges for very small
		 * values. If ratio < 0.01% ignore
		 */
		if (AMDGPU_CTX_FENCE_USAGE_MIN_RATIO(tmax, ttotal))
				centity = ctx->entities[hw_ip][i];
				if (!centity)
					continue;

		total = ktime_add(total, ttotal);
		max = ktime_after(tmax, max) ? tmax : max;
				spend = amdgpu_ctx_entity_time(ctx, centity);
				usage[hw_ip] = ktime_add(usage[hw_ip], spend);
			}
		}
	}

	mutex_unlock(&mgr->lock);
	if (elapsed)
		*elapsed = max;

	return total;
}
Loading