Commit bd275681 authored by Peter Zijlstra's avatar Peter Zijlstra
Browse files

perf: Rewrite core context handling



There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).

Notably:
 - HW breakpoint PMU
 - ARM big.little PMU / Intel ADL PMU
 - Intel Branch Monitoring PMU
 - AMD IBS PMU
 - S390 cpum_cf PMU
 - PowerPC trace_imc PMU

*Current design:*

Currently we have a per task and per cpu perf_event_contexts:

  task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
       ^                                 |    ^     |           ^
       `---------------------------------'    |     `--> pmu ---'
                                              v           ^
                                         perf_event ------'

Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.

Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.

The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.

Each perf_event is then associated with its PMU and one
perf_event_context.

*Proposed design:*

New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:

  task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
       ^                           |   ^ ^
       `---------------------------'   | |
                                       | |    perf_cpu_pmu_context <--.
                                       | `----.    ^                  |
                                       |      |    |                  |
                                       |      v    v                  |
                                       | ,--> perf_event_pmu_context  |
                                       | |                            |
                                       | |                            |
                                       v v                            |
                                  perf_event ---> pmu ----------------'

With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:

  {cpu, pmu, cgroup, group_index}

Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.

Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.

Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.

Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.

Much thanks to Ravi for picking this up and pushing it towards
completion.

Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: default avatarRavi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: default avatarRavi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
parent 247f34f7
Loading
Loading
Loading
Loading
+11 −7
Original line number Diff line number Diff line
@@ -806,10 +806,14 @@ static void armv8pmu_disable_event(struct perf_event *event)

static void armv8pmu_start(struct arm_pmu *cpu_pmu)
{
	struct perf_event_context *task_ctx =
		this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
	struct perf_event_context *ctx;
	int nr_user = 0;

	if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
	ctx = perf_cpu_task_ctx();
	if (ctx)
		nr_user = ctx->nr_user;

	if (sysctl_perf_user_access && nr_user)
		armv8pmu_enable_user_access(cpu_pmu);
	else
		armv8pmu_disable_user_access();
@@ -1019,10 +1023,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
	return 0;
}

static int armv8pmu_filter_match(struct perf_event *event)
static bool armv8pmu_filter(struct pmu *pmu, int cpu)
{
	unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
	return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
	struct arm_pmu *armpmu = to_arm_pmu(pmu);
	return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus);
}

static void armv8pmu_reset(void *info)
@@ -1253,7 +1257,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
	cpu_pmu->stop			= armv8pmu_stop;
	cpu_pmu->reset			= armv8pmu_reset;
	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
	cpu_pmu->filter_match		= armv8pmu_filter_match;
	cpu_pmu->filter			= armv8pmu_filter;

	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;

+4 −4
Original line number Diff line number Diff line
@@ -132,7 +132,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)

static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {}
static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
@@ -424,7 +424,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
		cpuhw->bhrb_context = event->ctx;
	}
	cpuhw->bhrb_users++;
	perf_sched_cb_inc(event->ctx->pmu);
	perf_sched_cb_inc(event->pmu);
}

static void power_pmu_bhrb_disable(struct perf_event *event)
@@ -436,7 +436,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)

	WARN_ON_ONCE(!cpuhw->bhrb_users);
	cpuhw->bhrb_users--;
	perf_sched_cb_dec(event->ctx->pmu);
	perf_sched_cb_dec(event->pmu);

	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
		/* BHRB cannot be turned off when other
@@ -451,7 +451,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
/* Called from ctxsw to prevent one process's branch entries to
 * mingle with the other process's entries during context switch.
 */
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
	if (!ppmu->bhrb_nr)
		return;
+1 −1
Original line number Diff line number Diff line
@@ -379,7 +379,7 @@ static int paicrypt_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
 * but for sampling only event CRYPTO_ALL is allowed.
 */
static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
	/* We started with a clean page on event installation. So read out
	 * results on schedule_out and if page was dirty, clear values.
+1 −1
Original line number Diff line number Diff line
@@ -471,7 +471,7 @@ static int paiext_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
 * but for sampling only event NNPA_ALL is allowed.
 */
static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
	/* We started with a clean page on event installation. So read out
	 * results on schedule_out and if page was dirty, clear values.
+1 −1
Original line number Diff line number Diff line
@@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void)
 * On ctxswin, sched_in = true, called after the PMU has started
 * On ctxswout, sched_in = false, called before the PMU is stopped
 */
void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

Loading