Merge tag 'perf_urgent_for_v6.1_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (a7038524) · Commits · jan.koester / Linux

include/linux/perf_event.h

+15 −4

Original line number	Diff line number	Diff line
		@@ -756,11 +756,14 @@ struct perf_event {
		struct fasync_struct *fasync;

		/* delayed work for NMIs and such */
		int pending_wakeup;
		int pending_kill;
		int pending_disable;
		unsigned int pending_wakeup;
		unsigned int pending_kill;
		unsigned int pending_disable;
		unsigned int pending_sigtrap;
		unsigned long pending_addr; /* SIGTRAP */
		struct irq_work pending;
		struct irq_work pending_irq;
		struct callback_head pending_task;
		unsigned int pending_work;

		atomic_t event_limit;

		@@ -877,6 +880,14 @@ struct perf_event_context {
		#endif
		void task_ctx_data; / pmu specific data */
		struct rcu_head rcu_head;

		/*
		* Sum (event->pending_sigtrap + event->pending_work)
		*
		* The SIGTRAP is targeted at ctx->task, as such it won't do changing
		* that until the signal is delivered.
		*/
		local_t nr_pending;
		};

		/*

kernel/events/core.c

+113 −38

Original line number	Diff line number	Diff line
		@@ -54,6 +54,7 @@
		#include <linux/highmem.h>
		#include <linux/pgtable.h>
		#include <linux/buildid.h>
		#include <linux/task_work.h>

		#include "internal.h"

		@@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event,
		event->pmu->del(event, 0);
		event->oncpu = -1;

		if (READ_ONCE(event->pending_disable) >= 0) {
		WRITE_ONCE(event->pending_disable, -1);
		if (event->pending_disable) {
		event->pending_disable = 0;
		perf_cgroup_event_disable(event, ctx);
		state = PERF_EVENT_STATE_OFF;
		}

		if (event->pending_sigtrap) {
		bool dec = true;

		event->pending_sigtrap = 0;
		if (state != PERF_EVENT_STATE_OFF &&
		!event->pending_work) {
		event->pending_work = 1;
		dec = false;
		task_work_add(current, &event->pending_task, TWA_RESUME);
		}
		if (dec)
		local_dec(&event->ctx->nr_pending);
		}

		perf_event_set_state(event, state);

		if (!is_software_event(event))
		@@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
		* hold the top-level event's child_mutex, so any descendant that
		* goes to exit will block in perf_event_exit_event().
		*
		* When called from perf_pending_event it's OK because event->ctx
		* When called from perf_pending_irq it's OK because event->ctx
		* is the current context on this CPU and preemption is disabled,
		* hence we can't get into perf_event_task_sched_out for this context.
		*/
		@@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);

		void perf_event_disable_inatomic(struct perf_event *event)
		{
		WRITE_ONCE(event->pending_disable, smp_processor_id());
		/* can fail, see perf_pending_event_disable() */
		irq_work_queue(&event->pending);
		event->pending_disable = 1;
		irq_work_queue(&event->pending_irq);
		}

		#define MAX_INTERRUPTS (~0ULL)
		@@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
		if (context_equiv(ctx, next_ctx)) {

		perf_pmu_disable(pmu);

		/* PMIs are disabled; ctx->nr_pending is stable. */
		if (local_read(&ctx->nr_pending) \|\|
		local_read(&next_ctx->nr_pending)) {
		/*
		* Must not swap out ctx when there's pending
		* events that rely on the ctx->task relation.
		*/
		raw_spin_unlock(&next_ctx->lock);
		rcu_read_unlock();
		goto inside_switch;
		}

		WRITE_ONCE(ctx->task, next);
		WRITE_ONCE(next_ctx->task, task);

		perf_pmu_disable(pmu);

		if (cpuctx->sched_cb_usage && pmu->sched_task)
		pmu->sched_task(ctx, false);

		@@ -3473,6 +3500,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
		raw_spin_lock(&ctx->lock);
		perf_pmu_disable(pmu);

		inside_switch:
		if (cpuctx->sched_cb_usage && pmu->sched_task)
		pmu->sched_task(ctx, false);
		task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
		@@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,

		static void _free_event(struct perf_event *event)
		{
		irq_work_sync(&event->pending);
		irq_work_sync(&event->pending_irq);

		unaccount_event(event);

		@@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
		return;

		/*
		* perf_pending_event() can race with the task exiting.
		* Both perf_pending_task() and perf_pending_irq() can race with the
		* task exiting.
		*/
		if (current->flags & PF_EXITING)
		return;
		@@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
		event->attr.type, event->attr.sig_data);
		}

		static void perf_pending_event_disable(struct perf_event *event)
		/*
		* Deliver the pending work in-event-context or follow the context.
		*/
		static void __perf_pending_irq(struct perf_event *event)
		{
		int cpu = READ_ONCE(event->pending_disable);
		int cpu = READ_ONCE(event->oncpu);

		/*
		* If the event isn't running; we done. event_sched_out() will have
		* taken care of things.
		*/
		if (cpu < 0)
		return;

		/*
		* Yay, we hit home and are in the context of the event.
		*/
		if (cpu == smp_processor_id()) {
		WRITE_ONCE(event->pending_disable, -1);

		if (event->attr.sigtrap) {
		if (event->pending_sigtrap) {
		event->pending_sigtrap = 0;
		perf_sigtrap(event);
		atomic_set_release(&event->event_limit, 1); /* rearm event */
		return;
		local_dec(&event->ctx->nr_pending);
		}

		if (event->pending_disable) {
		event->pending_disable = 0;
		perf_event_disable_local(event);
		}
		return;
		}

		@@ -6484,33 +6523,60 @@ static void perf_pending_event_disable(struct perf_event *event)
		* irq_work_queue(); // FAILS
		*
		* irq_work_run()
		* perf_pending_event()
		* perf_pending_irq()
		*
		* But the event runs on CPU-B and wants disabling there.
		*/
		irq_work_queue_on(&event->pending, cpu);
		irq_work_queue_on(&event->pending_irq, cpu);
		}

		static void perf_pending_event(struct irq_work *entry)
		static void perf_pending_irq(struct irq_work *entry)
		{
		struct perf_event *event = container_of(entry, struct perf_event, pending);
		struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
		int rctx;

		rctx = perf_swevent_get_recursion_context();
		/*
		* If we 'fail' here, that's OK, it means recursion is already disabled
		* and we won't recurse 'further'.
		*/
		rctx = perf_swevent_get_recursion_context();

		perf_pending_event_disable(event);

		/*
		* The wakeup isn't bound to the context of the event -- it can happen
		* irrespective of where the event is.
		*/
		if (event->pending_wakeup) {
		event->pending_wakeup = 0;
		perf_event_wakeup(event);
		}

		__perf_pending_irq(event);

		if (rctx >= 0)
		perf_swevent_put_recursion_context(rctx);
		}

		static void perf_pending_task(struct callback_head *head)
		{
		struct perf_event *event = container_of(head, struct perf_event, pending_task);
		int rctx;

		/*
		* If we 'fail' here, that's OK, it means recursion is already disabled
		* and we won't recurse 'further'.
		*/
		preempt_disable_notrace();
		rctx = perf_swevent_get_recursion_context();

		if (event->pending_work) {
		event->pending_work = 0;
		perf_sigtrap(event);
		local_dec(&event->ctx->nr_pending);
		}

		if (rctx >= 0)
		perf_swevent_put_recursion_context(rctx);
		preempt_enable_notrace();
		}

		#ifdef CONFIG_GUEST_PERF_EVENTS
		@@ -9236,16 +9302,28 @@ static int __perf_event_overflow(struct perf_event *event,
		if (events && atomic_dec_and_test(&event->event_limit)) {
		ret = 1;
		event->pending_kill = POLL_HUP;
		event->pending_addr = data->addr;

		perf_event_disable_inatomic(event);
		}

		if (event->attr.sigtrap) {
		/*
		* Should not be able to return to user space without processing
		* pending_sigtrap (kernel events can overflow multiple times).
		*/
		WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
		if (!event->pending_sigtrap) {
		event->pending_sigtrap = 1;
		local_inc(&event->ctx->nr_pending);
		}
		event->pending_addr = data->addr;
		irq_work_queue(&event->pending_irq);
		}

		READ_ONCE(event->overflow_handler)(event, data, regs);

		if (*perf_event_fasync(event) && event->pending_kill) {
		event->pending_wakeup = 1;
		irq_work_queue(&event->pending);
		irq_work_queue(&event->pending_irq);
		}

		return ret;
		@@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,


		init_waitqueue_head(&event->waitq);
		event->pending_disable = -1;
		init_irq_work(&event->pending, perf_pending_event);
		init_irq_work(&event->pending_irq, perf_pending_irq);
		init_task_work(&event->pending_task, perf_pending_task);

		mutex_init(&event->mmap_mutex);
		raw_spin_lock_init(&event->addr_filters.lock);
		@@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
		if (parent_event)
		event->event_caps = parent_event->event_caps;

		if (event->attr.sigtrap)
		atomic_set(&event->event_limit, 1);

		if (task) {
		event->attach_state = PERF_ATTACH_TASK;
		/*

kernel/events/ring_buffer.c

+1 −1

Original line number	Diff line number	Diff line
		@@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
		atomic_set(&handle->rb->poll, EPOLLIN);

		handle->event->pending_wakeup = 1;
		irq_work_queue(&handle->event->pending);
		irq_work_queue(&handle->event->pending_irq);
		}

		/*

kernel/trace/bpf_trace.c

+2 −0

Original line number	Diff line number	Diff line
		@@ -687,6 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs , regs, struct bpf_map , map,

		perf_sample_data_init(sd, 0, 0);
		sd->raw = &raw;
		sd->sample_flags \|= PERF_SAMPLE_RAW;

		err = __bpf_perf_event_output(regs, map, flags, sd);

		@@ -745,6 +746,7 @@ u64 bpf_event_output(struct bpf_map map, u64 flags, void meta, u64 meta_size,
		perf_fetch_caller_regs(regs);
		perf_sample_data_init(sd, 0, 0);
		sd->raw = &raw;
		sd->sample_flags \|= PERF_SAMPLE_RAW;

		ret = __bpf_perf_event_output(regs, map, flags, sd);
		out:

tools/testing/selftests/perf_events/sigtrap_threads.c

+32 −3

Original line number	Diff line number	Diff line
		@@ -62,6 +62,8 @@ static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr,
		.remove_on_exec = 1, /* Required by sigtrap. */
		.sigtrap = 1, /* Request synchronous SIGTRAP on event. */
		.sig_data = TEST_SIG_DATA(addr, id),
		.exclude_kernel = 1, /* To allow */
		.exclude_hv = 1, /* running as !root */
		};
		return attr;
		}
		@@ -93,10 +95,14 @@ static void test_thread(void arg)

		__atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED);
		iter = ctx.iterate_on; /* read */
		if (iter >= 0) {
		for (i = 0; i < iter - 1; i++) {
		__atomic_fetch_add(&ctx.tids_want_signal, tid, __ATOMIC_RELAXED);
		ctx.iterate_on = iter; /* idempotent write */
		}
		} else {
		while (ctx.iterate_on);
		}

		return NULL;
		}
		@@ -208,4 +214,27 @@ TEST_F(sigtrap_threads, signal_stress)
		EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0));
		}

		TEST_F(sigtrap_threads, signal_stress_with_disable)
		{
		const int target_count = NUM_THREADS * 3000;
		int i;

		ctx.iterate_on = -1;

		EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0);
		pthread_barrier_wait(&self->barrier);
		while (__atomic_load_n(&ctx.signal_count, __ATOMIC_RELAXED) < target_count) {
		EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0);
		EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_ENABLE, 0), 0);
		}
		ctx.iterate_on = 0;
		for (i = 0; i < NUM_THREADS; i++)
		ASSERT_EQ(pthread_join(self->threads[i], NULL), 0);
		EXPECT_EQ(ioctl(self->fd, PERF_EVENT_IOC_DISABLE, 0), 0);

		EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on);
		EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT);
		EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on, 0));
		}

		TEST_HARNESS_MAIN