Skip to content
core.c 33.2 KiB
Newer Older
// SPDX-License-Identifier: GPL-2.0
/*
 * KFENCE guarded object allocator and fault handling.
 *
 * Copyright (C) 2020, Google LLC.
 */

#define pr_fmt(fmt) "kfence: " fmt

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
#include <linux/sched/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>

#include <asm/kfence.h>

#include "kfence.h"

/* Disables KFENCE on the first warning assuming an irrecoverable error. */
#define KFENCE_WARN_ON(cond)                                                   \
	({                                                                     \
		const bool __cond = WARN_ON(cond);                             \
		if (unlikely(__cond)) {                                        \
			WRITE_ONCE(kfence_enabled, false);                     \
			disabled_by_warn = true;                               \
		}                                                              \
		__cond;                                                        \
	})

/* === Data ================================================================= */

static bool kfence_enabled __read_mostly;
static bool disabled_by_warn __read_mostly;
unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */

#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "kfence."

static int kfence_enable_late(void);
static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
{
	unsigned long num;
	int ret = kstrtoul(val, 0, &num);

	if (ret < 0)
		return ret;

	if (!num) /* Using 0 to indicate KFENCE is disabled. */
		WRITE_ONCE(kfence_enabled, false);

	*((unsigned long *)kp->arg) = num;

	if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
		return disabled_by_warn ? -EINVAL : kfence_enable_late();
	return 0;
}

static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
{
	if (!READ_ONCE(kfence_enabled))
		return sprintf(buffer, "0\n");

	return param_get_ulong(buffer, kp);
}

static const struct kernel_param_ops sample_interval_param_ops = {
	.set = param_set_sample_interval,
	.get = param_get_sample_interval,
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);

/* Pool usage% threshold when currently covered allocations are skipped. */
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);

/* If true, use a deferrable timer. */
static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
module_param_named(deferrable, kfence_deferrable, bool, 0444);

/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __read_mostly;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */

/*
 * Per-object metadata, with one-to-one mapping of object metadata to
 * backing pages (in __kfence_pool).
 */
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];

/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */

/*
 * The static key to set up a KFENCE allocation; or if static keys are not used
 * to gate allocations, to avoid a load and compare if KFENCE is disabled.
 */
DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);

/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);

/*
 * A Counting Bloom filter of allocation coverage: limits currently covered
 * allocations of the same source filling up the pool.
 *
 * Assuming a range of 15%-85% unique allocations in the pool at any point in
 * time, the below parameters provide a probablity of 0.02-0.33 for false
 * positive hits respectively:
 *
 *	P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
 */
#define ALLOC_COVERED_HNUM	2
#define ALLOC_COVERED_ORDER	(const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
#define ALLOC_COVERED_SIZE	(1 << ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_HNEXT(h)	hash_32(h, ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_MASK	(ALLOC_COVERED_SIZE - 1)
static atomic_t alloc_covered[ALLOC_COVERED_SIZE];

/* Stack depth used to determine uniqueness of an allocation. */
#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)

/*
 * Randomness for stack hashes, making the same collisions across reboots and
 * different machines less likely.
 */
static u32 stack_hash_seed __ro_after_init;

/* Statistics counters for debugfs. */
enum kfence_counter_id {
	KFENCE_COUNTER_ALLOCATED,
	KFENCE_COUNTER_ALLOCS,
	KFENCE_COUNTER_FREES,
	KFENCE_COUNTER_ZOMBIES,
	KFENCE_COUNTER_BUGS,
	KFENCE_COUNTER_SKIP_INCOMPAT,
	KFENCE_COUNTER_SKIP_CAPACITY,
	KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
static const char *const counter_names[] = {
	[KFENCE_COUNTER_ALLOCATED]	= "currently allocated",
	[KFENCE_COUNTER_ALLOCS]		= "total allocations",
	[KFENCE_COUNTER_FREES]		= "total frees",
	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
	[KFENCE_COUNTER_BUGS]		= "total bugs",
	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
	[KFENCE_COUNTER_SKIP_COVERED]	= "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);

/* === Internals ============================================================ */

static inline bool should_skip_covered(void)
{
	unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;

	return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
}

static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
{
	num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
	num_entries = filter_irq_stacks(stack_entries, num_entries);
	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
}

/*
 * Adds (or subtracts) count @val for allocation stack trace hash
 * @alloc_stack_hash from Counting Bloom filter.
 */
static void alloc_covered_add(u32 alloc_stack_hash, int val)
{
	int i;

Loading
Loading full blame...