Skip to content
intel_pstate.c 37.7 KiB
Newer Older
	cpu->last_sample_time = cpu->sample.time;
	cpu->sample.time = ktime_get();
	cpu->sample.aperf = aperf;
	cpu->sample.mperf = mperf;
	cpu->sample.aperf -= cpu->prev_aperf;
	cpu->sample.mperf -= cpu->prev_mperf;
	cpu->sample.tsc -= cpu->prev_tsc;

	cpu->prev_aperf = aperf;
	cpu->prev_mperf = mperf;
static inline void intel_hwp_set_sample_time(struct cpudata *cpu)
{
	int delay;

	delay = msecs_to_jiffies(50);
	mod_timer_pinned(&cpu->timer, jiffies + delay);
}

static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
{
	delay = msecs_to_jiffies(pid_params.sample_rate_ms);
	mod_timer_pinned(&cpu->timer, jiffies + delay);
}

static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
	u32 sample_time;
	/*
	 * core_busy is the ratio of actual performance to max
	 * max_pstate is the max non turbo pstate available
	 * current_pstate was the pstate that was requested during
	 * 	the last sample period.
	 *
	 * We normalize core_busy, which was our actual percent
	 * performance to what we requested during the last sample
	 * period. The result will be a percentage of busy at a
	 * specified pstate.
	 */
	core_busy = cpu->sample.core_pct_busy;
	max_pstate = int_tofp(cpu->pstate.max_pstate_physical);
	current_pstate = int_tofp(cpu->pstate.current_pstate);
	core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
	/*
	 * Since we have a deferred timer, it will not fire unless
	 * we are in C0.  So, determine if the actual elapsed time
	 * is significantly greater (3x) than our sample interval.  If it
	 * is, then we were idle for a long enough period of time
	 * to adjust our busyness.
	 */
	sample_time = pid_params.sample_rate_ms  * USEC_PER_MSEC;
	duration_us = ktime_us_delta(cpu->sample.time,
				     cpu->last_sample_time);
	if (duration_us > sample_time * 3) {
		sample_ratio = div_fp(int_tofp(sample_time),
		core_busy = mul_fp(core_busy, sample_ratio);
	}

}

static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
{
	int from;
	struct sample *sample;

	from = cpu->pstate.current_pstate;

	pid = &cpu->pid;
	busy_scaled = intel_pstate_get_scaled_busy(cpu);

	ctl = pid_calc(pid, busy_scaled);

	/* Negative values of ctl increase the pstate and vice versa */
	intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl, true);

	sample = &cpu->sample;
	trace_pstate_sample(fp_toint(sample->core_pct_busy),
		fp_toint(busy_scaled),
		from,
		cpu->pstate.current_pstate,
		sample->mperf,
		sample->aperf,
		sample->tsc,
		sample->freq);
static void intel_hwp_timer_func(unsigned long __data)
{
	struct cpudata *cpu = (struct cpudata *) __data;

	intel_pstate_sample(cpu);
	intel_hwp_set_sample_time(cpu);
}

static void intel_pstate_timer_func(unsigned long __data)
{
	struct cpudata *cpu = (struct cpudata *) __data;

	intel_pstate_sample(cpu);
	intel_pstate_adjust_busy_pstate(cpu);
	intel_pstate_set_sample_time(cpu);
}

#define ICPU(model, policy) \
	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
			(unsigned long)&policy }

static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
	ICPU(0x2a, core_params),
	ICPU(0x2d, core_params),
	ICPU(0x37, byt_params),
	ICPU(0x3a, core_params),
	ICPU(0x3c, core_params),
	ICPU(0x3d, core_params),
	ICPU(0x3e, core_params),
	ICPU(0x3f, core_params),
	ICPU(0x45, core_params),
	ICPU(0x46, core_params),
	ICPU(0x47, core_params),
	ICPU(0x4e, core_params),
	ICPU(0x4f, core_params),
	ICPU(0x5e, core_params),
	ICPU(0x56, core_params),
	ICPU(0x57, knl_params),
	{}
};
MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);

static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] = {
	ICPU(0x56, core_params),
	{}
};

static int intel_pstate_init_cpu(unsigned int cpunum)
{
	struct cpudata *cpu;

	if (!all_cpu_data[cpunum])
		all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata),
					       GFP_KERNEL);
	if (!all_cpu_data[cpunum])
		return -ENOMEM;

	cpu = all_cpu_data[cpunum];

	cpu->cpu = cpunum;

	if (hwp_active)
		intel_pstate_hwp_enable(cpu);

	intel_pstate_get_cpu_pstates(cpu);
	init_timer_deferrable(&cpu->timer);
	cpu->timer.data = (unsigned long)cpu;
	cpu->timer.expires = jiffies + HZ/100;

	if (!hwp_active)
		cpu->timer.function = intel_pstate_timer_func;
	else
		cpu->timer.function = intel_hwp_timer_func;

	intel_pstate_busy_pid_reset(cpu);
	intel_pstate_sample(cpu);

	add_timer_on(&cpu->timer, cpunum);

	pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);

	return 0;
}

static unsigned int intel_pstate_get(unsigned int cpu_num)
{
	struct sample *sample;
	struct cpudata *cpu;

	cpu = all_cpu_data[cpu_num];
	if (!cpu)
		return 0;
	sample = &cpu->sample;
	return sample->freq;
}

static int intel_pstate_set_policy(struct cpufreq_policy *policy)
{
#if IS_ENABLED(CONFIG_ACPI)
	struct cpudata *cpu;
	int i;
#endif
	pr_debug("intel_pstate: %s max %u policy->max %u\n", __func__,
		 policy->cpuinfo.max_freq, policy->max);
	if (policy->policy == CPUFREQ_POLICY_PERFORMANCE &&
	    policy->max >= policy->cpuinfo.max_freq) {
		limits.min_perf_pct = 100;
		limits.min_perf = int_tofp(1);
		limits.max_perf_pct = 100;
		limits.max_perf = int_tofp(1);
		limits.max_perf_ctl = 0;
		limits.min_perf_ctl = 0;
	limits.min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
	limits.min_policy_pct = clamp_t(int, limits.min_policy_pct, 0 , 100);
	limits.max_policy_pct = (policy->max * 100) / policy->cpuinfo.max_freq;
	limits.max_policy_pct = clamp_t(int, limits.max_policy_pct, 0 , 100);

	/* Normalize user input to [min_policy_pct, max_policy_pct] */
	limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
	limits.min_perf_pct = min(limits.max_policy_pct, limits.min_perf_pct);
	limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
	limits.max_perf_pct = max(limits.min_policy_pct, limits.max_perf_pct);

	/* Make sure min_perf_pct <= max_perf_pct */
	limits.min_perf_pct = min(limits.max_perf_pct, limits.min_perf_pct);

	limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
	limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
#if IS_ENABLED(CONFIG_ACPI)
	cpu = all_cpu_data[policy->cpu];
	for (i = 0; i < cpu->acpi_perf_data.state_count; i++) {
		int control;

		control = convert_to_native_pstate_format(cpu, i);
		if (control * cpu->pstate.scaling == policy->max)
			limits.max_perf_ctl = control;
		if (control * cpu->pstate.scaling == policy->min)
			limits.min_perf_ctl = control;
	}

	pr_debug("intel_pstate: max %u policy_max %u perf_ctl [0x%x-0x%x]\n",
		 policy->cpuinfo.max_freq, policy->max, limits.min_perf_ctl,
		 limits.max_perf_ctl);
#endif

	if (hwp_active)
		intel_pstate_hwp_set();

	return 0;
}

static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
{
	cpufreq_verify_within_cpu_limits(policy);
	if (policy->policy != CPUFREQ_POLICY_POWERSAVE &&
	    policy->policy != CPUFREQ_POLICY_PERFORMANCE)
static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
	int cpu_num = policy->cpu;
	struct cpudata *cpu = all_cpu_data[cpu_num];
	pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
	del_timer_sync(&all_cpu_data[cpu_num]->timer);
	if (hwp_active)
		return;

	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false);
static int intel_pstate_cpu_init(struct cpufreq_policy *policy)

	rc = intel_pstate_init_cpu(policy->cpu);
	if (rc)
		return rc;

	cpu = all_cpu_data[policy->cpu];

	if (limits.min_perf_pct == 100 && limits.max_perf_pct == 100)
		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
	else
		policy->policy = CPUFREQ_POLICY_POWERSAVE;

	policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
	policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;

	/* cpuinfo and default policy values */
	policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
	policy->cpuinfo.max_freq =
		cpu->pstate.turbo_pstate * cpu->pstate.scaling;
	if (!no_acpi_perf)
		intel_pstate_init_perf_limits(policy);
	/*
	 * If there is no acpi perf data or error, we ignore and use Intel P
	 * state calculated limits, So this is not fatal error.
	 */
	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
	cpumask_set_cpu(policy->cpu, policy->cpus);

	return 0;
}

static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
{
	return intel_pstate_exit_perf_limits(policy);
}

static struct cpufreq_driver intel_pstate_driver = {
	.flags		= CPUFREQ_CONST_LOOPS,
	.verify		= intel_pstate_verify_policy,
	.setpolicy	= intel_pstate_set_policy,
	.get		= intel_pstate_get,
	.init		= intel_pstate_cpu_init,
	.exit		= intel_pstate_cpu_exit,
	.stop_cpu	= intel_pstate_stop_cpu,
static int __initdata no_hwp;
static unsigned int force_load;
static int intel_pstate_msrs_not_valid(void)
{
	    !pstate_funcs.get_min() ||
	    !pstate_funcs.get_turbo())
static void copy_pid_params(struct pstate_adjust_policy *policy)
{
	pid_params.sample_rate_ms = policy->sample_rate_ms;
	pid_params.p_gain_pct = policy->p_gain_pct;
	pid_params.i_gain_pct = policy->i_gain_pct;
	pid_params.d_gain_pct = policy->d_gain_pct;
	pid_params.deadband = policy->deadband;
	pid_params.setpoint = policy->setpoint;
}

static void copy_cpu_funcs(struct pstate_funcs *funcs)
	pstate_funcs.get_max_physical = funcs->get_max_physical;
	pstate_funcs.get_min   = funcs->get_min;
	pstate_funcs.get_turbo = funcs->get_turbo;
	pstate_funcs.get_scaling = funcs->get_scaling;
	pstate_funcs.get_vid   = funcs->get_vid;
#if IS_ENABLED(CONFIG_ACPI)

static bool intel_pstate_no_acpi_pss(void)
{
	int i;

	for_each_possible_cpu(i) {
		acpi_status status;
		union acpi_object *pss;
		struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
		struct acpi_processor *pr = per_cpu(processors, i);

		if (!pr)
			continue;

		status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
		if (ACPI_FAILURE(status))
			continue;

		pss = buffer.pointer;
		if (pss && pss->type == ACPI_TYPE_PACKAGE) {
			kfree(pss);
			return false;
		}

		kfree(pss);
	}

	return true;
}

static bool intel_pstate_has_acpi_ppc(void)
{
	int i;

	for_each_possible_cpu(i) {
		struct acpi_processor *pr = per_cpu(processors, i);

		if (!pr)
			continue;
		if (acpi_has_method(pr->handle, "_PPC"))
			return true;
	}
	return false;
}

enum {
	PSS,
	PPC,
};

struct hw_vendor_info {
	u16  valid;
	char oem_id[ACPI_OEM_ID_SIZE];
	char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
};

/* Hardware vendor-specific info that has its own power management modes */
static struct hw_vendor_info vendor_info[] = {
	{1, "HP    ", "ProLiant", PSS},
	{1, "ORACLE", "X4-2    ", PPC},
	{1, "ORACLE", "X4-2L   ", PPC},
	{1, "ORACLE", "X4-2B   ", PPC},
	{1, "ORACLE", "X3-2    ", PPC},
	{1, "ORACLE", "X3-2L   ", PPC},
	{1, "ORACLE", "X3-2B   ", PPC},
	{1, "ORACLE", "X4470M2 ", PPC},
	{1, "ORACLE", "X4270M3 ", PPC},
	{1, "ORACLE", "X4270M2 ", PPC},
	{1, "ORACLE", "X4170M2 ", PPC},
	{1, "ORACLE", "X4170 M3", PPC},
	{1, "ORACLE", "X4275 M3", PPC},
	{1, "ORACLE", "X6-2    ", PPC},
	{1, "ORACLE", "Sudbury ", PPC},
	{0, "", ""},
};

static bool intel_pstate_platform_pwr_mgmt_exists(void)
{
	struct acpi_table_header hdr;
	struct hw_vendor_info *v_info;
	const struct x86_cpu_id *id;
	u64 misc_pwr;

	id = x86_match_cpu(intel_pstate_cpu_oob_ids);
	if (id) {
		rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
		if ( misc_pwr & (1 << 8))
			return true;
	}
	if (acpi_disabled ||
	    ACPI_FAILURE(acpi_get_table_header(ACPI_SIG_FADT, 0, &hdr)))
		return false;

	for (v_info = vendor_info; v_info->valid; v_info++) {
		if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
			!strncmp(hdr.oem_table_id, v_info->oem_table_id,
						ACPI_OEM_TABLE_ID_SIZE))
			switch (v_info->oem_pwr_table) {
			case PSS:
				return intel_pstate_no_acpi_pss();
			case PPC:
				return intel_pstate_has_acpi_ppc() &&
					(!force_load);
	}

	return false;
}
#else /* CONFIG_ACPI not enabled */
static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
static int __init intel_pstate_init(void)
{
	const struct x86_cpu_id *id;
	struct cpu_defaults *cpu_def;
	id = x86_match_cpu(intel_pstate_cpu_ids);
	if (!id)
		return -ENODEV;

	/*
	 * The Intel pstate driver will be ignored if the platform
	 * firmware has its own power management modes.
	 */
	if (intel_pstate_platform_pwr_mgmt_exists())
		return -ENODEV;

	cpu_def = (struct cpu_defaults *)id->driver_data;
	copy_pid_params(&cpu_def->pid_policy);
	copy_cpu_funcs(&cpu_def->funcs);
	if (intel_pstate_msrs_not_valid())
		return -ENODEV;

	pr_info("Intel P-state driver initializing.\n");

	all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
	if (!all_cpu_data)
		return -ENOMEM;

	if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp)
	rc = cpufreq_register_driver(&intel_pstate_driver);
	if (rc)
		goto out;

	intel_pstate_debug_expose_params();
	intel_pstate_sysfs_expose_params();
	get_online_cpus();
	for_each_online_cpu(cpu) {
		if (all_cpu_data[cpu]) {
			del_timer_sync(&all_cpu_data[cpu]->timer);
			kfree(all_cpu_data[cpu]);
		}
	}

	put_online_cpus();
	vfree(all_cpu_data);
	return -ENODEV;
}
device_initcall(intel_pstate_init);

static int __init intel_pstate_setup(char *str)
{
	if (!str)
		return -EINVAL;

	if (!strcmp(str, "disable"))
		no_load = 1;
	if (!strcmp(str, "no_hwp"))
		no_hwp = 1;
	if (!strcmp(str, "force"))
		force_load = 1;
	if (!strcmp(str, "hwp_only"))
		hwp_only = 1;
	if (!strcmp(str, "no_acpi"))
		no_acpi_perf = 1;

	return 0;
}
early_param("intel_pstate", intel_pstate_setup);

MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
MODULE_LICENSE("GPL");