From 78ce6dbfa317c85cfb8a8dc0b5ee588645c795c1 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Wed, 16 Sep 2015 22:24:46 +0200 Subject: [PATCH 01/11] cpufreq: integrator: Fix module autoload for OF platform driver This platform driver has a OF device ID table but the OF module alias information is not created so module autoloading won't work. Signed-off-by: Luis de Bethencourt Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/integrator-cpufreq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c index 2faa4216bf2a..79e3ff2771a6 100644 --- a/drivers/cpufreq/integrator-cpufreq.c +++ b/drivers/cpufreq/integrator-cpufreq.c @@ -221,6 +221,8 @@ static const struct of_device_id integrator_cpufreq_match[] = { { }, }; +MODULE_DEVICE_TABLE(of, integrator_cpufreq_match); + static struct platform_driver integrator_cpufreq_driver = { .driver = { .name = "integrator-cpufreq", From 03d5eec000973e80b1a1ccdef16ed8206621c3e4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 8 Sep 2015 07:10:34 +0530 Subject: [PATCH 02/11] cpufreq: conservative: remove 'enable' field Conservative governor has its own 'enable' field to check if conservative governor is used for a CPU or not This can be checked by policy->governor with 'cpufreq_gov_conservative' and so this field can be dropped. Because its not guaranteed that dbs_info->cdbs.shared will a valid pointer for all CPUs (will be NULL for CPUs that don't use ondemand/conservative governors), we can't use it anymore. Lets get policy with cpufreq_cpu_get_raw() instead. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_conservative.c | 31 +++++++++++++++----------- drivers/cpufreq/cpufreq_governor.c | 12 +--------- drivers/cpufreq/cpufreq_governor.h | 1 - 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 84a1506950a7..1fa1deb6e91f 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -23,6 +23,19 @@ static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info); +static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy, + unsigned int event); + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE +static +#endif +struct cpufreq_governor cpufreq_gov_conservative = { + .name = "conservative", + .governor = cs_cpufreq_governor_dbs, + .max_transition_latency = TRANSITION_LATENCY_LIMIT, + .owner = THIS_MODULE, +}; + static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners, struct cpufreq_policy *policy) { @@ -119,12 +132,14 @@ static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_freqs *freq = data; struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, freq->cpu); - struct cpufreq_policy *policy; + struct cpufreq_policy *policy = cpufreq_cpu_get_raw(freq->cpu); - if (!dbs_info->enable) + if (!policy) return 0; - policy = dbs_info->cdbs.shared->policy; + /* policy isn't governed by conservative governor */ + if (policy->governor != &cpufreq_gov_conservative) + return 0; /* * we only care if our internally tracked freq moves outside the 'valid' @@ -367,16 +382,6 @@ static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy, return cpufreq_governor_dbs(policy, &cs_dbs_cdata, event); } -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE -static -#endif -struct cpufreq_governor cpufreq_gov_conservative = { - .name = "conservative", - .governor = cs_cpufreq_governor_dbs, - .max_transition_latency = TRANSITION_LATENCY_LIMIT, - .owner = THIS_MODULE, -}; - static int __init cpufreq_gov_dbs_init(void) { return cpufreq_register_governor(&cpufreq_gov_conservative); diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 939197ffa4ac..750626d8fb03 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -463,7 +463,6 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy, cdata->get_cpu_dbs_info_s(cpu); cs_dbs_info->down_skip = 0; - cs_dbs_info->enable = 1; cs_dbs_info->requested_freq = policy->cur; } else { struct od_ops *od_ops = cdata->gov_ops; @@ -482,9 +481,7 @@ static int cpufreq_governor_start(struct cpufreq_policy *policy, static int cpufreq_governor_stop(struct cpufreq_policy *policy, struct dbs_data *dbs_data) { - struct common_dbs_data *cdata = dbs_data->cdata; - unsigned int cpu = policy->cpu; - struct cpu_dbs_info *cdbs = cdata->get_cpu_cdbs(cpu); + struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(policy->cpu); struct cpu_common_dbs_info *shared = cdbs->shared; /* State should be equivalent to START */ @@ -493,13 +490,6 @@ static int cpufreq_governor_stop(struct cpufreq_policy *policy, gov_cancel_work(dbs_data, policy); - if (cdata->governor == GOV_CONSERVATIVE) { - struct cs_cpu_dbs_info_s *cs_dbs_info = - cdata->get_cpu_dbs_info_s(cpu); - - cs_dbs_info->enable = 0; - } - shared->policy = NULL; mutex_destroy(&shared->timer_mutex); return 0; diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index 50f171796632..5621bb03e874 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -170,7 +170,6 @@ struct cs_cpu_dbs_info_s { struct cpu_dbs_info cdbs; unsigned int down_skip; unsigned int requested_freq; - unsigned int enable:1; }; /* Per policy Governors sysfs tunables */ From febf63cf616083e9a4ee05e2c28163b074890f36 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 8 Sep 2015 19:13:23 +0200 Subject: [PATCH 03/11] cpufreq: tegra20: remove superfluous CONFIG_PM ifdefs CONFIG_PM ifdefs are superfluous and can be removed. Signed-off-by: Bartlomiej Zolnierkiewicz Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/tegra20-cpufreq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c index 8084c7f7e206..2bd62845e9d5 100644 --- a/drivers/cpufreq/tegra20-cpufreq.c +++ b/drivers/cpufreq/tegra20-cpufreq.c @@ -175,9 +175,7 @@ static struct cpufreq_driver tegra_cpufreq_driver = { .exit = tegra_cpu_exit, .name = "tegra", .attr = cpufreq_generic_attr, -#ifdef CONFIG_PM .suspend = cpufreq_generic_suspend, -#endif }; static int __init tegra_cpufreq_init(void) From a35fc5a33b62a6c658b8ffe7544c966c1776d128 Mon Sep 17 00:00:00 2001 From: Bai Ping Date: Fri, 11 Sep 2015 23:41:05 +0800 Subject: [PATCH 04/11] cpufreq: imx: update the clock switch flow to support imx6ul For i.MX6UL, the clock switch flow is slightly different from other i.MX6 SOCs. It has a 'secondary_sel' clk that will be used when the CPU freq is higher than 396MHz. So the clock switch flow in 'set_target' callback need to update to support i.MX6UL in the common i.MX6 SOC cpufreq driver. Signed-off-by: Bai Ping Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/imx6q-cpufreq.c | 50 +++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index 380a90d3c57e..9b4a7bd04dea 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -30,6 +30,10 @@ static struct clk *pll1_sw_clk; static struct clk *step_clk; static struct clk *pll2_pfd2_396m_clk; +/* clk used by i.MX6UL */ +static struct clk *pll2_bus_clk; +static struct clk *secondary_sel_clk; + static struct device *cpu_dev; static bool free_opp; static struct cpufreq_frequency_table *freq_table; @@ -91,16 +95,36 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index) * The setpoints are selected per PLL/PDF frequencies, so we need to * reprogram PLL for frequency scaling. The procedure of reprogramming * PLL1 is as below. - * + * For i.MX6UL, it has a secondary clk mux, the cpu frequency change + * flow is slightly different from other i.MX6 OSC. + * The cpu frequeny change flow for i.MX6(except i.MX6UL) is as below: * - Enable pll2_pfd2_396m_clk and reparent pll1_sw_clk to it * - Reprogram pll1_sys_clk and reparent pll1_sw_clk back to it * - Disable pll2_pfd2_396m_clk */ - clk_set_parent(step_clk, pll2_pfd2_396m_clk); - clk_set_parent(pll1_sw_clk, step_clk); - if (freq_hz > clk_get_rate(pll2_pfd2_396m_clk)) { - clk_set_rate(pll1_sys_clk, new_freq * 1000); + if (of_machine_is_compatible("fsl,imx6ul")) { + /* + * When changing pll1_sw_clk's parent to pll1_sys_clk, + * CPU may run at higher than 528MHz, this will lead to + * the system unstable if the voltage is lower than the + * voltage of 528MHz, so lower the CPU frequency to one + * half before changing CPU frequency. + */ + clk_set_rate(arm_clk, (old_freq >> 1) * 1000); clk_set_parent(pll1_sw_clk, pll1_sys_clk); + if (freq_hz > clk_get_rate(pll2_pfd2_396m_clk)) + clk_set_parent(secondary_sel_clk, pll2_bus_clk); + else + clk_set_parent(secondary_sel_clk, pll2_pfd2_396m_clk); + clk_set_parent(step_clk, secondary_sel_clk); + clk_set_parent(pll1_sw_clk, step_clk); + } else { + clk_set_parent(step_clk, pll2_pfd2_396m_clk); + clk_set_parent(pll1_sw_clk, step_clk); + if (freq_hz > clk_get_rate(pll2_pfd2_396m_clk)) { + clk_set_rate(pll1_sys_clk, new_freq * 1000); + clk_set_parent(pll1_sw_clk, pll1_sys_clk); + } } /* Ensure the arm clock divider is what we expect */ @@ -186,6 +210,16 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) goto put_clk; } + if (of_machine_is_compatible("fsl,imx6ul")) { + pll2_bus_clk = clk_get(cpu_dev, "pll2_bus"); + secondary_sel_clk = clk_get(cpu_dev, "secondary_sel"); + if (IS_ERR(pll2_bus_clk) || IS_ERR(secondary_sel_clk)) { + dev_err(cpu_dev, "failed to get clocks specific to imx6ul\n"); + ret = -ENOENT; + goto put_clk; + } + } + arm_reg = regulator_get(cpu_dev, "arm"); pu_reg = regulator_get_optional(cpu_dev, "pu"); soc_reg = regulator_get(cpu_dev, "soc"); @@ -331,6 +365,10 @@ put_clk: clk_put(step_clk); if (!IS_ERR(pll2_pfd2_396m_clk)) clk_put(pll2_pfd2_396m_clk); + if (!IS_ERR(pll2_bus_clk)) + clk_put(pll2_bus_clk); + if (!IS_ERR(secondary_sel_clk)) + clk_put(secondary_sel_clk); of_node_put(np); return ret; } @@ -350,6 +388,8 @@ static int imx6q_cpufreq_remove(struct platform_device *pdev) clk_put(pll1_sw_clk); clk_put(step_clk); clk_put(pll2_pfd2_396m_clk); + clk_put(pll2_bus_clk); + clk_put(secondary_sel_clk); return 0; } From d43b1b6f8e5b3219d4d02ea7fadcf67fecf78b1a Mon Sep 17 00:00:00 2001 From: Shilpasri G Bhat Date: Mon, 14 Sep 2015 14:01:47 +0530 Subject: [PATCH 05/11] cpufreq : powernv: Report Pmax throttling if capped below nominal frequency Log a 'critical' message if the max frequency is reduced below nominal frequency. We already log 'info' message if the max frequency is capped below turbo frequency. CPU should guarantee atleast nominal frequency, but not turbo frequency in all system configurations and environments. So report the pmax throttling with severity when Pmax is dipped below nominal frequency. Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/powernv-cpufreq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 64994e10638e..cb501386eb6e 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -327,8 +327,14 @@ static void powernv_cpufreq_throttle_check(void *data) if (chips[i].throttled) goto next; chips[i].throttled = true; - pr_info("CPU %d on Chip %u has Pmax reduced to %d\n", cpu, - chips[i].id, pmsr_pmax); + if (pmsr_pmax < powernv_pstate_info.nominal) + pr_crit("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", + cpu, chips[i].id, pmsr_pmax, + powernv_pstate_info.nominal); + else + pr_info("CPU %d on Chip %u has Pmax reduced below turbo frequency (%d < %d)\n", + cpu, chips[i].id, pmsr_pmax, + powernv_pstate_info.max); } else if (chips[i].throttled) { chips[i].throttled = false; pr_info("CPU %d on Chip %u has Pmax restored to %d\n", cpu, From e625742f9c6e5d01b5c46efcc2870893735badf3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 13 Oct 2015 10:57:13 +0530 Subject: [PATCH 06/11] cpufreq: Drop redundant check for inactive policies We just made sure policy->cpu is online and this check will always fail as the policy is active. Drop it. Signed-off-by: Viresh Kumar Acked-by: Saravana Kannan Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 25c4c15103a0..8701dc559850 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -843,18 +843,11 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, down_write(&policy->rwsem); - /* Updating inactive policies is invalid, so avoid doing that. */ - if (unlikely(policy_is_inactive(policy))) { - ret = -EBUSY; - goto unlock_policy_rwsem; - } - if (fattr->store) ret = fattr->store(policy, buf, count); else ret = -EIO; -unlock_policy_rwsem: up_write(&policy->rwsem); unlock: put_online_cpus(); From 6a35fc2d6c22bafe45117cdc5d8cee332244edbb Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 14 Oct 2015 16:11:59 -0700 Subject: [PATCH 07/11] cpufreq: intel_pstate: get P1 from TAR when available After Ivybridge, the max non turbo ratio obtained from platform info msr is not always guaranteed P1 on client platforms. The max non turbo activation ratio (TAR), determines the max for the current level of TDP. The ratio in platform info is physical max. The TAR MSR can be locked, so updating this value is not possible on all platforms. This change gets this ratio from MSR TURBO_ACTIVATION_RATIO if available, but also do some sanity checking to make sure that this value is correct. The sanity check involves reading the TDP ratio for the current tdp control value when platform has configurable TDP present and matching TAC with this. Signed-off-by: Srinivas Pandruvada Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 7 ++++++ drivers/cpufreq/intel_pstate.c | 39 ++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b8c14bb7fc8f..9f3905697f12 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -206,6 +206,13 @@ #define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 #define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + /* Hardware P state interface */ #define MSR_PPERF 0x0000064e #define MSR_PERF_LIMIT_REASONS 0x0000064f diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3af9dd7332e6..da92d0201d52 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -43,7 +43,6 @@ #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) #define fp_toint(X) ((X) >> FRAC_BITS) - static inline int32_t mul_fp(int32_t x, int32_t y) { return ((int64_t)x * (int64_t)y) >> FRAC_BITS; @@ -593,10 +592,42 @@ static int core_get_min_pstate(void) static int core_get_max_pstate(void) { - u64 value; + u64 tar; + u64 plat_info; + int max_pstate; + int err; - rdmsrl(MSR_PLATFORM_INFO, value); - return (value >> 8) & 0xFF; + rdmsrl(MSR_PLATFORM_INFO, plat_info); + max_pstate = (plat_info >> 8) & 0xFF; + + err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar); + if (!err) { + /* Do some sanity checking for safety */ + if (plat_info & 0x600000000) { + u64 tdp_ctrl; + u64 tdp_ratio; + int tdp_msr; + + err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); + if (err) + goto skip_tar; + + tdp_msr = MSR_CONFIG_TDP_NOMINAL + tdp_ctrl; + err = rdmsrl_safe(tdp_msr, &tdp_ratio); + if (err) + goto skip_tar; + + if (tdp_ratio - 1 == tar) { + max_pstate = tar; + pr_debug("max_pstate=TAC %x\n", max_pstate); + } else { + goto skip_tar; + } + } + } + +skip_tar: + return max_pstate; } static int core_get_turbo_pstate(void) From 3bcc6fa971c06151d6bf90cb0dc80807f71b93f6 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 14 Oct 2015 16:12:00 -0700 Subject: [PATCH 08/11] cpufreq: intel-pstate: Use separate max pstate for scaling Systems with configurable TDP have multiple max non turbo p state. Intel P state uses max non turbo P state for scaling. But using the real max non turbo p state causes underestimation of next P state. So using the physical max non turbo P state as before for scaling. Signed-off-by: Srinivas Pandruvada Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index da92d0201d52..1369afdc1e19 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -77,6 +77,7 @@ struct pstate_data { int current_pstate; int min_pstate; int max_pstate; + int max_pstate_physical; int scaling; int turbo_pstate; }; @@ -126,6 +127,7 @@ struct pstate_adjust_policy { struct pstate_funcs { int (*get_max)(void); + int (*get_max_physical)(void); int (*get_min)(void); int (*get_turbo)(void); int (*get_scaling)(void); @@ -590,6 +592,14 @@ static int core_get_min_pstate(void) return (value >> 40) & 0xFF; } +static int core_get_max_pstate_physical(void) +{ + u64 value; + + rdmsrl(MSR_PLATFORM_INFO, value); + return (value >> 8) & 0xFF; +} + static int core_get_max_pstate(void) { u64 tar; @@ -683,6 +693,7 @@ static struct cpu_defaults core_params = { }, .funcs = { .get_max = core_get_max_pstate, + .get_max_physical = core_get_max_pstate_physical, .get_min = core_get_min_pstate, .get_turbo = core_get_turbo_pstate, .get_scaling = core_get_scaling, @@ -701,6 +712,7 @@ static struct cpu_defaults byt_params = { }, .funcs = { .get_max = byt_get_max_pstate, + .get_max_physical = byt_get_max_pstate, .get_min = byt_get_min_pstate, .get_turbo = byt_get_turbo_pstate, .set = byt_set_pstate, @@ -720,6 +732,7 @@ static struct cpu_defaults knl_params = { }, .funcs = { .get_max = core_get_max_pstate, + .get_max_physical = core_get_max_pstate_physical, .get_min = core_get_min_pstate, .get_turbo = knl_get_turbo_pstate, .get_scaling = core_get_scaling, @@ -774,6 +787,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) { cpu->pstate.min_pstate = pstate_funcs.get_min(); cpu->pstate.max_pstate = pstate_funcs.get_max(); + cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(); cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); cpu->pstate.scaling = pstate_funcs.get_scaling(); @@ -792,7 +806,8 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu) sample->freq = fp_toint( mul_fp(int_tofp( - cpu->pstate.max_pstate * cpu->pstate.scaling / 100), + cpu->pstate.max_pstate_physical * + cpu->pstate.scaling / 100), core_pct)); sample->core_pct_busy = (int32_t)core_pct; @@ -860,7 +875,7 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) * specified pstate. */ core_busy = cpu->sample.core_pct_busy; - max_pstate = int_tofp(cpu->pstate.max_pstate); + max_pstate = int_tofp(cpu->pstate.max_pstate_physical); current_pstate = int_tofp(cpu->pstate.current_pstate); core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); @@ -1144,6 +1159,7 @@ static void copy_pid_params(struct pstate_adjust_policy *policy) static void copy_cpu_funcs(struct pstate_funcs *funcs) { pstate_funcs.get_max = funcs->get_max; + pstate_funcs.get_max_physical = funcs->get_max_physical; pstate_funcs.get_min = funcs->get_min; pstate_funcs.get_turbo = funcs->get_turbo; pstate_funcs.get_scaling = funcs->get_scaling; From 37afb00032424d684a48d649fcfb8b5e4f17c409 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 14 Oct 2015 16:12:01 -0700 Subject: [PATCH 09/11] cpufreq: intel_pstate: Use ACPI perf configuration Use ACPI _PSS to limit the Intel P State turbo, max and min ratios. This driver uses acpi processor perf lib calls to register performance. The following logic is used to adjust Intel P state driver limits: - If there is no turbo entry in _PSS, then disable Intel P state turbo and limit to non turbo max - If the non turbo max ratio is more than _PSS max non turbo value, then set the max non turbo ratio to _PSS non turbo max - If the min ratio is less than _PSS min then change the min ratio matching _PSS min - Scale the _PSS turbo frequency to max turbo frequency based on control value. This feature can be disabled by using kernel parameters: intel_pstate=no_acpi Signed-off-by: Srinivas Pandruvada Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.x86 | 1 + drivers/cpufreq/intel_pstate.c | 171 ++++++++++++++++++++++++++++++++- 2 files changed, 171 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index c59bdcb83217..adbd1de1cea5 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -5,6 +5,7 @@ config X86_INTEL_PSTATE bool "Intel P state control" depends on X86 + select ACPI_PROCESSOR if ACPI help This driver provides a P state for Intel core processors. The driver implements an internal governor and will become diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1369afdc1e19..041cb4107991 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -34,6 +34,10 @@ #include #include +#if IS_ENABLED(CONFIG_ACPI) +#include +#endif + #define BYT_RATIOS 0x66a #define BYT_VIDS 0x66b #define BYT_TURBO_RATIOS 0x66c @@ -113,6 +117,9 @@ struct cpudata { u64 prev_mperf; u64 prev_tsc; struct sample sample; +#if IS_ENABLED(CONFIG_ACPI) + struct acpi_processor_performance acpi_perf_data; +#endif }; static struct cpudata **all_cpu_data; @@ -143,6 +150,7 @@ struct cpu_defaults { static struct pstate_adjust_policy pid_params; static struct pstate_funcs pstate_funcs; static int hwp_active; +static int no_acpi_perf; struct perf_limits { int no_turbo; @@ -170,6 +178,153 @@ static struct perf_limits limits = { .min_sysfs_pct = 0, }; +#if IS_ENABLED(CONFIG_ACPI) +/* + * The max target pstate ratio is a 8 bit value in both PLATFORM_INFO MSR and + * in TURBO_RATIO_LIMIT MSR, which pstate driver stores in max_pstate and + * max_turbo_pstate fields. The PERF_CTL MSR contains 16 bit value for P state + * ratio, out of it only high 8 bits are used. For example 0x1700 is setting + * target ratio 0x17. The _PSS control value stores in a format which can be + * directly written to PERF_CTL MSR. But in intel_pstate driver this shift + * occurs during write to PERF_CTL (E.g. for cores core_set_pstate()). + * This function converts the _PSS control value to intel pstate driver format + * for comparison and assignment. + */ +static int convert_to_native_pstate_format(struct cpudata *cpu, int index) +{ + return cpu->acpi_perf_data.states[index].control >> 8; +} + +static int intel_pstate_init_perf_limits(struct cpufreq_policy *policy) +{ + struct cpudata *cpu; + int ret; + bool turbo_absent = false; + int max_pstate_index; + int min_pss_ctl, max_pss_ctl, turbo_pss_ctl; + int i; + + cpu = all_cpu_data[policy->cpu]; + + pr_debug("intel_pstate: default limits 0x%x 0x%x 0x%x\n", + cpu->pstate.min_pstate, cpu->pstate.max_pstate, + cpu->pstate.turbo_pstate); + + if (!cpu->acpi_perf_data.shared_cpu_map && + zalloc_cpumask_var_node(&cpu->acpi_perf_data.shared_cpu_map, + GFP_KERNEL, cpu_to_node(policy->cpu))) { + return -ENOMEM; + } + + ret = acpi_processor_register_performance(&cpu->acpi_perf_data, + policy->cpu); + if (ret) + return ret; + + /* + * Check if the control value in _PSS is for PERF_CTL MSR, which should + * guarantee that the states returned by it map to the states in our + * list directly. + */ + if (cpu->acpi_perf_data.control_register.space_id != + ACPI_ADR_SPACE_FIXED_HARDWARE) + return -EIO; + + pr_debug("intel_pstate: CPU%u - ACPI _PSS perf data\n", policy->cpu); + for (i = 0; i < cpu->acpi_perf_data.state_count; i++) + pr_debug(" %cP%d: %u MHz, %u mW, 0x%x\n", + (i == cpu->acpi_perf_data.state ? '*' : ' '), i, + (u32) cpu->acpi_perf_data.states[i].core_frequency, + (u32) cpu->acpi_perf_data.states[i].power, + (u32) cpu->acpi_perf_data.states[i].control); + + /* + * If there is only one entry _PSS, simply ignore _PSS and continue as + * usual without taking _PSS into account + */ + if (cpu->acpi_perf_data.state_count < 2) + return 0; + + turbo_pss_ctl = convert_to_native_pstate_format(cpu, 0); + min_pss_ctl = convert_to_native_pstate_format(cpu, + cpu->acpi_perf_data.state_count - 1); + /* Check if there is a turbo freq in _PSS */ + if (turbo_pss_ctl <= cpu->pstate.max_pstate && + turbo_pss_ctl > cpu->pstate.min_pstate) { + pr_debug("intel_pstate: no turbo range exists in _PSS\n"); + limits.no_turbo = limits.turbo_disabled = 1; + cpu->pstate.turbo_pstate = cpu->pstate.max_pstate; + turbo_absent = true; + } + + /* Check if the max non turbo p state < Intel P state max */ + max_pstate_index = turbo_absent ? 0 : 1; + max_pss_ctl = convert_to_native_pstate_format(cpu, max_pstate_index); + if (max_pss_ctl < cpu->pstate.max_pstate && + max_pss_ctl > cpu->pstate.min_pstate) + cpu->pstate.max_pstate = max_pss_ctl; + + /* check If min perf > Intel P State min */ + if (min_pss_ctl > cpu->pstate.min_pstate && + min_pss_ctl < cpu->pstate.max_pstate) { + cpu->pstate.min_pstate = min_pss_ctl; + policy->cpuinfo.min_freq = min_pss_ctl * cpu->pstate.scaling; + } + + if (turbo_absent) + policy->cpuinfo.max_freq = cpu->pstate.max_pstate * + cpu->pstate.scaling; + else { + policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate * + cpu->pstate.scaling; + /* + * The _PSS table doesn't contain whole turbo frequency range. + * This just contains +1 MHZ above the max non turbo frequency, + * with control value corresponding to max turbo ratio. But + * when cpufreq set policy is called, it will call with this + * max frequency, which will cause a reduced performance as + * this driver uses real max turbo frequency as the max + * frequeny. So correct this frequency in _PSS table to + * correct max turbo frequency based on the turbo ratio. + * Also need to convert to MHz as _PSS freq is in MHz. + */ + cpu->acpi_perf_data.states[0].core_frequency = + turbo_pss_ctl * 100; + } + + pr_debug("intel_pstate: Updated limits using _PSS 0x%x 0x%x 0x%x\n", + cpu->pstate.min_pstate, cpu->pstate.max_pstate, + cpu->pstate.turbo_pstate); + pr_debug("intel_pstate: policy max_freq=%d Khz min_freq = %d KHz\n", + policy->cpuinfo.max_freq, policy->cpuinfo.min_freq); + + return 0; +} + +static int intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) +{ + struct cpudata *cpu; + + if (!no_acpi_perf) + return 0; + + cpu = all_cpu_data[policy->cpu]; + acpi_processor_unregister_performance(policy->cpu); + return 0; +} + +#else +static int intel_pstate_init_perf_limits(struct cpufreq_policy *policy) +{ + return 0; +} + +static int intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) +{ + return 0; +} +#endif + static inline void pid_reset(struct _pid *pid, int setpoint, int busy, int deadband, int integral) { pid->setpoint = setpoint; @@ -1115,18 +1270,30 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + if (!no_acpi_perf) + intel_pstate_init_perf_limits(policy); + /* + * If there is no acpi perf data or error, we ignore and use Intel P + * state calculated limits, So this is not fatal error. + */ policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; cpumask_set_cpu(policy->cpu, policy->cpus); return 0; } +static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) +{ + return intel_pstate_exit_perf_limits(policy); +} + static struct cpufreq_driver intel_pstate_driver = { .flags = CPUFREQ_CONST_LOOPS, .verify = intel_pstate_verify_policy, .setpolicy = intel_pstate_set_policy, .get = intel_pstate_get, .init = intel_pstate_cpu_init, + .exit = intel_pstate_cpu_exit, .stop_cpu = intel_pstate_stop_cpu, .name = "intel_pstate", }; @@ -1168,7 +1335,6 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs) } #if IS_ENABLED(CONFIG_ACPI) -#include static bool intel_pstate_no_acpi_pss(void) { @@ -1360,6 +1526,9 @@ static int __init intel_pstate_setup(char *str) force_load = 1; if (!strcmp(str, "hwp_only")) hwp_only = 1; + if (!strcmp(str, "no_acpi")) + no_acpi_perf = 1; + return 0; } early_param("intel_pstate", intel_pstate_setup); From 053f56def57bfaef14c97d268ef6bc4ebe952720 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 14 Oct 2015 16:12:02 -0700 Subject: [PATCH 10/11] Documentation: kernel_parameters for Intel P state driver Added new option "no_acpi" for not using ACPI processor performance control objects in Intel P state driver. Signed-off-by: Srinivas Pandruvada Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 22a4b687ea5b..9b75e2a760de 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1546,6 +1546,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. hwp_only Only load intel_pstate on systems which support hardware P state control (HWP) if available. + no_acpi + Don't use ACPI processor performance control objects + _PSS and _PPC specified limits. intremap= [X86-64, Intel-IOMMU] on enable Interrupt Remapping (default) From 4ef45148701917fbc08a7c05bc6a3bb0c0573047 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 14 Oct 2015 16:12:03 -0700 Subject: [PATCH 11/11] cpufreq: intel_pstate: Avoid calculation for max/min When requested from cpufreq to set policy, look into _pss and get control values, instead of using max/min perf calculations. These calculation misses next control state in boundary conditions. Signed-off-by: Srinivas Pandruvada Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 50 ++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 041cb4107991..c568226bbcdf 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -163,6 +163,8 @@ struct perf_limits { int max_sysfs_pct; int min_policy_pct; int min_sysfs_pct; + int max_perf_ctl; + int min_perf_ctl; }; static struct perf_limits limits = { @@ -176,6 +178,8 @@ static struct perf_limits limits = { .max_sysfs_pct = 100, .min_policy_pct = 0, .min_sysfs_pct = 0, + .max_perf_ctl = 0, + .min_perf_ctl = 0, }; #if IS_ENABLED(CONFIG_ACPI) @@ -909,12 +913,23 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) * policy, or by cpu specific default values determined through * experimentation. */ - max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf)); - *max = clamp_t(int, max_perf_adj, - cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); + if (limits.max_perf_ctl && limits.max_sysfs_pct >= + limits.max_policy_pct) { + *max = limits.max_perf_ctl; + } else { + max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), + limits.max_perf)); + *max = clamp_t(int, max_perf_adj, cpu->pstate.min_pstate, + cpu->pstate.turbo_pstate); + } - min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits.min_perf)); - *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); + if (limits.min_perf_ctl) { + *min = limits.min_perf_ctl; + } else { + min_perf = fp_toint(mul_fp(int_tofp(max_perf), + limits.min_perf)); + *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); + } } static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate, bool force) @@ -1184,6 +1199,12 @@ static unsigned int intel_pstate_get(unsigned int cpu_num) static int intel_pstate_set_policy(struct cpufreq_policy *policy) { +#if IS_ENABLED(CONFIG_ACPI) + struct cpudata *cpu; + int i; +#endif + pr_debug("intel_pstate: %s max %u policy->max %u\n", __func__, + policy->cpuinfo.max_freq, policy->max); if (!policy->cpuinfo.max_freq) return -ENODEV; @@ -1196,6 +1217,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) limits.max_perf_pct = 100; limits.max_perf = int_tofp(1); limits.no_turbo = 0; + limits.max_perf_ctl = 0; + limits.min_perf_ctl = 0; return 0; } @@ -1216,6 +1239,23 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100)); limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100)); +#if IS_ENABLED(CONFIG_ACPI) + cpu = all_cpu_data[policy->cpu]; + for (i = 0; i < cpu->acpi_perf_data.state_count; i++) { + int control; + + control = convert_to_native_pstate_format(cpu, i); + if (control * cpu->pstate.scaling == policy->max) + limits.max_perf_ctl = control; + if (control * cpu->pstate.scaling == policy->min) + limits.min_perf_ctl = control; + } + + pr_debug("intel_pstate: max %u policy_max %u perf_ctl [0x%x-0x%x]\n", + policy->cpuinfo.max_freq, policy->max, limits.min_perf_ctl, + limits.max_perf_ctl); +#endif + if (hwp_active) intel_pstate_hwp_set();