From c998c07836f985b24361629dc98506ec7893e7a0 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Tue, 5 Apr 2016 14:05:38 -0500 Subject: [PATCH 1/7] cpuidle: Indicate when a device has been unregistered Currently the 'registered' member of the cpuidle_device struct is set to 1 during cpuidle_register_device. In this same function there are checks to see if the device is already registered to prevent duplicate calls to register the device, but this value is never set to 0 even on unregister of the device. Because of this, any attempt to call cpuidle_register_device after a call to cpuidle_unregister_device will fail which shouldn't be the case. To prevent this, set registered to 0 when the device is unregistered. Fixes: c878a52d3c7c (cpuidle: Check if device is already registered) Signed-off-by: Dave Gerlach Acked-by: Daniel Lezcano Cc: All applicable Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index f996efc56605..c2dd99ab1648 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -433,6 +433,8 @@ static void __cpuidle_unregister_device(struct cpuidle_device *dev) list_del(&dev->device_list); per_cpu(cpuidle_devices, dev->cpu) = NULL; module_put(drv->owner); + + dev->registered = 0; } static void __cpuidle_device_init(struct cpuidle_device *dev) From 5dcef694860100fd16885f052591b1268b764d21 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Apr 2016 17:00:47 -0400 Subject: [PATCH 2/7] intel_idle: add BXT support Broxton has all the HSW C-states, except C3. BXT C-state timing is slightly different. Here we trust the IRTL MSRs as authority on maximum C-state latency, and override the driver's tables with the values found in the associated IRTL MSRs. Further we set the target_residency to 1x maximum latency, trusting the hardware demotion logic. Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 8 ++ drivers/idle/intel_idle.c | 137 +++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b05402ef3b84..73d66fa48638 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -162,6 +162,14 @@ #define MSR_PKG_C9_RESIDENCY 0x00000631 #define MSR_PKG_C10_RESIDENCY 0x00000632 +/* Interrupt Response Limit */ +#define MSR_PKGC3_IRTL 0x0000060a +#define MSR_PKGC6_IRTL 0x0000060b +#define MSR_PKGC7_IRTL 0x0000060c +#define MSR_PKGC8_IRTL 0x00000633 +#define MSR_PKGC9_IRTL 0x00000634 +#define MSR_PKGC10_IRTL 0x00000635 + /* Run Time Average Power Limiting (RAPL) Interface */ #define MSR_RAPL_POWER_UNIT 0x00000606 diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index c6935de425fa..c96649292b55 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -766,6 +766,67 @@ static struct cpuidle_state knl_cstates[] = { .enter = NULL } }; +static struct cpuidle_state bxt_cstates[] = { + { + .name = "C1-BXT", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-BXT", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-BXT", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, + .target_residency = 133, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7s-BXT", + .desc = "MWAIT 0x31", + .flags = MWAIT2flg(0x31) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 155, + .target_residency = 155, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C8-BXT", + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1000, + .target_residency = 1000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C9-BXT", + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2000, + .target_residency = 2000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C10-BXT", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 10000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + /** * intel_idle * @dev: cpuidle_device @@ -950,6 +1011,11 @@ static const struct idle_cpu idle_cpu_knl = { .state_table = knl_cstates, }; +static const struct idle_cpu idle_cpu_bxt = { + .state_table = bxt_cstates, + .disable_promotion_to_c1e = true, +}; + #define ICPU(model, cpu) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } @@ -985,6 +1051,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x9e, idle_cpu_skl), ICPU(0x55, idle_cpu_skx), ICPU(0x57, idle_cpu_knl), + ICPU(0x5c, idle_cpu_bxt), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); @@ -1075,6 +1142,73 @@ static void ivt_idle_state_table_update(void) /* else, 1 and 2 socket systems use default ivt_cstates */ } + +/* + * Translate IRTL (Interrupt Response Time Limit) MSR to usec + */ + +static unsigned int irtl_ns_units[] = { + 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 }; + +static unsigned long long irtl_2_usec(unsigned long long irtl) +{ + unsigned long long ns; + + ns = irtl_ns_units[(irtl >> 10) & 0x3]; + + return div64_u64((irtl & 0x3FF) * ns, 1000); +} +/* + * bxt_idle_state_table_update(void) + * + * On BXT, we trust the IRTL to show the definitive maximum latency + * We use the same value for target_residency. + */ +static void bxt_idle_state_table_update(void) +{ + unsigned long long msr; + + rdmsrl(MSR_PKGC6_IRTL, msr); + if (msr) { + unsigned int usec = irtl_2_usec(msr); + + bxt_cstates[2].exit_latency = usec; + bxt_cstates[2].target_residency = usec; + } + + rdmsrl(MSR_PKGC7_IRTL, msr); + if (msr) { + unsigned int usec = irtl_2_usec(msr); + + bxt_cstates[3].exit_latency = usec; + bxt_cstates[3].target_residency = usec; + } + + rdmsrl(MSR_PKGC8_IRTL, msr); + if (msr) { + unsigned int usec = irtl_2_usec(msr); + + bxt_cstates[4].exit_latency = usec; + bxt_cstates[4].target_residency = usec; + } + + rdmsrl(MSR_PKGC9_IRTL, msr); + if (msr) { + unsigned int usec = irtl_2_usec(msr); + + bxt_cstates[5].exit_latency = usec; + bxt_cstates[5].target_residency = usec; + } + + rdmsrl(MSR_PKGC10_IRTL, msr); + if (msr) { + unsigned int usec = irtl_2_usec(msr); + + bxt_cstates[6].exit_latency = usec; + bxt_cstates[6].target_residency = usec; + } + +} /* * sklh_idle_state_table_update(void) * @@ -1130,6 +1264,9 @@ static void intel_idle_state_table_update(void) case 0x3e: /* IVT */ ivt_idle_state_table_update(); break; + case 0x5c: /* BXT */ + bxt_idle_state_table_update(); + break; case 0x5e: /* SKL-H */ sklh_idle_state_table_update(); break; From 33475a8784f4b8f0bff4419569e0dc348b318c99 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 22 Mar 2016 22:42:40 +0800 Subject: [PATCH 3/7] ARM: cpuidle: add const qualifier to cpuidle_ops member in structures The core code does not modify smp_operations structures. To clarify it, this patch adds 'const' qualifier to the 'ops' member of struct of_cpuidle_method. This change allows each arm cpuidle code to add 'const' qualifier to its cpuidle_ops structure. Signed-off-by: Jisheng Zhang Signed-off-by: Daniel Lezcano --- arch/arm/include/asm/cpuidle.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/cpuidle.h b/arch/arm/include/asm/cpuidle.h index 3848259bebf8..baefe1d51517 100644 --- a/arch/arm/include/asm/cpuidle.h +++ b/arch/arm/include/asm/cpuidle.h @@ -36,7 +36,7 @@ struct cpuidle_ops { struct of_cpuidle_method { const char *method; - struct cpuidle_ops *ops; + const struct cpuidle_ops *ops; }; #define CPUIDLE_METHOD_OF_DECLARE(name, _method, _ops) \ From 4cfd55202cee115a3686d5ad6a0f60d604aca802 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 22 Mar 2016 22:42:41 +0800 Subject: [PATCH 4/7] ARM: cpuidle: constify return value of arm_cpuidle_get_ops() arm_cpuidle_read_ops() just copies '*ops' to cpuidle_ops[cpu], so the structure '*ops' is not modified at all. The comment is also updated accordingly. Signed-off-by: Jisheng Zhang Signed-off-by: Daniel Lezcano --- arch/arm/kernel/cpuidle.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/cpuidle.c b/arch/arm/kernel/cpuidle.c index 703926e7007b..a44b268e12e1 100644 --- a/arch/arm/kernel/cpuidle.c +++ b/arch/arm/kernel/cpuidle.c @@ -70,7 +70,7 @@ int arm_cpuidle_suspend(int index) * * Returns a struct cpuidle_ops pointer, NULL if not found. */ -static struct cpuidle_ops *__init arm_cpuidle_get_ops(const char *method) +static const struct cpuidle_ops *__init arm_cpuidle_get_ops(const char *method) { struct of_cpuidle_method *m = __cpuidle_method_of_table; @@ -88,7 +88,7 @@ static struct cpuidle_ops *__init arm_cpuidle_get_ops(const char *method) * * Get the method name defined in the 'enable-method' property, retrieve the * associated cpuidle_ops and do a struct copy. This copy is needed because all - * cpuidle_ops are tagged __initdata and will be unloaded after the init + * cpuidle_ops are tagged __initconst and will be unloaded after the init * process. * * Return 0 on sucess, -ENOENT if no 'enable-method' is defined, -EOPNOTSUPP if @@ -97,7 +97,7 @@ static struct cpuidle_ops *__init arm_cpuidle_get_ops(const char *method) static int __init arm_cpuidle_read_ops(struct device_node *dn, int cpu) { const char *enable_method; - struct cpuidle_ops *ops; + const struct cpuidle_ops *ops; enable_method = of_get_property(dn, "enable-method", NULL); if (!enable_method) From 1e712d9be873a44d7f5bc2a11d0ad6573029a67e Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 22 Mar 2016 22:42:42 +0800 Subject: [PATCH 5/7] soc: qcom: spm: Use const and __initconst for qcom_cpuidle_ops The qcom_cpuidle_ops structures is not over-written, so add "const" qualifier and replace __initdata with __initconst. Signed-off-by: Jisheng Zhang Signed-off-by: Daniel Lezcano Acked-by: Andy Gross --- drivers/soc/qcom/spm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/spm.c b/drivers/soc/qcom/spm.c index 5548a31e1a39..1fcbb22a4a1c 100644 --- a/drivers/soc/qcom/spm.c +++ b/drivers/soc/qcom/spm.c @@ -274,7 +274,7 @@ check_spm: return per_cpu(cpu_spm_drv, cpu) ? 0 : -ENXIO; } -static struct cpuidle_ops qcom_cpuidle_ops __initdata = { +static const struct cpuidle_ops qcom_cpuidle_ops __initconst = { .suspend = qcom_idle_enter, .init = qcom_cpuidle_init, }; From 5e7c17df795e462c70a43f1b3b670e08efefe8fd Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 22 Mar 2016 22:42:43 +0800 Subject: [PATCH 6/7] drivers: firmware: psci: use const and __initconst for psci_cpuidle_ops The psci_cpuidle_ops structures is not over-written, so add "const" qualifier and replace __initdata with __initconst. Acked-by: Lorenzo Pieralisi Signed-off-by: Jisheng Zhang Signed-off-by: Daniel Lezcano --- drivers/firmware/psci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c index 11bfee8b79a9..eb8134a148c0 100644 --- a/drivers/firmware/psci.c +++ b/drivers/firmware/psci.c @@ -355,7 +355,7 @@ int psci_cpu_suspend_enter(unsigned long index) /* ARM specific CPU idle operations */ #ifdef CONFIG_ARM -static struct cpuidle_ops psci_cpuidle_ops __initdata = { +static const struct cpuidle_ops psci_cpuidle_ops __initconst = { .suspend = psci_cpu_suspend_enter, .init = psci_dt_cpu_init_idle, }; From e93e59ce5b85e6c2b444f09fd1f707274ec066dc Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 21 Apr 2016 10:56:30 +0200 Subject: [PATCH 7/7] cpuidle: Replace ktime_get() with local_clock() The ktime_get() can have a non negligeable overhead, use local_clock() instead. In order to test the difference between ktime_get() and local_clock(), a quick hack has been added to trigger, via debugfs, 10000 times a call to ktime_get() and local_clock() and measure the elapsed time. Then the average value, the min and max is computed for each call. From userspace, the test above was called 100 times every 2 seconds. So, ktime_get() and local_clock() have been called 1000000 times in total. The results are: ktime_get(): ============ * average: 101 ns (stddev: 27.4) * maximum: 38313 ns * minimum: 65 ns local_clock(): ============== * average: 60 ns (stddev: 9.8) * maximum: 13487 ns * minimum: 46 ns The local_clock() is faster and more stable. Even if it is a drop in the ocean, changing the ktime_get() by the local_clock() allows to save 80ns at idle time (entry + exit). And in some circumstances, especially when there are several CPUs racing for the clock access, we save tens of microseconds. The idle duration resulting from a diff is converted from nanosec to microsec. This could be done with integer division (div 1000) - which is an expensive operation or by 10 bits shifting (div 1024) - which is fast but unprecise. The following table gives some results at the limits. ------------------------------------------ | nsec | div(1000) | div(1024) | ------------------------------------------ | 1e3 | 1 usec | 976 nsec | ------------------------------------------ | 1e6 | 1000 usec | 976 usec | ------------------------------------------ | 1e9 | 1000000 usec | 976562 usec | ------------------------------------------ There is a linear deviation of 2.34%. This loss of precision is acceptable in the context of the resulting diff which is used for statistics. These ones are processed to guess estimate an approximation of the duration of the next idle period which ends up into an idle state selection. The selection criteria takes into account the next duration based on large intervals, represented by the idle state's target residency. The 2^10 division is enough because the approximation regarding the 1e3 division is lost in all the approximations done for the next idle duration computation. Signed-off-by: Daniel Lezcano Acked-by: Peter Zijlstra (Intel) [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c2dd99ab1648..2b8e6ce62e81 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -173,7 +173,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, struct cpuidle_state *target_state = &drv->states[index]; bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); - ktime_t time_start, time_end; + u64 time_start, time_end; s64 diff; /* @@ -195,13 +195,13 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, sched_idle_set_state(target_state); trace_cpu_idle_rcuidle(index, dev->cpu); - time_start = ktime_get(); + time_start = local_clock(); stop_critical_timings(); entered_state = target_state->enter(dev, drv, index); start_critical_timings(); - time_end = ktime_get(); + time_end = local_clock(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ @@ -217,7 +217,11 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, if (!cpuidle_state_is_coupled(drv, entered_state)) local_irq_enable(); - diff = ktime_to_us(ktime_sub(time_end, time_start)); + /* + * local_clock() returns the time in nanosecond, let's shift + * by 10 (divide by 1024) to have microsecond based time. + */ + diff = (time_end - time_start) >> 10; if (diff > INT_MAX) diff = INT_MAX;