diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 26453f250683..d1c9e9766984 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -581,6 +581,28 @@ loops can be debugged more effectively on production systems. + clocksource.max_cswd_read_retries= [KNL] + Number of clocksource_watchdog() retries due to + external delays before the clock will be marked + unstable. Defaults to three retries, that is, + four attempts to read the clock under test. + + clocksource.verify_n_cpus= [KNL] + Limit the number of CPUs checked for clocksources + marked with CLOCK_SOURCE_VERIFY_PERCPU that + are marked unstable due to excessive skew. + A negative value says to check all CPUs, while + zero says not to check any. Values larger than + nr_cpu_ids are silently truncated to nr_cpu_ids. + The actual CPUs are chosen randomly, with + no replacement if the same CPU is chosen twice. + + clocksource-wdtest.holdoff= [KNL] + Set the time in seconds that the clocksource + watchdog test waits before commencing its tests. + Defaults to zero when built as a module and to + 10 seconds when built into the kernel. + clearcpuid=BITNUM[,BITNUM...] [X86] Disable CPUID feature X for the kernel. See arch/x86/include/asm/cpufeatures.h for the valid bit diff --git a/arch/arm/mach-zynq/Kconfig b/arch/arm/mach-zynq/Kconfig index 43fb941dcd07..a56748d671c4 100644 --- a/arch/arm/mach-zynq/Kconfig +++ b/arch/arm/mach-zynq/Kconfig @@ -6,7 +6,7 @@ config ARCH_ZYNQ select ARCH_SUPPORTS_BIG_ENDIAN select ARM_AMBA select ARM_GIC - select ARM_GLOBAL_TIMER if !CPU_FREQ + select ARM_GLOBAL_TIMER select CADENCE_TTC_TIMER select HAVE_ARM_SCU if SMP select HAVE_ARM_TWD if SMP diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57ec01192180..2e076a459a0c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1128,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs) static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, + .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | @@ -1152,7 +1153,8 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | - CLOCK_SOURCE_MUST_VERIFY, + CLOCK_SOURCE_MUST_VERIFY | + CLOCK_SOURCE_VERIFY_PERCPU, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 39aa21d01e05..9fa28237715a 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -358,6 +358,20 @@ config ARM_GLOBAL_TIMER help This option enables support for the ARM global timer unit. +config ARM_GT_INITIAL_PRESCALER_VAL + int "ARM global timer initial prescaler value" + default 2 if ARCH_ZYNQ + default 1 + depends on ARM_GLOBAL_TIMER + help + When the ARM global timer initializes, its current rate is declared + to the kernel and maintained forever. Should it's parent clock + change, the driver tries to fix the timer's internal prescaler. + On some machs (i.e. Zynq) the initial prescaler value thus poses + bounds about how much the parent clock is allowed to decrease or + increase wrt the initial clock value. + This affects CPU_FREQ max delta from the initial frequency. + config ARM_TIMER_SP804 bool "Support for Dual Timer SP804 module" if COMPILE_TEST depends on GENERIC_SCHED_CLOCK && CLKDEV_LOOKUP diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index fe1a82627d57..be6d741d404c 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -64,7 +64,6 @@ struct arch_timer { #define to_arch_timer(e) container_of(e, struct arch_timer, evt) static u32 arch_timer_rate __ro_after_init; -u32 arch_timer_rate1 __ro_after_init; static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init; static const char *arch_timer_ppi_names[ARCH_TIMER_MAX_TIMER_PPI] = { @@ -365,7 +364,7 @@ static u64 notrace arm64_858921_read_cntvct_el0(void) do { \ _val = read_sysreg(reg); \ _retries--; \ - } while (((_val + 1) & GENMASK(9, 0)) <= 1 && _retries); \ + } while (((_val + 1) & GENMASK(8, 0)) <= 1 && _retries); \ \ WARN_ON_ONCE(!_retries); \ _val; \ diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 88b2d38a7a61..44a61dc6f932 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -31,6 +31,10 @@ #define GT_CONTROL_COMP_ENABLE BIT(1) /* banked */ #define GT_CONTROL_IRQ_ENABLE BIT(2) /* banked */ #define GT_CONTROL_AUTO_INC BIT(3) /* banked */ +#define GT_CONTROL_PRESCALER_SHIFT 8 +#define GT_CONTROL_PRESCALER_MAX 0xF +#define GT_CONTROL_PRESCALER_MASK (GT_CONTROL_PRESCALER_MAX << \ + GT_CONTROL_PRESCALER_SHIFT) #define GT_INT_STATUS 0x0c #define GT_INT_STATUS_EVENT_FLAG BIT(0) @@ -39,6 +43,7 @@ #define GT_COMP1 0x14 #define GT_AUTO_INC 0x18 +#define MAX_F_ERR 50 /* * We are expecting to be clocked by the ARM peripheral clock. * @@ -46,7 +51,8 @@ * the units for all operations. */ static void __iomem *gt_base; -static unsigned long gt_clk_rate; +static struct notifier_block gt_clk_rate_change_nb; +static u32 gt_psv_new, gt_psv_bck, gt_target_rate; static int gt_ppi; static struct clock_event_device __percpu *gt_evt; @@ -96,7 +102,10 @@ static void gt_compare_set(unsigned long delta, int periodic) unsigned long ctrl; counter += delta; - ctrl = GT_CONTROL_TIMER_ENABLE; + ctrl = readl(gt_base + GT_CONTROL); + ctrl &= ~(GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE | + GT_CONTROL_AUTO_INC); + ctrl |= GT_CONTROL_TIMER_ENABLE; writel_relaxed(ctrl, gt_base + GT_CONTROL); writel_relaxed(lower_32_bits(counter), gt_base + GT_COMP0); writel_relaxed(upper_32_bits(counter), gt_base + GT_COMP1); @@ -123,7 +132,7 @@ static int gt_clockevent_shutdown(struct clock_event_device *evt) static int gt_clockevent_set_periodic(struct clock_event_device *evt) { - gt_compare_set(DIV_ROUND_CLOSEST(gt_clk_rate, HZ), 1); + gt_compare_set(DIV_ROUND_CLOSEST(gt_target_rate, HZ), 1); return 0; } @@ -177,7 +186,7 @@ static int gt_starting_cpu(unsigned int cpu) clk->cpumask = cpumask_of(cpu); clk->rating = 300; clk->irq = gt_ppi; - clockevents_config_and_register(clk, gt_clk_rate, + clockevents_config_and_register(clk, gt_target_rate, 1, 0xffffffff); enable_percpu_irq(clk->irq, IRQ_TYPE_NONE); return 0; @@ -232,9 +241,28 @@ static struct delay_timer gt_delay_timer = { .read_current_timer = gt_read_long, }; +static void gt_write_presc(u32 psv) +{ + u32 reg; + + reg = readl(gt_base + GT_CONTROL); + reg &= ~GT_CONTROL_PRESCALER_MASK; + reg |= psv << GT_CONTROL_PRESCALER_SHIFT; + writel(reg, gt_base + GT_CONTROL); +} + +static u32 gt_read_presc(void) +{ + u32 reg; + + reg = readl(gt_base + GT_CONTROL); + reg &= GT_CONTROL_PRESCALER_MASK; + return reg >> GT_CONTROL_PRESCALER_SHIFT; +} + static void __init gt_delay_timer_init(void) { - gt_delay_timer.freq = gt_clk_rate; + gt_delay_timer.freq = gt_target_rate; register_current_timer_delay(>_delay_timer); } @@ -243,18 +271,81 @@ static int __init gt_clocksource_init(void) writel(0, gt_base + GT_CONTROL); writel(0, gt_base + GT_COUNTER0); writel(0, gt_base + GT_COUNTER1); - /* enables timer on all the cores */ - writel(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); + /* set prescaler and enable timer on all the cores */ + writel(((CONFIG_ARM_GT_INITIAL_PRESCALER_VAL - 1) << + GT_CONTROL_PRESCALER_SHIFT) + | GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK - sched_clock_register(gt_sched_clock_read, 64, gt_clk_rate); + sched_clock_register(gt_sched_clock_read, 64, gt_target_rate); #endif - return clocksource_register_hz(>_clocksource, gt_clk_rate); + return clocksource_register_hz(>_clocksource, gt_target_rate); +} + +static int gt_clk_rate_change_cb(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct clk_notifier_data *ndata = data; + + switch (event) { + case PRE_RATE_CHANGE: + { + int psv; + + psv = DIV_ROUND_CLOSEST(ndata->new_rate, + gt_target_rate); + + if (abs(gt_target_rate - (ndata->new_rate / psv)) > MAX_F_ERR) + return NOTIFY_BAD; + + psv--; + + /* prescaler within legal range? */ + if (psv < 0 || psv > GT_CONTROL_PRESCALER_MAX) + return NOTIFY_BAD; + + /* + * store timer clock ctrl register so we can restore it in case + * of an abort. + */ + gt_psv_bck = gt_read_presc(); + gt_psv_new = psv; + /* scale down: adjust divider in post-change notification */ + if (ndata->new_rate < ndata->old_rate) + return NOTIFY_DONE; + + /* scale up: adjust divider now - before frequency change */ + gt_write_presc(psv); + break; + } + case POST_RATE_CHANGE: + /* scale up: pre-change notification did the adjustment */ + if (ndata->new_rate > ndata->old_rate) + return NOTIFY_OK; + + /* scale down: adjust divider now - after frequency change */ + gt_write_presc(gt_psv_new); + break; + + case ABORT_RATE_CHANGE: + /* we have to undo the adjustment in case we scale up */ + if (ndata->new_rate < ndata->old_rate) + return NOTIFY_OK; + + /* restore original register value */ + gt_write_presc(gt_psv_bck); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_DONE; } static int __init global_timer_of_register(struct device_node *np) { struct clk *gt_clk; + static unsigned long gt_clk_rate; int err = 0; /* @@ -292,11 +383,20 @@ static int __init global_timer_of_register(struct device_node *np) } gt_clk_rate = clk_get_rate(gt_clk); + gt_target_rate = gt_clk_rate / CONFIG_ARM_GT_INITIAL_PRESCALER_VAL; + gt_clk_rate_change_nb.notifier_call = + gt_clk_rate_change_cb; + err = clk_notifier_register(gt_clk, >_clk_rate_change_nb); + if (err) { + pr_warn("Unable to register clock notifier\n"); + goto out_clk; + } + gt_evt = alloc_percpu(struct clock_event_device); if (!gt_evt) { pr_warn("global-timer: can't allocate memory\n"); err = -ENOMEM; - goto out_clk; + goto out_clk_nb; } err = request_percpu_irq(gt_ppi, gt_clockevent_interrupt, @@ -326,6 +426,8 @@ out_irq: free_percpu_irq(gt_ppi, gt_evt); out_free: free_percpu(gt_evt); +out_clk_nb: + clk_notifier_unregister(gt_clk, >_clk_rate_change_nb); out_clk: clk_disable_unprepare(gt_clk); out_unmap: diff --git a/drivers/clocksource/ingenic-sysost.c b/drivers/clocksource/ingenic-sysost.c index e77d58449005..a129840f14f9 100644 --- a/drivers/clocksource/ingenic-sysost.c +++ b/drivers/clocksource/ingenic-sysost.c @@ -186,7 +186,7 @@ static const struct clk_ops ingenic_ost_global_timer_ops = { static const char * const ingenic_ost_clk_parents[] = { "ext" }; -static const struct ingenic_ost_clk_info ingenic_ost_clk_info[] = { +static const struct ingenic_ost_clk_info x1000_ost_clk_info[] = { [OST_CLK_PERCPU_TIMER] = { .init_data = { .name = "percpu timer", @@ -414,14 +414,14 @@ static const struct ingenic_soc_info x1000_soc_info = { .num_channels = 2, }; -static const struct of_device_id __maybe_unused ingenic_ost_of_match[] __initconst = { - { .compatible = "ingenic,x1000-ost", .data = &x1000_soc_info, }, +static const struct of_device_id __maybe_unused ingenic_ost_of_matches[] __initconst = { + { .compatible = "ingenic,x1000-ost", .data = &x1000_soc_info }, { /* sentinel */ } }; static int __init ingenic_ost_probe(struct device_node *np) { - const struct of_device_id *id = of_match_node(ingenic_ost_of_match, np); + const struct of_device_id *id = of_match_node(ingenic_ost_of_matches, np); struct ingenic_ost *ost; unsigned int i; int ret; @@ -462,7 +462,7 @@ static int __init ingenic_ost_probe(struct device_node *np) ost->clocks->num = ost->soc_info->num_channels; for (i = 0; i < ost->clocks->num; i++) { - ret = ingenic_ost_register_clock(ost, i, &ingenic_ost_clk_info[i], ost->clocks); + ret = ingenic_ost_register_clock(ost, i, &x1000_ost_clk_info[i], ost->clocks); if (ret) { pr_crit("%s: Cannot register clock %d\n", __func__, i); goto err_unregister_ost_clocks; diff --git a/drivers/clocksource/samsung_pwm_timer.c b/drivers/clocksource/samsung_pwm_timer.c index f760229d0c7f..6e46781bc9ac 100644 --- a/drivers/clocksource/samsung_pwm_timer.c +++ b/drivers/clocksource/samsung_pwm_timer.c @@ -4,7 +4,7 @@ * http://www.samsung.com/ * * samsung - Common hr-timer support (s3c and s5p) -*/ + */ #include #include @@ -22,7 +22,6 @@ #include - /* * Clocksource driver */ @@ -38,8 +37,8 @@ #define TCFG0_PRESCALER_MASK 0xff #define TCFG0_PRESCALER1_SHIFT 8 -#define TCFG1_SHIFT(x) ((x) * 4) -#define TCFG1_MUX_MASK 0xf +#define TCFG1_SHIFT(x) ((x) * 4) +#define TCFG1_MUX_MASK 0xf /* * Each channel occupies 4 bits in TCON register, but there is a gap of 4 @@ -62,7 +61,7 @@ EXPORT_SYMBOL(samsung_pwm_lock); struct samsung_pwm_clocksource { void __iomem *base; - void __iomem *source_reg; + const void __iomem *source_reg; unsigned int irq[SAMSUNG_PWM_NUM]; struct samsung_pwm_variant variant; @@ -183,7 +182,7 @@ static void samsung_time_start(unsigned int channel, bool periodic) } static int samsung_set_next_event(unsigned long cycles, - struct clock_event_device *evt) + struct clock_event_device *evt) { /* * This check is needed to account for internal rounding @@ -225,6 +224,7 @@ static void samsung_clockevent_resume(struct clock_event_device *cev) if (pwm.variant.has_tint_cstat) { u32 mask = (1 << pwm.event_id); + writel(mask | (mask << 5), pwm.base + REG_TINT_CSTAT); } } @@ -248,6 +248,7 @@ static irqreturn_t samsung_clock_event_isr(int irq, void *dev_id) if (pwm.variant.has_tint_cstat) { u32 mask = (1 << pwm.event_id); + writel(mask | (mask << 5), pwm.base + REG_TINT_CSTAT); } @@ -272,7 +273,7 @@ static void __init samsung_clockevent_init(void) time_event_device.cpumask = cpumask_of(0); clockevents_config_and_register(&time_event_device, - clock_rate, 1, pwm.tcnt_max); + clock_rate, 1, pwm.tcnt_max); irq_number = pwm.irq[pwm.event_id]; if (request_irq(irq_number, samsung_clock_event_isr, @@ -282,6 +283,7 @@ static void __init samsung_clockevent_init(void) if (pwm.variant.has_tint_cstat) { u32 mask = (1 << pwm.event_id); + writel(mask | (mask << 5), pwm.base + REG_TINT_CSTAT); } } @@ -347,7 +349,7 @@ static int __init samsung_clocksource_init(void) pwm.source_reg = pwm.base + pwm.source_id * 0x0c + 0x14; sched_clock_register(samsung_read_sched_clock, - pwm.variant.bits, clock_rate); + pwm.variant.bits, clock_rate); samsung_clocksource.mask = CLOCKSOURCE_MASK(pwm.variant.bits); return clocksource_register_hz(&samsung_clocksource, clock_rate); @@ -398,7 +400,8 @@ static int __init _samsung_pwm_clocksource_init(void) } void __init samsung_pwm_clocksource_init(void __iomem *base, - unsigned int *irqs, struct samsung_pwm_variant *variant) + unsigned int *irqs, + const struct samsung_pwm_variant *variant) { pwm.base = base; memcpy(&pwm.variant, variant, sizeof(pwm.variant)); @@ -418,7 +421,7 @@ static int __init samsung_pwm_alloc(struct device_node *np, struct property *prop; const __be32 *cur; u32 val; - int i; + int i, ret; memcpy(&pwm.variant, variant, sizeof(pwm.variant)); for (i = 0; i < SAMSUNG_PWM_NUM; ++i) @@ -441,10 +444,24 @@ static int __init samsung_pwm_alloc(struct device_node *np, pwm.timerclk = of_clk_get_by_name(np, "timers"); if (IS_ERR(pwm.timerclk)) { pr_crit("failed to get timers clock for timer\n"); - return PTR_ERR(pwm.timerclk); + ret = PTR_ERR(pwm.timerclk); + goto err_clk; } - return _samsung_pwm_clocksource_init(); + ret = _samsung_pwm_clocksource_init(); + if (ret) + goto err_clocksource; + + return 0; + +err_clocksource: + clk_put(pwm.timerclk); + pwm.timerclk = NULL; +err_clk: + iounmap(pwm.base); + pwm.base = NULL; + + return ret; } static const struct samsung_pwm_variant s3c24xx_variant = { diff --git a/drivers/clocksource/timer-mediatek.c b/drivers/clocksource/timer-mediatek.c index 9318edcd8963..ab63b95e414f 100644 --- a/drivers/clocksource/timer-mediatek.c +++ b/drivers/clocksource/timer-mediatek.c @@ -241,6 +241,28 @@ static void mtk_gpt_enable_irq(struct timer_of *to, u8 timer) timer_of_base(to) + GPT_IRQ_EN_REG); } +static void mtk_gpt_resume(struct clock_event_device *clk) +{ + struct timer_of *to = to_timer_of(clk); + + mtk_gpt_enable_irq(to, TIMER_CLK_EVT); +} + +static void mtk_gpt_suspend(struct clock_event_device *clk) +{ + struct timer_of *to = to_timer_of(clk); + + /* Disable all interrupts */ + writel(0x0, timer_of_base(to) + GPT_IRQ_EN_REG); + + /* + * This is called with interrupts disabled, + * so we need to ack any interrupt that is pending + * or for example ATF will prevent a suspend from completing. + */ + writel(0x3f, timer_of_base(to) + GPT_IRQ_ACK_REG); +} + static struct timer_of to = { .flags = TIMER_OF_IRQ | TIMER_OF_BASE | TIMER_OF_CLOCK, @@ -286,6 +308,8 @@ static int __init mtk_gpt_init(struct device_node *node) to.clkevt.set_state_oneshot = mtk_gpt_clkevt_shutdown; to.clkevt.tick_resume = mtk_gpt_clkevt_shutdown; to.clkevt.set_next_event = mtk_gpt_clkevt_next_event; + to.clkevt.suspend = mtk_gpt_suspend; + to.clkevt.resume = mtk_gpt_resume; to.of_irq.handler = mtk_gpt_interrupt; ret = timer_of_init(node, &to); diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 33eeabf9c3d1..3e52c5226c4d 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -78,6 +78,9 @@ static void omap_dm_timer_write_reg(struct omap_dm_timer *timer, u32 reg, static void omap_timer_restore_context(struct omap_dm_timer *timer) { + __omap_dm_timer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, + timer->context.ocp_cfg, 0); + omap_dm_timer_write_reg(timer, OMAP_TIMER_WAKEUP_EN_REG, timer->context.twer); omap_dm_timer_write_reg(timer, OMAP_TIMER_COUNTER_REG, @@ -95,6 +98,9 @@ static void omap_timer_restore_context(struct omap_dm_timer *timer) static void omap_timer_save_context(struct omap_dm_timer *timer) { + timer->context.ocp_cfg = + __omap_dm_timer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET, 0); + timer->context.tclr = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); timer->context.twer = @@ -122,7 +128,8 @@ static int omap_timer_context_notifier(struct notifier_block *nb, break; omap_timer_save_context(timer); break; - case CPU_CLUSTER_PM_ENTER_FAILED: + case CPU_CLUSTER_PM_ENTER_FAILED: /* No need to restore context */ + break; case CPU_CLUSTER_PM_EXIT: if ((timer->capability & OMAP_TIMER_ALWON) || !atomic_read(&timer->enabled)) diff --git a/include/clocksource/samsung_pwm.h b/include/clocksource/samsung_pwm.h index c395238d0922..9b435caa95fe 100644 --- a/include/clocksource/samsung_pwm.h +++ b/include/clocksource/samsung_pwm.h @@ -27,6 +27,7 @@ struct samsung_pwm_variant { }; void samsung_pwm_clocksource_init(void __iomem *base, - unsigned int *irqs, struct samsung_pwm_variant *variant); + unsigned int *irqs, + const struct samsung_pwm_variant *variant); #endif /* __CLOCKSOURCE_SAMSUNG_PWM_H */ diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index 4c61dade8835..f6da8a132639 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -74,6 +74,7 @@ #define OMAP_TIMER_ERRATA_I103_I767 0x80000000 struct timer_regs { + u32 ocp_cfg; u32 tidr; u32 tier; u32 twer; diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index d6ab416ee2d2..1d42d4b17327 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -43,6 +43,8 @@ struct module; * @shift: Cycle to nanosecond divisor (power of two) * @max_idle_ns: Maximum idle time permitted by the clocksource (nsecs) * @maxadj: Maximum adjustment value to mult (~11%) + * @uncertainty_margin: Maximum uncertainty in nanoseconds per half second. + * Zero says to use default WATCHDOG_THRESHOLD. * @archdata: Optional arch-specific data * @max_cycles: Maximum safe cycle value which won't overflow on * multiplication @@ -98,6 +100,7 @@ struct clocksource { u32 shift; u64 max_idle_ns; u32 maxadj; + u32 uncertainty_margin; #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA struct arch_clocksource_data archdata; #endif @@ -137,7 +140,7 @@ struct clocksource { #define CLOCK_SOURCE_UNSTABLE 0x40 #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 - +#define CLOCK_SOURCE_VERIFY_PERCPU 0x200 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) @@ -288,4 +291,7 @@ static inline void timer_probe(void) {} #define TIMER_ACPI_DECLARE(name, table_id, fn) \ ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn) +extern ulong max_cswd_read_retries; +void clocksource_verify_percpu(struct clocksource *cs); + #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7df71ef0e1fd..04bfd62f5e5c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -64,6 +64,15 @@ config LEGACY_TIMER_TICK lack support for the generic clockevent framework. New platforms should use generic clockevents instead. +config TIME_KUNIT_TEST + tristate "KUnit test for kernel/time functions" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test RTC library functions. + + If unsure, say N. + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 1fb1c1ef6a19..7e875e63ff3b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -21,3 +21,5 @@ obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o +obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o +obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f5490222e134..003ccf338d20 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -347,8 +347,7 @@ static void clockevents_notify_released(void) while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); - list_del(&dev->list); - list_add(&dev->list, &clockevent_devices); + list_move(&dev->list, &clockevent_devices); tick_check_new_device(dev); } } @@ -576,8 +575,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (old) { module_put(old->owner); clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); - list_del(&old->list); - list_add(&old->list, &clockevents_released); + list_move(&old->list, &clockevents_released); } if (new) { @@ -629,6 +627,7 @@ void tick_offline_cpu(unsigned int cpu) /** * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu + * @cpu: The dead CPU */ void tick_cleanup_dead_cpu(int cpu) { @@ -668,9 +667,9 @@ static struct bus_type clockevents_subsys = { static DEFINE_PER_CPU(struct device, tick_percpu_dev); static struct tick_device *tick_get_tick_dev(struct device *dev); -static ssize_t sysfs_show_current_tick_dev(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t current_device_show(struct device *dev, + struct device_attribute *attr, + char *buf) { struct tick_device *td; ssize_t count = 0; @@ -682,12 +681,12 @@ static ssize_t sysfs_show_current_tick_dev(struct device *dev, raw_spin_unlock_irq(&clockevents_lock); return count; } -static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); +static DEVICE_ATTR_RO(current_device); /* We don't support the abomination of removable broadcast devices */ -static ssize_t sysfs_unbind_tick_dev(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t unbind_device_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); @@ -714,7 +713,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, mutex_unlock(&clockevents_mutex); return ret ? ret : count; } -static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); +static DEVICE_ATTR_WO(unbind_device); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c new file mode 100644 index 000000000000..01df12395c0e --- /dev/null +++ b/kernel/time/clocksource-wdtest.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Unit test for the clocksource watchdog. + * + * Copyright (C) 2021 Facebook, Inc. + * + * Author: Paul E. McKenney + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include /* for spin_unlock_irq() using preempt_count() m68k */ +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; +module_param(holdoff, int, 0444); +MODULE_PARM_DESC(holdoff, "Time to wait to start test (s)."); + +/* Watchdog kthread's task_struct pointer for debug purposes. */ +static struct task_struct *wdtest_task; + +static u64 wdtest_jiffies_read(struct clocksource *cs) +{ + return (u64)jiffies; +} + +/* Assume HZ > 100. */ +#define JIFFIES_SHIFT 8 + +static struct clocksource clocksource_wdtest_jiffies = { + .name = "wdtest-jiffies", + .rating = 1, /* lowest valid rating*/ + .uncertainty_margin = TICK_NSEC, + .read = wdtest_jiffies_read, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_MUST_VERIFY, + .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .max_cycles = 10, +}; + +static int wdtest_ktime_read_ndelays; +static bool wdtest_ktime_read_fuzz; + +static u64 wdtest_ktime_read(struct clocksource *cs) +{ + int wkrn = READ_ONCE(wdtest_ktime_read_ndelays); + static int sign = 1; + u64 ret; + + if (wkrn) { + udelay(cs->uncertainty_margin / 250); + WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1); + } + ret = ktime_get_real_fast_ns(); + if (READ_ONCE(wdtest_ktime_read_fuzz)) { + sign = -sign; + ret = ret + sign * 100 * NSEC_PER_MSEC; + } + return ret; +} + +static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs) +{ + pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name); +} + +#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ + CLOCK_SOURCE_VALID_FOR_HRES | \ + CLOCK_SOURCE_MUST_VERIFY | \ + CLOCK_SOURCE_VERIFY_PERCPU) + +static struct clocksource clocksource_wdtest_ktime = { + .name = "wdtest-ktime", + .rating = 300, + .read = wdtest_ktime_read, + .mask = CLOCKSOURCE_MASK(64), + .flags = KTIME_FLAGS, + .mark_unstable = wdtest_ktime_cs_mark_unstable, + .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list), +}; + +/* Reset the clocksource if needed. */ +static void wdtest_ktime_clocksource_reset(void) +{ + if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) { + clocksource_unregister(&clocksource_wdtest_ktime); + clocksource_wdtest_ktime.flags = KTIME_FLAGS; + schedule_timeout_uninterruptible(HZ / 10); + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + } +} + +/* Run the specified series of watchdog tests. */ +static int wdtest_func(void *arg) +{ + unsigned long j1, j2; + char *s; + int i; + + schedule_timeout_uninterruptible(holdoff * HZ); + + /* + * Verify that jiffies-like clocksources get the manually + * specified uncertainty margin. + */ + pr_info("--- Verify jiffies-like uncertainty margin.\n"); + __clocksource_register(&clocksource_wdtest_jiffies); + WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC); + + j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); + schedule_timeout_uninterruptible(HZ); + j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); + WARN_ON_ONCE(j1 == j2); + + clocksource_unregister(&clocksource_wdtest_jiffies); + + /* + * Verify that tsc-like clocksources are assigned a reasonable + * uncertainty margin. + */ + pr_info("--- Verify tsc-like uncertainty margin.\n"); + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC); + + j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); + udelay(1); + j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); + pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); + WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + + /* Verify tsc-like stability with various numbers of errors injected. */ + for (i = 0; i <= max_cswd_read_retries + 1; i++) { + if (i <= 1 && i < max_cswd_read_retries) + s = ""; + else if (i <= max_cswd_read_retries) + s = ", expect message"; + else + s = ", expect clock skew"; + pr_info("--- Watchdog with %dx error injection, %lu retries%s.\n", i, max_cswd_read_retries, s); + WRITE_ONCE(wdtest_ktime_read_ndelays, i); + schedule_timeout_uninterruptible(2 * HZ); + WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); + WARN_ON_ONCE((i <= max_cswd_read_retries) != + !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); + wdtest_ktime_clocksource_reset(); + } + + /* Verify tsc-like stability with clock-value-fuzz error injection. */ + pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n"); + WRITE_ONCE(wdtest_ktime_read_fuzz, true); + schedule_timeout_uninterruptible(2 * HZ); + WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); + clocksource_verify_percpu(&clocksource_wdtest_ktime); + WRITE_ONCE(wdtest_ktime_read_fuzz, false); + + clocksource_unregister(&clocksource_wdtest_ktime); + + pr_info("--- Done with test.\n"); + return 0; +} + +static void wdtest_print_module_parms(void) +{ + pr_alert("--- holdoff=%d\n", holdoff); +} + +/* Cleanup function. */ +static void clocksource_wdtest_cleanup(void) +{ +} + +static int __init clocksource_wdtest_init(void) +{ + int ret = 0; + + wdtest_print_module_parms(); + + /* Create watchdog-test task. */ + wdtest_task = kthread_run(wdtest_func, NULL, "wdtest"); + if (IS_ERR(wdtest_task)) { + ret = PTR_ERR(wdtest_task); + pr_warn("%s: Failed to create wdtest kthread.\n", __func__); + wdtest_task = NULL; + return ret; + } + + return 0; +} + +module_init(clocksource_wdtest_init); +module_exit(clocksource_wdtest_cleanup); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2cd902592fc1..b89c76e1c02c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -14,6 +14,8 @@ #include /* for spin_unlock_irq() using preempt_count() m68k */ #include #include +#include +#include #include "tick-internal.h" #include "timekeeping_internal.h" @@ -93,6 +95,20 @@ static char override_name[CS_NAME_LEN]; static int finished_booting; static u64 suspend_start; +/* + * Threshold: 0.0312s, when doubled: 0.0625s. + * Also a default for cs->uncertainty_margin when registering clocks. + */ +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) + +/* + * Maximum permissible delay between two readouts of the watchdog + * clocksource surrounding a read of the clocksource being validated. + * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as + * a lower bound for cs->uncertainty_margin values when registering clocks. + */ +#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC) + #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); static void clocksource_select(void); @@ -119,10 +135,9 @@ static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); /* - * Interval: 0.5sec Threshold: 0.0625s + * Interval: 0.5sec. */ #define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) static void clocksource_watchdog_work(struct work_struct *work) { @@ -184,12 +199,164 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } +ulong max_cswd_read_retries = 3; +module_param(max_cswd_read_retries, ulong, 0644); +EXPORT_SYMBOL_GPL(max_cswd_read_retries); +static int verify_n_cpus = 8; +module_param(verify_n_cpus, int, 0644); + +static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) +{ + unsigned int nretries; + u64 wd_end, wd_delta; + int64_t wd_delay; + + for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { + local_irq_disable(); + *wdnow = watchdog->read(watchdog); + *csnow = cs->read(cs); + wd_end = watchdog->read(watchdog); + local_irq_enable(); + + wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); + wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, + watchdog->shift); + if (wd_delay <= WATCHDOG_MAX_SKEW) { + if (nretries > 1 || nretries >= max_cswd_read_retries) { + pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", + smp_processor_id(), watchdog->name, nretries); + } + return true; + } + } + + pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", + smp_processor_id(), watchdog->name, wd_delay, nretries); + return false; +} + +static u64 csnow_mid; +static cpumask_t cpus_ahead; +static cpumask_t cpus_behind; +static cpumask_t cpus_chosen; + +static void clocksource_verify_choose_cpus(void) +{ + int cpu, i, n = verify_n_cpus; + + if (n < 0) { + /* Check all of the CPUs. */ + cpumask_copy(&cpus_chosen, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + return; + } + + /* If no checking desired, or no other CPU to check, leave. */ + cpumask_clear(&cpus_chosen); + if (n == 0 || num_online_cpus() <= 1) + return; + + /* Make sure to select at least one CPU other than the current CPU. */ + cpu = cpumask_next(-1, cpu_online_mask); + if (cpu == smp_processor_id()) + cpu = cpumask_next(cpu, cpu_online_mask); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return; + cpumask_set_cpu(cpu, &cpus_chosen); + + /* Force a sane value for the boot parameter. */ + if (n > nr_cpu_ids) + n = nr_cpu_ids; + + /* + * Randomly select the specified number of CPUs. If the same + * CPU is selected multiple times, that CPU is checked only once, + * and no replacement CPU is selected. This gracefully handles + * situations where verify_n_cpus is greater than the number of + * CPUs that are currently online. + */ + for (i = 1; i < n; i++) { + cpu = prandom_u32() % nr_cpu_ids; + cpu = cpumask_next(cpu - 1, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_next(-1, cpu_online_mask); + if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) + cpumask_set_cpu(cpu, &cpus_chosen); + } + + /* Don't verify ourselves. */ + cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); +} + +static void clocksource_verify_one_cpu(void *csin) +{ + struct clocksource *cs = (struct clocksource *)csin; + + csnow_mid = cs->read(cs); +} + +void clocksource_verify_percpu(struct clocksource *cs) +{ + int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; + u64 csnow_begin, csnow_end; + int cpu, testcpu; + s64 delta; + + if (verify_n_cpus == 0) + return; + cpumask_clear(&cpus_ahead); + cpumask_clear(&cpus_behind); + get_online_cpus(); + preempt_disable(); + clocksource_verify_choose_cpus(); + if (cpumask_weight(&cpus_chosen) == 0) { + preempt_enable(); + put_online_cpus(); + pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + return; + } + testcpu = smp_processor_id(); + pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + for_each_cpu(cpu, &cpus_chosen) { + if (cpu == testcpu) + continue; + csnow_begin = cs->read(cs); + smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); + csnow_end = cs->read(cs); + delta = (s64)((csnow_mid - csnow_begin) & cs->mask); + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_behind); + delta = (csnow_end - csnow_mid) & cs->mask; + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_ahead); + delta = clocksource_delta(csnow_end, csnow_begin, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + if (cs_nsec > cs_nsec_max) + cs_nsec_max = cs_nsec; + if (cs_nsec < cs_nsec_min) + cs_nsec_min = cs_nsec; + } + preempt_enable(); + put_online_cpus(); + if (!cpumask_empty(&cpus_ahead)) + pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_ahead), testcpu, cs->name); + if (!cpumask_empty(&cpus_behind)) + pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_behind), testcpu, cs->name); + if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) + pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); +} +EXPORT_SYMBOL_GPL(clocksource_verify_percpu); + static void clocksource_watchdog(struct timer_list *unused) { - struct clocksource *cs; u64 csnow, wdnow, cslast, wdlast, delta; - int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; + int64_t wd_nsec, cs_nsec; + struct clocksource *cs; + u32 md; spin_lock(&watchdog_lock); if (!watchdog_running) @@ -206,10 +373,11 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - local_irq_disable(); - csnow = cs->read(cs); - wdnow = watchdog->read(watchdog); - local_irq_enable(); + if (!cs_watchdog_read(cs, &csnow, &wdnow)) { + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); + continue; + } /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || @@ -235,13 +403,20 @@ static void clocksource_watchdog(struct timer_list *unused) continue; /* Check the deviation from the watchdog clocksource. */ - if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + md = cs->uncertainty_margin + watchdog->uncertainty_margin; + if (abs(cs_nsec - wd_nsec) > md) { pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", smp_processor_id(), cs->name); - pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", - watchdog->name, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", - cs->name, csnow, cslast, cs->mask); + pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, cs_nsec, csnow, cslast, cs->mask); + if (curr_clocksource == cs) + pr_warn(" '%s' is current clocksource.\n", cs->name); + else if (curr_clocksource) + pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); + else + pr_warn(" No current clocksource.\n"); __clocksource_unstable(cs); continue; } @@ -407,6 +582,12 @@ static int __clocksource_watchdog_kthread(void) unsigned long flags; int select = 0; + /* Do any required per-CPU skew verification. */ + if (curr_clocksource && + curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && + curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) + clocksource_verify_percpu(curr_clocksource); + spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { @@ -876,6 +1057,26 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); } + + /* + * If the uncertainty margin is not specified, calculate it. + * If both scale and freq are non-zero, calculate the clock + * period, but bound below at 2*WATCHDOG_MAX_SKEW. However, + * if either of scale or freq is zero, be very conservative and + * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the + * uncertainty margin. Allow stupidly small uncertainty margins + * to be specified by the caller for testing purposes, but warn + * to discourage production use of this capability. + */ + if (scale && freq && !cs->uncertainty_margin) { + cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); + if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) + cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; + } else if (!cs->uncertainty_margin) { + cs->uncertainty_margin = WATCHDOG_THRESHOLD; + } + WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); + /* * Ensure clocksources that have large 'mult' values don't overflow * when adjusted. diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a492e4da69ba..01935aafdb46 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -49,13 +49,14 @@ static u64 jiffies_read(struct clocksource *cs) * for "tick-less" systems. */ static struct clocksource clocksource_jiffies = { - .name = "jiffies", - .rating = 1, /* lowest valid rating*/ - .read = jiffies_read, - .mask = CLOCKSOURCE_MASK(32), - .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, - .max_cycles = 10, + .name = "jiffies", + .rating = 1, /* lowest valid rating*/ + .uncertainty_margin = 32 * NSEC_PER_MSEC, + .read = jiffies_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .max_cycles = 10, }; __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index a44055228796..f7fe6fe36173 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -33,6 +33,8 @@ static int tick_broadcast_forced; static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT +static DEFINE_PER_CPU(struct clock_event_device *, tick_oneshot_wakeup_device); + static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); @@ -61,6 +63,13 @@ struct cpumask *tick_get_broadcast_mask(void) return tick_broadcast_mask; } +static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu); + +const struct clock_event_device *tick_get_wakeup_device(int cpu) +{ + return tick_get_oneshot_wakeup_device(cpu); +} + /* * Start the device in periodic mode */ @@ -88,13 +97,75 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, return !curdev || newdev->rating > curdev->rating; } +#ifdef CONFIG_TICK_ONESHOT +static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) +{ + return per_cpu(tick_oneshot_wakeup_device, cpu); +} + +static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) +{ + /* + * If we woke up early and the tick was reprogrammed in the + * meantime then this may be spurious but harmless. + */ + tick_receive_broadcast(); +} + +static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, + int cpu) +{ + struct clock_event_device *curdev = tick_get_oneshot_wakeup_device(cpu); + + if (!newdev) + goto set_device; + + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (!(newdev->features & CLOCK_EVT_FEAT_PERCPU) || + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return false; + + if (curdev && newdev->rating <= curdev->rating) + return false; + + if (!try_module_get(newdev->owner)) + return false; + + newdev->event_handler = tick_oneshot_wakeup_handler; +set_device: + clockevents_exchange_device(curdev, newdev); + per_cpu(tick_oneshot_wakeup_device, cpu) = newdev; + return true; +} +#else +static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) +{ + return NULL; +} + +static bool tick_set_oneshot_wakeup_device(struct clock_event_device *newdev, + int cpu) +{ + return false; +} +#endif + /* * Conditionally install/replace broadcast device */ -void tick_install_broadcast_device(struct clock_event_device *dev) +void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { struct clock_event_device *cur = tick_broadcast_device.evtdev; + if (tick_set_oneshot_wakeup_device(dev, cpu)) + return; + if (!tick_check_broadcast_device(cur, dev)) return; @@ -253,7 +324,6 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) return ret; } -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST int tick_receive_broadcast(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); @@ -268,7 +338,6 @@ int tick_receive_broadcast(void) evt->event_handler(evt); return 0; } -#endif /* * Broadcast the event to the cpus, which are set in the mask (mangled). @@ -719,24 +788,16 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +static int ___tick_broadcast_oneshot_control(enum tick_broadcast_state state, + struct tick_device *td, + int cpu) { - struct clock_event_device *bc, *dev; - int cpu, ret = 0; + struct clock_event_device *bc, *dev = td->evtdev; + int ret = 0; ktime_t now; - /* - * If there is no broadcast device, tell the caller not to go - * into deep idle. - */ - if (!tick_broadcast_device.evtdev) - return -EBUSY; - - dev = this_cpu_ptr(&tick_cpu_device)->evtdev; - raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; - cpu = smp_processor_id(); if (state == TICK_BROADCAST_ENTER) { /* @@ -865,6 +926,53 @@ out: return ret; } +static int tick_oneshot_wakeup_control(enum tick_broadcast_state state, + struct tick_device *td, + int cpu) +{ + struct clock_event_device *dev, *wd; + + dev = td->evtdev; + if (td->mode != TICKDEV_MODE_ONESHOT) + return -EINVAL; + + wd = tick_get_oneshot_wakeup_device(cpu); + if (!wd) + return -ENODEV; + + switch (state) { + case TICK_BROADCAST_ENTER: + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + clockevents_switch_state(wd, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(wd, dev->next_event, 1); + break; + case TICK_BROADCAST_EXIT: + /* We may have transitioned to oneshot mode while idle */ + if (clockevent_get_state(wd) != CLOCK_EVT_STATE_ONESHOT) + return -ENODEV; + } + + return 0; +} + +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + int cpu = smp_processor_id(); + + if (!tick_oneshot_wakeup_control(state, td, cpu)) + return 0; + + if (tick_broadcast_device.evtdev) + return ___tick_broadcast_oneshot_control(state, td, cpu); + + /* + * If there is no broadcast or wakeup device, tell the caller not + * to go into deep idle. + */ + return -EBUSY; +} + /* * Reset the one shot broadcast for a cpu * @@ -991,6 +1099,9 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) */ static void tick_broadcast_oneshot_offline(unsigned int cpu) { + if (tick_get_oneshot_wakeup_device(cpu)) + tick_set_oneshot_wakeup_device(NULL, cpu); + /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index e15bc0ef1912..d663249652ef 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -373,7 +373,7 @@ out_bc: /* * Can the new device be used as a broadcast device ? */ - tick_install_broadcast_device(newdev); + tick_install_broadcast_device(newdev, cpu); } /** diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7a981c9e87a4..6a742a29e545 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -61,7 +61,7 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* Broadcasting support */ # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern void tick_install_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev, int cpu); extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); @@ -71,8 +71,9 @@ extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadc extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); extern struct tick_device *tick_get_broadcast_device(void); extern struct cpumask *tick_get_broadcast_mask(void); +extern const struct clock_event_device *tick_get_wakeup_device(int cpu); # else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ -static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } +static inline void tick_install_broadcast_device(struct clock_event_device *dev, int cpu) { } static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c new file mode 100644 index 000000000000..831e8e779ace --- /dev/null +++ b/kernel/time/time_test.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: LGPL-2.1+ + +#include +#include + +/* + * Traditional implementation of leap year evaluation. + */ +static bool is_leap(long year) +{ + return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0); +} + +/* + * Gets the last day of a month. + */ +static int last_day_of_month(long year, int month) +{ + if (month == 2) + return 28 + is_leap(year); + if (month == 4 || month == 6 || month == 9 || month == 11) + return 30; + return 31; +} + +/* + * Advances a date by one day. + */ +static void advance_date(long *year, int *month, int *mday, int *yday) +{ + if (*mday != last_day_of_month(*year, *month)) { + ++*mday; + ++*yday; + return; + } + + *mday = 1; + if (*month != 12) { + ++*month; + ++*yday; + return; + } + + *month = 1; + *yday = 0; + ++*year; +} + +/* + * Checks every day in a 160000 years interval centered at 1970-01-01 + * against the expected result. + */ +static void time64_to_tm_test_date_range(struct kunit *test) +{ + /* + * 80000 years = (80000 / 400) * 400 years + * = (80000 / 400) * 146097 days + * = (80000 / 400) * 146097 * 86400 seconds + */ + time64_t total_secs = ((time64_t) 80000) / 400 * 146097 * 86400; + long year = 1970 - 80000; + int month = 1; + int mdday = 1; + int yday = 0; + + struct tm result; + time64_t secs; + s64 days; + + for (secs = -total_secs; secs <= total_secs; secs += 86400) { + + time64_to_tm(secs, 0, &result); + + days = div_s64(secs, 86400); + + #define FAIL_MSG "%05ld/%02d/%02d (%2d) : %ld", \ + year, month, mdday, yday, days + + KUNIT_ASSERT_EQ_MSG(test, year - 1900, result.tm_year, FAIL_MSG); + KUNIT_ASSERT_EQ_MSG(test, month - 1, result.tm_mon, FAIL_MSG); + KUNIT_ASSERT_EQ_MSG(test, mdday, result.tm_mday, FAIL_MSG); + KUNIT_ASSERT_EQ_MSG(test, yday, result.tm_yday, FAIL_MSG); + + advance_date(&year, &month, &mdday, &yday); + } +} + +static struct kunit_case time_test_cases[] = { + KUNIT_CASE(time64_to_tm_test_date_range), + {} +}; + +static struct kunit_suite time_test_suite = { + .name = "time_test_cases", + .test_cases = time_test_cases, +}; + +kunit_test_suite(time_test_suite); +MODULE_LICENSE("GPL"); diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 62e3b46717a6..59b922c826e7 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -22,47 +22,16 @@ /* * Converts the calendar time to broken-down time representation - * Based on code from glibc-2.6 * * 2009-7-14: * Moved from glibc-2.6 to kernel by Zhaolei + * 2021-06-02: + * Reimplemented by Cassio Neri */ #include #include - -/* - * Nonzero if YEAR is a leap year (every 4 years, - * except every 100th isn't, and every 400th is). - */ -static int __isleap(long year) -{ - return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); -} - -/* do a mathdiv for long type */ -static long math_div(long a, long b) -{ - return a / b - (a % b < 0); -} - -/* How many leap years between y1 and y2, y1 must less or equal to y2 */ -static long leaps_between(long y1, long y2) -{ - long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) - + math_div(y1 - 1, 400); - long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) - + math_div(y2 - 1, 400); - return leaps2 - leaps1; -} - -/* How many days come before each month (0-12). */ -static const unsigned short __mon_yday[2][13] = { - /* Normal years. */ - {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, - /* Leap years. */ - {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} -}; +#include #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) @@ -77,9 +46,11 @@ static const unsigned short __mon_yday[2][13] = { */ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { - long days, rem, y; + u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day; + u64 u64tmp, udays, century, year; + bool is_Jan_or_Feb, is_leap_year; + long days, rem; int remainder; - const unsigned short *ip; days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); rem = remainder; @@ -103,27 +74,68 @@ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) if (result->tm_wday < 0) result->tm_wday += 7; - y = 1970; + /* + * The following algorithm is, basically, Proposition 6.3 of Neri + * and Schneider [1]. In a few words: it works on the computational + * (fictitious) calendar where the year starts in March, month = 2 + * (*), and finishes in February, month = 13. This calendar is + * mathematically convenient because the day of the year does not + * depend on whether the year is leap or not. For instance: + * + * March 1st 0-th day of the year; + * ... + * April 1st 31-st day of the year; + * ... + * January 1st 306-th day of the year; (Important!) + * ... + * February 28th 364-th day of the year; + * February 29th 365-th day of the year (if it exists). + * + * After having worked out the date in the computational calendar + * (using just arithmetics) it's easy to convert it to the + * corresponding date in the Gregorian calendar. + * + * [1] "Euclidean Affine Functions and Applications to Calendar + * Algorithms". https://arxiv.org/abs/2102.06959 + * + * (*) The numbering of months follows tm more closely and thus, + * is slightly different from [1]. + */ - while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { - /* Guess a corrected year, assuming 365 days per year. */ - long yg = y + math_div(days, 365); + udays = ((u64) days) + 2305843009213814918ULL; - /* Adjust DAYS and Y to match the guessed year. */ - days -= (yg - y) * 365 + leaps_between(y, yg); - y = yg; - } + u64tmp = 4 * udays + 3; + century = div64_u64_rem(u64tmp, 146097, &u64tmp); + day_of_century = (u32) (u64tmp / 4); - result->tm_year = y - 1900; + u32tmp = 4 * day_of_century + 3; + u64tmp = 2939745ULL * u32tmp; + year_of_century = upper_32_bits(u64tmp); + day_of_year = lower_32_bits(u64tmp) / 2939745 / 4; - result->tm_yday = days; + year = 100 * century + year_of_century; + is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4); - ip = __mon_yday[__isleap(y)]; - for (y = 11; days < ip[y]; y--) - continue; - days -= ip[y]; + u32tmp = 2141 * day_of_year + 132377; + month = u32tmp >> 16; + day = ((u16) u32tmp) / 2141; - result->tm_mon = y; - result->tm_mday = days + 1; + /* + * Recall that January 1st is the 306-th day of the year in the + * computational (not Gregorian) calendar. + */ + is_Jan_or_Feb = day_of_year >= 306; + + /* Convert to the Gregorian calendar and adjust to Unix time. */ + year = year + is_Jan_or_Feb - 6313183731940000ULL; + month = is_Jan_or_Feb ? month - 12 : month; + day = day + 1; + day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year; + + /* Convert to tm's format. */ + result->tm_year = (long) (year - 1900); + result->tm_mon = (int) month; + result->tm_mday = (int) day; + result->tm_yday = (int) day_of_year; } EXPORT_SYMBOL(time64_to_tm); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 6939140ab7c5..ed7d6ad694fb 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -228,6 +228,14 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " event_handler: %ps\n", dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + if (cpu >= 0) { + const struct clock_event_device *wd = tick_get_wakeup_device(cpu); + + SEQ_printf(m, "Wakeup Device: %s\n", wd ? wd->name : ""); + } +#endif SEQ_printf(m, "\n"); } @@ -248,7 +256,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.8\n"); + SEQ_printf(m, "Timer List Version: v0.9\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a0e1910d0a16..b9e223b08586 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2573,6 +2573,18 @@ config TEST_FPU If unsure, say N. +config TEST_CLOCKSOURCE_WATCHDOG + tristate "Test clocksource watchdog in kernel space" + depends on CLOCKSOURCE_WATCHDOG + help + Enable this option to create a kernel module that will trigger + a test of the clocksource watchdog. This module may be loaded + via modprobe or insmod in which case it will run upon being + loaded, or it may be built in, in which case it will run + shortly after boot. + + If unsure, say N. + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST