s390/idle: remove arch_cpu_idle_time() and corresponding code

arch_cpu_idle_time() returns the idle time of any given cpu if it is in
idle, or zero if not. All if this is racy and partially incorrect. Time
stamps taken with store clock extended and store clock fast from different
cpus are compared, while the architecture states that this is nothing which
can be relied on (see Principles of Operation; Chapter 4, "Setting and
Inspecting the Clock").

A more fundamental problem is that the timestamp when a cpu is leaving idle
is taken early in the assembler part of the interrupt handler, and this
value is only transferred many cycles later to the cpu's per-cpu idle data
structure.

This per cpu data structure is read by arch_cpu_idle() to tell for which
period of time a remote cpu is idle: if only an idle_enter value is
present, the assumed idle time of the cpu is calculated by taking a local
timestamp and returning the difference of the local timestamp and the
idle_enter value. This is potentially incorrect, since the remote cpu may
have already left idle, but the taken timestamp may not have been
transferred to the per-cpu data structure. This in turn means that too much
idle time may be reported for a cpu, and a subsequent calculation of system
idle time may result in a smaller value.

Instead of coming up with even more complex code trying to fix this, just
remove this code, and only account idle time of a cpu, after idle state is
left.

Another minor bug is that it is assumed that timestamps are non-zero, which
is not necessarily the case for timestamps taken with store clock
fast. This however is just a very minor problem, since this can only happen
when the epoch increases.

Reviewed-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
Heiko Carstens 2023-02-07 17:39:42 +01:00
parent a02d584e72
commit be76ea6144
3 changed files with 9 additions and 72 deletions

View File

@ -16,10 +16,6 @@
*/
#define cputime_to_nsecs(cputime) tod_to_ns(cputime)
u64 arch_cpu_idle_time(int cpu);
#define arch_idle_time(cpu) arch_cpu_idle_time(cpu)
void account_idle_time_irq(void);
#endif /* _S390_CPUTIME_H */

View File

@ -10,16 +10,12 @@
#include <linux/types.h>
#include <linux/device.h>
#include <linux/seqlock.h>
struct s390_idle_data {
seqcount_t seqcount;
unsigned long idle_count;
unsigned long idle_time;
unsigned long clock_idle_enter;
unsigned long clock_idle_exit;
unsigned long timer_idle_enter;
unsigned long timer_idle_exit;
unsigned long mt_cycles_enter[8];
};

View File

@ -35,24 +35,18 @@ void account_idle_time_irq(void)
this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
}
idle->clock_idle_exit = S390_lowcore.int_clock;
idle->timer_idle_exit = S390_lowcore.sys_enter_timer;
idle_time = S390_lowcore.int_clock - idle->clock_idle_enter;
S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock;
S390_lowcore.last_update_clock = idle->clock_idle_exit;
S390_lowcore.last_update_clock = S390_lowcore.int_clock;
S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter;
S390_lowcore.last_update_timer = idle->timer_idle_exit;
S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer;
/* Account time spent with enabled wait psw loaded as idle time. */
raw_write_seqcount_begin(&idle->seqcount);
idle_time = idle->clock_idle_exit - idle->clock_idle_enter;
idle->clock_idle_enter = 0;
idle->clock_idle_exit = 0;
idle->idle_time += idle_time;
idle->idle_count++;
WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
account_idle_time(cputime_to_nsecs(idle_time));
raw_write_seqcount_end(&idle->seqcount);
}
void noinstr arch_cpu_idle(void)
@ -71,72 +65,23 @@ void noinstr arch_cpu_idle(void)
}
static ssize_t show_idle_count(struct device *dev,
struct device_attribute *attr, char *buf)
struct device_attribute *attr, char *buf)
{
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
unsigned long idle_count;
unsigned int seq;
do {
seq = read_seqcount_begin(&idle->seqcount);
idle_count = READ_ONCE(idle->idle_count);
if (READ_ONCE(idle->clock_idle_enter))
idle_count++;
} while (read_seqcount_retry(&idle->seqcount, seq));
return sprintf(buf, "%lu\n", idle_count);
return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count));
}
DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL);
static ssize_t show_idle_time(struct device *dev,
struct device_attribute *attr, char *buf)
struct device_attribute *attr, char *buf)
{
unsigned long now, idle_time, idle_enter, idle_exit, in_idle;
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
unsigned int seq;
do {
seq = read_seqcount_begin(&idle->seqcount);
idle_time = READ_ONCE(idle->idle_time);
idle_enter = READ_ONCE(idle->clock_idle_enter);
idle_exit = READ_ONCE(idle->clock_idle_exit);
} while (read_seqcount_retry(&idle->seqcount, seq));
in_idle = 0;
now = get_tod_clock();
if (idle_enter) {
if (idle_exit) {
in_idle = idle_exit - idle_enter;
} else if (now > idle_enter) {
in_idle = now - idle_enter;
}
}
idle_time += in_idle;
return sprintf(buf, "%lu\n", idle_time >> 12);
return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12);
}
DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
u64 arch_cpu_idle_time(int cpu)
{
struct s390_idle_data *idle = &per_cpu(s390_idle, cpu);
unsigned long now, idle_enter, idle_exit, in_idle;
unsigned int seq;
do {
seq = read_seqcount_begin(&idle->seqcount);
idle_enter = READ_ONCE(idle->clock_idle_enter);
idle_exit = READ_ONCE(idle->clock_idle_exit);
} while (read_seqcount_retry(&idle->seqcount, seq));
in_idle = 0;
now = get_tod_clock();
if (idle_enter) {
if (idle_exit) {
in_idle = idle_exit - idle_enter;
} else if (now > idle_enter) {
in_idle = now - idle_enter;
}
}
return cputime_to_nsecs(in_idle);
}
void arch_cpu_idle_enter(void)
{
}