linux/arch/powerpc/kernel/rtasd.c
Nathan Fontenot cd24e457fd powerpc/pseries: Remove prrn_work workqueue
When a PRRN event is received we are already running in a worker
thread. Instead of spawning off another worker thread on the prrn_work
workqueue to handle the PRRN event we can just call the PRRN handler
routine directly.

With this update we can also pass the scope variable for the PRRN
event directly to the handler instead of it being a global variable.

This patch fixes the following oops mnessage we are seeing in PRRN testing:

  Oops: Bad kernel stack pointer, sig: 6 [#1]
  SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc fscache binfmt_misc reiserfs vfat fat rpadlpar_io(X) rpaphp(X) tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag af_packet xfs libcrc32c dm_service_time ibmveth(X) ses enclosure scsi_transport_sas rtc_generic btrfs xor raid6_pq sd_mod ibmvscsi(X) scsi_transport_srp ipr(X) libata sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4
  Supported: Yes, External                                                     54
  CPU: 7 PID: 18967 Comm: kworker/u96:0 Tainted: G                 X 4.4.126-94.22-default #1
  Workqueue: pseries hotplug workque pseries_hp_work_fn
  task: c000000775367790 ti: c00000001ebd4000 task.ti: c00000070d140000
  NIP: 0000000000000000 LR: 000000001fb3d050 CTR: 0000000000000000
  REGS: c00000001ebd7d40 TRAP: 0700   Tainted: G                 X  (4.4.126-94.22-default)
  MSR: 8000000102081000 <41,VEC,ME5  CR: 28000002  XER: 20040018   4
  CFAR: 000000001fb3d084 40 419   1                                3
  GPR00: 000000000000000040000000000010007 000000001ffff400 000000041fffe200
  GPR04: 000000000000008050000000000000000 000000001fb15fa8 0000000500000500
  GPR08: 000000000001f40040000000000000001 0000000000000000 000005:5200040002
  GPR12: 00000000000000005c000000007a05400 c0000000000e89f8 000000001ed9f668
  GPR16: 000000001fbeff944000000001fbeff94 000000001fb545e4 0000006000000060
  GPR20: ffffffffffffffff4ffffffffffffffff 0000000000000000 0000000000000000
  GPR24: 00000000000000005400000001fb3c000 0000000000000000 000000001fb1b040
  GPR28: 000000001fb240004000000001fb440d8 0000000000000008 0000000000000000
  NIP [0000000000000000] 5         (null)
  LR [000000001fb3d050] 031fb3d050
  Call Trace:            4
  Instruction dump:      4                                       5:47 12    2
  XXXXXXXX XXXXXXXX XXXXX4XX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
  XXXXXXXX XXXXXXXX XXXXX5XX XXXXXXXX 60000000 60000000 60000000 60000000
  ---[ end trace aa5627b04a7d9d6b ]---                                       3NMI watchdog: BUG: soft lockup - CPU#27 stuck for 23s! [kworker/27:0:13903]
  Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc fscache binfmt_misc reiserfs vfat fat rpadlpar_io(X) rpaphp(X) tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag af_packet xfs libcrc32c dm_service_time ibmveth(X) ses enclosure scsi_transport_sas rtc_generic btrfs xor raid6_pq sd_mod ibmvscsi(X) scsi_transport_srp ipr(X) libata sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4
  Supported: Yes, External
  CPU: 27 PID: 13903 Comm: kworker/27:0 Tainted: G      D          X 4.4.126-94.22-default #1
  Workqueue: events prrn_work_fn
  task: c000000747cfa390 ti: c00000074712c000 task.ti: c00000074712c000
  NIP: c0000000008002a8 LR: c000000000090770 CTR: 000000000032e088
  REGS: c00000074712f7b0 TRAP: 0901   Tainted: G      D          X  (4.4.126-94.22-default)
  MSR: 8000000100009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 22482044  XER: 20040000
  CFAR: c0000000008002c4 SOFTE: 1
  GPR00: c000000000090770 c00000074712fa30 c000000000f09800 c000000000fa1928 6:02
  GPR04: c000000775f5e000 fffffffffffffffe 0000000000000001 c000000000f42db8
  GPR08: 0000000000000001 0000000080000007 0000000000000000 0000000000000000
  GPR12: 8006210083180000 c000000007a14400
  NIP [c0000000008002a8] _raw_spin_lock+0x68/0xd0
  LR [c000000000090770] mobility_rtas_call+0x50/0x100
  Call Trace:            59                                        5
  [c00000074712fa60] [c000000000090770] mobility_rtas_call+0x50/0x100
  [c00000074712faf0] [c000000000090b08] pseries_devicetree_update+0xf8/0x530
  [c00000074712fc20] [c000000000031ba4] prrn_work_fn+0x34/0x50
  [c00000074712fc40] [c0000000000e0390] process_one_work+0x1a0/0x4e0
  [c00000074712fcd0] [c0000000000e0870] worker_thread+0x1a0/0x6105:57       2
  [c00000074712fd80] [c0000000000e8b18] kthread+0x128/0x150
  [c00000074712fe30] [c0000000000096f8] ret_from_kernel_thread+0x5c/0x64
  Instruction dump:
  2c090000 40c20010 7d40192d 40c2fff0 7c2004ac 2fa90000 40de0018 5:540030   3
  e8010010 ebe1fff8 7c0803a6 4e800020 <7c210b78> e92d0000 89290009 792affe3

Signed-off-by: John Allen <jallen@linux.ibm.com>
Signed-off-by: Haren Myneni <haren@us.ibm.com>
Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-09-19 22:08:12 +10:00

605 lines
15 KiB
C

/*
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Communication to userspace based on kernel/printk.c
*/
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/spinlock.h>
#include <linux/cpu.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/topology.h>
#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/rtas.h>
#include <asm/prom.h>
#include <asm/nvram.h>
#include <linux/atomic.h>
#include <asm/machdep.h>
#include <asm/topology.h>
static DEFINE_SPINLOCK(rtasd_log_lock);
static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
static char *rtas_log_buf;
static unsigned long rtas_log_start;
static unsigned long rtas_log_size;
static int surveillance_timeout = -1;
static unsigned int rtas_error_log_max;
static unsigned int rtas_error_log_buffer_max;
/* RTAS service tokens */
static unsigned int event_scan;
static unsigned int rtas_event_scan_rate;
static bool full_rtas_msgs;
/* Stop logging to nvram after first fatal error */
static int logging_enabled; /* Until we initialize everything,
* make sure we don't try logging
* anything */
static int error_log_cnt;
/*
* Since we use 32 bit RTAS, the physical address of this must be below
* 4G or else bad things happen. Allocate this in the kernel data and
* make it big enough.
*/
static unsigned char logdata[RTAS_ERROR_LOG_MAX];
static char *rtas_type[] = {
"Unknown", "Retry", "TCE Error", "Internal Device Failure",
"Timeout", "Data Parity", "Address Parity", "Cache Parity",
"Address Invalid", "ECC Uncorrected", "ECC Corrupted",
};
static char *rtas_event_type(int type)
{
if ((type > 0) && (type < 11))
return rtas_type[type];
switch (type) {
case RTAS_TYPE_EPOW:
return "EPOW";
case RTAS_TYPE_PLATFORM:
return "Platform Error";
case RTAS_TYPE_IO:
return "I/O Event";
case RTAS_TYPE_INFO:
return "Platform Information Event";
case RTAS_TYPE_DEALLOC:
return "Resource Deallocation Event";
case RTAS_TYPE_DUMP:
return "Dump Notification Event";
case RTAS_TYPE_PRRN:
return "Platform Resource Reassignment Event";
}
return rtas_type[0];
}
/* To see this info, grep RTAS /var/log/messages and each entry
* will be collected together with obvious begin/end.
* There will be a unique identifier on the begin and end lines.
* This will persist across reboots.
*
* format of error logs returned from RTAS:
* bytes (size) : contents
* --------------------------------------------------------
* 0-7 (8) : rtas_error_log
* 8-47 (40) : extended info
* 48-51 (4) : vendor id
* 52-1023 (vendor specific) : location code and debug data
*/
static void printk_log_rtas(char *buf, int len)
{
int i,j,n = 0;
int perline = 16;
char buffer[64];
char * str = "RTAS event";
if (full_rtas_msgs) {
printk(RTAS_DEBUG "%d -------- %s begin --------\n",
error_log_cnt, str);
/*
* Print perline bytes on each line, each line will start
* with RTAS and a changing number, so syslogd will
* print lines that are otherwise the same. Separate every
* 4 bytes with a space.
*/
for (i = 0; i < len; i++) {
j = i % perline;
if (j == 0) {
memset(buffer, 0, sizeof(buffer));
n = sprintf(buffer, "RTAS %d:", i/perline);
}
if ((i % 4) == 0)
n += sprintf(buffer+n, " ");
n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
if (j == (perline-1))
printk(KERN_DEBUG "%s\n", buffer);
}
if ((i % perline) != 0)
printk(KERN_DEBUG "%s\n", buffer);
printk(RTAS_DEBUG "%d -------- %s end ----------\n",
error_log_cnt, str);
} else {
struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
error_log_cnt, rtas_event_type(rtas_error_type(errlog)),
rtas_error_severity(errlog));
}
}
static int log_rtas_len(char * buf)
{
int len;
struct rtas_error_log *err;
uint32_t extended_log_length;
/* rtas fixed header */
len = 8;
err = (struct rtas_error_log *)buf;
extended_log_length = rtas_error_extended_log_length(err);
if (rtas_error_extended(err) && extended_log_length) {
/* extended header */
len += extended_log_length;
}
if (rtas_error_log_max == 0)
rtas_error_log_max = rtas_get_error_log_max();
if (len > rtas_error_log_max)
len = rtas_error_log_max;
return len;
}
/*
* First write to nvram, if fatal error, that is the only
* place we log the info. The error will be picked up
* on the next reboot by rtasd. If not fatal, run the
* method for the type of error. Currently, only RTAS
* errors have methods implemented, but in the future
* there might be a need to store data in nvram before a
* call to panic().
*
* XXX We write to nvram periodically, to indicate error has
* been written and sync'd, but there is a possibility
* that if we don't shutdown correctly, a duplicate error
* record will be created on next reboot.
*/
void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
{
unsigned long offset;
unsigned long s;
int len = 0;
pr_debug("rtasd: logging event\n");
if (buf == NULL)
return;
spin_lock_irqsave(&rtasd_log_lock, s);
/* get length and increase count */
switch (err_type & ERR_TYPE_MASK) {
case ERR_TYPE_RTAS_LOG:
len = log_rtas_len(buf);
if (!(err_type & ERR_FLAG_BOOT))
error_log_cnt++;
break;
case ERR_TYPE_KERNEL_PANIC:
default:
WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
#ifdef CONFIG_PPC64
/* Write error to NVRAM */
if (logging_enabled && !(err_type & ERR_FLAG_BOOT))
nvram_write_error_log(buf, len, err_type, error_log_cnt);
#endif /* CONFIG_PPC64 */
/*
* rtas errors can occur during boot, and we do want to capture
* those somewhere, even if nvram isn't ready (why not?), and even
* if rtasd isn't ready. Put them into the boot log, at least.
*/
if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
printk_log_rtas(buf, len);
/* Check to see if we need to or have stopped logging */
if (fatal || !logging_enabled) {
logging_enabled = 0;
WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
/* call type specific method for error */
switch (err_type & ERR_TYPE_MASK) {
case ERR_TYPE_RTAS_LOG:
offset = rtas_error_log_buffer_max *
((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
/* First copy over sequence number */
memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
/* Second copy over error log data */
offset += sizeof(int);
memcpy(&rtas_log_buf[offset], buf, len);
if (rtas_log_size < LOG_NUMBER)
rtas_log_size += 1;
else
rtas_log_start += 1;
WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
spin_unlock_irqrestore(&rtasd_log_lock, s);
wake_up_interruptible(&rtas_log_wait);
break;
case ERR_TYPE_KERNEL_PANIC:
default:
WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
}
#ifdef CONFIG_PPC_PSERIES
static void handle_prrn_event(s32 scope)
{
/*
* For PRRN, we must pass the negative of the scope value in
* the RTAS event.
*/
pseries_devicetree_update(-scope);
numa_update_cpu_topology(false);
}
static void handle_rtas_event(const struct rtas_error_log *log)
{
if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled())
return;
/* For PRRN Events the extended log length is used to denote
* the scope for calling rtas update-nodes.
*/
handle_prrn_event(rtas_error_extended_log_length(log));
}
#else
static void handle_rtas_event(const struct rtas_error_log *log)
{
return;
}
#endif
static int rtas_log_open(struct inode * inode, struct file * file)
{
return 0;
}
static int rtas_log_release(struct inode * inode, struct file * file)
{
return 0;
}
/* This will check if all events are logged, if they are then, we
* know that we can safely clear the events in NVRAM.
* Next we'll sit and wait for something else to log.
*/
static ssize_t rtas_log_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
int error;
char *tmp;
unsigned long s;
unsigned long offset;
if (!buf || count < rtas_error_log_buffer_max)
return -EINVAL;
count = rtas_error_log_buffer_max;
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
tmp = kmalloc(count, GFP_KERNEL);
if (!tmp)
return -ENOMEM;
spin_lock_irqsave(&rtasd_log_lock, s);
/* if it's 0, then we know we got the last one (the one in NVRAM) */
while (rtas_log_size == 0) {
if (file->f_flags & O_NONBLOCK) {
spin_unlock_irqrestore(&rtasd_log_lock, s);
error = -EAGAIN;
goto out;
}
if (!logging_enabled) {
spin_unlock_irqrestore(&rtasd_log_lock, s);
error = -ENODATA;
goto out;
}
#ifdef CONFIG_PPC64
nvram_clear_error_log();
#endif /* CONFIG_PPC64 */
spin_unlock_irqrestore(&rtasd_log_lock, s);
error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
if (error)
goto out;
spin_lock_irqsave(&rtasd_log_lock, s);
}
offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
memcpy(tmp, &rtas_log_buf[offset], count);
rtas_log_start += 1;
rtas_log_size -= 1;
spin_unlock_irqrestore(&rtasd_log_lock, s);
error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
out:
kfree(tmp);
return error;
}
static __poll_t rtas_log_poll(struct file *file, poll_table * wait)
{
poll_wait(file, &rtas_log_wait, wait);
if (rtas_log_size)
return EPOLLIN | EPOLLRDNORM;
return 0;
}
static const struct file_operations proc_rtas_log_operations = {
.read = rtas_log_read,
.poll = rtas_log_poll,
.open = rtas_log_open,
.release = rtas_log_release,
.llseek = noop_llseek,
};
static int enable_surveillance(int timeout)
{
int error;
error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
if (error == 0)
return 0;
if (error == -EINVAL) {
printk(KERN_DEBUG "rtasd: surveillance not supported\n");
return 0;
}
printk(KERN_ERR "rtasd: could not update surveillance\n");
return -1;
}
static void do_event_scan(void)
{
int error;
do {
memset(logdata, 0, rtas_error_log_max);
error = rtas_call(event_scan, 4, 1, NULL,
RTAS_EVENT_SCAN_ALL_EVENTS, 0,
__pa(logdata), rtas_error_log_max);
if (error == -1) {
printk(KERN_ERR "event-scan failed\n");
break;
}
if (error == 0) {
if (rtas_error_type((struct rtas_error_log *)logdata) !=
RTAS_TYPE_PRRN)
pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG,
0);
handle_rtas_event((struct rtas_error_log *)logdata);
}
} while(error == 0);
}
static void rtas_event_scan(struct work_struct *w);
static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
/*
* Delay should be at least one second since some machines have problems if
* we call event-scan too quickly.
*/
static unsigned long event_scan_delay = 1*HZ;
static int first_pass = 1;
static void rtas_event_scan(struct work_struct *w)
{
unsigned int cpu;
do_event_scan();
get_online_cpus();
/* raw_ OK because just using CPU as starting point. */
cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (cpu >= nr_cpu_ids) {
cpu = cpumask_first(cpu_online_mask);
if (first_pass) {
first_pass = 0;
event_scan_delay = 30*HZ/rtas_event_scan_rate;
if (surveillance_timeout != -1) {
pr_debug("rtasd: enabling surveillance\n");
enable_surveillance(surveillance_timeout);
pr_debug("rtasd: surveillance enabled\n");
}
}
}
schedule_delayed_work_on(cpu, &event_scan_work,
__round_jiffies_relative(event_scan_delay, cpu));
put_online_cpus();
}
#ifdef CONFIG_PPC64
static void retrieve_nvram_error_log(void)
{
unsigned int err_type ;
int rc ;
/* See if we have any error stored in NVRAM */
memset(logdata, 0, rtas_error_log_max);
rc = nvram_read_error_log(logdata, rtas_error_log_max,
&err_type, &error_log_cnt);
/* We can use rtas_log_buf now */
logging_enabled = 1;
if (!rc) {
if (err_type != ERR_FLAG_ALREADY_LOGGED) {
pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
}
}
}
#else /* CONFIG_PPC64 */
static void retrieve_nvram_error_log(void)
{
}
#endif /* CONFIG_PPC64 */
static void start_event_scan(void)
{
printk(KERN_DEBUG "RTAS daemon started\n");
pr_debug("rtasd: will sleep for %d milliseconds\n",
(30000 / rtas_event_scan_rate));
/* Retrieve errors from nvram if any */
retrieve_nvram_error_log();
schedule_delayed_work_on(cpumask_first(cpu_online_mask),
&event_scan_work, event_scan_delay);
}
/* Cancel the rtas event scan work */
void rtas_cancel_event_scan(void)
{
cancel_delayed_work_sync(&event_scan_work);
}
EXPORT_SYMBOL_GPL(rtas_cancel_event_scan);
static int __init rtas_event_scan_init(void)
{
if (!machine_is(pseries) && !machine_is(chrp))
return 0;
/* No RTAS */
event_scan = rtas_token("event-scan");
if (event_scan == RTAS_UNKNOWN_SERVICE) {
printk(KERN_INFO "rtasd: No event-scan on system\n");
return -ENODEV;
}
rtas_event_scan_rate = rtas_token("rtas-event-scan-rate");
if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) {
printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n");
return -ENODEV;
}
if (!rtas_event_scan_rate) {
/* Broken firmware: take a rate of zero to mean don't scan */
printk(KERN_DEBUG "rtasd: scan rate is 0, not scanning\n");
return 0;
}
/* Make room for the sequence number */
rtas_error_log_max = rtas_get_error_log_max();
rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
rtas_log_buf = vmalloc(array_size(LOG_NUMBER,
rtas_error_log_buffer_max));
if (!rtas_log_buf) {
printk(KERN_ERR "rtasd: no memory\n");
return -ENOMEM;
}
start_event_scan();
return 0;
}
arch_initcall(rtas_event_scan_init);
static int __init rtas_init(void)
{
struct proc_dir_entry *entry;
if (!machine_is(pseries) && !machine_is(chrp))
return 0;
if (!rtas_log_buf)
return -ENODEV;
entry = proc_create("powerpc/rtas/error_log", 0400, NULL,
&proc_rtas_log_operations);
if (!entry)
printk(KERN_ERR "Failed to create error_log proc entry\n");
return 0;
}
__initcall(rtas_init);
static int __init surveillance_setup(char *str)
{
int i;
/* We only do surveillance on pseries */
if (!machine_is(pseries))
return 0;
if (get_option(&str,&i)) {
if (i >= 0 && i <= 255)
surveillance_timeout = i;
}
return 1;
}
__setup("surveillance=", surveillance_setup);
static int __init rtasmsgs_setup(char *str)
{
return (kstrtobool(str, &full_rtas_msgs) == 0);
}
__setup("rtasmsgs=", rtasmsgs_setup);