IB/ipath: Log "active" time and some errors to EEPROM

We currently track various errors, now we enhance that capability by
logging some of them to EEPROM.  We also now log a cumulative "active"
time defined by traffic though the InfiniPath HCA beyond the normal SM
traffic.

Signed-off-by: Michael Albaugh <michael.albaugh@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
This commit is contained in:
Michael Albaugh 2007-05-17 07:26:28 -07:00 committed by Roland Dreier
parent 8e9ab3f1c9
commit aecd3b5ab1
9 changed files with 370 additions and 8 deletions

View File

@ -2005,6 +2005,9 @@ void ipath_shutdown_device(struct ipath_devdata *dd)
~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
ipath_update_eeprom_log(dd);
}
/**

View File

@ -367,8 +367,8 @@ bail:
* @len: number of bytes to receive
*/
int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
void *buffer, int len)
static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
u8 eeprom_offset, void *buffer, int len)
{
/* compiler complains unless initialized */
u8 single_byte = 0;
@ -418,6 +418,7 @@ bail:
return ret;
}
/**
* ipath_eeprom_write - writes data to the eeprom via I2C
* @dd: the infinipath device
@ -425,8 +426,8 @@ bail:
* @buffer: data to write
* @len: number of bytes to write
*/
int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
const void *buffer, int len)
int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
const void *buffer, int len)
{
u8 single_byte;
int sub_len;
@ -500,6 +501,38 @@ bail:
return ret;
}
/*
* The public entry-points ipath_eeprom_read() and ipath_eeprom_write()
* are now just wrappers around the internal functions.
*/
int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
void *buff, int len)
{
int ret;
ret = down_interruptible(&dd->ipath_eep_sem);
if (!ret) {
ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
up(&dd->ipath_eep_sem);
}
return ret;
}
int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
const void *buff, int len)
{
int ret;
ret = down_interruptible(&dd->ipath_eep_sem);
if (!ret) {
ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
up(&dd->ipath_eep_sem);
}
return ret;
}
static u8 flash_csum(struct ipath_flash *ifp, int adjust)
{
u8 *ip = (u8 *) ifp;
@ -527,7 +560,7 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
void *buf;
struct ipath_flash *ifp;
__be64 guid;
int len;
int len, eep_stat;
u8 csum, *bguid;
int t = dd->ipath_unit;
struct ipath_devdata *dd0 = ipath_lookup(0);
@ -571,7 +604,11 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
goto bail;
}
if (ipath_eeprom_read(dd, 0, buf, len)) {
down(&dd->ipath_eep_sem);
eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
up(&dd->ipath_eep_sem);
if (eep_stat) {
ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
goto done;
}
@ -646,8 +683,192 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
(unsigned long long) be64_to_cpu(dd->ipath_guid));
memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
/*
* Power-on (actually "active") hours are kept as little-endian value
* in EEPROM, but as seconds in a (possibly as small as 24-bit)
* atomic_t while running.
*/
atomic_set(&dd->ipath_active_time, 0);
dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
done:
vfree(buf);
bail:;
}
/**
* ipath_update_eeprom_log - copy active-time and error counters to eeprom
* @dd: the infinipath device
*
* Although the time is kept as seconds in the ipath_devdata struct, it is
* rounded to hours for re-write, as we have only 16 bits in EEPROM.
* First-cut code reads whole (expected) struct ipath_flash, modifies,
* re-writes. Future direction: read/write only what we need, assuming
* that the EEPROM had to have been "good enough" for driver init, and
* if not, we aren't making it worse.
*
*/
int ipath_update_eeprom_log(struct ipath_devdata *dd)
{
void *buf;
struct ipath_flash *ifp;
int len, hi_water;
uint32_t new_time, new_hrs;
u8 csum;
int ret, idx;
unsigned long flags;
/* first, check if we actually need to do anything. */
ret = 0;
for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
if (dd->ipath_eep_st_new_errs[idx]) {
ret = 1;
break;
}
}
new_time = atomic_read(&dd->ipath_active_time);
if (ret == 0 && new_time < 3600)
return 0;
/*
* The quick-check above determined that there is something worthy
* of logging, so get current contents and do a more detailed idea.
*/
len = offsetof(struct ipath_flash, if_future);
buf = vmalloc(len);
ret = 1;
if (!buf) {
ipath_dev_err(dd, "Couldn't allocate memory to read %u "
"bytes from eeprom for logging\n", len);
goto bail;
}
/* Grab semaphore and read current EEPROM. If we get an
* error, let go, but if not, keep it until we finish write.
*/
ret = down_interruptible(&dd->ipath_eep_sem);
if (ret) {
ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
goto free_bail;
}
ret = ipath_eeprom_internal_read(dd, 0, buf, len);
if (ret) {
up(&dd->ipath_eep_sem);
ipath_dev_err(dd, "Unable read EEPROM for logging\n");
goto free_bail;
}
ifp = (struct ipath_flash *)buf;
csum = flash_csum(ifp, 0);
if (csum != ifp->if_csum) {
up(&dd->ipath_eep_sem);
ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
csum, ifp->if_csum);
ret = 1;
goto free_bail;
}
hi_water = 0;
spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
int new_val = dd->ipath_eep_st_new_errs[idx];
if (new_val) {
/*
* If we have seen any errors, add to EEPROM values
* We need to saturate at 0xFF (255) and we also
* would need to adjust the checksum if we were
* trying to minimize EEPROM traffic
* Note that we add to actual current count in EEPROM,
* in case it was altered while we were running.
*/
new_val += ifp->if_errcntp[idx];
if (new_val > 0xFF)
new_val = 0xFF;
if (ifp->if_errcntp[idx] != new_val) {
ifp->if_errcntp[idx] = new_val;
hi_water = offsetof(struct ipath_flash,
if_errcntp) + idx;
}
/*
* update our shadow (used to minimize EEPROM
* traffic), to match what we are about to write.
*/
dd->ipath_eep_st_errs[idx] = new_val;
dd->ipath_eep_st_new_errs[idx] = 0;
}
}
/*
* now update active-time. We would like to round to the nearest hour
* but unless atomic_t are sure to be proper signed ints we cannot,
* because we need to account for what we "transfer" to EEPROM and
* if we log an hour at 31 minutes, then we would need to set
* active_time to -29 to accurately count the _next_ hour.
*/
if (new_time > 3600) {
new_hrs = new_time / 3600;
atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
new_hrs += dd->ipath_eep_hrs;
if (new_hrs > 0xFFFF)
new_hrs = 0xFFFF;
dd->ipath_eep_hrs = new_hrs;
if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
ifp->if_powerhour[0] = new_hrs & 0xFF;
hi_water = offsetof(struct ipath_flash, if_powerhour);
}
if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
ifp->if_powerhour[1] = new_hrs >> 8;
hi_water = offsetof(struct ipath_flash, if_powerhour)
+ 1;
}
}
/*
* There is a tiny possibility that we could somehow fail to write
* the EEPROM after updating our shadows, but problems from holding
* the spinlock too long are a much bigger issue.
*/
spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
if (hi_water) {
/* we made some change to the data, uopdate cksum and write */
csum = flash_csum(ifp, 1);
ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
}
up(&dd->ipath_eep_sem);
if (ret)
ipath_dev_err(dd, "Failed updating EEPROM\n");
free_bail:
vfree(buf);
bail:
return ret;
}
/**
* ipath_inc_eeprom_err - increment one of the four error counters
* that are logged to EEPROM.
* @dd: the infinipath device
* @eidx: 0..3, the counter to increment
* @incr: how much to add
*
* Each counter is 8-bits, and saturates at 255 (0xFF). They
* are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
* is called, but it can only be called in a context that allows sleep.
* This function can be called even at interrupt level.
*/
void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
{
uint new_val;
unsigned long flags;
spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
if (new_val > 255)
new_val = 255;
dd->ipath_eep_st_new_errs[eidx] = new_val;
spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
return;
}

View File

@ -440,6 +440,7 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
u32 bits, ctrl;
int isfatal = 0;
char bitsmsg[64];
int log_idx;
hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
@ -468,6 +469,11 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
hwerrs &= dd->ipath_hwerrmask;
/* We log some errors to EEPROM, check if we have any of those. */
for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
ipath_inc_eeprom_err(dd, log_idx, 1);
/*
* make sure we get this much out, unless told to be quiet,
* it's a parity error we may recover from,
@ -1171,6 +1177,22 @@ static void ipath_init_ht_variables(struct ipath_devdata *dd)
dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
/*
* EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
* 2 is Some Misc, 3 is reserved for future.
*/
dd->ipath_eep_st_masks[0].hwerrs_to_log =
INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
dd->ipath_eep_st_masks[1].hwerrs_to_log =
INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
dd->ipath_eep_st_masks[2].errs_to_log =
INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET;
}
/**

View File

@ -340,6 +340,7 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
u32 bits, ctrl;
int isfatal = 0;
char bitsmsg[64];
int log_idx;
hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
if (!hwerrs) {
@ -367,6 +368,11 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
hwerrs &= dd->ipath_hwerrmask;
/* We log some errors to EEPROM, check if we have any of those. */
for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
ipath_inc_eeprom_err(dd, log_idx, 1);
/*
* make sure we get this much out, unless told to be quiet,
* or it's occurred within the last 5 seconds
@ -950,6 +956,27 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
/*
* EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
* 2 is Some Misc, 3 is reserved for future.
*/
dd->ipath_eep_st_masks[0].hwerrs_to_log =
INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
/* Ignore errors in PIO/PBC on systems with unordered write-combining */
if (ipath_unordered_wc())
dd->ipath_eep_st_masks[0].hwerrs_to_log &= ~TXE_PIO_PARITY;
dd->ipath_eep_st_masks[1].hwerrs_to_log =
INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
dd->ipath_eep_st_masks[2].errs_to_log =
INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET;
}
/* setup the MSI stuff again after a reset. I'd like to just call

View File

@ -341,6 +341,8 @@ static int init_chip_first(struct ipath_devdata *dd,
spin_lock_init(&dd->ipath_tid_lock);
spin_lock_init(&dd->ipath_gpio_lock);
spin_lock_init(&dd->ipath_eep_st_lock);
sema_init(&dd->ipath_eep_sem, 1);
done:
*pdp = pd;

View File

@ -505,6 +505,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
int i, iserr = 0;
int chkerrpkts = 0, noprint = 0;
unsigned supp_msgs;
int log_idx;
supp_msgs = handle_frequent_errors(dd, errs, msg, &noprint);
@ -518,6 +519,13 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
if (errs & INFINIPATH_E_HARDWARE) {
/* reuse same msg buf */
dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
} else {
u64 mask;
for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
if (errs & mask)
ipath_inc_eeprom_err(dd, log_idx, 1);
}
}
if (!noprint && (errs & ~dd->ipath_e_bitsextant))

View File

@ -57,6 +57,24 @@
extern struct infinipath_stats ipath_stats;
#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
/*
* First-cut critierion for "device is active" is
* two thousand dwords combined Tx, Rx traffic per
* 5-second interval. SMA packets are 64 dwords,
* and occur "a few per second", presumably each way.
*/
#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
/*
* Struct used to indicate which errors are logged in each of the
* error-counters that are logged to EEPROM. A counter is incremented
* _once_ (saturating at 255) for each event with any bits set in
* the error or hwerror register masks below.
*/
#define IPATH_EEP_LOG_CNT (4)
struct ipath_eep_log_mask {
u64 errs_to_log;
u64 hwerrs_to_log;
};
struct ipath_portdata {
void **port_rcvegrbuf;
@ -588,6 +606,24 @@ struct ipath_devdata {
/* Used to flash LEDs in override mode */
struct timer_list ipath_led_override_timer;
/* Support (including locks) for EEPROM logging of errors and time */
/* control access to actual counters, timer */
spinlock_t ipath_eep_st_lock;
/* control high-level access to EEPROM */
struct semaphore ipath_eep_sem;
/* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
uint64_t ipath_traffic_wds;
/* active time is kept in seconds, but logged in hours */
atomic_t ipath_active_time;
/* Below are nominal shadow of EEPROM, new since last EEPROM update */
uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
uint16_t ipath_eep_hrs;
/*
* masks for which bits of errs, hwerrs that cause
* each of the counters to increment.
*/
struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
};
/* Private data for file operations */
@ -726,6 +762,8 @@ u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32 *);
void ipath_init_iba6120_funcs(struct ipath_devdata *);
void ipath_init_iba6110_funcs(struct ipath_devdata *);
void ipath_get_eeprom_info(struct ipath_devdata *);
int ipath_update_eeprom_log(struct ipath_devdata *dd);
void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
void ipath_disarm_senderrbufs(struct ipath_devdata *, int);

View File

@ -55,6 +55,7 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
u64 val64;
unsigned long t0, t1;
u64 ret;
unsigned long flags;
t0 = jiffies;
/* If fast increment counters are only 32 bits, snapshot them,
@ -91,12 +92,18 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
if (creg == dd->ipath_cregs->cr_wordsendcnt) {
if (val != dd->ipath_lastsword) {
dd->ipath_sword += val - dd->ipath_lastsword;
spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
dd->ipath_traffic_wds += val - dd->ipath_lastsword;
spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
dd->ipath_lastsword = val;
}
val64 = dd->ipath_sword;
} else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
if (val != dd->ipath_lastrword) {
dd->ipath_rword += val - dd->ipath_lastrword;
spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
dd->ipath_traffic_wds += val - dd->ipath_lastrword;
spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
dd->ipath_lastrword = val;
}
val64 = dd->ipath_rword;
@ -200,6 +207,7 @@ void ipath_get_faststats(unsigned long opaque)
struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
u32 val;
static unsigned cnt;
unsigned long flags;
/*
* don't access the chip while running diags, or memory diags can
@ -210,9 +218,20 @@ void ipath_get_faststats(unsigned long opaque)
/* but re-arm the timer, for diags case; won't hurt other */
goto done;
/*
* We now try to maintain a "active timer", based on traffic
* exceeding a threshold, so we need to check the word-counts
* even if they are 64-bit.
*/
ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
if (dd->ipath_traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
atomic_add(5, &dd->ipath_active_time); /* S/B #define */
dd->ipath_traffic_wds = 0;
spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
}

View File

@ -613,6 +613,26 @@ static ssize_t store_led_override(struct device *dev,
return ret;
}
static ssize_t show_logged_errs(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct ipath_devdata *dd = dev_get_drvdata(dev);
int idx, count;
/* force consistency with actual EEPROM */
if (ipath_update_eeprom_log(dd) != 0)
return -ENXIO;
count = 0;
for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
dd->ipath_eep_st_errs[idx],
idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
}
return count;
}
static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
@ -643,6 +663,7 @@ static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
static struct attribute *dev_attributes[] = {
&dev_attr_guid.attr,
@ -660,6 +681,7 @@ static struct attribute *dev_attributes[] = {
&dev_attr_enabled.attr,
&dev_attr_rx_pol_inv.attr,
&dev_attr_led_override.attr,
&dev_attr_logged_errors.attr,
NULL
};