mirror of
https://github.com/torvalds/linux.git
synced 2024-12-15 15:41:58 +00:00
IB/hfi1: Fix softlockup issue
Soft lockups can occur because the mad processing on different CPUs acquire the spin lock dc8051_lock: [534552.835870] [<ffffffffa026f993>] ? read_dev_port_cntr.isra.37+0x23/0x160 [hfi1] [534552.835880] [<ffffffffa02775af>] read_dev_cntr+0x4f/0x60 [hfi1] [534552.835893] [<ffffffffa028d7cd>] pma_get_opa_portstatus+0x64d/0x8c0 [hfi1] [534552.835904] [<ffffffffa0290e7d>] hfi1_process_mad+0x48d/0x18c0 [hfi1] [534552.835908] [<ffffffff811dc1f1>] ? __slab_free+0x81/0x2f0 [534552.835936] [<ffffffffa024c34e>] ? ib_mad_recv_done+0x21e/0xa30 [ib_core] [534552.835939] [<ffffffff811dd153>] ? __kmalloc+0x1f3/0x240 [534552.835947] [<ffffffffa024c3fb>] ib_mad_recv_done+0x2cb/0xa30 [ib_core] [534552.835955] [<ffffffffa0237c85>] __ib_process_cq+0x55/0xd0 [ib_core] [534552.835962] [<ffffffffa0237d70>] ib_cq_poll_work+0x20/0x60 [ib_core] [534552.835964] [<ffffffff810a7f3b>] process_one_work+0x17b/0x470 [534552.835966] [<ffffffff810a8d76>] worker_thread+0x126/0x410 [534552.835969] [<ffffffff810a8c50>] ? rescuer_thread+0x460/0x460 [534552.835971] [<ffffffff810b052f>] kthread+0xcf/0xe0 [534552.835974] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 [534552.835977] [<ffffffff81696418>] ret_from_fork+0x58/0x90 [534552.835980] [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140 This issue is made worse when the 8051 is busy and the reads take longer. Fix by using a non-spinning lock procure. Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com> Reviewed-by: Mike Marciszyn <mike.marciniszyn@intel.com> Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
This commit is contained in:
parent
b6eac931b9
commit
22546b741a
@ -6410,18 +6410,17 @@ static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
|
||||
*
|
||||
* The expectation is that the caller of this routine would have taken
|
||||
* care of properly transitioning the link into the correct state.
|
||||
* NOTE: the caller needs to acquire the dd->dc8051_lock lock
|
||||
* before calling this function.
|
||||
*/
|
||||
static void dc_shutdown(struct hfi1_devdata *dd)
|
||||
static void _dc_shutdown(struct hfi1_devdata *dd)
|
||||
{
|
||||
unsigned long flags;
|
||||
lockdep_assert_held(&dd->dc8051_lock);
|
||||
|
||||
spin_lock_irqsave(&dd->dc8051_lock, flags);
|
||||
if (dd->dc_shutdown) {
|
||||
spin_unlock_irqrestore(&dd->dc8051_lock, flags);
|
||||
if (dd->dc_shutdown)
|
||||
return;
|
||||
}
|
||||
|
||||
dd->dc_shutdown = 1;
|
||||
spin_unlock_irqrestore(&dd->dc8051_lock, flags);
|
||||
/* Shutdown the LCB */
|
||||
lcb_shutdown(dd, 1);
|
||||
/*
|
||||
@ -6432,35 +6431,45 @@ static void dc_shutdown(struct hfi1_devdata *dd)
|
||||
write_csr(dd, DC_DC8051_CFG_RST, 0x1);
|
||||
}
|
||||
|
||||
static void dc_shutdown(struct hfi1_devdata *dd)
|
||||
{
|
||||
mutex_lock(&dd->dc8051_lock);
|
||||
_dc_shutdown(dd);
|
||||
mutex_unlock(&dd->dc8051_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calling this after the DC has been brought out of reset should not
|
||||
* do any damage.
|
||||
* NOTE: the caller needs to acquire the dd->dc8051_lock lock
|
||||
* before calling this function.
|
||||
*/
|
||||
static void dc_start(struct hfi1_devdata *dd)
|
||||
static void _dc_start(struct hfi1_devdata *dd)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
lockdep_assert_held(&dd->dc8051_lock);
|
||||
|
||||
spin_lock_irqsave(&dd->dc8051_lock, flags);
|
||||
if (!dd->dc_shutdown)
|
||||
goto done;
|
||||
spin_unlock_irqrestore(&dd->dc8051_lock, flags);
|
||||
return;
|
||||
|
||||
/* Take the 8051 out of reset */
|
||||
write_csr(dd, DC_DC8051_CFG_RST, 0ull);
|
||||
/* Wait until 8051 is ready */
|
||||
ret = wait_fm_ready(dd, TIMEOUT_8051_START);
|
||||
if (ret) {
|
||||
if (wait_fm_ready(dd, TIMEOUT_8051_START))
|
||||
dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
|
||||
__func__);
|
||||
}
|
||||
|
||||
/* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
|
||||
write_csr(dd, DCC_CFG_RESET, 0x10);
|
||||
/* lcb_shutdown() with abort=1 does not restore these */
|
||||
write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
|
||||
spin_lock_irqsave(&dd->dc8051_lock, flags);
|
||||
dd->dc_shutdown = 0;
|
||||
done:
|
||||
spin_unlock_irqrestore(&dd->dc8051_lock, flags);
|
||||
}
|
||||
|
||||
static void dc_start(struct hfi1_devdata *dd)
|
||||
{
|
||||
mutex_lock(&dd->dc8051_lock);
|
||||
_dc_start(dd);
|
||||
mutex_unlock(&dd->dc8051_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -8513,16 +8522,11 @@ static int do_8051_command(
|
||||
{
|
||||
u64 reg, completed;
|
||||
int return_code;
|
||||
unsigned long flags;
|
||||
unsigned long timeout;
|
||||
|
||||
hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
|
||||
|
||||
/*
|
||||
* Alternative to holding the lock for a long time:
|
||||
* - keep busy wait - have other users bounce off
|
||||
*/
|
||||
spin_lock_irqsave(&dd->dc8051_lock, flags);
|
||||
mutex_lock(&dd->dc8051_lock);
|
||||
|
||||
/* We can't send any commands to the 8051 if it's in reset */
|
||||
if (dd->dc_shutdown) {
|
||||
@ -8548,10 +8552,8 @@ static int do_8051_command(
|
||||
return_code = -ENXIO;
|
||||
goto fail;
|
||||
}
|
||||
spin_unlock_irqrestore(&dd->dc8051_lock, flags);
|
||||
dc_shutdown(dd);
|
||||
dc_start(dd);
|
||||
spin_lock_irqsave(&dd->dc8051_lock, flags);
|
||||
_dc_shutdown(dd);
|
||||
_dc_start(dd);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -8632,8 +8634,7 @@ static int do_8051_command(
|
||||
write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
|
||||
|
||||
fail:
|
||||
spin_unlock_irqrestore(&dd->dc8051_lock, flags);
|
||||
|
||||
mutex_unlock(&dd->dc8051_lock);
|
||||
return return_code;
|
||||
}
|
||||
|
||||
@ -12007,6 +12008,10 @@ static void free_cntrs(struct hfi1_devdata *dd)
|
||||
dd->scntrs = NULL;
|
||||
kfree(dd->cntrnames);
|
||||
dd->cntrnames = NULL;
|
||||
if (dd->update_cntr_wq) {
|
||||
destroy_workqueue(dd->update_cntr_wq);
|
||||
dd->update_cntr_wq = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
|
||||
@ -12162,7 +12167,7 @@ u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
|
||||
return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
|
||||
}
|
||||
|
||||
static void update_synth_timer(unsigned long opaque)
|
||||
static void do_update_synth_timer(struct work_struct *work)
|
||||
{
|
||||
u64 cur_tx;
|
||||
u64 cur_rx;
|
||||
@ -12171,8 +12176,8 @@ static void update_synth_timer(unsigned long opaque)
|
||||
int i, j, vl;
|
||||
struct hfi1_pportdata *ppd;
|
||||
struct cntr_entry *entry;
|
||||
|
||||
struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
|
||||
struct hfi1_devdata *dd = container_of(work, struct hfi1_devdata,
|
||||
update_cntr_work);
|
||||
|
||||
/*
|
||||
* Rather than keep beating on the CSRs pick a minimal set that we can
|
||||
@ -12255,7 +12260,13 @@ static void update_synth_timer(unsigned long opaque)
|
||||
} else {
|
||||
hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
|
||||
}
|
||||
}
|
||||
|
||||
static void update_synth_timer(unsigned long opaque)
|
||||
{
|
||||
struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
|
||||
|
||||
queue_work(dd->update_cntr_wq, &dd->update_cntr_work);
|
||||
mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
|
||||
}
|
||||
|
||||
@ -12491,6 +12502,13 @@ static int init_cntrs(struct hfi1_devdata *dd)
|
||||
if (init_cpu_counters(dd))
|
||||
goto bail;
|
||||
|
||||
dd->update_cntr_wq = alloc_ordered_workqueue("hfi1_update_cntr_%d",
|
||||
WQ_MEM_RECLAIM, dd->unit);
|
||||
if (!dd->update_cntr_wq)
|
||||
goto bail;
|
||||
|
||||
INIT_WORK(&dd->update_cntr_work, do_update_synth_timer);
|
||||
|
||||
mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
|
||||
return 0;
|
||||
bail:
|
||||
|
@ -484,7 +484,7 @@ struct rvt_sge_state;
|
||||
#define HFI1_PART_ENFORCE_OUT 0x2
|
||||
|
||||
/* how often we check for synthetic counter wrap around */
|
||||
#define SYNTH_CNT_TIME 2
|
||||
#define SYNTH_CNT_TIME 3
|
||||
|
||||
/* Counter flags */
|
||||
#define CNTR_NORMAL 0x0 /* Normal counters, just read register */
|
||||
@ -962,8 +962,9 @@ struct hfi1_devdata {
|
||||
spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
|
||||
/* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
|
||||
spinlock_t uctxt_lock; /* rcd and user context changes */
|
||||
/* exclusive access to 8051 */
|
||||
spinlock_t dc8051_lock;
|
||||
struct mutex dc8051_lock; /* exclusive access to 8051 */
|
||||
struct workqueue_struct *update_cntr_wq;
|
||||
struct work_struct update_cntr_work;
|
||||
/* exclusive access to 8051 memory */
|
||||
spinlock_t dc8051_memlock;
|
||||
int dc8051_timed_out; /* remember if the 8051 timed out */
|
||||
|
@ -1081,11 +1081,11 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
|
||||
spin_lock_init(&dd->uctxt_lock);
|
||||
spin_lock_init(&dd->hfi1_diag_trans_lock);
|
||||
spin_lock_init(&dd->sc_init_lock);
|
||||
spin_lock_init(&dd->dc8051_lock);
|
||||
spin_lock_init(&dd->dc8051_memlock);
|
||||
seqlock_init(&dd->sc2vl_lock);
|
||||
spin_lock_init(&dd->sde_map_lock);
|
||||
spin_lock_init(&dd->pio_map_lock);
|
||||
mutex_init(&dd->dc8051_lock);
|
||||
init_waitqueue_head(&dd->event_queue);
|
||||
|
||||
dd->int_counter = alloc_percpu(u64);
|
||||
|
Loading…
Reference in New Issue
Block a user