staging/rdma/hfi: fix CQ completion order issue

The current implementation of the sdma_wait variable
has a timing hole that can cause a completion Q entry
to be returned from a pio send prior to an older
sdma packets completion queue entry.

The sdma_wait variable used to be decremented prior to
calling the packet complete routine.  The hole is between decrement
and the verbs completion where send engine using pio could return
a out of order completion in that window.

This patch closes the hole by allowing an API option to
specify an sdma_drained callback.   The atomic dec
is positioned after the complete callback to avoid the
window as long as the pio path doesn't execute when
there is a non-zero sdma count.

Reviewed-by: Jubin John <jubin.john@intel.com>
Signed-off-by: Dean Luick <dean.luick@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
This commit is contained in:
Mike Marciniszyn 2016-02-14 12:45:53 -08:00 committed by Doug Ledford
parent 91702b4a39
commit a545f5308b
7 changed files with 65 additions and 92 deletions

View File

@ -69,7 +69,8 @@ struct sdma_engine;
* @list: used to add/insert into QP/PQ wait lists
* @tx_head: overflow list of sdma_txreq's
* @sleep: no space callback
* @wakeup: space callback
* @wakeup: space callback wakeup
* @sdma_drained: sdma count drained
* @iowork: workqueue overhead
* @wait_dma: wait for sdma_busy == 0
* @wait_pio: wait for pio_busy == 0
@ -104,6 +105,7 @@ struct iowait {
struct sdma_txreq *tx,
unsigned seq);
void (*wakeup)(struct iowait *wait, int reason);
void (*sdma_drained)(struct iowait *wait);
struct work_struct iowork;
wait_queue_head_t wait_dma;
wait_queue_head_t wait_pio;
@ -122,7 +124,7 @@ struct iowait {
* @tx_limit: limit for overflow queuing
* @func: restart function for workqueue
* @sleep: sleep function for no space
* @wakeup: wakeup function for no space
* @resume: wakeup function for no space
*
* This function initializes the iowait
* structure embedded in the QP or PQ.
@ -138,7 +140,8 @@ static inline void iowait_init(
struct iowait *wait,
struct sdma_txreq *tx,
unsigned seq),
void (*wakeup)(struct iowait *wait, int reason))
void (*wakeup)(struct iowait *wait, int reason),
void (*sdma_drained)(struct iowait *wait))
{
wait->count = 0;
INIT_LIST_HEAD(&wait->list);
@ -151,6 +154,7 @@ static inline void iowait_init(
wait->tx_limit = tx_limit;
wait->sleep = sleep;
wait->wakeup = wakeup;
wait->sdma_drained = sdma_drained;
}
/**
@ -273,6 +277,8 @@ static inline void iowait_drain_wakeup(struct iowait *wait)
{
wake_up(&wait->wait_dma);
wake_up(&wait->wait_pio);
if (wait->sdma_drained)
wait->sdma_drained(wait);
}
/**

View File

@ -73,6 +73,7 @@ static int iowait_sleep(
struct sdma_txreq *stx,
unsigned seq);
static void iowait_wakeup(struct iowait *wait, int reason);
static void iowait_sdma_drained(struct iowait *wait);
static void qp_pio_drain(struct rvt_qp *qp);
static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
@ -509,6 +510,22 @@ static void iowait_wakeup(struct iowait *wait, int reason)
hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
}
static void iowait_sdma_drained(struct iowait *wait)
{
struct rvt_qp *qp = iowait_to_qp(wait);
/*
* This happens when the send engine notes
* a QP in the error state and cannot
* do the flush work until that QP's
* sdma work has finished.
*/
if (qp->s_flags & RVT_S_WAIT_DMA) {
qp->s_flags &= ~RVT_S_WAIT_DMA;
hfi1_schedule_send(qp);
}
}
/**
*
* qp_to_sdma_engine - map a qp to a send engine
@ -773,7 +790,8 @@ void notify_qp_reset(struct rvt_qp *qp)
1,
_hfi1_do_send,
iowait_sleep,
iowait_wakeup);
iowait_wakeup,
iowait_sdma_drained);
priv->r_adefered = 0;
clear_ahg(qp);
}

View File

@ -361,6 +361,28 @@ static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
write_sde_csr(sde, SD(DESC_CNT), reg);
}
static inline void complete_tx(struct sdma_engine *sde,
struct sdma_txreq *tx,
int res)
{
/* protect against complete modifying */
struct iowait *wait = tx->wait;
callback_t complete = tx->complete;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
trace_hfi1_sdma_out_sn(sde, txp->sn);
if (WARN_ON_ONCE(sde->head_sn != txp->sn))
dd_dev_err(sde->dd, "expected %llu got %llu\n",
sde->head_sn, txp->sn);
sde->head_sn++;
#endif
sdma_txclean(sde->dd, tx);
if (complete)
(*complete)(tx, res);
if (iowait_sdma_dec(wait) && wait)
iowait_drain_wakeup(wait);
}
/*
* Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
*
@ -395,27 +417,8 @@ static void sdma_flush(struct sdma_engine *sde)
}
spin_unlock_irqrestore(&sde->flushlist_lock, flags);
/* flush from flush list */
list_for_each_entry_safe(txp, txp_next, &flushlist, list) {
int drained = 0;
/* protect against complete modifying */
struct iowait *wait = txp->wait;
list_del_init(&txp->list);
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
trace_hfi1_sdma_out_sn(sde, txp->sn);
if (WARN_ON_ONCE(sde->head_sn != txp->sn))
dd_dev_err(sde->dd, "expected %llu got %llu\n",
sde->head_sn, txp->sn);
sde->head_sn++;
#endif
sdma_txclean(sde->dd, txp);
if (wait)
drained = iowait_sdma_dec(wait);
if (txp->complete)
(*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
if (wait && drained)
iowait_drain_wakeup(wait);
}
list_for_each_entry_safe(txp, txp_next, &flushlist, list)
complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
}
/*
@ -577,31 +580,10 @@ static void sdma_flush_descq(struct sdma_engine *sde)
head = ++sde->descq_head & sde->sdma_mask;
/* if now past this txp's descs, do the callback */
if (txp && txp->next_descq_idx == head) {
int drained = 0;
/* protect against complete modifying */
struct iowait *wait = txp->wait;
/* remove from list */
sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
if (wait)
drained = iowait_sdma_dec(wait);
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
trace_hfi1_sdma_out_sn(sde, txp->sn);
if (WARN_ON_ONCE(sde->head_sn != txp->sn))
dd_dev_err(sde->dd, "expected %llu got %llu\n",
sde->head_sn, txp->sn);
sde->head_sn++;
#endif
sdma_txclean(sde->dd, txp);
complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
trace_hfi1_sdma_progress(sde, head, tail, txp);
if (txp->complete)
(*txp->complete)(
txp,
SDMA_TXREQ_S_ABORTED,
drained);
if (wait && drained)
iowait_drain_wakeup(wait);
/* see if there is another txp */
txp = get_txhead(sde);
}
progress++;
@ -1470,7 +1452,7 @@ static void sdma_make_progress(struct sdma_engine *sde, u64 status)
{
struct sdma_txreq *txp = NULL;
int progress = 0;
u16 hwhead, swhead, swtail;
u16 hwhead, swhead;
int idle_check_done = 0;
hwhead = sdma_gethead(sde);
@ -1491,29 +1473,9 @@ retry:
/* if now past this txp's descs, do the callback */
if (txp && txp->next_descq_idx == swhead) {
int drained = 0;
/* protect against complete modifying */
struct iowait *wait = txp->wait;
/* remove from list */
sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
if (wait)
drained = iowait_sdma_dec(wait);
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
trace_hfi1_sdma_out_sn(sde, txp->sn);
if (WARN_ON_ONCE(sde->head_sn != txp->sn))
dd_dev_err(sde->dd, "expected %llu got %llu\n",
sde->head_sn, txp->sn);
sde->head_sn++;
#endif
sdma_txclean(sde->dd, txp);
if (txp->complete)
(*txp->complete)(
txp,
SDMA_TXREQ_S_OK,
drained);
if (wait && drained)
iowait_drain_wakeup(wait);
complete_tx(sde, txp, SDMA_TXREQ_S_OK);
/* see if there is another txp */
txp = get_txhead(sde);
}
@ -1531,6 +1493,8 @@ retry:
* of sdma_make_progress(..) which is ensured by idle_check_done flag
*/
if ((status & sde->idle_mask) && !idle_check_done) {
u16 swtail;
swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
if (swtail != hwhead) {
hwhead = (u16)read_sde_csr(sde, SD(HEAD));

View File

@ -555,7 +555,7 @@ static inline int sdma_txinit_ahg(
u8 num_ahg,
u32 *ahg,
u8 ahg_hlen,
void (*cb)(struct sdma_txreq *, int, int))
void (*cb)(struct sdma_txreq *, int))
{
if (tlen == 0)
return -ENODATA;
@ -618,7 +618,7 @@ static inline int sdma_txinit(
struct sdma_txreq *tx,
u16 flags,
u16 tlen,
void (*cb)(struct sdma_txreq *, int, int))
void (*cb)(struct sdma_txreq *, int))
{
return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
}

View File

@ -93,7 +93,7 @@ struct sdma_desc {
#define SDMA_TXREQ_F_USE_AHG 0x0004
struct sdma_txreq;
typedef void (*callback_t)(struct sdma_txreq *, int, int);
typedef void (*callback_t)(struct sdma_txreq *, int);
struct iowait;
struct sdma_txreq {

View File

@ -273,7 +273,7 @@ struct user_sdma_txreq {
static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
static int num_user_pages(const struct iovec *);
static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
static void user_sdma_txreq_cb(struct sdma_txreq *, int);
static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
static void user_sdma_free_request(struct user_sdma_request *, bool);
static int pin_vector_pages(struct user_sdma_request *,
@ -388,7 +388,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
init_waitqueue_head(&pq->wait);
iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
activate_packet_queue);
activate_packet_queue, NULL);
pq->reqidx = 0;
snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
fd->subctxt);
@ -1341,8 +1341,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
* tx request have been processed by the DMA engine. Called in
* interrupt context.
*/
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
int drain)
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
{
struct user_sdma_txreq *tx =
container_of(txreq, struct user_sdma_txreq, txreq);

View File

@ -130,8 +130,7 @@ MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
static void verbs_sdma_complete(
struct sdma_txreq *cookie,
int status,
int drained);
int status);
static int pio_wait(struct rvt_qp *qp,
struct send_context *sc,
@ -523,8 +522,7 @@ void update_sge(struct rvt_sge_state *ss, u32 length)
/* New API */
static void verbs_sdma_complete(
struct sdma_txreq *cookie,
int status,
int drained)
int status)
{
struct verbs_txreq *tx =
container_of(cookie, struct verbs_txreq, txreq);
@ -539,18 +537,6 @@ static void verbs_sdma_complete(
hdr = &tx->phdr.hdr;
hfi1_rc_send_complete(qp, hdr);
}
if (drained) {
/*
* This happens when the send engine notes
* a QP in the error state and cannot
* do the flush work until that QP's
* sdma work has finished.
*/
if (qp->s_flags & RVT_S_WAIT_DMA) {
qp->s_flags &= ~RVT_S_WAIT_DMA;
hfi1_schedule_send(qp);
}
}
spin_unlock(&qp->s_lock);
hfi1_put_txreq(tx);