RDMA/cma: Multiple path records support with netlink channel

Support receiving inbound and outbound IB path records (along with GMP
PathRecord) from user-space service through the RDMA netlink channel.
The LIDs in these 3 PRs can be used in this way:
1. GMP PR: used as the standard local/remote LIDs;
2. DLID of outbound PR: Used as the "dlid" field for outbound traffic;
3. DLID of inbound PR: Used as the "dlid" field for outbound traffic in
   responder side.

This is aimed to support adaptive routing. With current IB routing
solution when a packet goes out it's assigned with a fixed DLID per
target, meaning a fixed router will be used.
The LIDs in inbound/outbound path records can be used to identify group
of routers that allow communication with another subnet's entity. With
them packets from an inter-subnet connection may travel through any
router in the set to reach the target.

As confirmed with Jason, when sending a netlink request, kernel uses
LS_RESOLVE_PATH_USE_ALL so that the service knows kernel supports
multiple PRs.

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://lore.kernel.org/r/2fa2b6c93c4c16c8915bac3cfc4f27be1d60519d.1662631201.git.leonro@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
Mark Zhang 2022-09-08 13:09:01 +03:00 committed by Leon Romanovsky
parent bf9a992851
commit 5a37494933
6 changed files with 232 additions and 88 deletions

View File

@ -2026,6 +2026,8 @@ static void _destroy_id(struct rdma_id_private *id_priv,
cma_id_put(id_priv->id.context);
kfree(id_priv->id.route.path_rec);
kfree(id_priv->id.route.path_rec_inbound);
kfree(id_priv->id.route.path_rec_outbound);
put_net(id_priv->id.route.addr.dev_addr.net);
kfree(id_priv);
@ -2817,26 +2819,72 @@ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
}
EXPORT_SYMBOL(rdma_set_min_rnr_timer);
static void route_set_path_rec_inbound(struct cma_work *work,
struct sa_path_rec *path_rec)
{
struct rdma_route *route = &work->id->id.route;
if (!route->path_rec_inbound) {
route->path_rec_inbound =
kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
if (!route->path_rec_inbound)
return;
}
*route->path_rec_inbound = *path_rec;
}
static void route_set_path_rec_outbound(struct cma_work *work,
struct sa_path_rec *path_rec)
{
struct rdma_route *route = &work->id->id.route;
if (!route->path_rec_outbound) {
route->path_rec_outbound =
kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL);
if (!route->path_rec_outbound)
return;
}
*route->path_rec_outbound = *path_rec;
}
static void cma_query_handler(int status, struct sa_path_rec *path_rec,
void *context)
int num_prs, void *context)
{
struct cma_work *work = context;
struct rdma_route *route;
int i;
route = &work->id->id.route;
if (!status) {
if (status)
goto fail;
for (i = 0; i < num_prs; i++) {
if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP))
*route->path_rec = path_rec[i];
else if (path_rec[i].flags & IB_PATH_INBOUND)
route_set_path_rec_inbound(work, &path_rec[i]);
else if (path_rec[i].flags & IB_PATH_OUTBOUND)
route_set_path_rec_outbound(work, &path_rec[i]);
}
if (!route->path_rec) {
status = -EINVAL;
goto fail;
}
route->num_pri_alt_paths = 1;
*route->path_rec = *path_rec;
} else {
queue_work(cma_wq, &work->work);
return;
fail:
work->old_state = RDMA_CM_ROUTE_QUERY;
work->new_state = RDMA_CM_ADDR_RESOLVED;
work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
work->event.status = status;
pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
status);
}
queue_work(cma_wq, &work->work);
}

View File

@ -50,6 +50,7 @@
#include <rdma/ib_marshall.h>
#include <rdma/ib_addr.h>
#include <rdma/opa_addr.h>
#include <rdma/rdma_cm.h>
#include "sa.h"
#include "core_priv.h"
@ -104,7 +105,8 @@ struct ib_sa_device {
};
struct ib_sa_query {
void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
void (*callback)(struct ib_sa_query *sa_query, int status,
int num_prs, struct ib_sa_mad *mad);
void (*release)(struct ib_sa_query *);
struct ib_sa_client *client;
struct ib_sa_port *port;
@ -116,6 +118,12 @@ struct ib_sa_query {
u32 seq; /* Local svc request sequence number */
unsigned long timeout; /* Local svc timeout */
u8 path_use; /* How will the pathrecord be used */
/* A separate buffer to save pathrecords of a response, as in cases
* like IB/netlink, mulptiple pathrecords are supported, so that
* mad->data is not large enough to hold them
*/
void *resp_pr_data;
};
#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
@ -123,7 +131,8 @@ struct ib_sa_query {
#define IB_SA_QUERY_OPA 0x00000004
struct ib_sa_path_query {
void (*callback)(int, struct sa_path_rec *, void *);
void (*callback)(int status, struct sa_path_rec *rec,
int num_paths, void *context);
void *context;
struct ib_sa_query sa_query;
struct sa_path_rec *conv_pr;
@ -712,7 +721,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
sa_rec->reversible != 0)
query->path_use = LS_RESOLVE_PATH_USE_GMP;
query->path_use = LS_RESOLVE_PATH_USE_ALL;
else
query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
header->path_use = query->path_use;
@ -865,15 +874,31 @@ static void send_handler(struct ib_mad_agent *agent,
static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
const struct nlmsghdr *nlh)
{
struct ib_path_rec_data *srec, *drec;
struct ib_sa_path_query *path_query;
struct ib_mad_send_wc mad_send_wc;
struct ib_sa_mad *mad = NULL;
const struct nlattr *head, *curr;
struct ib_path_rec_data *rec;
int len, rem;
struct ib_sa_mad *mad = NULL;
int len, rem, num_prs = 0;
u32 mask = 0;
int status = -EIO;
if (query->callback) {
if (!query->callback)
goto out;
path_query = container_of(query, struct ib_sa_path_query, sa_query);
mad = query->mad_buf->mad;
if (!path_query->conv_pr &&
(be16_to_cpu(mad->mad_hdr.attr_id) == IB_SA_ATTR_PATH_REC)) {
/* Need a larger buffer for possible multiple PRs */
query->resp_pr_data = kvcalloc(RDMA_PRIMARY_PATH_MAX_REC_NUM,
sizeof(*drec), GFP_KERNEL);
if (!query->resp_pr_data) {
query->callback(query, -ENOMEM, 0, NULL);
return;
}
}
head = (const struct nlattr *) nlmsg_data(nlh);
len = nlmsg_len(nlh);
switch (query->path_use) {
@ -882,33 +907,48 @@ static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
break;
case LS_RESOLVE_PATH_USE_ALL:
mask = IB_PATH_PRIMARY;
break;
case LS_RESOLVE_PATH_USE_GMP:
default:
mask = IB_PATH_PRIMARY | IB_PATH_GMP |
IB_PATH_BIDIRECTIONAL;
break;
}
drec = (struct ib_path_rec_data *)query->resp_pr_data;
nla_for_each_attr(curr, head, len, rem) {
if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
rec = nla_data(curr);
/*
* Get the first one. In the future, we may
* need to get up to 6 pathrecords.
*/
if ((rec->flags & mask) == mask) {
mad = query->mad_buf->mad;
mad->mad_hdr.method |=
IB_MGMT_METHOD_RESP;
memcpy(mad->data, rec->path_rec,
sizeof(rec->path_rec));
if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD)
continue;
srec = nla_data(curr);
if ((srec->flags & mask) != mask)
continue;
status = 0;
if (!drec) {
memcpy(mad->data, srec->path_rec,
sizeof(srec->path_rec));
num_prs = 1;
break;
}
}
}
query->callback(query, status, mad);
memcpy(drec, srec, sizeof(*drec));
drec++;
num_prs++;
if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM)
break;
}
if (!status)
mad->mad_hdr.method |= IB_MGMT_METHOD_RESP;
query->callback(query, status, num_prs, mad);
kvfree(query->resp_pr_data);
query->resp_pr_data = NULL;
out:
mad_send_wc.send_buf = query->mad_buf;
mad_send_wc.status = IB_WC_SUCCESS;
send_handler(query->mad_buf->mad_agent, &mad_send_wc);
@ -1411,25 +1451,12 @@ static int opa_pr_query_possible(struct ib_sa_client *client,
return PR_IB_SUPPORTED;
}
static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
int status,
struct ib_sa_mad *mad)
static void ib_sa_pr_callback_single(struct ib_sa_path_query *query,
int status, struct ib_sa_mad *mad)
{
struct ib_sa_path_query *query =
container_of(sa_query, struct ib_sa_path_query, sa_query);
struct sa_path_rec rec = {};
if (mad) {
struct sa_path_rec rec;
if (sa_query->flags & IB_SA_QUERY_OPA) {
ib_unpack(opa_path_rec_table,
ARRAY_SIZE(opa_path_rec_table),
mad->data, &rec);
rec.rec_type = SA_PATH_REC_TYPE_OPA;
query->callback(status, &rec, query->context);
} else {
ib_unpack(path_rec_table,
ARRAY_SIZE(path_rec_table),
ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
mad->data, &rec);
rec.rec_type = SA_PATH_REC_TYPE_IB;
sa_path_set_dmac_zero(&rec);
@ -1439,13 +1466,75 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
memset(&opa, 0, sizeof(struct sa_path_rec));
sa_convert_path_ib_to_opa(&opa, &rec);
query->callback(status, &opa, query->context);
query->callback(status, &opa, 1, query->context);
} else {
query->callback(status, &rec, query->context);
query->callback(status, &rec, 1, query->context);
}
}
} else
query->callback(status, NULL, query->context);
/**
* ib_sa_pr_callback_multiple() - Parse path records then do callback.
*
* In a multiple-PR case the PRs are saved in "query->resp_pr_data"
* (instead of"mad->data") and with "ib_path_rec_data" structure format,
* so that rec->flags can be set to indicate the type of PR.
* This is valid only in IB fabric.
*/
static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query,
int status, int num_prs,
struct ib_path_rec_data *rec_data)
{
struct sa_path_rec *rec;
int i;
rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
if (!rec) {
query->callback(-ENOMEM, NULL, 0, query->context);
return;
}
for (i = 0; i < num_prs; i++) {
ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
rec_data[i].path_rec, rec + i);
rec[i].rec_type = SA_PATH_REC_TYPE_IB;
sa_path_set_dmac_zero(rec + i);
rec[i].flags = rec_data[i].flags;
}
query->callback(status, rec, num_prs, query->context);
kvfree(rec);
}
static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
int status, int num_prs,
struct ib_sa_mad *mad)
{
struct ib_sa_path_query *query =
container_of(sa_query, struct ib_sa_path_query, sa_query);
struct sa_path_rec rec;
if (!mad || !num_prs) {
query->callback(status, NULL, 0, query->context);
return;
}
if (sa_query->flags & IB_SA_QUERY_OPA) {
if (num_prs != 1) {
query->callback(-EINVAL, NULL, 0, query->context);
return;
}
ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table),
mad->data, &rec);
rec.rec_type = SA_PATH_REC_TYPE_OPA;
query->callback(status, &rec, num_prs, query->context);
} else {
if (!sa_query->resp_pr_data)
ib_sa_pr_callback_single(query, status, mad);
else
ib_sa_pr_callback_multiple(query, status, num_prs,
sa_query->resp_pr_data);
}
}
static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
@ -1489,7 +1578,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
unsigned long timeout_ms, gfp_t gfp_mask,
void (*callback)(int status,
struct sa_path_rec *resp,
void *context),
int num_paths, void *context),
void *context,
struct ib_sa_query **sa_query)
{
@ -1588,7 +1677,7 @@ err1:
EXPORT_SYMBOL(ib_sa_path_rec_get);
static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
int status,
int status, int num_prs,
struct ib_sa_mad *mad)
{
struct ib_sa_mcmember_query *query =
@ -1680,7 +1769,7 @@ err1:
/* Support GuidInfoRecord */
static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
int status,
int status, int num_paths,
struct ib_sa_mad *mad)
{
struct ib_sa_guidinfo_query *query =
@ -1790,7 +1879,7 @@ static void ib_classportinfo_cb(void *context)
}
static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
int status,
int status, int num_prs,
struct ib_sa_mad *mad)
{
unsigned long flags;
@ -1966,13 +2055,13 @@ static void send_handler(struct ib_mad_agent *agent,
/* No callback -- already got recv */
break;
case IB_WC_RESP_TIMEOUT_ERR:
query->callback(query, -ETIMEDOUT, NULL);
query->callback(query, -ETIMEDOUT, 0, NULL);
break;
case IB_WC_WR_FLUSH_ERR:
query->callback(query, -EINTR, NULL);
query->callback(query, -EINTR, 0, NULL);
break;
default:
query->callback(query, -EIO, NULL);
query->callback(query, -EIO, 0, NULL);
break;
}
@ -2000,10 +2089,10 @@ static void recv_handler(struct ib_mad_agent *mad_agent,
if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
query->callback(query,
mad_recv_wc->recv_buf.mad->mad_hdr.status ?
-EINVAL : 0,
-EINVAL : 0, 1,
(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
else
query->callback(query, -EIO, NULL);
query->callback(query, -EIO, 0, NULL);
}
ib_free_recv_mad(mad_recv_wc);

View File

@ -742,7 +742,7 @@ void ipoib_flush_paths(struct net_device *dev)
static void path_rec_completion(int status,
struct sa_path_rec *pathrec,
void *path_ptr)
int num_prs, void *path_ptr)
{
struct ipoib_path *path = path_ptr;
struct net_device *dev = path->dev;

View File

@ -699,7 +699,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
static void srp_path_rec_completion(int status,
struct sa_path_rec *pathrec,
void *ch_ptr)
int num_paths, void *ch_ptr)
{
struct srp_rdma_ch *ch = ch_ptr;
struct srp_target_port *target = ch->target;

View File

@ -186,6 +186,7 @@ struct sa_path_rec {
struct sa_path_rec_opa opa;
};
enum sa_path_rec_type rec_type;
u32 flags;
};
static inline enum ib_gid_type
@ -413,7 +414,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device,
ib_sa_comp_mask comp_mask, unsigned long timeout_ms,
gfp_t gfp_mask,
void (*callback)(int status, struct sa_path_rec *resp,
void *context),
int num_prs, void *context),
void *context, struct ib_sa_query **query);
struct ib_sa_multicast {

View File

@ -49,9 +49,15 @@ struct rdma_addr {
struct rdma_dev_addr dev_addr;
};
#define RDMA_PRIMARY_PATH_MAX_REC_NUM 3
struct rdma_route {
struct rdma_addr addr;
struct sa_path_rec *path_rec;
/* Optional path records of primary path */
struct sa_path_rec *path_rec_inbound;
struct sa_path_rec *path_rec_outbound;
/*
* 0 - No primary nor alternate path is available
* 1 - Only primary path is available