From 8d5dee21f6f01f4632c10b750709a1383eefc7aa Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:23 +0100 Subject: [PATCH 1/9] tipc: a couple of cleanups - We remove the 'reclaiming' member list in struct tipc_group, since it doesn't serve any purpose. - We simplify the GRP_REMIT_MSG branch of tipc_group_protocol_rcv(). Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index 3e8268d966fa..e5daeb093879 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -85,7 +85,6 @@ struct tipc_group { struct list_head small_win; struct list_head pending; struct list_head active; - struct list_head reclaiming; struct tipc_nlist dests; struct net *net; int subid; @@ -172,7 +171,6 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); - INIT_LIST_HEAD(&grp->reclaiming); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; @@ -575,7 +573,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, if (!list_empty(active) && active_cnt >= reclaim_limit) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; - list_move_tail(&rm->list, &grp->reclaiming); + list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); } /* If max active, become pending and wait for reclaimed space */ @@ -600,12 +598,12 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; + grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } - grp->active_cnt--; - list_del_init(&m->list); + if (list_empty(&grp->pending)) return; @@ -761,18 +759,14 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, m->advertised = ADV_IDLE + in_flight; return; } - /* All messages preceding the REMIT have been read */ - if (m->advertised <= remitted) { - m->state = MBR_JOINED; - in_flight = 0; - } - /* ..and the REMIT overtaken by more messages => re-advertise */ + /* This should never happen */ if (m->advertised < remitted) - tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + pr_warn_ratelimited("Unexpected REMIT msg\n"); - m->advertised = ADV_IDLE + in_flight; + /* All messages preceding the REMIT have been read */ + m->state = MBR_JOINED; grp->active_cnt--; - list_del_init(&m->list); + m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) From 4ea5dab541717fc55cad609360b100857af770b0 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:24 +0100 Subject: [PATCH 2/9] tipc: let group member stay in JOINED mode if unable to reclaim We handle a corner case in the function tipc_group_update_rcv_win(). During extreme pessure it might happen that a message receiver has all its active senders in RECLAIMING or REMITTED mode, meaning that there is nobody to reclaim advertisements from if an additional sender tries to go active. Currently we just set the new sender to ACTIVE anyway, hence at least theoretically opening up for a receiver queue overflow by exceeding the MAX_ACTIVE limit. The correct solution to this is to instead add the member to the pending queue, while letting the oldest member in that queue revert to JOINED state. In this commit we refactor the code for handling message arrival from a JOINED member, both to make it more comprehensible and to cover the case described above. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index e5daeb093879..652fa66a87f6 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -569,24 +569,34 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, switch (m->state) { case MBR_JOINED: - /* Reclaim advertised space from least active member */ - if (!list_empty(active) && active_cnt >= reclaim_limit) { + /* First, decide if member can go active */ + if (active_cnt <= max_active) { + m->state = MBR_ACTIVE; + list_add_tail(&m->list, active); + grp->active_cnt++; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + } else { + m->state = MBR_PENDING; + list_add_tail(&m->list, &grp->pending); + } + + if (active_cnt < reclaim_limit) + break; + + /* Reclaim from oldest active member, if possible */ + if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); - } - /* If max active, become pending and wait for reclaimed space */ - if (active_cnt >= max_active) { - m->state = MBR_PENDING; - list_add_tail(&m->list, &grp->pending); break; } - /* Otherwise become active */ - m->state = MBR_ACTIVE; - list_add_tail(&m->list, &grp->active); - grp->active_cnt++; - /* Fall through */ + /* Nobody to reclaim from; - revert oldest pending to JOINED */ + pm = list_first_entry(&grp->pending, struct tipc_member, list); + list_del_init(&pm->list); + pm->state = MBR_JOINED; + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); + break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); From 0233493a5fad227645f7f02539cb42db72e76030 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:25 +0100 Subject: [PATCH 3/9] tipc: adjustment to group member FSM Analysis reveals that the member state MBR_QURANTINED in reality is unnecessary, and can be replaced by the state MBR_JOINING at all occurrencs. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index 652fa66a87f6..a352e098f0e7 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -49,7 +49,6 @@ #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { - MBR_QUARANTINED, MBR_DISCOVERED, MBR_JOINING, MBR_PUBLISHED, @@ -138,7 +137,7 @@ u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) static bool tipc_group_is_receiver(struct tipc_member *m) { - return m->state != MBR_QUARANTINED && m->state != MBR_LEAVING; + return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } static bool tipc_group_is_sender(struct tipc_member *m) @@ -690,7 +689,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, - MBR_QUARANTINED); + MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); From 7ad32bcb7855ae8a60a8cf98e1b9da77cfdba4d0 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:26 +0100 Subject: [PATCH 4/9] tipc: create group member event messages when they are needed In the current implementation, a group socket receiving topology events about other members just converts the topology event message into a group event message and stores it until it reaches the right state to issue it to the user. This complicates the code unnecessarily, and becomes impractical when we in the coming commits will need to create and issue membership events independently. In this commit, we change this so that we just notice the type and origin of the incoming topology event, and then drop the buffer. Only when it is time to actually send a group event to the user do we explicitly create a new message and send it upwards. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 95 +++++++++++++++++++++++++++-------------------- net/tipc/group.h | 2 +- net/tipc/socket.c | 3 +- 3 files changed, 56 insertions(+), 44 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index a352e098f0e7..e08b7acc7b2d 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -64,7 +64,6 @@ struct tipc_member { struct rb_node tree_node; struct list_head list; struct list_head small_win; - struct sk_buff *event_msg; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; @@ -632,6 +631,40 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, } } +static void tipc_group_create_event(struct tipc_group *grp, + struct tipc_member *m, + u32 event, u16 seqno, + struct sk_buff_head *inputq) +{ u32 dnode = tipc_own_addr(grp->net); + struct tipc_event evt; + struct sk_buff *skb; + struct tipc_msg *hdr; + + evt.event = event; + evt.found_lower = m->instance; + evt.found_upper = m->instance; + evt.port.ref = m->port; + evt.port.node = m->node; + evt.s.seq.type = grp->type; + evt.s.seq.lower = m->instance; + evt.s.seq.upper = m->instance; + + skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, + GROUP_H_SIZE, sizeof(evt), dnode, m->node, + grp->portid, m->port, 0); + if (!skb) + return; + + hdr = buf_msg(skb); + msg_set_nametype(hdr, grp->type); + msg_set_grp_evt(hdr, event); + msg_set_dest_droppable(hdr, true); + msg_set_grp_bc_seqno(hdr, seqno); + memcpy(msg_data(hdr), &evt, sizeof(evt)); + TIPC_SKB_CB(skb)->orig_member = m->instance; + __skb_queue_tail(inputq, skb); +} + static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { @@ -677,7 +710,6 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; - struct tipc_msg *ehdr; u16 remitted, in_flight; if (!grp) @@ -704,9 +736,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, *usr_wakeup = true; m->usr_pending = false; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); - ehdr = buf_msg(m->event_msg); - msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); - __skb_queue_tail(inputq, m->event_msg); + tipc_group_create_event(grp, m, TIPC_PUBLISHED, + m->bc_syncpt, inputq); } list_del_init(&m->small_win); tipc_group_update_member(m, 0); @@ -725,10 +756,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, m->state = MBR_LEAVING; return; } - /* Otherwise deliver already received WITHDRAW event */ - ehdr = buf_msg(m->event_msg); - msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); - __skb_queue_tail(inputq, m->event_msg); + /* Otherwise deliver member WITHDRAW event */ + tipc_group_create_event(grp, m, TIPC_WITHDRAWN, + m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) @@ -797,11 +827,10 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, - struct sk_buff *skb, + struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { - struct tipc_msg *hdr = buf_msg(skb); struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; @@ -813,21 +842,12 @@ void tipc_group_member_evt(struct tipc_group *grp, u32 self; if (!grp) - goto drop; + return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) - goto drop; - - /* Convert message before delivery to user */ - msg_set_hdr_sz(hdr, GROUP_H_SIZE); - msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); - msg_set_type(hdr, TIPC_GRP_MEMBER_EVT); - msg_set_origport(hdr, port); - msg_set_orignode(hdr, node); - msg_set_nametype(hdr, grp->type); - msg_set_grp_evt(hdr, event); + return; m = tipc_group_find_member(grp, node, port); @@ -836,59 +856,52 @@ void tipc_group_member_evt(struct tipc_group *grp, m = tipc_group_create_member(grp, node, port, MBR_DISCOVERED); if (!m) - goto drop; + return; + + m->instance = instance; /* Hold back event if JOIN message not yet received */ if (m->state == MBR_DISCOVERED) { - m->event_msg = skb; m->state = MBR_PUBLISHED; } else { - msg_set_grp_bc_seqno(hdr, m->bc_syncpt); - __skb_queue_tail(inputq, skb); + tipc_group_create_event(grp, m, TIPC_PUBLISHED, + m->bc_syncpt, inputq); m->state = MBR_JOINED; *usr_wakeup = true; m->usr_pending = false; } - m->instance = instance; - TIPC_SKB_CB(skb)->orig_member = m->instance; tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_update_member(m, 0); } else if (event == TIPC_WITHDRAWN) { if (!m) - goto drop; - - TIPC_SKB_CB(skb)->orig_member = m->instance; + return; *usr_wakeup = true; m->usr_pending = false; node_up = tipc_node_is_up(net, node); - m->event_msg = NULL; if (node_up) { /* Hold back event if a LEAVE msg should be expected */ if (m->state != MBR_LEAVING) { - m->event_msg = skb; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; } else { - msg_set_grp_bc_seqno(hdr, m->bc_syncpt); - __skb_queue_tail(inputq, skb); + tipc_group_create_event(grp, m, TIPC_WITHDRAWN, + m->bc_syncpt, inputq); } } else { if (m->state != MBR_LEAVING) { tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; - msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); + tipc_group_create_event(grp, m, TIPC_WITHDRAWN, + m->bc_rcv_nxt, inputq); } else { - msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + tipc_group_create_event(grp, m, TIPC_WITHDRAWN, + m->bc_syncpt, inputq); } - __skb_queue_tail(inputq, skb); } list_del_init(&m->list); list_del_init(&m->small_win); } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); - return; -drop: - kfree_skb(skb); } diff --git a/net/tipc/group.h b/net/tipc/group.h index d525e1cd7de5..5ffffd0121a2 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -54,7 +54,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); void tipc_group_member_evt(struct tipc_group *grp, bool *wakeup, - int *sk_rcvbuf, struct sk_buff *skb, + int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq); void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b51d5cba5094..36744ebef74f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1933,8 +1933,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, break; case TOP_SRV: tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf, - skb, inputq, xmitq); - skb = NULL; + hdr, inputq, xmitq); break; default: break; From c2b22bcf2e18a279afd80a8c57e936014acf3348 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:27 +0100 Subject: [PATCH 5/9] tipc: simplify group LEAVE sequence After the changes in the previous commit the group LEAVE sequence can be simplified. We now let the arrival of a LEAVE message unconditionally issue a group DOWN event to the user. When a topology WITHDRAW event is received, the member, if it still there, is set to state LEAVING, but we only issue a group DOWN event when the link to the peer node is gone, so that no LEAVE message is to be expected. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index e08b7acc7b2d..bdc54be9c07e 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -749,14 +749,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, list_del_init(&m->list); list_del_init(&m->small_win); *usr_wakeup = true; - - /* Wait until WITHDRAW event is received */ - if (m->state != MBR_LEAVING) { - tipc_group_decr_active(grp, m); - m->state = MBR_LEAVING; - return; - } - /* Otherwise deliver member WITHDRAW event */ + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_syncpt, inputq); return; @@ -838,7 +832,6 @@ void tipc_group_member_evt(struct tipc_group *grp, int event = evt->event; struct tipc_member *m; struct net *net; - bool node_up; u32 self; if (!grp) @@ -878,30 +871,15 @@ void tipc_group_member_evt(struct tipc_group *grp, *usr_wakeup = true; m->usr_pending = false; - node_up = tipc_node_is_up(net, node); - - if (node_up) { - /* Hold back event if a LEAVE msg should be expected */ - if (m->state != MBR_LEAVING) { - tipc_group_decr_active(grp, m); - m->state = MBR_LEAVING; - } else { - tipc_group_create_event(grp, m, TIPC_WITHDRAWN, - m->bc_syncpt, inputq); - } - } else { - if (m->state != MBR_LEAVING) { - tipc_group_decr_active(grp, m); - m->state = MBR_LEAVING; - tipc_group_create_event(grp, m, TIPC_WITHDRAWN, - m->bc_rcv_nxt, inputq); - } else { - tipc_group_create_event(grp, m, TIPC_WITHDRAWN, - m->bc_syncpt, inputq); - } - } + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; list_del_init(&m->list); list_del_init(&m->small_win); + + /* Only send event if no LEAVE message can be expected */ + if (!tipc_node_is_up(net, node)) + tipc_group_create_event(grp, m, TIPC_WITHDRAWN, + m->bc_rcv_nxt, inputq); } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } From d12d2e12cec2d66eab6cd58f592dad9fd386b97d Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:28 +0100 Subject: [PATCH 6/9] tipc: send out join messages as soon as new member is discovered When a socket is joining a group, we look up in the binding table to find if there are already other members of the group present. This is used for being able to return EAGAIN instead of EHOSTUNREACH if the user proceeds directly to a send attempt. However, the information in the binding table can be used to directly set the created member in state MBR_PUBLISHED and send a JOIN message to the peer, instead of waiting for a topology PUBLISH event to do this. When there are many members in a group, the propagation time for such events can be significant, and we can save time during the join operation if we use the initial lookup result fully. In this commit, we eliminate the member state MBR_DISCOVERED which has been the result of the initial lookup, and do instead go directly to MBR_PUBLISHED, which initiates the setup. After this change, the tipc_member FSM looks as follows: +-----------+ ---->| PUBLISHED |-----------------------------------------------+ PUB- +-----------+ LEAVE/WITHRAW | LISH |JOIN | | +-------------------------------------------+ | | | LEAVE/WITHDRAW | | | | +------------+ | | | | +----------->| PENDING |---------+ | | | | |msg/maxactv +-+---+------+ LEAVE/ | | | | | | | | WITHDRAW | | | | | | +----------+ | | | | | | | |revert/maxactv| | | | | | | V V V V V | +----------+ msg +------------+ +-----------+ +-->| JOINED |------>| ACTIVE |------>| LEAVING |---> | +----------+ +--- -+------+ LEAVE/+-----------+DOWN | A A | WITHDRAW A A A EVT | | | |RECLAIM | | | | | |REMIT V | | | | | |== adv +------------+ | | | | | +---------| RECLAIMING |--------+ | | | | +-----+------+ LEAVE/ | | | | |REMIT WITHDRAW | | | | |< adv | | | |msg/ V LEAVE/ | | | |adv==ADV_IDLE+------------+ WITHDRAW | | | +-------------| REMITTED |------------+ | | +------------+ | |PUBLISH | JOIN +-----------+ LEAVE/WITHDRAW | ---->| JOINING |-----------------------------------------------+ +-----------+ Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 102 ++++++++++++++++++++++++++---------------- net/tipc/group.h | 4 +- net/tipc/name_table.c | 2 +- net/tipc/socket.c | 4 +- 4 files changed, 69 insertions(+), 43 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index bdc54be9c07e..6ca07f0da60c 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -49,7 +49,6 @@ #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { - MBR_DISCOVERED, MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, @@ -141,7 +140,7 @@ static bool tipc_group_is_receiver(struct tipc_member *m) static bool tipc_group_is_sender(struct tipc_member *m) { - return m && m->state >= MBR_JOINED; + return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) @@ -184,6 +183,21 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, return NULL; } +void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) +{ + struct rb_root *tree = &grp->members; + struct tipc_member *m, *tmp; + struct sk_buff_head xmitq; + + skb_queue_head_init(&xmitq); + rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); + tipc_group_update_member(m, 0); + } + tipc_node_distr_xmit(net, &xmitq); + *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); +} + void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; @@ -274,7 +288,7 @@ static void tipc_group_add_to_tree(struct tipc_group *grp, static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, - int state) + u32 instance, int state) { struct tipc_member *m; @@ -287,6 +301,7 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, m->group = grp; m->node = node; m->port = port; + m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; grp->member_cnt++; tipc_group_add_to_tree(grp, m); @@ -295,9 +310,10 @@ static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, return m; } -void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port) +void tipc_group_add_member(struct tipc_group *grp, u32 node, + u32 port, u32 instance) { - tipc_group_create_member(grp, node, port, MBR_DISCOVERED); + tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, @@ -623,7 +639,6 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: - case MBR_DISCOVERED: case MBR_JOINING: case MBR_LEAVING: default: @@ -721,26 +736,26 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, - MBR_JOINING); + 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); - /* Wait until PUBLISH event is received */ - if (m->state == MBR_DISCOVERED) { - m->state = MBR_JOINING; - } else if (m->state == MBR_PUBLISHED) { - m->state = MBR_JOINED; - *usr_wakeup = true; - m->usr_pending = false; - tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); - tipc_group_create_event(grp, m, TIPC_PUBLISHED, - m->bc_syncpt, inputq); - } + /* Wait until PUBLISH event is received if necessary */ + if (m->state != MBR_PUBLISHED) + return; + + /* Member can be taken into service */ + m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; list_del_init(&m->small_win); tipc_group_update_member(m, 0); + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + tipc_group_create_event(grp, m, TIPC_PUBLISHED, + m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) @@ -844,30 +859,36 @@ void tipc_group_member_evt(struct tipc_group *grp, m = tipc_group_find_member(grp, node, port); - if (event == TIPC_PUBLISHED) { - if (!m) - m = tipc_group_create_member(grp, node, port, - MBR_DISCOVERED); - if (!m) - return; - - m->instance = instance; - - /* Hold back event if JOIN message not yet received */ - if (m->state == MBR_DISCOVERED) { - m->state = MBR_PUBLISHED; - } else { - tipc_group_create_event(grp, m, TIPC_PUBLISHED, - m->bc_syncpt, inputq); - m->state = MBR_JOINED; - *usr_wakeup = true; - m->usr_pending = false; + switch (event) { + case TIPC_PUBLISHED: + /* Send and wait for arrival of JOIN message if necessary */ + if (!m) { + m = tipc_group_create_member(grp, node, port, instance, + MBR_PUBLISHED); + if (!m) + break; + tipc_group_update_member(m, 0); + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + break; } - tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + + if (m->state != MBR_JOINING) + break; + + /* Member can be taken into service */ + m->instance = instance; + m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; + list_del_init(&m->small_win); tipc_group_update_member(m, 0); - } else if (event == TIPC_WITHDRAWN) { + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + tipc_group_create_event(grp, m, TIPC_PUBLISHED, + m->bc_syncpt, inputq); + break; + case TIPC_WITHDRAWN: if (!m) - return; + break; *usr_wakeup = true; m->usr_pending = false; @@ -880,6 +901,9 @@ void tipc_group_member_evt(struct tipc_group *grp, if (!tipc_node_is_up(net, node)) tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_rcv_nxt, inputq); + break; + default: + break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } diff --git a/net/tipc/group.h b/net/tipc/group.h index 5ffffd0121a2..dee79477d499 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -44,8 +44,10 @@ struct tipc_msg; struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq); +void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcv_buf); void tipc_group_delete(struct net *net, struct tipc_group *grp); -void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port); +void tipc_group_add_member(struct tipc_group *grp, u32 node, + u32 port, u32 instance); struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, int *scope); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index b3829bcf63c7..e04ab72f313c 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -732,7 +732,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, list_for_each_entry(p, &info->zone_list, zone_list) { if (!tipc_in_scope(domain, p->node)) continue; - tipc_group_add_member(grp, p->node, p->ref); + tipc_group_add_member(grp, p->node, p->ref, p->lower); } } spin_unlock_bh(&seq->lock); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 36744ebef74f..e3a02f1fcab5 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2757,10 +2757,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) tipc_group_delete(net, grp); tsk->group = NULL; } - - /* Eliminate any risk that a broadcast overtakes the sent JOIN */ + /* Eliminate any risk that a broadcast overtakes sent JOINs */ tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; + tipc_group_join(net, grp, &tsk->sk.sk_rcvbuf); return rc; } From 8348500f80d5660af29c475e1f15d412d83564c9 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:29 +0100 Subject: [PATCH 7/9] tipc: add option to suppress PUBLISH events for pre-existing publications Currently, when a user is subscribing for binding table publications, he will receive a PUBLISH event for all already existing matching items in the binding table. However, a group socket making a subscriptions doesn't need this initial status update from the binding table, because it has already scanned it during the join operation. Worse, the multiplicatory effect of issuing mutual events for dozens or hundreds group members within a short time frame put a heavy load on the topology server, with the end result that scale out operations on a big group tend to take much longer than needed. We now add a new filter option, TIPC_SUB_NO_STATUS, for topology server subscriptions, so that this initial avalanche of events is suppressed. This change, along with the previous commit, significantly improves the range and speed of group scale out operations. We keep the new option internal for the tipc driver, at least for now. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 4 +++- net/tipc/name_table.c | 13 +++++++------ net/tipc/name_table.h | 2 +- net/tipc/server.c | 4 ++-- net/tipc/server.h | 3 ++- net/tipc/subscr.c | 10 ++++++---- 6 files changed, 21 insertions(+), 15 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index 6ca07f0da60c..cf996bd6ec98 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -177,7 +177,9 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; - if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid)) + if (tipc_topsrv_kern_subscr(net, portid, type, + TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS, + 0, ~0, &grp->subid)) return grp; kfree(grp); return NULL; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e04ab72f313c..60af9885f160 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -405,12 +405,13 @@ found: } /** - * tipc_nameseq_subscribe - attach a subscription, and issue - * the prescribed number of events if there is any sub- + * tipc_nameseq_subscribe - attach a subscription, and optionally + * issue the prescribed number of events if there is any sub- * sequence overlapping with the requested sequence */ static void tipc_nameseq_subscribe(struct name_seq *nseq, - struct tipc_subscription *s) + struct tipc_subscription *s, + bool status) { struct sub_seq *sseq = nseq->sseqs; struct tipc_name_seq ns; @@ -420,7 +421,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, tipc_subscrp_get(s); list_add(&s->nameseq_list, &nseq->subscriptions); - if (!sseq) + if (!status || !sseq) return; while (sseq != &nseq->sseqs[nseq->first_free]) { @@ -811,7 +812,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, /** * tipc_nametbl_subscribe - add a subscription object to the name table */ -void tipc_nametbl_subscribe(struct tipc_subscription *s) +void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) { struct tipc_net *tn = net_generic(s->net, tipc_net_id); u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); @@ -825,7 +826,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s) seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); if (seq) { spin_lock_bh(&seq->lock); - tipc_nameseq_subscribe(seq, s); + tipc_nameseq_subscribe(seq, s, status); spin_unlock_bh(&seq->lock); } else { tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 71926e429446..73a148c85c15 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -121,7 +121,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, u32 lower, u32 node, u32 ref, u32 key); -void tipc_nametbl_subscribe(struct tipc_subscription *s); +void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status); void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); diff --git a/net/tipc/server.c b/net/tipc/server.c index d60c30342327..950c54cbcf3a 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -490,7 +490,7 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) } bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 lower, u32 upper, int *conid) + u32 filter, u32 lower, u32 upper, int *conid) { struct tipc_subscriber *scbr; struct tipc_subscr sub; @@ -501,7 +501,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, sub.seq.lower = lower; sub.seq.upper = upper; sub.timeout = TIPC_WAIT_FOREVER; - sub.filter = TIPC_SUB_PORTS; + sub.filter = filter; *(u32 *)&sub.usr_handle = port; con = tipc_alloc_conn(tipc_topsrv(net)); diff --git a/net/tipc/server.h b/net/tipc/server.h index 2113c9192633..ea1effbff23e 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -41,6 +41,7 @@ #include <net/net_namespace.h> #define TIPC_SERVER_NAME_LEN 32 +#define TIPC_SUB_NO_STATUS 0x80 /** * struct tipc_server - TIPC server structure @@ -84,7 +85,7 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, struct sockaddr_tipc *addr, void *data, size_t len); bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 lower, u32 upper, int *conid); + u32 filter, u32 lower, u32 upper, int *conid); void tipc_topsrv_kern_unsubscr(struct net *net, int conid); /** diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 251065dfd8df..1052341a0ea9 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -286,7 +286,8 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, } static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, int swap) + struct tipc_subscriber *subscriber, int swap, + bool status) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub = NULL; @@ -299,7 +300,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); sub->subscriber = subscriber; - tipc_nametbl_subscribe(sub); + tipc_nametbl_subscribe(sub, status); tipc_subscrb_get(subscriber); spin_unlock_bh(&subscriber->lock); @@ -323,6 +324,7 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, { struct tipc_subscriber *subscriber = usr_data; struct tipc_subscr *s = (struct tipc_subscr *)buf; + bool status; int swap; /* Determine subscriber's endianness */ @@ -334,8 +336,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); return tipc_subscrp_cancel(s, subscriber); } - - tipc_subscrp_subscribe(net, s, subscriber, swap); + status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap)); + tipc_subscrp_subscribe(net, s, subscriber, swap, status); } /* Handle one request to establish a new subscriber */ From 232d07b74a33b9f5d48516dc1d8ce41723ada593 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:30 +0100 Subject: [PATCH 8/9] tipc: improve groupcast scope handling When a member joins a group, it also indicates a binding scope. This makes it possible to create both node local groups, invisible to other nodes, as well as cluster global groups, visible everywhere. In order to avoid that different members end up having permanently differing views of group size and memberhip, we must inhibit locally and globally bound members from joining the same group. We do this by using the binding scope as an additional separator between groups. I.e., a member must ignore all membership events from sockets using a different scope than itself, and all lookups for message destinations must require an exact match between the message's lookup scope and the potential target's binding scope. Apart from making it possible to create local groups using the same identity on different nodes, a side effect of this is that it now also becomes possible to create a cluster global group with the same identity across the same nodes, without interfering with the local groups. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/uapi/linux/tipc.h | 7 ++-- net/tipc/group.c | 13 +++--- net/tipc/name_table.c | 40 +++++++++--------- net/tipc/name_table.h | 4 +- net/tipc/server.c | 4 +- net/tipc/server.h | 6 ++- net/tipc/socket.c | 86 +++++++++++++++++++++++---------------- net/tipc/subscr.c | 10 +++-- net/tipc/subscr.h | 2 +- 9 files changed, 98 insertions(+), 74 deletions(-) diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 35f79d1f8c3a..14bacc7e6cef 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -117,10 +117,9 @@ static inline unsigned int tipc_node(__u32 addr) /* * Publication scopes when binding port names and port name sequences */ - -#define TIPC_ZONE_SCOPE 1 -#define TIPC_CLUSTER_SCOPE 2 -#define TIPC_NODE_SCOPE 3 +#define TIPC_ZONE_SCOPE 1 +#define TIPC_CLUSTER_SCOPE 2 +#define TIPC_NODE_SCOPE 3 /* * Limiting values for messages diff --git a/net/tipc/group.c b/net/tipc/group.c index cf996bd6ec98..1908773c9fca 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -87,7 +87,6 @@ struct tipc_group { int subid; u32 type; u32 instance; - u32 domain; u32 scope; u32 portid; u16 member_cnt; @@ -158,6 +157,8 @@ int tipc_group_size(struct tipc_group *grp) struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq) { + u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; + bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; @@ -171,15 +172,14 @@ struct tipc_group *tipc_group_create(struct net *net, u32 portid, grp->members = RB_ROOT; grp->net = net; grp->portid = portid; - grp->domain = addr_domain(net, mreq->scope); grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; - if (tipc_topsrv_kern_subscr(net, portid, type, - TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS, - 0, ~0, &grp->subid)) + filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; + if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, + filter, &grp->subid)) return grp; kfree(grp); return NULL; @@ -732,6 +732,9 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, if (!grp) return; + if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) + return; + m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 60af9885f160..64cdd3c302b0 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -328,7 +328,8 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_subscrp_report_overlap(s, publ->lower, publ->upper, TIPC_PUBLISHED, publ->ref, - publ->node, created_subseq); + publ->node, publ->scope, + created_subseq); } return publ; } @@ -398,7 +399,8 @@ found: list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_subscrp_report_overlap(s, publ->lower, publ->upper, TIPC_WITHDRAWN, publ->ref, - publ->node, removed_subseq); + publ->node, publ->scope, + removed_subseq); } return publ; @@ -435,6 +437,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, sseq->upper, TIPC_PUBLISHED, crs->ref, crs->node, + crs->scope, must_report); must_report = 0; } @@ -598,7 +601,7 @@ not_found: return ref; } -bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, struct list_head *dsts, int *dstcnt, u32 exclude, bool all) { @@ -608,9 +611,6 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, struct name_seq *seq; struct sub_seq *sseq; - if (!tipc_in_scope(domain, self)) - return false; - *dstcnt = 0; rcu_read_lock(); seq = nametbl_find_seq(net, type); @@ -621,7 +621,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, if (likely(sseq)) { info = sseq->info; list_for_each_entry(publ, &info->zone_list, zone_list) { - if (!tipc_in_scope(domain, publ->node)) + if (publ->scope != scope) continue; if (publ->ref == exclude && publ->node == self) continue; @@ -639,13 +639,14 @@ exit: return !list_empty(dsts); } -int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct list_head *dports) +int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports) { - struct name_seq *seq; - struct sub_seq *sseq; struct sub_seq *sseq_stop; struct name_info *info; + struct publication *p; + struct name_seq *seq; + struct sub_seq *sseq; int res = 0; rcu_read_lock(); @@ -657,15 +658,12 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); sseq_stop = seq->sseqs + seq->first_free; for (; sseq != sseq_stop; sseq++) { - struct publication *publ; - if (sseq->lower > upper) break; - info = sseq->info; - list_for_each_entry(publ, &info->node_list, node_list) { - if (publ->scope <= limit) - tipc_dest_push(dports, 0, publ->ref); + list_for_each_entry(p, &info->node_list, node_list) { + if (p->scope == scope || (!exact && p->scope < scope)) + tipc_dest_push(dports, 0, p->ref); } if (info->cluster_list_size != info->node_list_size) @@ -682,7 +680,7 @@ exit: * - Determines if any node local ports overlap */ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, - u32 upper, u32 domain, + u32 upper, u32 scope, struct tipc_nlist *nodes) { struct sub_seq *sseq, *stop; @@ -701,7 +699,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, for (; sseq != stop && sseq->lower <= upper; sseq++) { info = sseq->info; list_for_each_entry(publ, &info->zone_list, zone_list) { - if (tipc_in_scope(domain, publ->node)) + if (publ->scope == scope) tipc_nlist_add(nodes, publ->node); } } @@ -713,7 +711,7 @@ exit: /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, - u32 type, u32 domain) + u32 type, u32 scope) { struct sub_seq *sseq, *stop; struct name_info *info; @@ -731,7 +729,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, for (; sseq != stop; sseq++) { info = sseq->info; list_for_each_entry(p, &info->zone_list, zone_list) { - if (!tipc_in_scope(domain, p->node)) + if (p->scope != scope) continue; tipc_group_add_member(grp, p->node, p->ref, p->lower); } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 73a148c85c15..b595d8aa00f0 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -100,8 +100,8 @@ struct name_table { int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); -int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct list_head *dports); +int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports); void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, diff --git a/net/tipc/server.c b/net/tipc/server.c index 950c54cbcf3a..8ee5e86b7870 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -489,8 +489,8 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) } } -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 filter, u32 lower, u32 upper, int *conid) +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid) { struct tipc_subscriber *scbr; struct tipc_subscr sub; diff --git a/net/tipc/server.h b/net/tipc/server.h index ea1effbff23e..17f49ee44cfd 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -41,6 +41,8 @@ #include <net/net_namespace.h> #define TIPC_SERVER_NAME_LEN 32 +#define TIPC_SUB_CLUSTER_SCOPE 0x20 +#define TIPC_SUB_NODE_SCOPE 0x40 #define TIPC_SUB_NO_STATUS 0x80 /** @@ -84,8 +86,8 @@ struct tipc_server { int tipc_conn_sendmsg(struct tipc_server *s, int conid, struct sockaddr_tipc *addr, void *data, size_t len); -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, - u32 filter, u32 lower, u32 upper, int *conid); +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid); void tipc_topsrv_kern_unsubscr(struct net *net, int conid); /** diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e3a02f1fcab5..b24dab3996c9 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -928,21 +928,22 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, struct list_head *cong_links = &tsk->cong_links; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; struct tipc_member *first = NULL; struct tipc_member *mbr = NULL; struct net *net = sock_net(sk); u32 node, port, exclude; - u32 type, inst, domain; struct list_head dsts; + u32 type, inst, scope; int lookups = 0; int dstcnt, rc; bool cong; INIT_LIST_HEAD(&dsts); - type = dest->addr.name.name.type; + type = msg_nametype(hdr); inst = dest->addr.name.name.instance; - domain = addr_domain(net, dest->scope); + scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); while (++lookups < 4) { @@ -950,7 +951,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, /* Look for a non-congested destination member, if any */ while (1) { - if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts, + if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, &dstcnt, exclude, false)) return -EHOSTUNREACH; tipc_dest_pop(&dsts, &node, &port); @@ -1079,22 +1080,23 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - struct tipc_name_seq *seq = &dest->addr.nameseq; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - u32 domain, exclude, dstcnt; + u32 type, inst, scope, exclude; struct list_head dsts; + u32 dstcnt; INIT_LIST_HEAD(&dsts); - if (seq->lower != seq->upper) - return -ENOTSUPP; - - domain = addr_domain(net, dest->scope); + type = msg_nametype(hdr); + inst = dest->addr.name.name.instance; + scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); - if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain, - &dsts, &dstcnt, exclude, true)) + + if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, + &dstcnt, exclude, true)) return -EHOSTUNREACH; if (dstcnt == 1) { @@ -1116,24 +1118,29 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { - u32 scope = TIPC_CLUSTER_SCOPE; u32 self = tipc_own_addr(net); + u32 type, lower, upper, scope; struct sk_buff *skb, *_skb; - u32 lower = 0, upper = ~0; - struct sk_buff_head tmpq; u32 portid, oport, onode; + struct sk_buff_head tmpq; struct list_head dports; - struct tipc_msg *msg; - int user, mtyp, hsz; + struct tipc_msg *hdr; + int user, mtyp, hlen; + bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { - msg = buf_msg(skb); - user = msg_user(msg); - mtyp = msg_type(msg); + hdr = buf_msg(skb); + user = msg_user(hdr); + mtyp = msg_type(hdr); + hlen = skb_headroom(skb) + msg_hdr_sz(hdr); + oport = msg_origport(hdr); + onode = msg_orignode(hdr); + type = msg_nametype(hdr); + if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { @@ -1144,21 +1151,31 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, spin_unlock_bh(&inputq->lock); continue; } - hsz = skb_headroom(skb) + msg_hdr_sz(msg); - oport = msg_origport(msg); - onode = msg_orignode(msg); - if (onode == self) - scope = TIPC_NODE_SCOPE; - /* Create destination port list and message clones: */ - if (!msg_in_group(msg)) { - lower = msg_namelower(msg); - upper = msg_nameupper(msg); + /* Group messages require exact scope match */ + if (msg_in_group(hdr)) { + lower = 0; + upper = ~0; + scope = msg_lookup_scope(hdr); + exact = true; + } else { + /* TIPC_NODE_SCOPE means "any scope" in this context */ + if (onode == self) + scope = TIPC_NODE_SCOPE; + else + scope = TIPC_CLUSTER_SCOPE; + exact = false; + lower = msg_namelower(hdr); + upper = msg_nameupper(hdr); } - tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper, - scope, &dports); + + /* Create destination port list: */ + tipc_nametbl_mc_lookup(net, type, lower, upper, + scope, exact, &dports); + + /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { - _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); + _skb = __pskb_copy(skb, hlen, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); __skb_queue_tail(&tmpq, _skb); @@ -2731,7 +2748,6 @@ void tipc_sk_rht_destroy(struct net *net) static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) { struct net *net = sock_net(&tsk->sk); - u32 domain = addr_domain(net, mreq->scope); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq seq; @@ -2739,6 +2755,8 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) if (mreq->type < TIPC_RESERVED_TYPES) return -EACCES; + if (mreq->scope > TIPC_NODE_SCOPE) + return -EINVAL; if (grp) return -EACCES; grp = tipc_group_create(net, tsk->portid, mreq); @@ -2751,7 +2769,7 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) seq.type = mreq->type; seq.lower = mreq->instance; seq.upper = seq.lower; - tipc_nametbl_build_group(net, grp, mreq->type, domain); + tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope); rc = tipc_sk_publish(tsk, mreq->scope, &seq); if (rc) { tipc_group_delete(net, grp); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 1052341a0ea9..44df528ed6ab 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -118,15 +118,19 @@ void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port_ref, - u32 node, int must) + u32 node, u32 scope, int must) { + u32 filter = htohl(sub->evt.s.filter, sub->swap); struct tipc_name_seq seq; tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; - if (!must && - !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS)) + if (!must && !(filter & TIPC_SUB_PORTS)) + return; + if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE) + return; + if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE) return; tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index ee52957dc952..f3edca775d9f 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -71,7 +71,7 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, - u32 port_ref, u32 node, int must); + u32 port_ref, u32 node, u32 scope, int must); void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, struct tipc_name_seq *out); u32 tipc_subscrp_convert_seq_type(u32 type, int swap); From eb929a91b213d2a72c5a8b4af9a1acf63bfb8287 Mon Sep 17 00:00:00 2001 From: Jon Maloy <jon.maloy@ericsson.com> Date: Mon, 8 Jan 2018 21:03:31 +0100 Subject: [PATCH 9/9] tipc: improve poll() for group member socket The current criteria for returning POLLOUT from a group member socket is too simplistic. It basically returns POLLOUT as soon as the group has external destinations, something obviously leading to a lot of spinning during destination congestion situations. At the same time, the internal congestion handling is unnecessarily complex. We now change this as follows. - We introduce an 'open' flag in struct tipc_group. This flag is used only to help poll() get the setting of POLLOUT right, and *not* for congeston handling as such. This means that a user can choose to ignore an EAGAIN for a destination and go on sending messages to other destinations in the group if he wants to. - The flag is set to false every time we return EAGAIN on a send call. - The flag is set to true every time any member, i.e., not necessarily the member that caused EAGAIN, is removed from the small_win list. - We remove the group member 'usr_pending' flag. The size of the send window and presence in the 'small_win' list is sufficient criteria for recognizing congestion. This solution seems to be a reasonable compromise between 'anycast', which is normally not waiting for POLLOUT for a specific destination, and the other three send modes, which are. Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/group.c | 64 ++++++++++++++++++++++++++--------------------- net/tipc/group.h | 2 +- net/tipc/socket.c | 8 +++--- 3 files changed, 41 insertions(+), 33 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index 1908773c9fca..497ee34bfab9 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -74,7 +74,6 @@ struct tipc_member { u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; - bool usr_pending; }; struct tipc_group { @@ -96,11 +95,27 @@ struct tipc_group { u16 bc_ackers; bool loopback; bool events; + bool open; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); +bool tipc_group_is_open(struct tipc_group *grp) +{ + return grp->open; +} + +static void tipc_group_open(struct tipc_member *m, bool *wakeup) +{ + *wakeup = false; + if (list_empty(&m->small_win)) + return; + list_del_init(&m->small_win); + m->group->open = true; + *wakeup = true; +} + static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { @@ -406,20 +421,20 @@ bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int adv, state; m = tipc_group_find_dest(grp, dnode, dport); - *mbr = m; - if (!m) + if (!tipc_group_is_receiver(m)) { + *mbr = NULL; return false; - if (m->usr_pending) - return true; + } + *mbr = m; + if (m->window >= len) return false; - m->usr_pending = true; + + grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; - if (state < MBR_JOINED) - return true; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) @@ -437,9 +452,10 @@ bool tipc_group_bc_cong(struct tipc_group *grp, int len) struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ - if (grp->bc_ackers) + if (grp->bc_ackers) { + grp->open = false; return true; - + } if (list_empty(&grp->small_win)) return false; @@ -754,9 +770,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, /* Member can be taken into service */ m->state = MBR_JOINED; - *usr_wakeup = true; - m->usr_pending = false; - list_del_init(&m->small_win); + tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, @@ -767,8 +781,7 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); list_del_init(&m->list); - list_del_init(&m->small_win); - *usr_wakeup = true; + tipc_group_open(m, usr_wakeup); tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, @@ -778,26 +791,25 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, if (!m) return; m->window += msg_adv_win(hdr); - *usr_wakeup = m->usr_pending; - m->usr_pending = false; - list_del_init(&m->small_win); + tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) - break; + return; + list_del_init(&m->small_win); + m->group->open = true; *usr_wakeup = true; - m->usr_pending = false; + tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; - *usr_wakeup = m->usr_pending; - m->usr_pending = false; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; + tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) @@ -883,9 +895,7 @@ void tipc_group_member_evt(struct tipc_group *grp, /* Member can be taken into service */ m->instance = instance; m->state = MBR_JOINED; - *usr_wakeup = true; - m->usr_pending = false; - list_del_init(&m->small_win); + tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, @@ -895,12 +905,10 @@ void tipc_group_member_evt(struct tipc_group *grp, if (!m) break; - *usr_wakeup = true; - m->usr_pending = false; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; list_del_init(&m->list); - list_del_init(&m->small_win); + tipc_group_open(m, usr_wakeup); /* Only send event if no LEAVE message can be expected */ if (!tipc_node_is_up(net, node)) diff --git a/net/tipc/group.h b/net/tipc/group.h index dee79477d499..f4a596ed9848 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -67,9 +67,9 @@ void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack); bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **m); bool tipc_group_bc_cong(struct tipc_group *grp, int len); +bool tipc_group_is_open(struct tipc_group *grp); void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); void tipc_group_update_member(struct tipc_member *m, int len); -int tipc_group_size(struct tipc_group *grp); #endif diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b24dab3996c9..1f236271766c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -715,7 +715,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_group *grp = tsk->group; + struct tipc_group *grp; u32 revents = 0; sock_poll_wait(file, sk_sleep(sk), wait); @@ -736,9 +736,9 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: - if (!grp || tipc_group_size(grp)) - if (!tsk->cong_link_cnt) - revents |= POLLOUT; + grp = tsk->group; + if ((!grp || tipc_group_is_open(grp)) && !tsk->cong_link_cnt) + revents |= POLLOUT; if (!tipc_sk_type_connectionless(sk)) break; if (skb_queue_empty(&sk->sk_receive_queue))