forked from Minki/linux
drbd: Avoid NetworkFailure state during disconnect
Disconnecting is a cluster wide state change. In case the peer node agrees to the state transition, it sends back the fact on the meta-data connection and closes both sockets. In case the node node that initiated the state transfer sees the closing action on the data-socket, before the P_STATE_CHG_REPLY packet, it was going into one of the network failure states. At least with the fencing option set to something else thatn "dont-care", the unclean shutdown of the connection causes a short IO freeze or a fence operation. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
This commit is contained in:
parent
39a1aa7f49
commit
b66623e33e
@ -816,6 +816,7 @@ enum {
|
||||
* so shrink_page_list() would not recurse into,
|
||||
* and potentially deadlock on, this drbd worker.
|
||||
*/
|
||||
DISCONNECT_SENT,
|
||||
};
|
||||
|
||||
struct drbd_tconn { /* is a resource from the config file */
|
||||
|
@ -522,7 +522,6 @@ static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
|
||||
conn_err(tconn, "sock_recvmsg returned %d\n", rv);
|
||||
break;
|
||||
} else if (rv == 0) {
|
||||
conn_info(tconn, "sock was shut down by peer\n");
|
||||
break;
|
||||
} else {
|
||||
/* signal came in, or peer/link went down,
|
||||
@ -535,9 +534,25 @@ static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
|
||||
|
||||
set_fs(oldfs);
|
||||
|
||||
if (rv == 0) {
|
||||
if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
|
||||
long t;
|
||||
rcu_read_lock();
|
||||
t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
|
||||
rcu_read_unlock();
|
||||
|
||||
t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t);
|
||||
|
||||
if (t)
|
||||
goto out;
|
||||
}
|
||||
conn_info(tconn, "sock was shut down by peer\n");
|
||||
}
|
||||
|
||||
if (rv != size)
|
||||
conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
|
||||
|
||||
out:
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -894,6 +909,7 @@ static int conn_connect(struct drbd_tconn *tconn)
|
||||
.door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
|
||||
};
|
||||
|
||||
clear_bit(DISCONNECT_SENT, &tconn->flags);
|
||||
if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
|
||||
return -2;
|
||||
|
||||
@ -5316,6 +5332,18 @@ int drbd_asender(struct drbd_thread *thi)
|
||||
received += rv;
|
||||
buf += rv;
|
||||
} else if (rv == 0) {
|
||||
if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
|
||||
long t;
|
||||
rcu_read_lock();
|
||||
t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
|
||||
rcu_read_unlock();
|
||||
|
||||
t = wait_event_timeout(tconn->ping_wait,
|
||||
tconn->cstate < C_WF_REPORT_PARAMS,
|
||||
t);
|
||||
if (t)
|
||||
break;
|
||||
}
|
||||
conn_err(tconn, "meta connection shut down by peer.\n");
|
||||
goto reconnect;
|
||||
} else if (rv == -EAGAIN) {
|
||||
|
@ -1742,6 +1742,9 @@ conn_cl_wide(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state v
|
||||
goto abort;
|
||||
}
|
||||
|
||||
if (val.conn == C_DISCONNECTING)
|
||||
set_bit(DISCONNECT_SENT, &tconn->flags);
|
||||
|
||||
wait_event(tconn->ping_wait, (rv = _conn_rq_cond(tconn, mask, val)));
|
||||
clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user