From 61d9102b62129e13a2258c1e0566962f9a1732f0 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 12 Sep 2017 08:55:04 +0000 Subject: [PATCH 01/20] DLM: Eliminate CF_CONNECT_PENDING flag Before this patch, there was a flag in the con structure that was used to determine whether or not a connect was needed. The bit was set here and there, and cleared here and there, so it left some race conditions: the bit was set, work was queued, then the worker cleared the bit, allowing someone else to set it while the worker ran. For the most part, this worked okay, but we got into trouble if connections were lost and it needed to reconnect. This patch eliminates the flag in favor of simply checking if we actually have a sock pointer while protected by the mutex. Signed-off-by: Bob Peterson Reviewed-by: Tadashi Miyauchi Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4813d0e0cd9b..6a7a49b93374 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -107,7 +107,6 @@ struct connection { unsigned long flags; #define CF_READ_PENDING 1 #define CF_WRITE_PENDING 2 -#define CF_CONNECT_PENDING 3 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 @@ -435,8 +434,8 @@ static inline void lowcomms_connect_sock(struct connection *con) { if (test_bit(CF_CLOSE, &con->flags)) return; - if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); + queue_work(send_workqueue, &con->swork); + cond_resched(); } static void lowcomms_state_change(struct sock *sk) @@ -579,7 +578,6 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { - clear_bit(CF_CONNECT_PENDING, &con->flags); clear_bit(CF_WRITE_PENDING, &con->flags); if (tx && cancel_work_sync(&con->swork)) log_print("canceled swork for node %d", con->nodeid); @@ -1098,7 +1096,6 @@ socket_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); - clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } @@ -1194,7 +1191,6 @@ out_err: con->retries, result); mutex_unlock(&con->sock_mutex); msleep(1000); - clear_bit(CF_CONNECT_PENDING, &con->flags); lowcomms_connect_sock(con); return; } @@ -1593,7 +1589,7 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) + if (con->sock == NULL) /* not mutex protected so check it inside too */ con->connect_action(con); if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) send_to_sock(con); From 01da24d3fbed92dc6faf60b753e6bd50cdafb646 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 12 Sep 2017 08:55:14 +0000 Subject: [PATCH 02/20] DLM: Eliminate CF_WRITE_PENDING flag Signed-off-by: Bob Peterson Reviewed-by: Tadashi Miyauchi Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 6a7a49b93374..ed707d4323f4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -106,7 +106,6 @@ struct connection { struct mutex sock_mutex; unsigned long flags; #define CF_READ_PENDING 1 -#define CF_WRITE_PENDING 2 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 @@ -426,8 +425,7 @@ static void lowcomms_write_space(struct sock *sk) clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags); } - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) - queue_work(send_workqueue, &con->swork); + queue_work(send_workqueue, &con->swork); } static inline void lowcomms_connect_sock(struct connection *con) @@ -578,7 +576,6 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { - clear_bit(CF_WRITE_PENDING, &con->flags); if (tx && cancel_work_sync(&con->swork)) log_print("canceled swork for node %d", con->nodeid); if (rx && cancel_work_sync(&con->rwork)) @@ -1077,7 +1074,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result == 0) goto out; - bind_err: con->sock = NULL; sock_release(sock); @@ -1102,7 +1098,6 @@ socket_err: out: mutex_unlock(&con->sock_mutex); - set_bit(CF_WRITE_PENDING, &con->flags); } /* Connect a new socket to its peer */ @@ -1196,7 +1191,6 @@ out_err: } out: mutex_unlock(&con->sock_mutex); - set_bit(CF_WRITE_PENDING, &con->flags); return; } @@ -1452,9 +1446,7 @@ void dlm_lowcomms_commit_buffer(void *mh) e->len = e->end - e->offset; spin_unlock(&con->writequeue_lock); - if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { - queue_work(send_workqueue, &con->swork); - } + queue_work(send_workqueue, &con->swork); return; out: @@ -1524,12 +1516,15 @@ out: send_error: mutex_unlock(&con->sock_mutex); close_connection(con, false, false, true); - lowcomms_connect_sock(con); + /* Requeue the send work. When the work daemon runs again, it will try + a new connection, then call this function again. */ + queue_work(send_workqueue, &con->swork); return; out_connect: mutex_unlock(&con->sock_mutex); - lowcomms_connect_sock(con); + queue_work(send_workqueue, &con->swork); + cond_resched(); } static void clean_one_writequeue(struct connection *con) @@ -1591,7 +1586,7 @@ static void process_send_sockets(struct work_struct *work) if (con->sock == NULL) /* not mutex protected so check it inside too */ con->connect_action(con); - if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) + if (!list_empty(&con->writequeue)) send_to_sock(con); } From cc661fc934a004526a714a7b804ee3f119d27093 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 12 Sep 2017 08:55:23 +0000 Subject: [PATCH 03/20] DLM: Fix saving of NULL callbacks In a previous patch I noted that accept() often copies the struct sock (sk) which overwrites the sock callbacks. However, in testing we discovered that the dlm connection structures (con) are sometimes deleted and recreated as connections come and go, and since they're zeroed out by kmem_cache_zalloc, the saved callback pointers are also initialized to zero. But with today's DLM code, the callbacks are only saved when a socket is added. During recovery testing, we discovered a common situation in which the new con is initialized to zero, then a socket is added after accept(). In this case, the sock's saved values are all NULL, but the saved values are wiped out, due to accept(). Therefore, we don't have a known good copy of the callbacks from which we can restore. Since the struct sock callbacks are always good after listen(), this patch saves the known good values after listen(). These good values are then used for subsequent restores. Signed-off-by: Bob Peterson Reviewed-by: Tadashi Miyauchi Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index ed707d4323f4..32b534f4a9b6 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -122,10 +122,6 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ - void (*orig_error_report)(struct sock *); - void (*orig_data_ready)(struct sock *); - void (*orig_state_change)(struct sock *); - void (*orig_write_space)(struct sock *); }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -148,6 +144,13 @@ struct dlm_node_addr { struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; +static struct listen_sock_callbacks { + void (*sk_error_report)(struct sock *); + void (*sk_data_ready)(struct sock *); + void (*sk_state_change)(struct sock *); + void (*sk_write_space)(struct sock *); +} listen_sock; + static LIST_HEAD(dlm_node_addrs); static DEFINE_SPINLOCK(dlm_node_addrs_spin); @@ -477,7 +480,7 @@ static void lowcomms_error_report(struct sock *sk) if (con == NULL) goto out; - orig_report = con->orig_error_report; + orig_report = listen_sock.sk_error_report; if (con->sock == NULL || kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " @@ -514,22 +517,26 @@ out: } /* Note: sk_callback_lock must be locked before calling this function. */ -static void save_callbacks(struct connection *con, struct sock *sk) +static void save_listen_callbacks(struct socket *sock) { - con->orig_data_ready = sk->sk_data_ready; - con->orig_state_change = sk->sk_state_change; - con->orig_write_space = sk->sk_write_space; - con->orig_error_report = sk->sk_error_report; + struct sock *sk = sock->sk; + + listen_sock.sk_data_ready = sk->sk_data_ready; + listen_sock.sk_state_change = sk->sk_state_change; + listen_sock.sk_write_space = sk->sk_write_space; + listen_sock.sk_error_report = sk->sk_error_report; } -static void restore_callbacks(struct connection *con, struct sock *sk) +static void restore_callbacks(struct socket *sock) { + struct sock *sk = sock->sk; + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; - sk->sk_data_ready = con->orig_data_ready; - sk->sk_state_change = con->orig_state_change; - sk->sk_write_space = con->orig_write_space; - sk->sk_error_report = con->orig_error_report; + sk->sk_data_ready = listen_sock.sk_data_ready; + sk->sk_state_change = listen_sock.sk_state_change; + sk->sk_write_space = listen_sock.sk_write_space; + sk->sk_error_report = listen_sock.sk_error_report; write_unlock_bh(&sk->sk_callback_lock); } @@ -542,8 +549,6 @@ static void add_sock(struct socket *sock, struct connection *con, bool save_cb) con->sock = sock; sk->sk_user_data = con; - if (save_cb) - save_callbacks(con, sk); /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; sk->sk_write_space = lowcomms_write_space; @@ -583,8 +588,7 @@ static void close_connection(struct connection *con, bool and_other, mutex_lock(&con->sock_mutex); if (con->sock) { - if (!test_bit(CF_IS_OTHERCON, &con->flags)) - restore_callbacks(con, con->sock->sk); + restore_callbacks(con->sock); sock_release(con->sock); con->sock = NULL; } @@ -1226,7 +1230,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, log_print("Failed to set SO_REUSEADDR on socket: %d", result); } sock->sk->sk_user_data = con; - + save_listen_callbacks(sock); con->rx_action = tcp_accept_from_sock; con->connect_action = tcp_connect_to_sock; @@ -1310,6 +1314,7 @@ static int sctp_listen_for_all(void) write_lock_bh(&sock->sk->sk_callback_lock); /* Init con struct */ sock->sk->sk_user_data = con; + save_listen_callbacks(sock); con->sock = sock; con->sock->sk->sk_data_ready = lowcomms_data_ready; con->rx_action = sctp_accept_from_sock; From 988419a9deab68035364d8163bc27adb694ab28e Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 08:55:32 +0000 Subject: [PATCH 04/20] DLM: fix remove save_cb argument from add_sock() save_cb argument is not used. We remove them. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 32b534f4a9b6..72247cb4bc5e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -541,7 +541,7 @@ static void restore_callbacks(struct socket *sock) } /* Make a socket active */ -static void add_sock(struct socket *sock, struct connection *con, bool save_cb) +static void add_sock(struct socket *sock, struct connection *con) { struct sock *sk = sock->sk; @@ -801,7 +801,7 @@ static int tcp_accept_from_sock(struct connection *con) newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon, false); + add_sock(newsock, othercon); addcon = othercon; } else { @@ -817,7 +817,7 @@ static int tcp_accept_from_sock(struct connection *con) /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ - add_sock(newsock, newcon, false); + add_sock(newsock, newcon); addcon = newcon; } @@ -918,7 +918,7 @@ static int sctp_accept_from_sock(struct connection *con) newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon, false); + add_sock(newsock, othercon); addcon = othercon; } else { printk("Extra connection from node %d attempted\n", nodeid); @@ -929,7 +929,7 @@ static int sctp_accept_from_sock(struct connection *con) } else { newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon, false); + add_sock(newsock, newcon); addcon = newcon; } @@ -1057,7 +1057,7 @@ static void sctp_connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = sctp_connect_to_sock; - add_sock(sock, con, true); + add_sock(sock, con); /* Bind to all addresses. */ if (sctp_bind_addrs(con, 0)) @@ -1142,7 +1142,7 @@ static void tcp_connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; - add_sock(sock, con, true); + add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid routing problems */ @@ -1361,7 +1361,7 @@ static int tcp_listen_for_all(void) sock = tcp_create_listen_sock(con, dlm_local_addr[0]); if (sock) { - add_sock(sock, con, true); + add_sock(sock, con); result = 0; } else { From f0fb83cb9201a9f272f8ac771eed6b1e5745375c Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 08:55:40 +0000 Subject: [PATCH 05/20] DLM: fix double list_del() dlm_lowcomms_stop() was not functioning properly. Correctly, we have to wait until all processing is finished with send_workqueue and recv_workqueue. This problem causes the following issue. Senario is 1. dlm_send thread: send_to_sock refers con->writequeue 2. main thread: dlm_lowcomms_stop calls list_del 3. dlm_send thread: send_to_sock calls list_del in writequeue_entry_complete [ 1925.770305] dlm: canceled swork for node 4 [ 1925.772374] general protection fault: 0000 [#1] SMP [ 1925.777930] Modules linked in: ocfs2_stack_user ocfs2 ocfs2_nodemanager ocfs2_stackglue dlm fmxnet(O) fmx_api(O) fmx_cu(O) igb(O) kvm_intel kvm irqbypass autofs4 [ 1925.794131] CPU: 3 PID: 6994 Comm: kworker/u8:0 Tainted: G O 4.4.39 #1 [ 1925.802684] Hardware name: TOSHIBA OX/OX, BIOS OX-P0015 12/03/2015 [ 1925.809595] Workqueue: dlm_send process_send_sockets [dlm] [ 1925.815714] task: ffff8804398d3c00 ti: ffff88046910c000 task.ti: ffff88046910c000 [ 1925.824072] RIP: 0010:[] [] process_send_sockets+0xf8/0x280 [dlm] [ 1925.834480] RSP: 0018:ffff88046910fde0 EFLAGS: 00010246 [ 1925.840411] RAX: dead000000000200 RBX: 0000000000000001 RCX: 000000000000000a [ 1925.848372] RDX: ffff88046bd980c0 RSI: 0000000000000000 RDI: ffff8804673c5670 [ 1925.856341] RBP: ffff88046910fe20 R08: 00000000000000c9 R09: 0000000000000010 [ 1925.864311] R10: ffffffff81e22fc0 R11: 0000000000000000 R12: ffff8804673c56d8 [ 1925.872281] R13: ffff8804673c5660 R14: ffff88046bd98440 R15: 0000000000000058 [ 1925.880251] FS: 0000000000000000(0000) GS:ffff88047fd80000(0000) knlGS:0000000000000000 [ 1925.889280] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1925.895694] CR2: 00007fff09eadf58 CR3: 00000004690f5000 CR4: 00000000001006e0 [ 1925.903663] Stack: [ 1925.905903] ffff8804673c5630 ffff8804673c5620 ffff8804673c5670 ffff88007d219b40 [ 1925.914181] ffff88046f095800 0000000000000100 ffff8800717a1400 ffff8804673c56d8 [ 1925.922459] ffff88046910fe60 ffffffff81073db2 00ff880400000000 ffff88007d219b40 [ 1925.930736] Call Trace: [ 1925.933468] [] process_one_work+0x162/0x450 [ 1925.939983] [] worker_thread+0x69/0x4a0 [ 1925.946109] [] ? rescuer_thread+0x350/0x350 [ 1925.952622] [] kthread+0xef/0x110 [ 1925.958165] [] ? kthread_park+0x60/0x60 [ 1925.964283] [] ret_from_fork+0x3f/0x70 [ 1925.970312] [] ? kthread_park+0x60/0x60 [ 1925.976436] Code: 01 00 00 48 8b 7d d0 e8 07 d3 3a e1 45 01 7e 18 45 29 7e 1c 75 ab 41 8b 46 24 85 c0 75 a3 49 8b 16 49 8b 46 08 31 f6 48 89 42 08 <48> 89 10 48 b8 00 01 00 00 00 00 ad de 49 8b 7e 10 49 89 06 66 [ 1925.997791] RIP [] process_send_sockets+0xf8/0x280 [dlm] [ 1926.005577] RSP Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 72247cb4bc5e..980c58befd53 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1628,11 +1628,20 @@ static int work_start(void) return 0; } -static void stop_conn(struct connection *con) +static void _stop_conn(struct connection *con, bool and_other) { - con->flags |= 0x0F; + mutex_lock(&con->sock_mutex); + set_bit(CF_READ_PENDING, &con->flags); if (con->sock && con->sock->sk) con->sock->sk->sk_user_data = NULL; + if (con->othercon && and_other) + _stop_conn(con->othercon, false); + mutex_unlock(&con->sock_mutex); +} + +static void stop_conn(struct connection *con) +{ + _stop_conn(con, true); } static void free_conn(struct connection *con) @@ -1644,6 +1653,32 @@ static void free_conn(struct connection *con) kmem_cache_free(con_cache, con); } +static void work_flush(void) +{ + int ok; + int i; + struct hlist_node *n; + struct connection *con; + + flush_workqueue(recv_workqueue); + flush_workqueue(send_workqueue); + do { + ok = 1; + foreach_conn(stop_conn); + flush_workqueue(recv_workqueue); + flush_workqueue(send_workqueue); + for (i = 0; i < CONN_HASH_SIZE && ok; i++) { + hlist_for_each_entry_safe(con, n, + &connection_hash[i], list) { + ok &= test_bit(CF_READ_PENDING, &con->flags); + if (con->othercon) + ok &= test_bit(CF_READ_PENDING, + &con->othercon->flags); + } + } + } while (!ok); +} + void dlm_lowcomms_stop(void) { /* Set all the flags to prevent any @@ -1651,11 +1686,10 @@ void dlm_lowcomms_stop(void) */ mutex_lock(&connections_lock); dlm_allow_conn = 0; - foreach_conn(stop_conn); + mutex_unlock(&connections_lock); + work_flush(); clean_writequeues(); foreach_conn(free_conn); - mutex_unlock(&connections_lock); - work_stop(); kmem_cache_destroy(con_cache); From b2a6662932c52304eee11323701f8a01aa110e37 Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 08:55:50 +0000 Subject: [PATCH 06/20] DLM: fix race condition between dlm_send and dlm_recv When kernel_sendpage(in send_to_sock) and kernel_recvmsg (in receive_from_sock) return error, close_connection may works at the same time. At that time, they may wait for each other by cancel_work_sync. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 980c58befd53..420946dcb7ca 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -110,6 +110,7 @@ struct connection { #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 #define CF_APP_LIMITED 7 +#define CF_CLOSING 8 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ @@ -581,9 +582,11 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { - if (tx && cancel_work_sync(&con->swork)) + bool closing = test_and_set_bit(CF_CLOSING, &con->flags); + + if (tx && !closing && cancel_work_sync(&con->swork)) log_print("canceled swork for node %d", con->nodeid); - if (rx && cancel_work_sync(&con->rwork)) + if (rx && !closing && cancel_work_sync(&con->rwork)) log_print("canceled rwork for node %d", con->nodeid); mutex_lock(&con->sock_mutex); @@ -603,6 +606,7 @@ static void close_connection(struct connection *con, bool and_other, con->retries = 0; mutex_unlock(&con->sock_mutex); + clear_bit(CF_CLOSING, &con->flags); } /* Data received from remote end */ From c7355827b27c550824bbcc1f0586cf993bf83d94 Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 08:56:00 +0000 Subject: [PATCH 07/20] DLM: fix to use sock_mutex correctly in xxx_accept_from_sock In the current implementation, we think that exclusion control for othercon in tcp_accept_from_sock() and sctp_accept_from_sock() was not enough. We fix them. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 420946dcb7ca..b275813c9901 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -801,16 +801,19 @@ static int tcp_accept_from_sock(struct connection *con) INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); } + mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; add_sock(newsock, othercon); addcon = othercon; + mutex_unlock(&othercon->sock_mutex); } else { printk("Extra connection from node %d attempted\n", nodeid); result = -EAGAIN; + mutex_unlock(&othercon->sock_mutex); mutex_unlock(&newcon->sock_mutex); goto accept_err; } @@ -918,15 +921,18 @@ static int sctp_accept_from_sock(struct connection *con) INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); } + mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; add_sock(newsock, othercon); addcon = othercon; + mutex_unlock(&othercon->sock_mutex); } else { printk("Extra connection from node %d attempted\n", nodeid); ret = -EAGAIN; + mutex_unlock(&othercon->sock_mutex); mutex_unlock(&newcon->sock_mutex); goto accept_err; } From 5966121241b10a32396d770a0b39a41441511a8c Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 08:56:08 +0000 Subject: [PATCH 08/20] DLM: retry rcom when dlm_wait_function is timed out. If a node sends a DLM_RCOM_STATUS command and an error occurs on the receiving side, the DLM_RCOM_STATUS_REPLY response may not be returned. We retransmitted the DLM_RCOM_STATUS command so that we do not wait for an infinite response. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/rcom.c | 6 ++++++ fs/dlm/recover.c | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f3f5e72a29ba..4ff061de927e 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -155,6 +155,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) goto out; } +retry: error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, sizeof(struct rcom_status), &rc, &mh); if (error) @@ -169,6 +170,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; if (error) goto out; @@ -276,6 +279,7 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) ls->ls_recover_nodeid = nodeid; +retry: error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); if (error) goto out; @@ -288,6 +292,8 @@ int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); + if (error == -ETIMEDOUT) + goto retry; out: return error; } diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index eaea789bf97d..ce2aa54ca2e2 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -52,6 +52,10 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) dlm_config.ci_recover_timer * HZ); if (rv) break; + if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) { + log_debug(ls, "dlm_wait_function timed out"); + return -ETIMEDOUT; + } } if (dlm_recovery_stopped(ls)) { From c553e173b022347dcb34721688f55167c654466d Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 08:56:15 +0000 Subject: [PATCH 09/20] DLM: close othercon at send/receive error If an error occurs in the sending / receiving process, if othercon exists, sending / receiving processing using othercon may also result in an error. We fix to pre-close othercon as well. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b275813c9901..b48640709bbd 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -703,7 +703,7 @@ out_resched: out_close: mutex_unlock(&con->sock_mutex); if (ret != -EAGAIN) { - close_connection(con, false, true, false); + close_connection(con, true, true, false); /* Reconnect when there is something to send */ } /* Don't return success if we really got EOF */ @@ -1530,7 +1530,7 @@ out: send_error: mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); + close_connection(con, true, false, true); /* Requeue the send work. When the work daemon runs again, it will try a new connection, then call this function again. */ queue_work(send_workqueue, &con->swork); From e412f9201de255a503ae3d67f46aae580c79006c Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 08:56:30 +0000 Subject: [PATCH 10/20] DLM: fix race condition between dlm_recoverd_stop and dlm_recoverd When dlm_recoverd_stop() is called between kthread_should_stop() and set_task_state(TASK_INTERRUPTIBLE), dlm_recoverd will not wake up. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/recoverd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 6859b4bf971e..d2ad1cab0f05 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -287,8 +287,17 @@ static int dlm_recoverd(void *arg) set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags); wake_up(&ls->ls_recover_lock_wait); - while (!kthread_should_stop()) { + while (1) { + /* + * We call kthread_should_stop() after set_current_state(). + * This is because it works correctly if kthread_stop() is + * called just before set_current_state(). + */ set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) schedule(); From 8a4abb0819769a556f9023845d3821a06f81452d Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 09:01:16 +0000 Subject: [PATCH 11/20] DLM: Reanimate CF_WRITE_PENDING flag CF_WRITE_PENDING flag has been reanimated to make dlm_send stop properly when running dlm_lowcomms_stop. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b48640709bbd..306b5fe0866f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -106,6 +106,7 @@ struct connection { struct mutex sock_mutex; unsigned long flags; #define CF_READ_PENDING 1 +#define CF_WRITE_PENDING 2 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 #define CF_CLOSE 6 @@ -1599,6 +1600,7 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); + clear_bit(CF_WRITE_PENDING, &con->flags); if (con->sock == NULL) /* not mutex protected so check it inside too */ con->connect_action(con); if (!list_empty(&con->writequeue)) @@ -1642,6 +1644,7 @@ static void _stop_conn(struct connection *con, bool and_other) { mutex_lock(&con->sock_mutex); set_bit(CF_READ_PENDING, &con->flags); + set_bit(CF_WRITE_PENDING, &con->flags); if (con->sock && con->sock->sk) con->sock->sk->sk_user_data = NULL; if (con->othercon && and_other) @@ -1681,9 +1684,13 @@ static void work_flush(void) hlist_for_each_entry_safe(con, n, &connection_hash[i], list) { ok &= test_bit(CF_READ_PENDING, &con->flags); - if (con->othercon) + ok &= test_bit(CF_WRITE_PENDING, &con->flags); + if (con->othercon) { ok &= test_bit(CF_READ_PENDING, &con->othercon->flags); + ok &= test_bit(CF_WRITE_PENDING, + &con->othercon->flags); + } } } } while (!ok); From 173a31fe2b23b3ccc45d0b70edb225b1d836c31d Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 09:01:24 +0000 Subject: [PATCH 12/20] DLM: use CF_CLOSE flag to stop dlm_send correctly If reconnection fails while executing dlm_lowcomms_stop, dlm_send will not stop. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 306b5fe0866f..215515198edb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1643,6 +1643,7 @@ static int work_start(void) static void _stop_conn(struct connection *con, bool and_other) { mutex_lock(&con->sock_mutex); + set_bit(CF_CLOSE, &con->flags); set_bit(CF_READ_PENDING, &con->flags); set_bit(CF_WRITE_PENDING, &con->flags); if (con->sock && con->sock->sk) From 294e7e458763dc1d229cbbe7147a6034bfc6e39a Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Fri, 15 Sep 2017 14:17:23 -0500 Subject: [PATCH 13/20] DLM: fix conversion deadlock when DLM_LKF_NODLCKWT flag is set When the DLM_LKF_NODLCKWT flag was set, even if conversion deadlock was detected, the caller of can_be_granted() was unknown. We change the behavior of can_be_granted() and change it to detect conversion deadlock regardless of whether the DLM_LKF_NODLCKWT flag is set or not. And depending on whether the DLM_LKF_NODLCKWT flag is set or not, we change the behavior at the caller of can_be_granted(). This fix has no effect except when using DLM_LKF_NODLCKWT flag. Currently, ocfs2 uses the DLM_LKF_NODLCKWT flag and does not expect a cancel operation from conversion deadlock when calling dlm_lock(). ocfs2 is implemented to perform a cancel operation by requesting BASTs (callback). Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lock.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d4aaddec1b16..f145a2a9d6cb 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2465,14 +2465,12 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { lkb->lkb_grmode = DLM_LOCK_NL; lkb->lkb_sbflags |= DLM_SBF_DEMOTED; - } else if (!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { - if (err) - *err = -EDEADLK; - else { - log_print("can_be_granted deadlock %x now %d", - lkb->lkb_id, now); - dlm_dump_rsb(r); - } + } else if (err) { + *err = -EDEADLK; + } else { + log_print("can_be_granted deadlock %x now %d", + lkb->lkb_id, now); + dlm_dump_rsb(r); } goto out; } @@ -2501,13 +2499,6 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, return rv; } -/* FIXME: I don't think that can_be_granted() can/will demote or find deadlock - for locks pending on the convert list. Once verified (watch for these - log_prints), we should be able to just call _can_be_granted() and not - bother with the demote/deadlk cases here (and there's no easy way to deal - with a deadlk here, we'd have to generate something like grant_lock with - the deadlk error.) */ - /* Returns the highest requested mode of all blocked conversions; sets cw if there's a blocked conversion to DLM_LOCK_CW. */ @@ -2545,9 +2536,22 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, } if (deadlk) { - log_print("WARN: pending deadlock %x node %d %s", - lkb->lkb_id, lkb->lkb_nodeid, r->res_name); - dlm_dump_rsb(r); + /* + * If DLM_LKB_NODLKWT flag is set and conversion + * deadlock is detected, we request blocking AST and + * down (or cancel) conversion. + */ + if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) { + if (lkb->lkb_highbast < lkb->lkb_rqmode) { + queue_bast(r, lkb, lkb->lkb_rqmode); + lkb->lkb_highbast = lkb->lkb_rqmode; + } + } else { + log_print("WARN: pending deadlock %x node %d %s", + lkb->lkb_id, lkb->lkb_nodeid, + r->res_name); + dlm_dump_rsb(r); + } continue; } @@ -3123,7 +3127,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) deadlock, so we leave it on the granted queue and return EDEADLK in the ast for the convert. */ - if (deadlk) { + if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { /* it's left on the granted queue */ revert_lock(r, lkb); queue_cast(r, lkb, -EDEADLK); From 3421fb15be01ff4714fed5a5d6db64849b59a1fd Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 09:01:38 +0000 Subject: [PATCH 14/20] DLM: fix memory leak in tcp_accept_from_sock() The sk member of the socket generated by sock_create_kern() is overwritten by ops->accept(). So the previous sk will not be released. We use kernel_accept() instead of sock_create_kern() and ops->accept(). Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 215515198edb..a464a8c446ab 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -732,22 +732,14 @@ static int tcp_accept_from_sock(struct connection *con) } mutex_unlock(&connections_lock); - memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_lite(dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &newsock); - if (result < 0) - return -ENOMEM; - mutex_lock_nested(&con->sock_mutex, 0); - result = -ENOTCONN; - if (con->sock == NULL) - goto accept_err; + if (!con->sock) { + mutex_unlock(&con->sock_mutex); + return -ENOTCONN; + } - newsock->type = con->sock->type; - newsock->ops = con->sock->ops; - - result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true); + result = kernel_accept(con->sock, &newsock, O_NONBLOCK); if (result < 0) goto accept_err; @@ -844,7 +836,8 @@ static int tcp_accept_from_sock(struct connection *con) accept_err: mutex_unlock(&con->sock_mutex); - sock_release(newsock); + if (newsock) + sock_release(newsock); if (result != -EAGAIN) log_print("error accepting connection from node: %d", result); From ccbbea04326e061651e0b38eda0792c758ce4f91 Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 09:01:46 +0000 Subject: [PATCH 15/20] DLM: fix overflow dlm_cb_seq dlm_cb_seq is 64 bits. If dlm_cb_seq overflows and returns to 0, dlm_rem_lkb_callback() will not work properly. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/ast.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 07fed838d8fd..562fa8c3edff 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -181,6 +181,8 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, spin_lock(&dlm_cb_seq_spin); new_seq = ++dlm_cb_seq; + if (!dlm_cb_seq) + new_seq = ++dlm_cb_seq; spin_unlock(&dlm_cb_seq_spin); if (lkb->lkb_flags & DLM_IFL_USER) { From 93eaadebe9e1c28528f01204cefccc7ba050c913 Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 09:01:55 +0000 Subject: [PATCH 16/20] DLM: fix to use sk_callback_lock correctly In the current implementation, we think that exclusion control between processing to set the callback function to the connection structure and processing to refer to the connection structure from the callback function was not enough. We fix them. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a464a8c446ab..5d0de91adc36 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -411,17 +411,23 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) /* Data available on socket or listen socket received a connect */ static void lowcomms_data_ready(struct sock *sk) { - struct connection *con = sock2con(sk); + struct connection *con; + + read_lock_bh(&sk->sk_callback_lock); + con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) queue_work(recv_workqueue, &con->rwork); + read_unlock_bh(&sk->sk_callback_lock); } static void lowcomms_write_space(struct sock *sk) { - struct connection *con = sock2con(sk); + struct connection *con; + read_lock_bh(&sk->sk_callback_lock); + con = sock2con(sk); if (!con) - return; + goto out; clear_bit(SOCK_NOSPACE, &con->sock->flags); @@ -431,6 +437,8 @@ static void lowcomms_write_space(struct sock *sk) } queue_work(send_workqueue, &con->swork); +out: + read_unlock_bh(&sk->sk_callback_lock); } static inline void lowcomms_connect_sock(struct connection *con) @@ -797,8 +805,6 @@ static int tcp_accept_from_sock(struct connection *con) mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; - othercon->sock = newsock; - newsock->sk->sk_user_data = othercon; add_sock(newsock, othercon); addcon = othercon; mutex_unlock(&othercon->sock_mutex); @@ -812,7 +818,6 @@ static int tcp_accept_from_sock(struct connection *con) } } else { - newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will @@ -918,8 +923,6 @@ static int sctp_accept_from_sock(struct connection *con) mutex_lock_nested(&othercon->sock_mutex, 2); if (!othercon->sock) { newcon->othercon = othercon; - othercon->sock = newsock; - newsock->sk->sk_user_data = othercon; add_sock(newsock, othercon); addcon = othercon; mutex_unlock(&othercon->sock_mutex); @@ -931,7 +934,6 @@ static int sctp_accept_from_sock(struct connection *con) goto accept_err; } } else { - newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; add_sock(newsock, newcon); addcon = newcon; @@ -1058,7 +1060,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = sctp_connect_to_sock; add_sock(sock, con); @@ -1143,7 +1144,6 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; } - sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; add_sock(sock, con); @@ -1233,10 +1233,12 @@ static struct socket *tcp_create_listen_sock(struct connection *con, if (result < 0) { log_print("Failed to set SO_REUSEADDR on socket: %d", result); } + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; save_listen_callbacks(sock); con->rx_action = tcp_accept_from_sock; con->connect_action = tcp_connect_to_sock; + write_unlock_bh(&sock->sk->sk_callback_lock); /* Bind to our port */ make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); @@ -1639,8 +1641,11 @@ static void _stop_conn(struct connection *con, bool and_other) set_bit(CF_CLOSE, &con->flags); set_bit(CF_READ_PENDING, &con->flags); set_bit(CF_WRITE_PENDING, &con->flags); - if (con->sock && con->sock->sk) + if (con->sock && con->sock->sk) { + write_lock_bh(&con->sock->sk->sk_callback_lock); con->sock->sk->sk_user_data = NULL; + write_unlock_bh(&con->sock->sk->sk_callback_lock); + } if (con->othercon && and_other) _stop_conn(con->othercon, false); mutex_unlock(&con->sock_mutex); From 0aa18464c812e0154e8bafc9f60ca8002b3a8e7c Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 09:02:02 +0000 Subject: [PATCH 17/20] DLM: fix to reschedule rwork When an error occurs in kernel_recvmsg or kernel_sendpage and close_connection is called and receive work is already scheduled, receive work is canceled. In that case, the receive work will not be scheduled forever after reconnection, because CF_READ_PENDING flag is established. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5d0de91adc36..c64e39f76ce8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -593,10 +593,14 @@ static void close_connection(struct connection *con, bool and_other, { bool closing = test_and_set_bit(CF_CLOSING, &con->flags); - if (tx && !closing && cancel_work_sync(&con->swork)) + if (tx && !closing && cancel_work_sync(&con->swork)) { log_print("canceled swork for node %d", con->nodeid); - if (rx && !closing && cancel_work_sync(&con->rwork)) + clear_bit(CF_WRITE_PENDING, &con->flags); + } + if (rx && !closing && cancel_work_sync(&con->rwork)) { log_print("canceled rwork for node %d", con->nodeid); + clear_bit(CF_READ_PENDING, &con->flags); + } mutex_lock(&con->sock_mutex); if (con->sock) { From 26b41099e7e97d6d44769fd159e822a98c98afa2 Mon Sep 17 00:00:00 2001 From: "tsutomu.owa@toshiba.co.jp" Date: Tue, 12 Sep 2017 09:02:10 +0000 Subject: [PATCH 18/20] DLM: fix NULL pointer dereference in send_to_sock() The writequeue and writequeue_lock member of othercon was not initialized. If lowcomms_state_change() is called from network layer, othercon->swork may be scheduled. In this case, send_to_sock() will generate a NULL pointer reference. We avoid this problem by correctly initializing writequeue and writequeue_lock member of othercon. Signed-off-by: Tadashi Miyauchi Signed-off-by: Tsutomu Owa Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index c64e39f76ce8..05707850f93a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -802,6 +802,8 @@ static int tcp_accept_from_sock(struct connection *con) othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); + INIT_LIST_HEAD(&othercon->writequeue); + spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); @@ -920,6 +922,8 @@ static int sctp_accept_from_sock(struct connection *con) othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); + INIT_LIST_HEAD(&othercon->writequeue); + spin_lock_init(&othercon->writequeue_lock); INIT_WORK(&othercon->swork, process_send_sockets); INIT_WORK(&othercon->rwork, process_recv_sockets); set_bit(CF_IS_OTHERCON, &othercon->flags); From 9e1b0211c5dd4acbd21a8ec1b86fc38a497a4656 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 25 Sep 2017 15:47:50 +0800 Subject: [PATCH 19/20] dlm: recheck kthread_should_stop() before schedule() Call schedule() here could make the thread miss wake up from kthread_stop(), so it is better to recheck kthread_should_stop() before call schedule(), a symptom happened when I run indefinite test (which mostly created clustered raid1, assemble it in other nodes, then stop them) of clustered raid. $ ps aux|grep md|grep D root 4211 0.0 0.0 19760 2220 ? Ds 02:58 0:00 mdadm -Ssq $ cat /proc/4211/stack kthread_stop+0x4d/0x150 dlm_recoverd_stop+0x15/0x20 [dlm] dlm_release_lockspace+0x2ab/0x460 [dlm] leave+0xbf/0x150 [md_cluster] md_cluster_stop+0x18/0x30 [md_mod] bitmap_free+0x12e/0x140 [md_mod] bitmap_destroy+0x7f/0x90 [md_mod] __md_stop+0x21/0xa0 [md_mod] do_md_stop+0x15f/0x5c0 [md_mod] md_ioctl+0xa65/0x18a0 [md_mod] blkdev_ioctl+0x49e/0x8d0 block_ioctl+0x41/0x50 do_vfs_ioctl+0x96/0x5b0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x1e/0xad This maybe not resolve the issue completely since the KTHREAD_SHOULD_STOP flag could be set between "break" and "schedule", but at least the chance for the symptom happen could be reduce a lot (The indefinite test runs more than 20 hours without problem and it happens easily without the change). Signed-off-by: Guoqing Jiang Signed-off-by: David Teigland --- fs/dlm/recoverd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index d2ad1cab0f05..6f4e1d42d733 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -299,8 +299,11 @@ static int dlm_recoverd(void *arg) break; } if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && - !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) + !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { + if (kthread_should_stop()) + break; schedule(); + } set_current_state(TASK_RUNNING); if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { From 9250e523592a8ced3ecd14abe29fbb1e036bd7eb Mon Sep 17 00:00:00 2001 From: David Teigland Date: Mon, 9 Oct 2017 09:29:31 -0500 Subject: [PATCH 20/20] dlm: remove dlm_send_rcom_lookup_dump This function was only for debugging. It would be called in a condition that should not happen, and should probably have been removed from the final version of the original commit. Remove it because it does mutex lock under spin lock. Signed-off-by: David Teigland --- fs/dlm/lock.c | 1 - fs/dlm/rcom.c | 20 +------------------- fs/dlm/rcom.h | 1 - 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f145a2a9d6cb..cc91963683de 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1003,7 +1003,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, if (r->res_master_nodeid == our_nodeid) { log_error(ls, "from_master %d our_master", from_nodeid); dlm_dump_rsb(r); - dlm_send_rcom_lookup_dump(r, from_nodeid); goto out_found; } diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 4ff061de927e..70c625999d36 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -338,25 +338,6 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) return error; } -int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid) -{ - struct dlm_rcom *rc; - struct dlm_mhandle *mh; - struct dlm_ls *ls = r->res_ls; - int error; - - error = create_rcom(ls, to_nodeid, DLM_RCOM_LOOKUP, r->res_length, - &rc, &mh); - if (error) - goto out; - memcpy(rc->rc_buf, r->res_name, r->res_length); - rc->rc_id = 0xFFFFFFFF; - - send_rcom(ls, mh, rc); - out: - return error; -} - static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; @@ -368,6 +349,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) if (error) return; + /* Old code would send this special id to trigger a debug dump. */ if (rc_in->rc_id == 0xFFFFFFFF) { log_error(ls, "receive_rcom_lookup dump from %d", nodeid); dlm_dump_rsb_name(ls, rc_in->rc_buf, len); diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index f8e243463c15..206723ab744d 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -17,7 +17,6 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); -int dlm_send_rcom_lookup_dump(struct dlm_rsb *r, int to_nodeid); int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);