diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 88d95d96e36c..42eee2783756 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -20,6 +20,7 @@ #include #include "config.h" +#include "midcomms.h" #include "lowcomms.h" /* @@ -79,6 +80,9 @@ struct dlm_cluster { unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; + + struct dlm_spaces *sps; + struct dlm_comms *cms; }; static struct dlm_cluster *config_item_to_cluster(struct config_item *i) @@ -204,7 +208,7 @@ static int dlm_check_zero(unsigned int x) static int dlm_check_buffer_size(unsigned int x) { - if (x < DEFAULT_BUFFER_SIZE) + if (x < DLM_MAX_SOCKET_BUFSIZE) return -EINVAL; return 0; @@ -409,6 +413,9 @@ static struct config_group *make_cluster(struct config_group *g, if (!cl || !sps || !cms) goto fail; + cl->sps = sps; + cl->cms = cms; + config_group_init_type_name(&cl->group, name, &cluster_type); config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); config_group_init_type_name(&cms->cs_group, "comms", &comms_type); @@ -458,6 +465,9 @@ static void drop_cluster(struct config_group *g, struct config_item *i) static void release_cluster(struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); + + kfree(cl->sps); + kfree(cl->cms); kfree(cl); } @@ -532,7 +542,7 @@ static void drop_comm(struct config_group *g, struct config_item *i) struct dlm_comm *cm = config_item_to_comm(i); if (local_comm == cm) local_comm = NULL; - dlm_lowcomms_close(cm->nodeid); + dlm_midcomms_close(cm->nodeid); while (cm->addr_count--) kfree(cm->addr[cm->addr_count]); config_item_put(i); @@ -942,7 +952,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_LOG_INFO 1 -#define DEFAULT_PROTOCOL 0 +#define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 @@ -952,7 +962,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, - .ci_buffer_size = DEFAULT_BUFFER_SIZE, + .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, .ci_toss_secs = DEFAULT_TOSS_SECS, diff --git a/fs/dlm/config.h b/fs/dlm/config.h index d2cd4bd20313..df92b0a07fc6 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,7 +12,7 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ -#define DEFAULT_BUFFER_SIZE 4096 +#define DLM_MAX_SOCKET_BUFSIZE 4096 struct dlm_config_node { int nodeid; @@ -23,6 +23,9 @@ struct dlm_config_node { #define DLM_MAX_ADDR_COUNT 3 +#define DLM_PROTO_TCP 0 +#define DLM_PROTO_SCTP 1 + struct dlm_config_info { int ci_tcp_port; int ci_buffer_size; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index d5bd990bcab8..47e9d57e4cae 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -16,6 +16,7 @@ #include #include "dlm_internal.h" +#include "midcomms.h" #include "lock.h" #define DLM_DEBUG_BUF_LEN 4096 @@ -23,6 +24,7 @@ static char debug_buf[DLM_DEBUG_BUF_LEN]; static struct mutex debug_buf_lock; static struct dentry *dlm_root; +static struct dentry *dlm_comms; static char *print_lockmode(int mode) { @@ -738,6 +740,57 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_toss_dentry); } +static int dlm_state_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%s\n", dlm_midcomms_state(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_state); + +static int dlm_flags_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%lu\n", dlm_midcomms_flags(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_flags); + +static int dlm_send_queue_cnt_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%d\n", dlm_midcomms_send_queue_cnt(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_send_queue_cnt); + +static int dlm_version_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "0x%08x\n", dlm_midcomms_version(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_version); + +void *dlm_create_debug_comms_file(int nodeid, void *data) +{ + struct dentry *d_node; + char name[256]; + + memset(name, 0, sizeof(name)); + snprintf(name, 256, "%d", nodeid); + + d_node = debugfs_create_dir(name, dlm_comms); + debugfs_create_file("state", 0444, d_node, data, &dlm_state_fops); + debugfs_create_file("flags", 0444, d_node, data, &dlm_flags_fops); + debugfs_create_file("send_queue_count", 0444, d_node, data, + &dlm_send_queue_cnt_fops); + debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops); + + return d_node; +} + +void dlm_delete_debug_comms_file(void *ctx) +{ + debugfs_remove(ctx); +} + void dlm_create_debug_file(struct dlm_ls *ls) { char name[DLM_LOCKSPACE_LEN + 8]; @@ -797,6 +850,7 @@ void __init dlm_register_debugfs(void) { mutex_init(&debug_buf_lock); dlm_root = debugfs_create_dir("dlm", NULL); + dlm_comms = debugfs_create_dir("comms", dlm_root); } void dlm_unregister_debugfs(void) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 04fe9f525ac7..91d1ca3a121a 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -57,9 +57,12 @@ struct dlm_header; struct dlm_message; struct dlm_rcom; struct dlm_mhandle; +struct dlm_msg; #define log_print(fmt, args...) \ printk(KERN_ERR "dlm: "fmt"\n" , ##args) +#define log_print_ratelimited(fmt, args...) \ + printk_ratelimited(KERN_ERR "dlm: "fmt"\n", ##args) #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) @@ -368,23 +371,33 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) /* dlm_header is first element of all structs sent between nodes */ #define DLM_HEADER_MAJOR 0x00030000 -#define DLM_HEADER_MINOR 0x00000001 +#define DLM_HEADER_MINOR 0x00000002 + +#define DLM_VERSION_3_1 0x00030001 +#define DLM_VERSION_3_2 0x00030002 #define DLM_HEADER_SLOTS 0x00000001 #define DLM_MSG 1 #define DLM_RCOM 2 +#define DLM_OPTS 3 +#define DLM_ACK 4 +#define DLM_FIN 5 struct dlm_header { uint32_t h_version; - uint32_t h_lockspace; + union { + /* for DLM_MSG and DLM_RCOM */ + uint32_t h_lockspace; + /* for DLM_ACK and DLM_OPTS */ + uint32_t h_seq; + } u; uint32_t h_nodeid; /* nodeid of sender */ uint16_t h_length; uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ uint8_t h_pad; }; - #define DLM_MSG_REQUEST 1 #define DLM_MSG_CONVERT 2 #define DLM_MSG_UNLOCK 3 @@ -452,10 +465,29 @@ struct dlm_rcom { char rc_buf[]; }; +struct dlm_opt_header { + uint16_t t_type; + uint16_t t_length; + uint32_t o_pad; + /* need to be 8 byte aligned */ + char t_value[]; +}; + +/* encapsulation header */ +struct dlm_opts { + struct dlm_header o_header; + uint8_t o_nextcmd; + uint8_t o_pad; + uint16_t o_optlen; + uint32_t o_pad2; + char o_opts[]; +}; + union dlm_packet { struct dlm_header header; /* common to other two */ struct dlm_message message; struct dlm_rcom rcom; + struct dlm_opts opts; }; #define DLM_RSF_NEED_SLOTS 0x00000001 @@ -722,11 +754,15 @@ void dlm_register_debugfs(void); void dlm_unregister_debugfs(void); void dlm_create_debug_file(struct dlm_ls *ls); void dlm_delete_debug_file(struct dlm_ls *ls); +void *dlm_create_debug_comms_file(int nodeid, void *data); +void dlm_delete_debug_comms_file(void *ctx); #else static inline void dlm_register_debugfs(void) { } static inline void dlm_unregister_debugfs(void) { } static inline void dlm_create_debug_file(struct dlm_ls *ls) { } static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { return NULL; } +static inline void dlm_delete_debug_comms_file(void *ctx) { } #endif #endif /* __DLM_INTERNAL_DOT_H__ */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index b93df39d0915..c502c065d007 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -59,7 +59,7 @@ #include "dlm_internal.h" #include #include "memory.h" -#include "lowcomms.h" +#include "midcomms.h" #include "requestqueue.h" #include "util.h" #include "dir.h" @@ -3534,17 +3534,17 @@ static int _create_message(struct dlm_ls *ls, int mb_len, char *mb; /* get_buffer gives us a message handle (mh) that we need to - pass into lowcomms_commit and a message buffer (mb) that we + pass into midcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; ms = (struct dlm_message *) mb; ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - ms->m_header.h_lockspace = ls->ls_global_id; + ms->m_header.u.h_lockspace = ls->ls_global_id; ms->m_header.h_nodeid = dlm_our_nodeid(); ms->m_header.h_length = mb_len; ms->m_header.h_cmd = DLM_MSG; @@ -3589,7 +3589,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) { dlm_message_out(ms); - dlm_lowcomms_commit_buffer(mh); + dlm_midcomms_commit_mhandle(mh); return 0; } @@ -5038,16 +5038,16 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) if (hd->h_nodeid != nodeid) { log_print("invalid h_nodeid %d from %d lockspace %x", - hd->h_nodeid, nodeid, hd->h_lockspace); + hd->h_nodeid, nodeid, hd->u.h_lockspace); return; } - ls = dlm_find_lockspace_global(hd->h_lockspace); + ls = dlm_find_lockspace_global(hd->u.h_lockspace); if (!ls) { if (dlm_config.ci_log_debug) { printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " "%u from %d cmd %d type %d\n", - hd->h_lockspace, nodeid, hd->h_cmd, type); + hd->u.h_lockspace, nodeid, hd->h_cmd, type); } if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index c14cf2b7faab..d71aba8c3e64 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -16,6 +16,7 @@ #include "member.h" #include "recoverd.h" #include "dir.h" +#include "midcomms.h" #include "lowcomms.h" #include "config.h" #include "memory.h" @@ -390,7 +391,7 @@ static int threads_start(void) } /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_lowcomms_start(); + error = dlm_midcomms_start(); if (error) { log_print("cannot start dlm lowcomms %d", error); goto scand_fail; @@ -566,7 +567,12 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS); + /* Due backwards compatibility with 3.1 we need to use maximum + * possible dlm message size to be sure the message will fit and + * not having out of bounds issues. However on sending side 3.2 + * might send less. + */ + ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS); if (!ls->ls_recover_buf) goto out_lkbidr; @@ -698,7 +704,7 @@ int dlm_new_lockspace(const char *name, const char *cluster, error = 0; if (!ls_count) { dlm_scand_stop(); - dlm_lowcomms_shutdown(); + dlm_midcomms_shutdown(); dlm_lowcomms_stop(); } out: @@ -787,7 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) if (ls_count == 1) { dlm_scand_stop(); - dlm_lowcomms_shutdown(); + dlm_midcomms_shutdown(); } dlm_callback_stop(ls); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 166e36fcf3e4..0ea9ae35da0b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -59,7 +59,6 @@ #include "config.h" #define NEEDED_RMEM (4*1024*1024) -#define CONN_HASH_SIZE 32 /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 @@ -79,14 +78,20 @@ struct connection { #define CF_CLOSING 8 #define CF_SHUTDOWN 9 #define CF_CONNECTED 10 +#define CF_RECONNECT 11 +#define CF_DELAY_CONNECT 12 +#define CF_EOF 13 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; + atomic_t writequeue_cnt; void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ + bool (*eof_condition)(struct connection *con); /* What to do to eof check */ int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; struct connection *othercon; + struct connection *sendcon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ @@ -113,7 +118,22 @@ struct writequeue_entry { int len; int end; int users; + bool dirty; struct connection *con; + struct list_head msgs; + struct kref ref; +}; + +struct dlm_msg { + struct writequeue_entry *entry; + struct dlm_msg *orig_msg; + bool retransmit; + void *ppc; + int len; + int idx; /* new()/commit() idx exchange */ + + struct list_head list; + struct kref ref; }; struct dlm_node_addr { @@ -155,33 +175,23 @@ static void sctp_connect_to_sock(struct connection *con); static void tcp_connect_to_sock(struct connection *con); static void dlm_tcp_shutdown(struct connection *con); -/* This is deliberately very simple because most clusters have simple - sequential nodeids, so we should be able to go straight to a connection - struct in the array */ -static inline int nodeid_hash(int nodeid) +static struct connection *__find_con(int nodeid, int r) { - return nodeid & (CONN_HASH_SIZE-1); -} - -static struct connection *__find_con(int nodeid) -{ - int r, idx; struct connection *con; - r = nodeid_hash(nodeid); - - idx = srcu_read_lock(&connections_srcu); hlist_for_each_entry_rcu(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) { - srcu_read_unlock(&connections_srcu, idx); + if (con->nodeid == nodeid) return con; - } } - srcu_read_unlock(&connections_srcu, idx); return NULL; } +static bool tcp_eof_condition(struct connection *con) +{ + return atomic_read(&con->writequeue_cnt); +} + static int dlm_con_init(struct connection *con, int nodeid) { con->rx_buflen = dlm_config.ci_buffer_size; @@ -193,15 +203,23 @@ static int dlm_con_init(struct connection *con, int nodeid) mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); + atomic_set(&con->writequeue_cnt, 0); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); init_waitqueue_head(&con->shutdown_wait); - if (dlm_config.ci_protocol == 0) { + switch (dlm_config.ci_protocol) { + case DLM_PROTO_TCP: con->connect_action = tcp_connect_to_sock; con->shutdown_action = dlm_tcp_shutdown; - } else { + con->eof_condition = tcp_eof_condition; + break; + case DLM_PROTO_SCTP: con->connect_action = sctp_connect_to_sock; + break; + default: + kfree(con->rx_buf); + return -EINVAL; } return 0; @@ -216,7 +234,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) struct connection *con, *tmp; int r, ret; - con = __find_con(nodeid); + r = nodeid_hash(nodeid); + con = __find_con(nodeid, r); if (con || !alloc) return con; @@ -230,8 +249,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } - r = nodeid_hash(nodeid); - spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -239,7 +256,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) * under protection of connections_lock. If this is the case we * abort our connection creation and return the existing connection. */ - tmp = __find_con(nodeid); + tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); kfree(con->rx_buf); @@ -256,15 +273,13 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i, idx; + int i; struct connection *con; - idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } - srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -462,6 +477,9 @@ static void lowcomms_data_ready(struct sock *sk) static void lowcomms_listen_data_ready(struct sock *sk) { + if (!dlm_allow_conn) + return; + queue_work(recv_workqueue, &listen_con.rwork); } @@ -518,14 +536,21 @@ static void lowcomms_state_change(struct sock *sk) int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + int idx; if (nodeid == dlm_our_nodeid()) return 0; + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, GFP_NOFS); - if (!con) + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; + } + lowcomms_connect_sock(con); + srcu_read_unlock(&connections_srcu, idx); + return 0; } @@ -587,6 +612,22 @@ static void lowcomms_error_report(struct sock *sk) dlm_config.ci_tcp_port, sk->sk_err, sk->sk_err_soft); } + + /* below sendcon only handling */ + if (test_bit(CF_IS_OTHERCON, &con->flags)) + con = con->sendcon; + + switch (sk->sk_err) { + case ECONNREFUSED: + set_bit(CF_DELAY_CONNECT, &con->flags); + break; + default: + break; + } + + if (!test_and_set_bit(CF_RECONNECT, &con->flags)) + queue_work(send_workqueue, &con->swork); + out: read_unlock_bh(&sk->sk_callback_lock); if (orig_report) @@ -669,6 +710,42 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); } +static void dlm_page_release(struct kref *kref) +{ + struct writequeue_entry *e = container_of(kref, struct writequeue_entry, + ref); + + __free_page(e->page); + kfree(e); +} + +static void dlm_msg_release(struct kref *kref) +{ + struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); + + kref_put(&msg->entry->ref, dlm_page_release); + kfree(msg); +} + +static void free_entry(struct writequeue_entry *e) +{ + struct dlm_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &e->msgs, list) { + if (msg->orig_msg) { + msg->orig_msg->retransmit = false; + kref_put(&msg->orig_msg->ref, dlm_msg_release); + } + + list_del(&msg->list); + kref_put(&msg->ref, dlm_msg_release); + } + + list_del(&e->list); + atomic_dec(&e->con->writequeue_cnt); + kref_put(&e->ref, dlm_page_release); +} + static void dlm_close_sock(struct socket **sock) { if (*sock) { @@ -683,6 +760,7 @@ static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { bool closing = test_and_set_bit(CF_CLOSING, &con->flags); + struct writequeue_entry *e; if (tx && !closing && cancel_work_sync(&con->swork)) { log_print("canceled swork for node %d", con->nodeid); @@ -698,12 +776,35 @@ static void close_connection(struct connection *con, bool and_other, if (con->othercon && and_other) { /* Will only re-enter once. */ - close_connection(con->othercon, false, true, true); + close_connection(con->othercon, false, tx, rx); } + /* if we send a writequeue entry only a half way, we drop the + * whole entry because reconnection and that we not start of the + * middle of a msg which will confuse the other end. + * + * we can always drop messages because retransmits, but what we + * cannot allow is to transmit half messages which may be processed + * at the other side. + * + * our policy is to start on a clean state when disconnects, we don't + * know what's send/received on transport layer in this case. + */ + spin_lock(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_first_entry(&con->writequeue, struct writequeue_entry, + list); + if (e->dirty) + free_entry(e); + } + spin_unlock(&con->writequeue_lock); + con->rx_leftover = 0; con->retries = 0; clear_bit(CF_CONNECTED, &con->flags); + clear_bit(CF_DELAY_CONNECT, &con->flags); + clear_bit(CF_RECONNECT, &con->flags); + clear_bit(CF_EOF, &con->flags); mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); } @@ -841,19 +942,26 @@ out_resched: return -EAGAIN; out_close: - mutex_unlock(&con->sock_mutex); - if (ret != -EAGAIN) { - /* Reconnect when there is something to send */ - close_connection(con, false, true, false); - if (ret == 0) { - log_print("connection %p got EOF from %d", - con, con->nodeid); + if (ret == 0) { + log_print("connection %p got EOF from %d", + con, con->nodeid); + + if (con->eof_condition && con->eof_condition(con)) { + set_bit(CF_EOF, &con->flags); + mutex_unlock(&con->sock_mutex); + } else { + mutex_unlock(&con->sock_mutex); + close_connection(con, false, true, false); + /* handling for tcp shutdown */ clear_bit(CF_SHUTDOWN, &con->flags); wake_up(&con->shutdown_wait); - /* signal to breaking receive worker */ - ret = -1; } + + /* signal to breaking receive worker */ + ret = -1; + } else { + mutex_unlock(&con->sock_mutex); } return ret; } @@ -864,16 +972,12 @@ static int accept_from_sock(struct listen_connection *con) int result; struct sockaddr_storage peeraddr; struct socket *newsock; - int len; + int len, idx; int nodeid; struct connection *newcon; struct connection *addcon; unsigned int mark; - if (!dlm_allow_conn) { - return -1; - } - if (!con->sock) return -ENOTCONN; @@ -907,8 +1011,10 @@ static int accept_from_sock(struct listen_connection *con) * the same time and the connections cross on the wire. * In this case we store the incoming one in "othercon" */ + idx = srcu_read_lock(&connections_srcu); newcon = nodeid2con(nodeid, GFP_NOFS); if (!newcon) { + srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } @@ -924,6 +1030,7 @@ static int accept_from_sock(struct listen_connection *con) if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); + srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } @@ -932,11 +1039,14 @@ static int accept_from_sock(struct listen_connection *con) if (result < 0) { kfree(othercon); mutex_unlock(&newcon->sock_mutex); + srcu_read_unlock(&connections_srcu, idx); goto accept_err; } lockdep_set_subclass(&othercon->sock_mutex, 1); + set_bit(CF_IS_OTHERCON, &othercon->flags); newcon->othercon = othercon; + othercon->sendcon = newcon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); @@ -966,6 +1076,8 @@ static int accept_from_sock(struct listen_connection *con) if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) queue_work(recv_workqueue, &addcon->rwork); + srcu_read_unlock(&connections_srcu, idx); + return 0; accept_err: @@ -977,12 +1089,6 @@ accept_err: return result; } -static void free_entry(struct writequeue_entry *e) -{ - __free_page(e->page); - kfree(e); -} - /* * writequeue_entry_complete - try to delete and free write queue entry * @e: write queue entry to try to delete @@ -994,11 +1100,11 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) { e->offset += completed; e->len -= completed; + /* signal that page was half way transmitted */ + e->dirty = true; - if (e->len == 0 && e->users == 0) { - list_del(&e->list); + if (e->len == 0 && e->users == 0) free_entry(e); - } } /* @@ -1075,7 +1181,7 @@ static void sctp_connect_to_sock(struct connection *con) make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - log_print("connecting to %d", con->nodeid); + log_print_ratelimited("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ sctp_sock_set_nodelay(sock->sk); @@ -1171,7 +1277,7 @@ static void tcp_connect_to_sock(struct connection *con) make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); - log_print("connecting to %d", con->nodeid); + log_print_ratelimited("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ tcp_sock_set_nodelay(sock->sk); @@ -1364,12 +1470,16 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, entry->con = con; entry->users = 1; + kref_init(&entry->ref); + INIT_LIST_HEAD(&entry->msgs); return entry; } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, - gfp_t allocation, char **ppc) + gfp_t allocation, char **ppc, + void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) { struct writequeue_entry *e; @@ -1377,7 +1487,12 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { + kref_get(&e->ref); + *ppc = page_address(e->page) + e->end; + if (cb) + cb(mh); + e->end += len; e->users++; spin_unlock(&con->writequeue_lock); @@ -1391,42 +1506,92 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, if (!e) return NULL; + kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; + atomic_inc(&con->writequeue_cnt); spin_lock(&con->writequeue_lock); + if (cb) + cb(mh); + list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); return e; }; -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, + gfp_t allocation, char **ppc, + void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) +{ + struct writequeue_entry *e; + struct dlm_msg *msg; + + msg = kzalloc(sizeof(*msg), allocation); + if (!msg) + return NULL; + + kref_init(&msg->ref); + + e = new_wq_entry(con, len, allocation, ppc, cb, mh); + if (!e) { + kfree(msg); + return NULL; + } + + msg->ppc = *ppc; + msg->len = len; + msg->entry = e; + + return msg; +} + +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) { struct connection *con; + struct dlm_msg *msg; + int idx; - if (len > DEFAULT_BUFFER_SIZE || + if (len > DLM_MAX_SOCKET_BUFSIZE || len < sizeof(struct dlm_header)) { - BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE); + BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); WARN_ON(1); return NULL; } + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, allocation); - if (!con) + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return NULL; + } - return new_wq_entry(con, len, allocation, ppc); + msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh); + if (!msg) { + srcu_read_unlock(&connections_srcu, idx); + return NULL; + } + + /* we assume if successful commit must called */ + msg->idx = idx; + return msg; } -void dlm_lowcomms_commit_buffer(void *mh) +static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) { - struct writequeue_entry *e = (struct writequeue_entry *)mh; + struct writequeue_entry *e = msg->entry; struct connection *con = e->con; int users; spin_lock(&con->writequeue_lock); + kref_get(&msg->ref); + list_add(&msg->list, &e->msgs); + users = --e->users; if (users) goto out; @@ -1442,6 +1607,42 @@ out: return; } +void dlm_lowcomms_commit_msg(struct dlm_msg *msg) +{ + _dlm_lowcomms_commit_msg(msg); + srcu_read_unlock(&connections_srcu, msg->idx); +} + +void dlm_lowcomms_put_msg(struct dlm_msg *msg) +{ + kref_put(&msg->ref, dlm_msg_release); +} + +/* does not held connections_srcu, usage workqueue only */ +int dlm_lowcomms_resend_msg(struct dlm_msg *msg) +{ + struct dlm_msg *msg_resend; + char *ppc; + + if (msg->retransmit) + return 1; + + msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len, + GFP_ATOMIC, &ppc, NULL, NULL); + if (!msg_resend) + return -ENOMEM; + + msg->retransmit = true; + kref_get(&msg->ref); + msg_resend->orig_msg = msg; + + memcpy(ppc, msg->ppc, msg->len); + _dlm_lowcomms_commit_msg(msg_resend); + dlm_lowcomms_put_msg(msg_resend); + + return 0; +} + /* Send a message */ static void send_to_sock(struct connection *con) { @@ -1483,7 +1684,7 @@ static void send_to_sock(struct connection *con) cond_resched(); goto out; } else if (ret < 0) - goto send_error; + goto out; } /* Don't starve people filling buffers */ @@ -1496,16 +1697,23 @@ static void send_to_sock(struct connection *con) writequeue_entry_complete(e, ret); } spin_unlock(&con->writequeue_lock); -out: - mutex_unlock(&con->sock_mutex); + + /* close if we got EOF */ + if (test_and_clear_bit(CF_EOF, &con->flags)) { + mutex_unlock(&con->sock_mutex); + close_connection(con, false, false, true); + + /* handling for tcp shutdown */ + clear_bit(CF_SHUTDOWN, &con->flags); + wake_up(&con->shutdown_wait); + } else { + mutex_unlock(&con->sock_mutex); + } + return; -send_error: +out: mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - /* Requeue the send work. When the work daemon runs again, it will try - a new connection, then call this function again. */ - queue_work(send_workqueue, &con->swork); return; out_connect: @@ -1520,7 +1728,6 @@ static void clean_one_writequeue(struct connection *con) spin_lock(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { - list_del(&e->list); free_entry(e); } spin_unlock(&con->writequeue_lock); @@ -1532,8 +1739,10 @@ int dlm_lowcomms_close(int nodeid) { struct connection *con; struct dlm_node_addr *na; + int idx; log_print("closing connection to node %d", nodeid); + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (con) { set_bit(CF_CLOSE, &con->flags); @@ -1542,6 +1751,7 @@ int dlm_lowcomms_close(int nodeid) if (con->othercon) clean_one_writequeue(con->othercon); } + srcu_read_unlock(&connections_srcu, idx); spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); @@ -1578,35 +1788,50 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); + WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags)); + clear_bit(CF_WRITE_PENDING, &con->flags); - if (con->sock == NULL) /* not mutex protected so check it inside too */ + + if (test_and_clear_bit(CF_RECONNECT, &con->flags)) { + close_connection(con, false, false, true); + dlm_midcomms_unack_msg_resend(con->nodeid); + } + + if (con->sock == NULL) { /* not mutex protected so check it inside too */ + if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) + msleep(1000); con->connect_action(con); + } if (!list_empty(&con->writequeue)) send_to_sock(con); } static void work_stop(void) { - if (recv_workqueue) + if (recv_workqueue) { destroy_workqueue(recv_workqueue); - if (send_workqueue) + recv_workqueue = NULL; + } + + if (send_workqueue) { destroy_workqueue(send_workqueue); + send_workqueue = NULL; + } } static int work_start(void) { - recv_workqueue = alloc_workqueue("dlm_recv", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM); if (!recv_workqueue) { log_print("can't start dlm_recv"); return -ENOMEM; } - send_workqueue = alloc_workqueue("dlm_send", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM); if (!send_workqueue) { log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); + recv_workqueue = NULL; return -ENOMEM; } @@ -1621,6 +1846,8 @@ static void shutdown_conn(struct connection *con) void dlm_lowcomms_shutdown(void) { + int idx; + /* Set all the flags to prevent any * socket activity. */ @@ -1633,7 +1860,9 @@ void dlm_lowcomms_shutdown(void) dlm_close_sock(&listen_con.sock); + idx = srcu_read_lock(&connections_srcu); foreach_conn(shutdown_conn); + srcu_read_unlock(&connections_srcu, idx); } static void _stop_conn(struct connection *con, bool and_other) @@ -1682,7 +1911,7 @@ static void free_conn(struct connection *con) static void work_flush(void) { - int ok, idx; + int ok; int i; struct connection *con; @@ -1693,7 +1922,6 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); - idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { @@ -1707,14 +1935,17 @@ static void work_flush(void) } } } - srcu_read_unlock(&connections_srcu, idx); } while (!ok); } void dlm_lowcomms_stop(void) { + int idx; + + idx = srcu_read_lock(&connections_srcu); work_flush(); foreach_conn(free_conn); + srcu_read_unlock(&connections_srcu, idx); work_stop(); deinit_local(); } @@ -1738,15 +1969,24 @@ int dlm_lowcomms_start(void) error = work_start(); if (error) - goto fail; + goto fail_local; dlm_allow_conn = 1; /* Start listening */ - if (dlm_config.ci_protocol == 0) + switch (dlm_config.ci_protocol) { + case DLM_PROTO_TCP: error = tcp_listen_for_all(); - else + break; + case DLM_PROTO_SCTP: error = sctp_listen_for_all(&listen_con); + break; + default: + log_print("Invalid protocol identifier %d set", + dlm_config.ci_protocol); + error = -EINVAL; + break; + } if (error) goto fail_unlisten; @@ -1755,6 +1995,9 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; dlm_close_sock(&listen_con.sock); + work_stop(); +fail_local: + deinit_local(); fail: return error; } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 48bbc4e18761..aaae7115c00d 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -12,7 +12,22 @@ #ifndef __LOWCOMMS_DOT_H__ #define __LOWCOMMS_DOT_H__ -#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 +#include "dlm_internal.h" + +#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts) +#define DLM_MAX_APP_BUFSIZE (DLM_MAX_SOCKET_BUFSIZE - \ + DLM_MIDCOMMS_OPT_LEN) + +#define CONN_HASH_SIZE 32 + +/* This is deliberately very simple because most clusters have simple + * sequential nodeids, so we should be able to go straight to a connection + * struct in the array + */ +static inline int nodeid_hash(int nodeid) +{ + return nodeid & (CONN_HASH_SIZE-1); +} /* switch to check if dlm is running */ extern int dlm_allow_conn; @@ -22,8 +37,12 @@ void dlm_lowcomms_shutdown(void); void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); -void dlm_lowcomms_commit_buffer(void *mh); +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh); +void dlm_lowcomms_commit_msg(struct dlm_msg *msg); +void dlm_lowcomms_put_msg(struct dlm_msg *msg); +int dlm_lowcomms_resend_msg(struct dlm_msg *msg); int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index ceef3f2074ff..d9e1e4170eb1 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -15,6 +15,7 @@ #include "recover.h" #include "rcom.h" #include "config.h" +#include "midcomms.h" #include "lowcomms.h" int dlm_slots_version(struct dlm_header *h) @@ -270,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, log_slots(ls, gen, num, NULL, array, array_size); - max_slots = (LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom) - + max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) - sizeof(struct rcom_config)) / sizeof(struct rcom_slot); if (num > max_slots) { @@ -329,6 +330,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) memb->nodeid = node->nodeid; memb->weight = node->weight; memb->comm_seq = node->comm_seq; + dlm_midcomms_add_member(node->nodeid); add_ordered_member(ls, memb); ls->ls_num_nodes++; return 0; @@ -359,26 +361,34 @@ int dlm_is_removed(struct dlm_ls *ls, int nodeid) return 0; } -static void clear_memb_list(struct list_head *head) +static void clear_memb_list(struct list_head *head, + void (*after_del)(int nodeid)) { struct dlm_member *memb; while (!list_empty(head)) { memb = list_entry(head->next, struct dlm_member, list); list_del(&memb->list); + if (after_del) + after_del(memb->nodeid); kfree(memb); } } +static void clear_members_cb(int nodeid) +{ + dlm_midcomms_remove_member(nodeid); +} + void dlm_clear_members(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes); + clear_memb_list(&ls->ls_nodes, clear_members_cb); ls->ls_num_nodes = 0; } void dlm_clear_members_gone(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes_gone); + clear_memb_list(&ls->ls_nodes_gone, NULL); } static void make_member_array(struct dlm_ls *ls) @@ -552,6 +562,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) neg++; list_move(&memb->list, &ls->ls_nodes_gone); + dlm_midcomms_remove_member(memb->nodeid); ls->ls_num_nodes--; dlm_lsop_recover_slot(ls, memb); } @@ -576,12 +587,18 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - if (!error || error == -EPROTO) { - /* new_lockspace() may be waiting to know if the config - is good or bad */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - } + /* error -EINTR means that a new recovery action is triggered. + * We ignore this recovery action and let run the new one which might + * have new member configuration. + */ + if (error == -EINTR) + error = 0; + + /* new_lockspace() may be waiting to know if the config + * is good or bad + */ + ls->ls_members_result = error; + complete(&ls->ls_members_done); log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 1c6654a21ec4..e3de268898ed 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -3,7 +3,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* @@ -12,22 +12,866 @@ /* * midcomms.c * - * This is the appallingly named "mid-level" comms layer. + * This is the appallingly named "mid-level" comms layer. It takes care about + * deliver an on application layer "reliable" communication above the used + * lowcomms transport layer. * - * Its purpose is to take packets from the "real" comms layer, - * split them up into packets and pass them to the interested - * part of the locking mechanism. + * How it works: * - * It also takes messages from the locking layer, formats them - * into packets and sends them to the comms layer. + * Each nodes keeps track of all send DLM messages in send_queue with a sequence + * number. The receive will send an DLM_ACK message back for every DLM message + * received at the other side. If a reconnect happens in lowcomms we will send + * all unacknowledged dlm messages again. The receiving side might drop any already + * received message by comparing sequence numbers. + * + * How version detection works: + * + * Due the fact that dlm has pre-configured node addresses on every side + * it is in it's nature that every side connects at starts to transmit + * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS + * and their replies are the first messages which are exchanges. Due backwards + * compatibility these messages are not covered by the midcomms re-transmission + * layer. These messages have their own re-transmission handling in the dlm + * application layer. The version field of every node will be set on these RCOM + * messages as soon as they arrived and the node isn't yet part of the nodes + * hash. There exists also logic to detect version mismatched if something weird + * going on or the first messages isn't an expected one. + * + * Termination: + * + * The midcomms layer does a 4 way handshake for termination on DLM protocol + * like TCP supports it with half-closed socket support. SCTP doesn't support + * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be + * interrupted by .e.g. tcp reset itself. Additional there exists the othercon + * paradigm in lowcomms which cannot be easily without breaking backwards + * compatibility. A node cannot send anything to another node when a DLM_FIN + * message was send. There exists additional logic to print a warning if + * DLM wants to do it. There exists a state handling like RFC 793 but reduced + * to termination only. The event "member removal event" describes the cluster + * manager removed the node from internal lists, at this point DLM does not + * send any message to the other node. There exists two cases: + * + * 1. The cluster member was removed and we received a FIN + * OR + * 2. We received a FIN but the member was not removed yet + * + * One of these cases will do the CLOSE_WAIT to LAST_ACK change. + * + * + * +---------+ + * | CLOSED | + * +---------+ + * | add member/receive RCOM version + * | detection msg + * V + * +---------+ + * | ESTAB | + * +---------+ + * CLOSE | | rcv FIN + * ------- | | ------- + * +---------+ snd FIN / \ snd ACK +---------+ + * | FIN |<----------------- ------------------>| CLOSE | + * | WAIT-1 |------------------ | WAIT | + * +---------+ rcv FIN \ +---------+ + * | rcv ACK of FIN ------- | CLOSE | member + * | -------------- snd ACK | ------- | removal + * V x V snd FIN V event + * +---------+ +---------+ +---------+ + * |FINWAIT-2| | CLOSING | | LAST-ACK| + * +---------+ +---------+ +---------+ + * | rcv ACK of FIN | rcv ACK of FIN | + * | rcv FIN -------------- | -------------- | + * | ------- x V x V + * \ snd ACK +---------+ +---------+ + * ------------------------>| CLOSED | | CLOSED | + * +---------+ +---------+ + * + * NOTE: any state can interrupted by midcomms_close() and state will be + * switched to CLOSED in case of fencing. There exists also some timeout + * handling when we receive the version detection RCOM messages which is + * made by observation. + * + * Future improvements: + * + * There exists some known issues/improvements of the dlm handling. Some + * of them should be done in a next major dlm version bump which makes + * it incompatible with previous versions. + * + * Unaligned memory access: + * + * There exists cases when the dlm message buffer length is not aligned + * to 8 byte. However seems nobody detected any problem with it. This + * can be fixed in the next major version bump of dlm. + * + * Version detection: + * + * The version detection and how it's done is related to backwards + * compatibility. There exists better ways to make a better handling. + * However this should be changed in the next major version bump of dlm. + * + * Ack handling: + * + * Currently we send an ack message for every dlm message. However we + * can ack multiple dlm messages with one ack by just delaying the ack + * message. Will reduce some traffic but makes the drop detection slower. + * + * Tail Size checking: + * + * There exists a message tail payload in e.g. DLM_MSG however we don't + * check it against the message length yet regarding to the receive buffer + * length. That need to be validated. + * + * Fencing bad nodes: + * + * At timeout places or weird sequence number behaviours we should send + * a fencing request to the cluster manager. */ +/* Debug switch to enable a 5 seconds sleep waiting of a termination. + * This can be useful to test fencing while termination is running. + * This requires a setup with only gfs2 as dlm user, so that the + * last umount will terminate the connection. + * + * However it became useful to test, while the 5 seconds block in umount + * just press the reset button. In a lot of dropping the termination + * process can could take several seconds. + */ +#define DLM_DEBUG_FENCE_TERMINATION 0 + +#include + #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" +#include "util.h" #include "midcomms.h" +/* init value for sequence numbers for testing purpose only e.g. overflows */ +#define DLM_SEQ_INIT 0 +/* 3 minutes wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000) +#define DLM_VERSION_NOT_SET 0 + +struct midcomms_node { + int nodeid; + uint32_t version; + uint32_t seq_send; + uint32_t seq_next; + /* These queues are unbound because we cannot drop any message in dlm. + * We could send a fence signal for a specific node to the cluster + * manager if queues hits some maximum value, however this handling + * not supported yet. + */ + struct list_head send_queue; + spinlock_t send_queue_lock; + atomic_t send_queue_cnt; +#define DLM_NODE_FLAG_CLOSE 1 +#define DLM_NODE_FLAG_STOP_TX 2 +#define DLM_NODE_FLAG_STOP_RX 3 + unsigned long flags; + wait_queue_head_t shutdown_wait; + + /* dlm tcp termination state */ +#define DLM_CLOSED 1 +#define DLM_ESTABLISHED 2 +#define DLM_FIN_WAIT1 3 +#define DLM_FIN_WAIT2 4 +#define DLM_CLOSE_WAIT 5 +#define DLM_LAST_ACK 6 +#define DLM_CLOSING 7 + int state; + spinlock_t state_lock; + + /* counts how many lockspaces are using this node + * this refcount is necessary to determine if the + * node wants to disconnect. + */ + int users; + + /* not protected by srcu, node_hash lifetime */ + void *debugfs; + + struct hlist_node hlist; + struct rcu_head rcu; +}; + +struct dlm_mhandle { + const struct dlm_header *inner_hd; + struct midcomms_node *node; + struct dlm_opts *opts; + struct dlm_msg *msg; + bool committed; + uint32_t seq; + + void (*ack_rcv)(struct midcomms_node *node); + + /* get_mhandle/commit srcu idx exchange */ + int idx; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct hlist_head node_hash[CONN_HASH_SIZE]; +static DEFINE_SPINLOCK(nodes_lock); +DEFINE_STATIC_SRCU(nodes_srcu); + +/* This mutex prevents that midcomms_close() is running while + * stop() or remove(). As I experienced invalid memory access + * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and + * resetting machines. I will end in some double deletion in nodes + * datastructure. + */ +static DEFINE_MUTEX(close_lock); + +static inline const char *dlm_state_str(int state) +{ + switch (state) { + case DLM_CLOSED: + return "CLOSED"; + case DLM_ESTABLISHED: + return "ESTABLISHED"; + case DLM_FIN_WAIT1: + return "FIN_WAIT1"; + case DLM_FIN_WAIT2: + return "FIN_WAIT2"; + case DLM_CLOSE_WAIT: + return "CLOSE_WAIT"; + case DLM_LAST_ACK: + return "LAST_ACK"; + case DLM_CLOSING: + return "CLOSING"; + default: + return "UNKNOWN"; + } +} + +const char *dlm_midcomms_state(struct midcomms_node *node) +{ + return dlm_state_str(node->state); +} + +unsigned long dlm_midcomms_flags(struct midcomms_node *node) +{ + return node->flags; +} + +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node) +{ + return atomic_read(&node->send_queue_cnt); +} + +uint32_t dlm_midcomms_version(struct midcomms_node *node) +{ + return node->version; +} + +static struct midcomms_node *__find_node(int nodeid, int r) +{ + struct midcomms_node *node; + + hlist_for_each_entry_rcu(node, &node_hash[r], hlist) { + if (node->nodeid == nodeid) + return node; + } + + return NULL; +} + +static void dlm_mhandle_release(struct rcu_head *rcu) +{ + struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); + + dlm_lowcomms_put_msg(mh->msg); + kfree(mh); +} + +static void dlm_mhandle_delete(struct midcomms_node *node, + struct dlm_mhandle *mh) +{ + list_del_rcu(&mh->list); + atomic_dec(&node->send_queue_cnt); + call_rcu(&mh->rcu, dlm_mhandle_release); +} + +static void dlm_send_queue_flush(struct midcomms_node *node) +{ + struct dlm_mhandle *mh; + + pr_debug("flush midcomms send queue of node %d\n", node->nodeid); + + rcu_read_lock(); + spin_lock(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + dlm_mhandle_delete(node, mh); + } + spin_unlock(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void midcomms_node_reset(struct midcomms_node *node) +{ + pr_debug("reset node %d\n", node->nodeid); + + node->seq_next = DLM_SEQ_INIT; + node->seq_send = DLM_SEQ_INIT; + node->version = DLM_VERSION_NOT_SET; + node->flags = 0; + + dlm_send_queue_flush(node); + node->state = DLM_CLOSED; + wake_up(&node->shutdown_wait); +} + +static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) +{ + struct midcomms_node *node, *tmp; + int r = nodeid_hash(nodeid); + + node = __find_node(nodeid, r); + if (node || !alloc) + return node; + + node = kmalloc(sizeof(*node), alloc); + if (!node) + return NULL; + + node->nodeid = nodeid; + spin_lock_init(&node->state_lock); + spin_lock_init(&node->send_queue_lock); + atomic_set(&node->send_queue_cnt, 0); + INIT_LIST_HEAD(&node->send_queue); + init_waitqueue_head(&node->shutdown_wait); + node->users = 0; + midcomms_node_reset(node); + + spin_lock(&nodes_lock); + /* check again if there was somebody else + * earlier here to add the node + */ + tmp = __find_node(nodeid, r); + if (tmp) { + spin_unlock(&nodes_lock); + kfree(node); + return tmp; + } + + hlist_add_head_rcu(&node->hlist, &node_hash[r]); + spin_unlock(&nodes_lock); + + node->debugfs = dlm_create_debug_comms_file(nodeid, node); + return node; +} + +static int dlm_send_ack(int nodeid, uint32_t seq) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_msg *msg; + char *ppc; + + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + NULL, NULL); + if (!msg) + return -ENOMEM; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = dlm_our_nodeid(); + m_header->h_length = mb_len; + m_header->h_cmd = DLM_ACK; + m_header->u.h_seq = seq; + + header_out(m_header); + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); + + return 0; +} + +static int dlm_send_fin(struct midcomms_node *node, + void (*ack_rcv)(struct midcomms_node *node)) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_mhandle *mh; + char *ppc; + + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + if (!mh) + return -ENOMEM; + + mh->ack_rcv = ack_rcv; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = dlm_our_nodeid(); + m_header->h_length = mb_len; + m_header->h_cmd = DLM_FIN; + + header_out(m_header); + + pr_debug("sending fin msg to node %d\n", node->nodeid); + dlm_midcomms_commit_mhandle(mh); + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); + + return 0; +} + +static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) +{ + struct dlm_mhandle *mh; + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + if (mh->ack_rcv) + mh->ack_rcv(node); + } else { + /* send queue should be ordered */ + break; + } + } + + spin_lock(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + dlm_mhandle_delete(node, mh); + } else { + /* send queue should be ordered */ + break; + } + } + spin_unlock(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive passive fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_LAST_ACK: + /* DLM_CLOSED */ + midcomms_node_reset(node); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); +} + +static void dlm_midcomms_receive_buffer(union dlm_packet *p, + struct midcomms_node *node, + uint32_t seq) +{ + if (seq == node->seq_next) { + node->seq_next++; + /* send ack before fin */ + dlm_send_ack(node->nodeid, node->seq_next); + + switch (p->header.h_cmd) { + case DLM_FIN: + spin_lock(&node->state_lock); + pr_debug("receive fin msg from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_ESTABLISHED: + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + /* passive shutdown DLM_LAST_ACK case 1 + * additional we check if the node is used by + * cluster manager events at all. + */ + if (node->users == 0) { + node->state = DLM_LAST_ACK; + pr_debug("switch node %d to state %s case 1\n", + node->nodeid, dlm_state_str(node->state)); + spin_unlock(&node->state_lock); + goto send_fin; + } + break; + case DLM_FIN_WAIT1: + node->state = DLM_CLOSING; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_FIN_WAIT2: + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + wake_up(&node->shutdown_wait); + break; + case DLM_LAST_ACK: + /* probably remove_member caught it, do nothing */ + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); + + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + break; + default: + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer(p, node->nodeid); + break; + } + } else { + /* retry to ack message which we already have by sending back + * current node->seq_next number as ack. + */ + if (seq < node->seq_next) + dlm_send_ack(node->nodeid, node->seq_next); + + log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", + seq, node->seq_next, node->nodeid); + } + + return; + +send_fin: + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); +} + +static struct midcomms_node * +dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, + uint16_t msglen, int (*cb)(struct midcomms_node *node)) +{ + struct midcomms_node *node = NULL; + gfp_t allocation = 0; + int ret; + + switch (p->header.h_cmd) { + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("rcom msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return NULL; + } + + switch (le32_to_cpu(p->rcom.rc_type)) { + case DLM_RCOM_NAMES: + fallthrough; + case DLM_RCOM_NAMES_REPLY: + fallthrough; + case DLM_RCOM_STATUS: + fallthrough; + case DLM_RCOM_STATUS_REPLY: + node = nodeid2node(nodeid, 0); + if (node) { + spin_lock(&node->state_lock); + if (node->state != DLM_ESTABLISHED) + pr_debug("receive begin RCOM msg from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_ESTABLISHED: + break; + default: + /* some invalid state passive shutdown + * was failed, we try to reset and + * hope it will go on. + */ + log_print("reset node %d because shutdown stuck", + node->nodeid); + + midcomms_node_reset(node); + node->state = DLM_ESTABLISHED; + break; + } + spin_unlock(&node->state_lock); + } + + allocation = GFP_NOFS; + break; + default: + break; + } + + break; + default: + break; + } + + node = nodeid2node(nodeid, allocation); + if (!node) { + switch (p->header.h_cmd) { + case DLM_OPTS: + if (msglen < sizeof(struct dlm_opts)) { + log_print("opts msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return NULL; + } + + log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence", + p->opts.o_nextcmd, nodeid); + break; + default: + log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence", + p->header.h_cmd, nodeid); + break; + } + + return NULL; + } + + ret = cb(node); + if (ret < 0) + return NULL; + + return node; +} + +static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) +{ + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_2; + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, + node->nodeid); + break; + case DLM_VERSION_3_2: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_2, node->nodeid, node->version); + return -1; + } + + return 0; +} + +static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid) +{ + int len = msglen; + + /* we only trust outer header msglen because + * it's checked against receive buffer length. + */ + if (len < sizeof(struct dlm_opts)) + return -1; + len -= sizeof(struct dlm_opts); + + if (len < le16_to_cpu(p->opts.o_optlen)) + return -1; + len -= le16_to_cpu(p->opts.o_optlen); + + switch (p->opts.o_nextcmd) { + case DLM_FIN: + if (len < sizeof(struct dlm_header)) { + log_print("fin too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + case DLM_MSG: + if (len < sizeof(struct dlm_message)) { + log_print("msg too small: %d, will skip this message from node %d", + msglen, nodeid); + return -1; + } + + break; + case DLM_RCOM: + if (len < sizeof(struct dlm_rcom)) { + log_print("rcom msg too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + default: + log_print("unsupported o_nextcmd received: %u, will skip this message from node %d", + p->opts.o_nextcmd, nodeid); + return -1; + } + + return 0; +} + +static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + uint32_t seq; + int ret, idx; + + idx = srcu_read_lock(&nodes_srcu); + node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, + dlm_midcomms_version_check_3_2); + if (!node) + goto out; + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* these rcom message we use to determine version. + * they have their own retransmission handling and + * are the first messages of dlm. + * + * length already checked. + */ + switch (le32_to_cpu(p->rcom.rc_type)) { + case DLM_RCOM_NAMES: + fallthrough; + case DLM_RCOM_NAMES_REPLY: + fallthrough; + case DLM_RCOM_STATUS: + fallthrough; + case DLM_RCOM_STATUS_REPLY: + break; + default: + log_print("unsupported rcom type received: %u, will skip this message from node %d", + le32_to_cpu(p->rcom.rc_type), nodeid); + goto out; + } + + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer(p, nodeid); + break; + case DLM_OPTS: + seq = le32_to_cpu(p->header.u.h_seq); + + ret = dlm_opts_check_msglen(p, msglen, nodeid); + if (ret < 0) { + log_print("opts msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + p = (union dlm_packet *)((unsigned char *)p->opts.o_opts + + le16_to_cpu(p->opts.o_optlen)); + + /* recheck inner msglen just if it's not garbage */ + msglen = le16_to_cpu(p->header.h_length); + switch (p->header.h_cmd) { + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("inner rcom msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("inner msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_FIN: + if (msglen < sizeof(struct dlm_header)) { + log_print("inner fin too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + default: + log_print("unsupported inner h_cmd received: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + dlm_midcomms_receive_buffer(p, node, seq); + break; + case DLM_ACK: + seq = le32_to_cpu(p->header.u.h_seq); + dlm_receive_ack(node, seq); + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + break; + } + +out: + srcu_read_unlock(&nodes_srcu, idx); +} + +static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) +{ + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_1; + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, + node->nodeid); + break; + case DLM_VERSION_3_1: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_1, node->nodeid, node->version); + return -1; + } + + return 0; +} + +static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, + dlm_midcomms_version_check_3_1); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + srcu_read_unlock(&nodes_srcu, idx); + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* length already checked */ + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return; + } + + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + return; + } + + dlm_receive_buffer(p, nodeid); +} + /* * Called from the low-level comms layer to process a buffer of * commands. @@ -43,7 +887,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; - /* no message should be more than DEFAULT_BUFFER_SIZE or + /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or * less than dlm_header size. * * Some messages does not have a 8 byte length boundary yet @@ -55,7 +899,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) * the next major version bump. */ msglen = le16_to_cpu(hd->h_length); - if (msglen > DEFAULT_BUFFER_SIZE || + if (msglen > DLM_MAX_SOCKET_BUFSIZE || msglen < sizeof(struct dlm_header)) { log_print("received invalid length header: %u from node %d, will abort message parsing", msglen, nodeid); @@ -68,32 +912,19 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; - switch (hd->h_cmd) { - case DLM_MSG: - if (msglen < sizeof(struct dlm_message)) { - log_print("dlm msg too small: %u, will skip this message", - msglen); - goto skip; - } - + switch (le32_to_cpu(hd->h_version)) { + case DLM_VERSION_3_1: + dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); break; - case DLM_RCOM: - if (msglen < sizeof(struct dlm_rcom)) { - log_print("dlm rcom msg too small: %u, will skip this message", - msglen); - goto skip; - } - + case DLM_VERSION_3_2: + dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); break; default: - log_print("unsupported h_cmd received: %u, will skip this message", - hd->h_cmd); - goto skip; + log_print("received invalid version header: %u from node %d, will skip this message", + le32_to_cpu(hd->h_version), nodeid); + break; } - dlm_receive_buffer((union dlm_packet *)ptr, nodeid); - -skip: ret += msglen; len -= msglen; ptr += msglen; @@ -102,3 +933,455 @@ skip: return ret; } +void dlm_midcomms_unack_msg_resend(int nodeid) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + int idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* old protocol, we don't support to retransmit on failure */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (!mh->committed) + continue; + + ret = dlm_lowcomms_resend_msg(mh->msg); + if (!ret) + log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d", + mh->seq, node->nodeid); + } + rcu_read_unlock(); + srcu_read_unlock(&nodes_srcu, idx); +} + +static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, + uint32_t seq) +{ + opts->o_header.h_cmd = DLM_OPTS; + opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + opts->o_header.h_nodeid = dlm_our_nodeid(); + opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len; + opts->o_header.u.h_seq = seq; + header_out(&opts->o_header); +} + +static void midcomms_new_msg_cb(struct dlm_mhandle *mh) +{ + atomic_inc(&mh->node->send_queue_cnt); + + spin_lock(&mh->node->send_queue_lock); + list_add_tail_rcu(&mh->list, &mh->node->send_queue); + spin_unlock(&mh->node->send_queue_lock); + + mh->seq = mh->node->seq_send++; +} + +static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, + int len, gfp_t allocation, char **ppc) +{ + struct dlm_opts *opts; + struct dlm_msg *msg; + + msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN, + allocation, ppc, midcomms_new_msg_cb, mh); + if (!msg) + return NULL; + + opts = (struct dlm_opts *)*ppc; + mh->opts = opts; + + /* add possible options here */ + dlm_fill_opts_header(opts, len, mh->seq); + + *ppc += sizeof(*opts); + mh->inner_hd = (const struct dlm_header *)*ppc; + return msg; +} + +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + struct dlm_msg *msg; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + WARN_ON_ONCE(1); + goto err; + } + + /* this is a bug, however we going on and hope it will be resolved */ + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); + + mh = kzalloc(sizeof(*mh), GFP_NOFS); + if (!mh) + goto err; + + mh->idx = idx; + mh->node = node; + + switch (node->version) { + case DLM_VERSION_3_1: + msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, + NULL, NULL); + if (!msg) { + kfree(mh); + goto err; + } + + break; + case DLM_VERSION_3_2: + msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, + ppc); + if (!msg) { + kfree(mh); + goto err; + } + + break; + default: + kfree(mh); + WARN_ON(1); + goto err; + } + + mh->msg = msg; + + /* keep in mind that is a must to call + * dlm_midcomms_commit_msg() which releases + * nodes_srcu using mh->idx which is assumed + * here that the application will call it. + */ + return mh; + +err: + srcu_read_unlock(&nodes_srcu, idx); + return NULL; +} + +static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) +{ + /* nexthdr chain for fast lookup */ + mh->opts->o_nextcmd = mh->inner_hd->h_cmd; + mh->committed = true; + dlm_lowcomms_commit_msg(mh->msg); +} + +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) +{ + switch (mh->node->version) { + case DLM_VERSION_3_1: + srcu_read_unlock(&nodes_srcu, mh->idx); + + dlm_lowcomms_commit_msg(mh->msg); + dlm_lowcomms_put_msg(mh->msg); + /* mh is not part of rcu list in this case */ + kfree(mh); + break; + case DLM_VERSION_3_2: + dlm_midcomms_commit_msg_3_2(mh); + srcu_read_unlock(&nodes_srcu, mh->idx); + break; + default: + srcu_read_unlock(&nodes_srcu, mh->idx); + WARN_ON(1); + break; + } +} + +int dlm_midcomms_start(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&node_hash[i]); + + return dlm_lowcomms_start(); +} + +static void dlm_act_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive active fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_FIN_WAIT1: + node->state = DLM_FIN_WAIT2; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSING: + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + wake_up(&node->shutdown_wait); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); +} + +void dlm_midcomms_add_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + if (nodeid == dlm_our_nodeid()) + return; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, GFP_NOFS); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + if (!node->users) { + pr_debug("receive add member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + /* some invalid state passive shutdown + * was failed, we try to reset and + * hope it will go on. + */ + log_print("reset node %d because shutdown stuck", + node->nodeid); + + midcomms_node_reset(node); + node->state = DLM_ESTABLISHED; + break; + } + } + + node->users++; + pr_debug("users inc count %d\n", node->users); + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); +} + +void dlm_midcomms_remove_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + if (nodeid == dlm_our_nodeid()) + return; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + node->users--; + pr_debug("users dec count %d\n", node->users); + + /* hitting users count to zero means the + * other side is running dlm_midcomms_stop() + * we meet us to have a clean disconnect. + */ + if (node->users == 0) { + pr_debug("receive remove member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSE_WAIT: + /* passive shutdown DLM_LAST_ACK case 2 */ + node->state = DLM_LAST_ACK; + spin_unlock(&node->state_lock); + + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + goto send_fin; + case DLM_LAST_ACK: + /* probably receive fin caught it, do nothing */ + break; + case DLM_CLOSED: + /* already gone, do nothing */ + break; + default: + log_print("%s: unexpected state: %d\n", + __func__, node->state); + break; + } + } + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); + return; + +send_fin: + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + srcu_read_unlock(&nodes_srcu, idx); +} + +static void midcomms_node_release(struct rcu_head *rcu) +{ + struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); + + WARN_ON(atomic_read(&node->send_queue_cnt)); + kfree(node); +} + +static void midcomms_shutdown(struct midcomms_node *node) +{ + int ret; + + /* old protocol, we don't wait for pending operations */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + return; + } + + spin_lock(&node->state_lock); + pr_debug("receive active shutdown for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + node->state = DLM_FIN_WAIT1; + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSED: + /* we have what we want */ + spin_unlock(&node->state_lock); + return; + default: + /* busy to enter DLM_FIN_WAIT1, wait until passive + * done in shutdown_wait to enter DLM_CLOSED. + */ + break; + } + spin_unlock(&node->state_lock); + + if (node->state == DLM_FIN_WAIT1) { + dlm_send_fin(node, dlm_act_fin_ack_rcv); + + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); + } + + /* wait for other side dlm + fin */ + ret = wait_event_timeout(node->shutdown_wait, + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) { + pr_debug("active shutdown timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + midcomms_node_reset(node); + return; + } + + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); +} + +void dlm_midcomms_shutdown(void) +{ + struct midcomms_node *node; + int i, idx; + + mutex_lock(&close_lock); + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + midcomms_shutdown(node); + + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + } + } + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); + + dlm_lowcomms_shutdown(); +} + +int dlm_midcomms_close(int nodeid) +{ + struct midcomms_node *node; + int idx, ret; + + if (nodeid == dlm_our_nodeid()) + return 0; + + idx = srcu_read_lock(&nodes_srcu); + /* Abort pending close/remove operation */ + node = nodeid2node(nodeid, 0); + if (node) { + /* let shutdown waiters leave */ + set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); + wake_up(&node->shutdown_wait); + } + srcu_read_unlock(&nodes_srcu, idx); + + synchronize_srcu(&nodes_srcu); + + idx = srcu_read_lock(&nodes_srcu); + mutex_lock(&close_lock); + node = nodeid2node(nodeid, 0); + if (!node) { + mutex_unlock(&close_lock); + srcu_read_unlock(&nodes_srcu, idx); + return dlm_lowcomms_close(nodeid); + } + + ret = dlm_lowcomms_close(nodeid); + spin_lock(&node->state_lock); + midcomms_node_reset(node); + spin_unlock(&node->state_lock); + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); + + return ret; +} diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 61e90a921849..579abc6929be 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,7 +12,22 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ +struct midcomms_node; + int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc); +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh); +int dlm_midcomms_close(int nodeid); +int dlm_midcomms_start(void); +void dlm_midcomms_shutdown(void); +void dlm_midcomms_add_member(int nodeid); +void dlm_midcomms_remove_member(int nodeid); +void dlm_midcomms_unack_msg_resend(int nodeid); +const char *dlm_midcomms_state(struct midcomms_node *node); +unsigned long dlm_midcomms_flags(struct midcomms_node *node); +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); +uint32_t dlm_midcomms_version(struct midcomms_node *node); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f5b1bd65728d..5651933f54a4 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -27,25 +27,15 @@ static int rcom_response(struct dlm_ls *ls) return test_bit(LSFL_RCOM_READY, &ls->ls_flags); } -static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, char *mb, int mb_len) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; - char *mb; - int mb_len = sizeof(struct dlm_rcom) + len; - - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); - if (!mh) { - log_print("create_rcom to %d type %d len %d ENOBUFS", - to_nodeid, type, len); - return -ENOBUFS; - } rc = (struct dlm_rcom *) mb; rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - rc->rc_header.h_lockspace = ls->ls_global_id; + rc->rc_header.u.h_lockspace = ls->ls_global_id; rc->rc_header.h_nodeid = dlm_our_nodeid(); rc->rc_header.h_length = mb_len; rc->rc_header.h_cmd = DLM_RCOM; @@ -56,16 +46,67 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); - *mh_ret = mh; *rc_ret = rc; +} + +static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_mhandle *mh; + char *mb; + + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) { + log_print("%s to %d type %d len %d ENOBUFS", + __func__, to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + *mh_ret = mh; return 0; } +static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, + int len, struct dlm_rcom **rc_ret, + struct dlm_msg **msg_ret) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_msg *msg; + char *mb; + + msg = dlm_lowcomms_new_msg(to_nodeid, mb_len, GFP_NOFS, &mb, + NULL, NULL); + if (!msg) { + log_print("create_rcom to %d type %d len %d ENOBUFS", + to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + *msg_ret = msg; + return 0; +} + +static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + dlm_rcom_out(rc); +} + static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, struct dlm_rcom *rc) { - dlm_rcom_out(rc); - dlm_lowcomms_commit_buffer(mh); + _send_rcom(ls, rc); + dlm_midcomms_commit_mhandle(mh); +} + +static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg, + struct dlm_rcom *rc) +{ + _send_rcom(ls, rc); + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); } static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs, @@ -141,7 +182,7 @@ static void disallow_sync_reply(struct dlm_ls *ls) int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; + struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; @@ -153,17 +194,17 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) } retry: - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, - sizeof(struct rcom_status), &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS, + sizeof(struct rcom_status), &rc, &msg); if (error) goto out; set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -191,11 +232,11 @@ retry: static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; struct rcom_status *rs; uint32_t status; int nodeid = rc_in->rc_header.h_nodeid; int len = sizeof(struct rcom_config); + struct dlm_msg *msg; int num_slots = 0; int error; @@ -218,8 +259,8 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) len += num_slots * sizeof(struct rcom_slot); do_create: - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, - len, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY, + len, &rc, &msg); if (error) return; @@ -246,7 +287,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_recover_lock); do_send: - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); } static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -271,21 +312,22 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; + struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; retry: - error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len, + &rc, &msg); if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -298,14 +340,15 @@ retry: static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; int error, inlen, outlen, nodeid; + struct dlm_msg *msg; nodeid = rc_in->rc_header.h_nodeid; inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - outlen = LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom); + outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); - error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, + &rc, &msg); if (error) return; rc->rc_id = rc_in->rc_id; @@ -313,7 +356,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); } int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) @@ -342,10 +385,6 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid; int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); - if (error) - return; - /* Old code would send this special id to trigger a debug dump. */ if (rc_in->rc_id == 0xFFFFFFFF) { log_error(ls, "receive_rcom_lookup dump from %d", nodeid); @@ -353,6 +392,10 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) return; } + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + if (error) + return; + error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len, DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL); if (error) @@ -458,14 +501,14 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) char *mb; int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); - mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; rc = (struct dlm_rcom *) mb; rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace; + rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace; rc->rc_header.h_nodeid = dlm_our_nodeid(); rc->rc_header.h_length = mb_len; rc->rc_header.h_cmd = DLM_RCOM; @@ -479,7 +522,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rf->rf_lvblen = cpu_to_le32(~0U); dlm_rcom_out(rc); - dlm_lowcomms_commit_buffer(mh); + dlm_midcomms_commit_mhandle(mh); return 0; } diff --git a/fs/dlm/util.c b/fs/dlm/util.c index cfd0d00b19ae..58acbcc2081a 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -20,18 +20,20 @@ #define DLM_ERRNO_ETIMEDOUT 110 #define DLM_ERRNO_EINPROGRESS 115 -static void header_out(struct dlm_header *hd) +void header_out(struct dlm_header *hd) { hd->h_version = cpu_to_le32(hd->h_version); - hd->h_lockspace = cpu_to_le32(hd->h_lockspace); + /* does it for others u32 in union as well */ + hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace); hd->h_nodeid = cpu_to_le32(hd->h_nodeid); hd->h_length = cpu_to_le16(hd->h_length); } -static void header_in(struct dlm_header *hd) +void header_in(struct dlm_header *hd) { hd->h_version = le32_to_cpu(hd->h_version); - hd->h_lockspace = le32_to_cpu(hd->h_lockspace); + /* does it for others u32 in union as well */ + hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace); hd->h_nodeid = le32_to_cpu(hd->h_nodeid); hd->h_length = le16_to_cpu(hd->h_length); } diff --git a/fs/dlm/util.h b/fs/dlm/util.h index cc719ca9397e..d46f23c7a6a0 100644 --- a/fs/dlm/util.h +++ b/fs/dlm/util.h @@ -15,6 +15,8 @@ void dlm_message_out(struct dlm_message *ms); void dlm_message_in(struct dlm_message *ms); void dlm_rcom_out(struct dlm_rcom *rc); void dlm_rcom_in(struct dlm_rcom *rc); +void header_out(struct dlm_header *hd); +void header_in(struct dlm_header *hd); #endif