From 898efface1a5076cbae5af87b935212b1869971b Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:01:25 -0800 Subject: [PATCH 1/6] [PATCH] ocfs2: recheck recovery state after getting lock * after successfully taking the $RECOVERY lock in EX mode, recheck to make sure that recovery has not already begun or completed on another node Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmrecovery.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 186e9a76aa58..f9ce864966ec 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2032,6 +2032,30 @@ again: dlm->reco.new_master); status = -EEXIST; } else { + status = 0; + + /* see if recovery was already finished elsewhere */ + spin_lock(&dlm->spinlock); + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + status = -EINVAL; + mlog(0, "%s: got reco EX lock, but " + "node got recovered already\n", dlm->name); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + mlog(ML_ERROR, "%s: new master is %u " + "but no dead node!\n", + dlm->name, dlm->reco.new_master); + BUG(); + } + } + spin_unlock(&dlm->spinlock); + } + + /* if this node has actually become the recovery master, + * set the master and send the messages to begin recovery */ + if (!status) { + mlog(0, "%s: dead=%u, this=%u, sending " + "begin_reco now\n", dlm->name, + dlm->reco.dead_node, dlm->node_num); status = dlm_send_begin_reco_message(dlm, dlm->reco.dead_node); /* this always succeeds */ From e2b5e4506f5c5187b91d7a79fbad28fe3ebd2fc5 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:02:56 -0800 Subject: [PATCH 2/6] [PATCH] ocfs2: fix release of ast never reserved * fix a bug in dlm_convert_lock_handler where dlm_lockres_release_ast was being called even if no ast was ever reserved Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmconvert.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 6001b22a997d..f5c2f1979ad3 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -421,7 +421,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; - int call_ast = 0, kick_thread = 0; + int call_ast = 0, kick_thread = 0, ast_reserved = 0; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -490,6 +490,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) status = __dlm_lockres_state_to_status(res); if (status == DLM_NORMAL) { __dlm_lockres_reserve_ast(res); + ast_reserved = 1; res->state |= DLM_LOCK_RES_IN_PROGRESS; status = __dlmconvert_master(dlm, res, lock, flags, cnv->requested_type, @@ -512,10 +513,10 @@ leave: else dlm_lock_put(lock); - /* either queue the ast or release it */ + /* either queue the ast or release it, if reserved */ if (call_ast) dlm_queue_ast(dlm, lock); - else + else if (ast_reserved) dlm_lockres_release_ast(dlm, res); if (kick_thread) From 44465a7daf7c4e34199b2b0ebb3c5101619dcb9d Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:05:38 -0800 Subject: [PATCH 3/6] [PATCH] ocfs2: add dlm_wait_for_node_death * add dlm_wait_for_node_death function to be used after receiving a network error. this will wait for the given timeout to allow the heartbeat callbacks to update the domain map. without this, some paths may spin and consume enough cpu that the heartbeat gets starved and never updates. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmcommon.h | 4 ++++ fs/ocfs2/dlm/dlmconvert.c | 5 +++++ fs/ocfs2/dlm/dlmlock.c | 14 +++++++++++++- fs/ocfs2/dlm/dlmrecovery.c | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 42eb53b5293b..23ceaa7127b4 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -208,6 +208,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +/* max milliseconds to wait to sync up a network failure with a node death */ +#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) + #define DLM_PURGE_INTERVAL_MS (8 * 1000) struct dlm_lock_resource @@ -658,6 +661,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index f5c2f1979ad3..f66e2d818ccd 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { + /* instead of logging the same network error over + * and over, sleep here and wait for the heartbeat + * to notice the node is dead. times out after 5s. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); ret = DLM_RECOVERING; mlog(0, "node %u died so returning DLM_RECOVERING " "from convert message!\n", res->owner); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index d1a0038557a3..e709412e6e32 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -646,7 +646,19 @@ retry_lock: mlog(0, "retrying lock with migration/" "recovery/in progress\n"); msleep(100); - dlm_wait_for_recovery(dlm); + /* no waiting for dlm_reco_thread */ + if (recovery) { + if (status == DLM_RECOVERING) { + mlog(0, "%s: got RECOVERING " + "for $REOCVERY lock, master " + "was %u\n", dlm->name, + res->owner); + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + } + } else { + dlm_wait_for_recovery(dlm); + } goto retry_lock; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f9ce864966ec..ed76bda1a534 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) return dead; } +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(ML_NOTICE, "%s: waiting %dms for notification of " + "death of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(ML_NOTICE, "%s: waiting indefinitely for notification " + "of death of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins From 558c70c59b75a5a53ba496fe3bccea80a9e3e6fb Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:07:47 -0800 Subject: [PATCH 4/6] [PATCH] ocfs2: manually grant remote recovery lock * fix a hang in recovery that occurred in dlmlock_remote. the $RECOVERY lock was never moved to the granted queue even after getting DLM_NORMAL back from the master node. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmlock.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index e709412e6e32..671d4ff222cc 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, dlm_error(status); dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); + } else if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* special case for the $RECOVERY lock. + * there will never be an AST delivered to put + * this lock on the proper secondary queue + * (granted), so do it manually. */ + mlog(0, "%s: $RECOVERY lock for this node (%u) is " + "mastered by %u; got lock, manually granting (no ast)\n", + dlm->name, dlm->node_num, res->owner); + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); From 745ae8ba29e729ec922393fa4d9448c385673599 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 9 Feb 2006 13:23:39 -0800 Subject: [PATCH 5/6] [PATCH] ocfs2: only checkpoint journal when asked to Disable automatic checkpointing of the journal - this is a relic from older ocfs2 days. Worth quite a bit of performance on longer running single node tests. Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 7 +++---- fs/ocfs2/journal.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index fa0bcac5ceae..d329c9df90ae 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1584,10 +1584,9 @@ static int ocfs2_commit_thread(void *arg) while (!(kthread_should_stop() && atomic_read(&journal->j_num_trans) == 0)) { - wait_event_interruptible_timeout(osb->checkpoint_event, - atomic_read(&journal->j_num_trans) - || kthread_should_stop(), - OCFS2_CHECKPOINT_INTERVAL); + wait_event_interruptible(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop()); status = ocfs2_commit_cache(osb); if (status < 0) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7d0a816184fa..2f3a6acdac45 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -29,8 +29,6 @@ #include #include -#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) - enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, OCFS2_JOURNAL_LOADED, From f671c09bce88ea253d576c842f8f39d9a2a29028 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Tue, 14 Feb 2006 11:45:21 -0800 Subject: [PATCH 6/6] [PATCH] ocfs2: detach from heartbeat events before freeing mle Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmmaster.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a3194fe173d9..2e2e95e69499 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2482,7 +2482,9 @@ top: atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - /* final put will take care of list removal */ + /* do not need events any longer, so detach + * from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); __dlm_put_mle(mle); } continue; @@ -2537,6 +2539,9 @@ top: spin_unlock(&res->spinlock); dlm_lockres_put(res); + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + /* dump the mle */ spin_lock(&dlm->master_lock); __dlm_put_mle(mle);