Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=3d7... Commit: 3d7b2195b78fa5c70e94024c777e1c178abd0282 Parent: d8d82c5f026b306dad35c50a1b038105c19c3324 Author: David Teigland teigland@redhat.com AuthorDate: Fri Mar 30 15:27:40 2012 -0500 Committer: David Teigland teigland@redhat.com CommitterDate: Fri Mar 30 15:27:40 2012 -0500
dlm_controld: log useful fence info
log each fence request and corresponding result, and final fence status when received.
Signed-off-by: David Teigland teigland@redhat.com --- dlm_controld/daemon_cpg.c | 38 ++++++++++++++++++++++++-------------- dlm_controld/dlm_daemon.h | 2 +- dlm_controld/fence.c | 34 ++++++++++++++++++++-------------- 3 files changed, 45 insertions(+), 29 deletions(-)
diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c index dc9e780..0339b5d 100644 --- a/dlm_controld/daemon_cpg.c +++ b/dlm_controld/daemon_cpg.c @@ -190,6 +190,8 @@ void log_ringid(const char *name, const char *reason_str(int reason) { switch (reason) { + case REASON_STARTUP_FENCING: + return "startup"; case CPG_REASON_JOIN: return "join"; case CPG_REASON_LEAVE: @@ -868,11 +870,15 @@ static void daemon_fence_work(void) continue; }
- log_debug("fence request %d", node->nodeid); + log_debug("fence request %d pos %d", + node->nodeid, node->fence_config.pos);
rv = fence_request(node->nodeid, - node->fail_walltime, node->fail_monotime, - &node->fence_config, &pid); + node->fail_walltime, + node->fail_monotime, + &node->fence_config, + node->left_reason, + &pid); if (rv < 0) { /* FIXME: keep ourself from retrying between here * and receiving this message */ @@ -1189,6 +1195,7 @@ static void receive_fence_result(struct dlm_header *hd, int len) { struct fence_result *fr; struct node_daemon *node; + uint64_t now; int count;
fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header)); @@ -1206,33 +1213,36 @@ static void receive_fence_result(struct dlm_header *hd, int len)
count = clear_startup_node(fr->nodeid, 0); if (count) { - log_debug("receive_fence_result from %d for %d clear startup", - hd->nodeid, fr->nodeid); + log_debug("receive_fence_result %d from %d clear startup", + fr->nodeid, hd->nodeid); }
node = get_node_daemon(fr->nodeid); if (!node) { - log_error("receive_fence_result from %d for %d no daemon node", - hd->nodeid, fr->nodeid); + log_error("receive_fence_result %d from %d result %d no daemon node", + fr->nodeid, hd->nodeid, fr->result); return; }
- log_debug("receive_fence_result from %d for %d result %d walltime %llu", - hd->nodeid, fr->nodeid, fr->result, - (unsigned long long)fr->fence_walltime); - if (!node->need_fencing) { /* should never happen */ - log_error("receive_fence_result from %d for %d result %d no need_fencing", - hd->nodeid, fr->nodeid, fr->result); + log_error("receive_fence_result %d from %d result %d no need_fencing", + fr->nodeid, hd->nodeid, fr->result); return; }
+ now = monotime(); + + log_error("fence status %d receive %d from %d walltime %llu local %llu", + fr->nodeid, fr->result, hd->nodeid, + (unsigned long long)fr->fence_walltime, + (unsigned long long)now); + if (!fr->result || (fr->result == -ECANCELED)) { node->need_fencing = 0; node->delay_fencing = 0; node->fence_walltime = fr->fence_walltime; - node->fence_monotime = monotime(); + node->fence_monotime = now; node->fence_actor_done = hd->nodeid; } else { /* causes the next lowest nodeid to request fencing */ diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h index 6adb16b..f0ce9b4 100644 --- a/dlm_controld/dlm_daemon.h +++ b/dlm_controld/dlm_daemon.h @@ -400,7 +400,7 @@ int setup_node_config(void);
/* fence.c */ int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, - struct fence_config *fc, int *pid_out); + struct fence_config *fc, int reason, int *pid_out); int fence_result(int nodeid, int pid, int *result);
/* netlink.c */ diff --git a/dlm_controld/fence.c b/dlm_controld/fence.c index 6d01be9..9807d66 100644 --- a/dlm_controld/fence.c +++ b/dlm_controld/fence.c @@ -78,7 +78,7 @@ static int run_agent(char *agent, char *args, int *pid_out) }
int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, - struct fence_config *fc, int *pid_out) + struct fence_config *fc, int reason, int *pid_out) { struct fence_device *dev; char args[FENCE_CONFIG_ARGS_MAX]; @@ -91,24 +91,28 @@ int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, snprintf(extra, sizeof(extra)-1, "fail_time=%llu\n", (unsigned long long)fail_walltime);
dev = fc->dev[fc->pos]; - if (!dev) + if (!dev) { + log_error("fence request %d no config pos %d", nodeid, fc->pos); return -1; + }
rv = fence_config_agent_args(fc, extra, args); if (rv < 0) { - log_error("fence_request %d args error %d", nodeid, rv); + log_error("fence request %d args error %d", nodeid, rv); return rv; }
rv = run_agent(dev->agent, args, &pid); if (rv < 0) { - log_error("fence_request %d pos %d name %s agent %s pid %d run error %d", - nodeid, fc->pos, dev->name, dev->agent, pid, rv); + log_error("fence request %d pid %d %s time %llu %s %s run error %d", + nodeid, pid, reason_str(reason), (unsigned long long)fail_walltime, + dev->name, dev->agent, rv); return rv; }
- log_debug("fence_request %d pos %d name %s agent %s pid %d running", - nodeid, fc->pos, dev->name, dev->agent, pid); + log_error("fence request %d pid %d %s time %llu %s %s", + nodeid, pid, reason_str(reason), (unsigned long long)fail_walltime, + dev->name, dev->agent);
*pid_out = pid; return 0; @@ -133,7 +137,7 @@ int fence_result(int nodeid, int pid, int *result)
if (rv < 0) { /* shouldn't happen */ - log_error("waitpid %d nodeid %d error rv %d errno %d", + log_error("fence result %d pid %d waitpid %d errno %d", pid, nodeid, rv, errno); return rv; } @@ -149,15 +153,17 @@ int fence_result(int nodeid, int pid, int *result) if (WIFEXITED(status)) { /* pid exited with an exit code */ *result = WEXITSTATUS(status); - log_error("waitpid %d nodeid %d exit status %d", - pid, nodeid, *result); + + log_error("fence result %d pid %d result %d exit status", + nodeid, pid, *result); return 0; } if (WIFSIGNALED(status)) { - /* pid exited due to a signal */ + /* pid terminated due to a signal */ *result = -1; - log_error("waitpid %d nodeid %d term signal %d", - pid, nodeid, WTERMSIG(status)); + + log_error("fence result %d pid %d result %d term signal %d", + nodeid, pid, *result, WTERMSIG(status)); return 0; }
@@ -166,7 +172,7 @@ int fence_result(int nodeid, int pid, int *result) }
/* shouldn't happen */ - log_error("waitpid %d nodeid %d error rv %d", pid, nodeid, rv); + log_error("fence result %d pid %d waitpid rv %d", nodeid, pid, rv); return -1; }
cluster-commits@lists.fedorahosted.org