Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=6a030a4efa08528c75... Commit: 6a030a4efa08528c75ab5d045a3b3752a4b85fb2 Parent: f15a2daed7137481c7161e1447b1796c308c7e08 Author: David Teigland teigland@redhat.com AuthorDate: Tue Jan 22 10:41:30 2013 -0600 Committer: David Teigland teigland@redhat.com CommitterDate: Tue Jan 22 10:41:30 2013 -0600
dlm_controld: limit log debug
Avoid filling the debug buffer with log_debug entries that occur within indefinate retries. Limit to 60 log_debug repetitions, (once a second for a minute.)
Also, after one minute, escalate these repeating log_debugs to log_error messages. The log_error messages repeat once an hour.
Signed-off-by: David Teigland teigland@redhat.com --- dlm_controld/cpg.c | 44 ++++++++++++++++++++++++++++--- dlm_controld/daemon_cpg.c | 61 +++++++++++++++++++++++++++++++++------------ dlm_controld/dlm_daemon.h | 4 ++- dlm_controld/main.c | 2 +- 4 files changed, 88 insertions(+), 23 deletions(-)
diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c index 0318b98..487887c 100644 --- a/dlm_controld/cpg.c +++ b/dlm_controld/cpg.c @@ -16,6 +16,16 @@ } \ })
+/* retries are once a second */ +#define log_retry(ls, fmt, args...) ({ \ + if (ls->wait_retry < 60) \ + log_group(ls, fmt, ##args); \ + else if (ls->wait_retry == 60) \ + log_erros(ls, fmt, ##args); \ + else if (!(ls->wait_retry % 3600)) \ + log_erros(ls, fmt, ##args); \ +}) + /* per lockspace cpg: ls->node_history */
struct node { @@ -591,30 +601,54 @@ static void stop_kernel(struct lockspace *ls, uint32_t seq) static int wait_conditions_done(struct lockspace *ls) { if (!check_ringid_done(ls)) { - ls->wait_debug = DLMC_LS_WAIT_RINGID; + if (ls->wait_debug != DLMC_LS_WAIT_RINGID) { + ls->wait_debug = DLMC_LS_WAIT_RINGID; + ls->wait_retry = 0; + } + ls->wait_retry++; + /* the check function logs a message */ + return 0; }
if (opt(enable_quorum_lockspace_ind) && !cluster_quorate) { - log_group(ls, "wait for quorum"); - ls->wait_debug = DLMC_LS_WAIT_QUORUM; + if (ls->wait_debug != DLMC_LS_WAIT_QUORUM) { + ls->wait_debug = DLMC_LS_WAIT_QUORUM; + ls->wait_retry = 0; + } + ls->wait_retry++; + log_retry(ls, "wait for quorum"); + poll_lockspaces++; return 0; }
if (!check_fencing_done(ls)) { - ls->wait_debug = DLMC_LS_WAIT_FENCING; + if (ls->wait_debug != DLMC_LS_WAIT_FENCING) { + ls->wait_debug = DLMC_LS_WAIT_FENCING; + ls->wait_retry = 0; + } + ls->wait_retry++; + log_retry(ls, "wait for fencing"); + poll_lockspaces++; return 0; }
if (!check_fs_done(ls)) { - ls->wait_debug = DLMC_LS_WAIT_FSDONE; + if (ls->wait_debug != DLMC_LS_WAIT_FSDONE) { + ls->wait_debug = DLMC_LS_WAIT_FSDONE; + ls->wait_retry = 0; + } + ls->wait_retry++; + log_retry(ls, "wait for fsdone"); + poll_fs++; return 0; }
ls->wait_debug = 0; + ls->wait_retry = 0;
return 1; } diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c index 61d2da2..0eec5ce 100644 --- a/dlm_controld/daemon_cpg.c +++ b/dlm_controld/daemon_cpg.c @@ -11,6 +11,16 @@ /* protocol_version flags */ #define PV_STATEFUL 0x0001
+/* retries are once a second */ +#define log_retry(cur_count, fmt, args...) ({ \ + if (cur_count < 60) \ + log_debug(fmt, ##args); \ + else if (cur_count == 60) \ + log_error(fmt, ##args); \ + else if (!(cur_count % 3600)) \ + log_error(fmt, ##args); \ +}) + struct protocol_version { uint16_t major; uint16_t minor; @@ -106,6 +116,9 @@ static int fence_in_progress_unknown = 1; static int zombie_pids[MAX_ZOMBIES]; static int zombie_count;
+static int fence_result_pid; +static unsigned int fence_result_try; + static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime); static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime);
@@ -718,32 +731,33 @@ static void fence_pid_cancel(int nodeid, int pid) * later same as case B above */
-static void daemon_fence_work(void) +static int daemon_fence_work(void) { struct node_daemon *node, *safe; int rv, nodeid, pid, need, low, actor, result; + int retry = 0; uint32_t flags;
if (daemon_ringid_wait) { /* We've seen a nodedown confchg callback, but not the corresponding ringid callback. */ - log_debug("fence work wait for cpg ringid"); - return; + log_retry(retry_fencing, "fence work wait for cpg ringid"); + return retry; }
if (cluster_ringid_seq != daemon_ringid.seq) { /* wait for ringids to be in sync */ - log_debug("fence work wait for cluster ringid"); - return; + log_retry(retry_fencing, "fence work wait for cluster ringid"); + return retry; }
- /* poll_fencing++; */ + /* retry = 1; */
if (opt(enable_quorum_fencing_ind) && !cluster_quorate) { /* wait for quorum before doing any fencing, but if there is none, send_fence_clear below can unblock new nodes */ - log_debug("fence work wait for quorum"); - poll_fencing++; + log_retry(retry_fencing, "fence work wait for quorum"); + retry = 1; goto out_fipu; }
@@ -766,7 +780,7 @@ static void daemon_fence_work(void) log_debug("fence startup %d delay %d from %llu", node->nodeid, opt(post_join_delay_ind), (unsigned long long)daemon_last_join_monotime); - poll_fencing++; + retry = 1; continue; }
@@ -837,10 +851,10 @@ static void daemon_fence_work(void)
if (!opt(enable_concurrent_fencing_ind) && daemon_fence_pid) { /* run one agent at a time in case they need the same switch */ - log_debug("fence request %d delay for other pid %d", + log_retry(retry_fencing, "fence request %d delay for other pid %d", node->nodeid, daemon_fence_pid); node->delay_fencing = 1; - poll_fencing++; + retry = 1; continue; }
@@ -849,7 +863,7 @@ static void daemon_fence_work(void) node->nodeid, opt(post_join_delay_ind), (unsigned long long)cluster_last_join_monotime); node->delay_fencing = 1; - poll_fencing++; + retry = 1; continue; } node->delay_fencing = 0; @@ -947,12 +961,19 @@ static void daemon_fence_work(void) continue; }
- poll_fencing++; + retry = 1;
rv = fence_result(nodeid, pid, &result); if (rv == -EAGAIN) { /* agent pid is still running */ - log_debug("fence wait %d pid %d running", nodeid, pid); + + if (fence_result_pid != pid) { + fence_result_try = 0; + fence_result_pid = pid; + } + fence_result_try++; + + log_retry(fence_result_try, "fence wait %d pid %d running", nodeid, pid); continue; }
@@ -1083,12 +1104,20 @@ static void daemon_fence_work(void)
if (zombie_count) clear_zombies(); + + return retry; }
void process_fencing_changes(void) { - poll_fencing = 0; - daemon_fence_work(); + int retry; + + retry = daemon_fence_work(); + + if (retry) + retry_fencing++; + else + retry_fencing = 0; }
static void receive_fence_clear(struct dlm_header *hd, int len) diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h index ad99e17..662cc35 100644 --- a/dlm_controld/dlm_daemon.h +++ b/dlm_controld/dlm_daemon.h @@ -161,7 +161,7 @@ EXTERN struct dlm_option dlm_options[dlm_options_max]; EXTERN int daemon_quit; EXTERN int cluster_down; EXTERN int poll_lockspaces; -EXTERN int poll_fencing; +EXTERN unsigned int retry_fencing; EXTERN int poll_fs; EXTERN int poll_ignore_plock; EXTERN int poll_drop_plock; @@ -190,6 +190,7 @@ void log_level(char *name_in, uint32_t level_in, const char *fmt, ...);
#define log_error(fmt, args...) log_level(NULL, LOG_ERR, fmt, ##args) #define log_debug(fmt, args...) log_level(NULL, LOG_DEBUG, fmt, ##args) +#define log_erros(ls, fmt, args...) log_level((ls)->name, LOG_ERR, fmt, ##args) #define log_group(ls, fmt, args...) log_level((ls)->name, LOG_DEBUG, fmt, ##args)
#define log_plock(ls, fmt, args...) log_level((ls)->name, LOG_PLOCK|LOG_NONE, fmt, ##args) @@ -258,6 +259,7 @@ struct lockspace { int kernel_stopped; int fs_registered; int wait_debug; /* for status/debugging */ + uint32_t wait_retry; /* for debug rate limiting */ uint32_t change_seq; uint32_t started_count; struct change *started_change; diff --git a/dlm_controld/main.c b/dlm_controld/main.c index a68e1b8..953cf1b 100644 --- a/dlm_controld/main.c +++ b/dlm_controld/main.c @@ -1059,7 +1059,7 @@ static void loop(void)
poll_timeout = -1;
- if (poll_fencing) { + if (retry_fencing) { process_fencing_changes(); poll_timeout = 1000; }