dlm: master - dlm_controld: limit log debug

22 Jan 2013

Gitweb:        http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=6a030a4efa08528c75...
Commit:        6a030a4efa08528c75ab5d045a3b3752a4b85fb2
Parent:        f15a2daed7137481c7161e1447b1796c308c7e08
Author:        David Teigland teigland@redhat.com
AuthorDate:    Tue Jan 22 10:41:30 2013 -0600
Committer:     David Teigland teigland@redhat.com
CommitterDate: Tue Jan 22 10:41:30 2013 -0600
dlm_controld: limit log debug
Avoid filling the debug buffer with log_debug
entries that occur within indefinate retries.
Limit to 60 log_debug repetitions, (once a
second for a minute.)
Also, after one minute, escalate these repeating
log_debugs to log_error messages.  The log_error
messages repeat once an hour.
Signed-off-by: David Teigland teigland@redhat.com
---
 dlm_controld/cpg.c        |   44 ++++++++++++++++++++++++++++---
 dlm_controld/daemon_cpg.c |   61 +++++++++++++++++++++++++++++++++------------
 dlm_controld/dlm_daemon.h |    4 ++-
 dlm_controld/main.c       |    2 +-
 4 files changed, 88 insertions(+), 23 deletions(-)

diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c
index 0318b98..487887c 100644
--- a/dlm_controld/cpg.c
+++ b/dlm_controld/cpg.c
@@ -16,6 +16,16 @@
    }                                     \
 })
+/* retries are once a second */
+#define log_retry(ls, fmt, args...) ({ \
+	if (ls->wait_retry < 60) \
+		log_group(ls, fmt, ##args); \
+	else if (ls->wait_retry == 60) \
+		log_erros(ls, fmt, ##args); \
+        else if (!(ls->wait_retry % 3600)) \
+                log_erros(ls, fmt, ##args); \
+})
+
 /* per lockspace cpg: ls->node_history */
struct node {
@@ -591,30 +601,54 @@ static void stop_kernel(struct lockspace *ls, uint32_t seq)
 static int wait_conditions_done(struct lockspace *ls)
 {
    if (!check_ringid_done(ls)) {
-		ls->wait_debug = DLMC_LS_WAIT_RINGID;
+		if (ls->wait_debug != DLMC_LS_WAIT_RINGID) {
+			ls->wait_debug = DLMC_LS_WAIT_RINGID;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		/* the check function logs a message */
+
    	return 0;
    }
if (opt(enable_quorum_lockspace_ind) && !cluster_quorate) {
-		log_group(ls, "wait for quorum");
-		ls->wait_debug = DLMC_LS_WAIT_QUORUM;
+		if (ls->wait_debug != DLMC_LS_WAIT_QUORUM) {
+			ls->wait_debug = DLMC_LS_WAIT_QUORUM;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		log_retry(ls, "wait for quorum");
+
    	poll_lockspaces++;
    	return 0;
    }
if (!check_fencing_done(ls)) {
-		ls->wait_debug = DLMC_LS_WAIT_FENCING;
+		if (ls->wait_debug != DLMC_LS_WAIT_FENCING) {
+			ls->wait_debug = DLMC_LS_WAIT_FENCING;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		log_retry(ls, "wait for fencing");
+
    	poll_lockspaces++;
    	return 0;
    }
if (!check_fs_done(ls)) {
-		ls->wait_debug = DLMC_LS_WAIT_FSDONE;
+		if (ls->wait_debug != DLMC_LS_WAIT_FSDONE) {
+			ls->wait_debug = DLMC_LS_WAIT_FSDONE;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		log_retry(ls, "wait for fsdone");
+
    	poll_fs++;
    	return 0;
    }
ls->wait_debug = 0;
+	ls->wait_retry = 0;
return 1;
 }
diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
index 61d2da2..0eec5ce 100644
--- a/dlm_controld/daemon_cpg.c
+++ b/dlm_controld/daemon_cpg.c
@@ -11,6 +11,16 @@
 /* protocol_version flags */
 #define PV_STATEFUL 0x0001
+/* retries are once a second */
+#define log_retry(cur_count, fmt, args...) ({ \
+	if (cur_count < 60) \
+		log_debug(fmt, ##args); \
+	else if (cur_count == 60) \
+		log_error(fmt, ##args); \
+	else if (!(cur_count % 3600)) \
+		log_error(fmt, ##args); \
+})
+
 struct protocol_version {
    uint16_t major;
    uint16_t minor;
@@ -106,6 +116,9 @@ static int fence_in_progress_unknown = 1;
 static int zombie_pids[MAX_ZOMBIES];
 static int zombie_count;
+static int fence_result_pid;
+static unsigned int fence_result_try;
+
 static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime);
 static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime);
@@ -718,32 +731,33 @@ static void fence_pid_cancel(int nodeid, int pid)
  * later same as case B above
  */
-static void daemon_fence_work(void)
+static int daemon_fence_work(void)
 {
    struct node_daemon *node, *safe;
    int rv, nodeid, pid, need, low, actor, result;
+	int retry = 0;
    uint32_t flags;
if (daemon_ringid_wait) {
    	/* We've seen a nodedown confchg callback, but not the
    	   corresponding ringid callback. */
-		log_debug("fence work wait for cpg ringid");
-		return;
+		log_retry(retry_fencing, "fence work wait for cpg ringid");
+		return retry;
    }
if (cluster_ringid_seq != daemon_ringid.seq) {
    	/* wait for ringids to be in sync */
-		log_debug("fence work wait for cluster ringid");
-		return;
+		log_retry(retry_fencing, "fence work wait for cluster ringid");
+		return retry;
    }
-	/* poll_fencing++; */
+	/* retry = 1; */
if (opt(enable_quorum_fencing_ind) && !cluster_quorate) {
    	/* wait for quorum before doing any fencing, but if there
    	   is none, send_fence_clear below can unblock new nodes */
-		log_debug("fence work wait for quorum");
-		poll_fencing++;
+		log_retry(retry_fencing, "fence work wait for quorum");
+		retry = 1;
    	goto out_fipu;
    }
@@ -766,7 +780,7 @@ static void daemon_fence_work(void)
    		log_debug("fence startup %d delay %d from %llu",
    			  node->nodeid, opt(post_join_delay_ind),
    			  (unsigned long long)daemon_last_join_monotime);
-			poll_fencing++;
+			retry = 1;
    		continue;
    	}
@@ -837,10 +851,10 @@ static void daemon_fence_work(void)
if (!opt(enable_concurrent_fencing_ind) && daemon_fence_pid) {
    		/* run one agent at a time in case they need the same switch */
-			log_debug("fence request %d delay for other pid %d",
+			log_retry(retry_fencing, "fence request %d delay for other pid %d",
    			  node->nodeid, daemon_fence_pid);
    		node->delay_fencing = 1;
-			poll_fencing++;
+			retry = 1;
    		continue;
    	}
@@ -849,7 +863,7 @@ static void daemon_fence_work(void)
    			  node->nodeid, opt(post_join_delay_ind),
    			  (unsigned long long)cluster_last_join_monotime);
    		node->delay_fencing = 1;
-			poll_fencing++;
+			retry = 1;
    		continue;
    	}
    	node->delay_fencing = 0;
@@ -947,12 +961,19 @@ static void daemon_fence_work(void)
    		continue;
    	}
-		poll_fencing++;
+		retry = 1;
rv = fence_result(nodeid, pid, &result);
    	if (rv == -EAGAIN) {
    		/* agent pid is still running */
-			log_debug("fence wait %d pid %d running", nodeid, pid);
+
+			if (fence_result_pid != pid) {
+				fence_result_try = 0;
+				fence_result_pid = pid;
+			}
+			fence_result_try++;
+
+			log_retry(fence_result_try, "fence wait %d pid %d running", nodeid, pid);
    		continue;
    	}
@@ -1083,12 +1104,20 @@ static void daemon_fence_work(void)
if (zombie_count)
    	clear_zombies();
+
+	return retry;
 }
void process_fencing_changes(void)
 {
-	poll_fencing = 0;
-	daemon_fence_work();
+	int retry;
+
+	retry = daemon_fence_work();
+
+	if (retry)
+		retry_fencing++;
+	else
+		retry_fencing = 0;
 }
static void receive_fence_clear(struct dlm_header *hd, int len)
diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
index ad99e17..662cc35 100644
--- a/dlm_controld/dlm_daemon.h
+++ b/dlm_controld/dlm_daemon.h
@@ -161,7 +161,7 @@ EXTERN struct dlm_option dlm_options[dlm_options_max];
 EXTERN int daemon_quit;
 EXTERN int cluster_down;
 EXTERN int poll_lockspaces;
-EXTERN int poll_fencing;
+EXTERN unsigned int retry_fencing;
 EXTERN int poll_fs;
 EXTERN int poll_ignore_plock;
 EXTERN int poll_drop_plock;
@@ -190,6 +190,7 @@ void log_level(char *name_in, uint32_t level_in, const char *fmt, ...);
#define log_error(fmt, args...) log_level(NULL, LOG_ERR, fmt, ##args)
 #define log_debug(fmt, args...) log_level(NULL, LOG_DEBUG, fmt, ##args)
+#define log_erros(ls, fmt, args...) log_level((ls)->name, LOG_ERR, fmt, ##args)
 #define log_group(ls, fmt, args...) log_level((ls)->name, LOG_DEBUG, fmt, ##args)
#define log_plock(ls, fmt, args...) log_level((ls)->name, LOG_PLOCK|LOG_NONE, fmt, ##args)
@@ -258,6 +259,7 @@ struct lockspace {
    int			kernel_stopped;
    int			fs_registered;
    int			wait_debug; /* for status/debugging */
+	uint32_t		wait_retry; /* for debug rate limiting */
    uint32_t		change_seq;
    uint32_t		started_count;
    struct change		*started_change;
diff --git a/dlm_controld/main.c b/dlm_controld/main.c
index a68e1b8..953cf1b 100644
--- a/dlm_controld/main.c
+++ b/dlm_controld/main.c
@@ -1059,7 +1059,7 @@ static void loop(void)
poll_timeout = -1;
-		if (poll_fencing) {
+		if (retry_fencing) {
    		process_fencing_changes();
    		poll_timeout = 1000;
    	}

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

dlm: master - dlm_controld: limit log debug