Gitweb:
http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 305b7f1c01f195d0df069eadf805433d522e789e
Parent: 710af80f016ce390f957c3726c1825411598d2c8
Author: Eduardo Damato <edamato(a)redhat.com>
AuthorDate: Tue Sep 29 10:06:26 2009 -0400
Committer: Lon Hohberger <lhh(a)redhat.com>
CommitterDate: Tue Sep 29 10:06:26 2009 -0400
qdisk: Implement I/O timeout for read
This patch creates a timer for last successful read and
reboots the system if last successful read was more than
interval*tko ago.
Signed-off-by: Eduardo Damato <edamato(a)redhat.com>
Signed-off-by: Lon Hohberger <lhh(a)redhat.com>
---
cman/qdisk/main.c | 33 +++++++++++++++++++++++++++------
1 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 950a2d2..32740de 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -907,7 +907,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
int low_id, bid_pending = 0, score, score_max, score_req,
upgrade = 0, count, errors, error_cycles = 0;
memb_mask_t mask, master_mask;
- struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval, lastok;
+ struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval, rd_lastok,
wr_lastok;
ctx->qc_status = S_NONE;
@@ -917,8 +917,11 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
interval.tv_usec = 0;
interval.tv_sec = ctx->qc_interval;
- lastok.tv_usec = 0;
- lastok.tv_sec = 0;
+ rd_lastok.tv_usec = 0;
+ rd_lastok.tv_sec = 0;
+
+ wr_lastok.tv_usec = 0;
+ wr_lastok.tv_sec = 0;
get_my_score(&score, &score_max);
if (score_max < ctx->qc_scoremin) {
@@ -938,7 +941,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
get_time(&oldtime, (ctx->qc_flags&RF_UPTIME));
/* Read everyone else's status */
- errors = read_node_blocks(ctx, ni, max);
+ if ( (errors = read_node_blocks(ctx, ni, max) == 0 ))
+ get_time(&rd_lastok, ctx->qc_flags&RF_UPTIME);
/* Check for node transitions */
check_transitions(ctx, ni, max, mask);
@@ -1114,7 +1118,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
errors++; /* this value isn't really used
at this point */
} else {
- get_time(&lastok, ctx->qc_flags&RF_UPTIME);
+ get_time(&wr_lastok, ctx->qc_flags&RF_UPTIME);
}
/* write out our local status */
@@ -1127,7 +1131,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
/*
* Reboot if the last successful hearbeat was longer ago than interval*TKO_COUNT
*/
- _diff_tv(&diff, &lastok, &newtime);
+ _diff_tv(&diff, &wr_lastok, &newtime);
if (_cmp_tv(&maxtime, &diff) == 1 &&
ctx->qc_flags & RF_IOTIMEOUT) {
logt_print(LOG_EMERG, "Failed to send a heartbeat "
@@ -1139,6 +1143,23 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
if (!(ctx->qc_flags & RF_DEBUG))
reboot(RB_AUTOBOOT);
}
+
+ /*
+ * Reboot if the last successful hearbeat was longer ago than interval*TKO_COUNT
+ */
+ _diff_tv(&diff, &rd_lastok, &newtime);
+ if (_cmp_tv(&maxtime, &diff) == 1 &&
+ ctx->qc_flags & RF_IOTIMEOUT) {
+ logt_print(LOG_EMERG,
+ "Failed to read from qdisk within "
+ "%d second%s (%d.%06d) - REBOOTING\n",
+ (int)maxtime.tv_sec,
+ maxtime.tv_sec==1?"":"s",
+ (int)diff.tv_sec,
+ (int)diff.tv_usec);
+ if (!(ctx->qc_flags & RF_DEBUG))
+ reboot(RB_AUTOBOOT);
+ }
/*
* Reboot if we didn't send a heartbeat in interval*TKO_COUNT