Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=044... Commit: 0445e89ec97595f283fefb4633c9e8e2800a8bf6 Parent: 0a84466c0450d8c4810a5db3e6fab4ec146ff78c Author: Lon Hohberger lhh@redhat.com AuthorDate: Thu Oct 22 16:04:23 2009 -0400 Committer: Lon Hohberger lhh@redhat.com CommitterDate: Mon Oct 26 13:17:37 2009 -0400
rgmanager: Use RG_START_RECOVER after relo failure
rgmanager correctly tries to restart a service all around the cluster in the event of failures when using RG_ENABLE (e.g. clusvcadm -e), but if a relocation operation fails, the service can end up stuck in the 'recovering' state. This is because we were not switching to the RG_START_RECOVER operation after the first relocation attempt failed.
Resolves: rhbz530409
Signed-off-by: Lon Hohberger lhh@redhat.com --- rgmanager/src/daemons/rg_state.c | 19 +++++++++++++++---- 1 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c index ab18202..4d53e63 100644 --- a/rgmanager/src/daemons/rg_state.c +++ b/rgmanager/src/daemons/rg_state.c @@ -1640,14 +1640,14 @@ svc_start_remote(const char *svcName, int request, uint32_t target) * @param new_owner Member who actually ends up owning the service. */ int -handle_relocate_req(char *svcName, int request, int preferred_target, +handle_relocate_req(char *svcName, int orig_request, int preferred_target, int *new_owner) { cluster_member_list_t *allowed_nodes = NULL, *backup = NULL; cman_node_t *m; - int target = preferred_target, me = my_id(); - int ret, x; rg_state_t svcStatus; + int target = preferred_target, me = my_id(); + int ret, x, request = orig_request; get_rg_state_local(svcName, &svcStatus); if (svcStatus.rs_state == RG_STATE_DISABLED || @@ -1749,6 +1749,13 @@ handle_relocate_req(char *svcName, int request, int preferred_target, */ return 0; } + + /* + * Failed to start on that node. + * Use the START_RECOVER operation on subsequent + * attempts. + */ + request = RG_START_RECOVER; } }
@@ -1783,6 +1790,10 @@ handle_relocate_req(char *svcName, int request, int preferred_target, return 0; case RG_EDEPEND: case RG_EFAIL: + /* Uh oh - we failed to relocate to this node. + ensure that we tell the next node to start it from + the 'recovering' state. */ + request = RG_START_RECOVER; memb_mark_down(allowed_nodes, target); continue; case RG_EABORT: @@ -1815,7 +1826,7 @@ handle_relocate_req(char *svcName, int request, int preferred_target, * We got sent here from handle_start_req. * We're DONE. */ - if (request == RG_START_RECOVER) { + if (orig_request == RG_START_RECOVER) { _svc_stop_finish(svcName, 0, RG_STATE_STOPPED); return RG_EFAIL; }
cluster-commits@lists.fedorahosted.org