Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=4d4... Commit: 4d4d992a0a861f5dca7af437009667e0284f4ce8 Parent: 2acd52b50becbf98a45746f1f43d223d4365f521 Author: Lon Hohberger lhh@redhat.com AuthorDate: Thu Oct 22 16:04:23 2009 -0400 Committer: Lon Hohberger lhh@redhat.com CommitterDate: Mon Oct 26 13:19:16 2009 -0400
rgmanager: Use RG_START_RECOVER after relo failure
rgmanager correctly tries to restart a service all around the cluster in the event of failures when using RG_ENABLE (e.g. clusvcadm -e), but if a relocation operation fails, the service can end up stuck in the 'recovering' state. This is because we were not switching to the RG_START_RECOVER operation after the first relocation attempt failed.
Resolves: rhbz530409
Signed-off-by: Lon Hohberger lhh@redhat.com --- rgmanager/src/daemons/rg_state.c | 19 +++++++++++++++---- 1 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c index 3660b71..0b52f09 100644 --- a/rgmanager/src/daemons/rg_state.c +++ b/rgmanager/src/daemons/rg_state.c @@ -1727,14 +1727,14 @@ svc_start_remote(char *svcName, int request, uint32_t target) * @param new_owner Member who actually ends up owning the service. */ int -handle_relocate_req(char *svcName, int request, int preferred_target, +handle_relocate_req(char *svcName, int orig_request, int preferred_target, int *new_owner) { cluster_member_list_t *allowed_nodes = NULL, *backup = NULL; cman_node_t *m; - int target = preferred_target, me = my_id(); - int ret, x; rg_state_t svcStatus; + int target = preferred_target, me = my_id(); + int ret, x, request = orig_request; get_rg_state_local(svcName, &svcStatus); if (svcStatus.rs_state == RG_STATE_DISABLED || @@ -1836,6 +1836,13 @@ handle_relocate_req(char *svcName, int request, int preferred_target, */ return 0; } + + /* + * Failed to start on that node. + * Use the START_RECOVER operation on subsequent + * attempts. + */ + request = RG_START_RECOVER; } }
@@ -1870,6 +1877,10 @@ handle_relocate_req(char *svcName, int request, int preferred_target, return 0; case RG_EDEPEND: case RG_EFAIL: + /* Uh oh - we failed to relocate to this node. + ensure that we tell the next node to start it from + the 'recovering' state. */ + request = RG_START_RECOVER; memb_mark_down(allowed_nodes, target); continue; case RG_EABORT: @@ -1902,7 +1913,7 @@ handle_relocate_req(char *svcName, int request, int preferred_target, * We got sent here from handle_start_req. * We're DONE. */ - if (request == RG_START_RECOVER) { + if (orig_request == RG_START_RECOVER) { _svc_stop_finish(svcName, 0, RG_STATE_STOPPED); return RG_EFAIL; }