From fa450a837add4f1e1f75b09ae9ccbfa90d7a8c32 Mon Sep 17 00:00:00 2001 From: Stephen Gallagher Date: Fri, 7 Aug 2009 13:53:46 -0400 Subject: [PATCH] Fix startup race condition causing backend restarts When the backends start up, the monitor was immediately sending a getIdentity request. However, as we've added more processing to the initialization routines over time, we started introducing latency between when we open the connection and when we're able to process requests on that connection. I've updated the monitor to check for NoReply as an error message and queue the getIdentity check for retries until the service answers or one wallclock second (literally, ten tries at 10ms each) has passed. --- server/monitor/monitor.c | 78 +++++++++++++++++++++++++++++++++++++++------ 1 files changed, 67 insertions(+), 11 deletions(-) diff --git a/server/monitor/monitor.c b/server/monitor/monitor.c index 4e92213..c2e8fb2 100644 --- a/server/monitor/monitor.c +++ b/server/monitor/monitor.c @@ -1770,6 +1770,9 @@ static int mt_conn_destructor(void *ptr) return 0; } +static void do_identity_check (struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *ptr); /* * dbus_service_init * This function should initiate a query to the newly connected @@ -1782,15 +1785,11 @@ static int dbus_service_init(struct sbus_conn_ctx *conn_ctx, void *data) struct mt_ctx *ctx; struct mt_svc *svc; struct mt_conn *mt_conn; - DBusMessage *msg; - DBusPendingCall *pending_reply; - DBusConnection *conn; - dbus_bool_t dbret; + struct tevent_timer *te; DEBUG(3, ("Initializing D-BUS Service\n")); ctx = talloc_get_type(data, struct mt_ctx); - conn = sbus_get_connection(conn_ctx); /* hang off this memory to the connection so that when the connection * is freed we can call a destructor to clear up the structure and @@ -1818,6 +1817,31 @@ static int dbus_service_init(struct sbus_conn_ctx *conn_ctx, void *data) mt_conn->svc_ptr = svc; talloc_set_destructor((TALLOC_CTX *)mt_conn, mt_conn_destructor); + struct timeval tv = { 0, 0 }; + te = tevent_add_timer(svc->mt_ctx->ev, svc, tv, + do_identity_check, svc); + if (te == NULL) { + DEBUG(0, ("Failed to add do_identity_check event for [%s]!\n", + svc->name)); + sbus_disconnect(conn_ctx); + return EIO; + } + + return EOK; +} + +static void do_identity_check (struct tevent_context *ev, + struct tevent_timer *te, + struct timeval t, void *ptr) +{ + struct mt_svc *svc; + DBusMessage *msg; + DBusPendingCall *pending_reply; + dbus_bool_t dbret; + DBusConnection *conn; + + svc = talloc_get_type(ptr, struct mt_svc); + conn = sbus_get_connection(svc->mt_conn->conn_ctx); /* * Set up identity request * This should be a well-known path and method @@ -1829,11 +1853,11 @@ static int dbus_service_init(struct sbus_conn_ctx *conn_ctx, void *data) SERVICE_METHOD_IDENTITY); if (msg == NULL) { DEBUG(0,("Out of memory?!\n")); - talloc_free(conn_ctx); - return ENOMEM; + talloc_free(svc->mt_conn->conn_ctx); + return; } dbret = dbus_connection_send_with_reply(conn, msg, &pending_reply, - ctx->service_id_timeout); + svc->mt_ctx->service_id_timeout); if (!dbret || pending_reply == NULL) { /* * Critical Failure @@ -1842,15 +1866,15 @@ static int dbus_service_init(struct sbus_conn_ctx *conn_ctx, void *data) */ DEBUG(0, ("D-BUS send failed.\n")); dbus_message_unref(msg); - talloc_free(conn_ctx); - return EIO; + talloc_free(svc->mt_conn->conn_ctx); + return; } /* Set up the reply handler */ dbus_pending_call_set_notify(pending_reply, identity_check, svc, NULL); dbus_message_unref(msg); - return EOK; + return; } static void identity_check(DBusPendingCall *pending, void *data) @@ -1862,8 +1886,11 @@ static void identity_check(DBusPendingCall *pending, void *data) DBusError dbus_error; dbus_uint16_t svc_ver; char *svc_name; + const char *error_name; dbus_bool_t ret; int type; + struct timeval tv; + struct tevent_timer *te; fake_svc = talloc_get_type(data, struct mt_svc); conn_ctx = fake_svc->mt_conn->conn_ctx; @@ -1897,6 +1924,7 @@ static void identity_check(DBusPendingCall *pending, void *data) } DEBUG(4,("Received ID reply: (%s,%d)\n", svc_name, svc_ver)); + fake_svc->restarts = 0; /* search this service in the list */ svc = fake_svc->mt_ctx->svc_list; @@ -1924,6 +1952,34 @@ static void identity_check(DBusPendingCall *pending, void *data) break; case DBUS_MESSAGE_TYPE_ERROR: + /* If the error was NoReply, we might have hit a race condition where + * we requested getIdentity before the service entered its mainloop + * We'll retry every 10ms until 1s has passed before giving it up + * for dead. + */ + if (fake_svc->restarts >= 10) { + DEBUG(0, ("Too many retries on getIdentity. Killing service.\n")); + sbus_disconnect(conn_ctx); + return; + } + error_name = dbus_message_get_error_name(reply); + if (strcmp(error_name, DBUS_ERROR_NO_REPLY) == 0) { + /* Queue up another try in 10ms */ + fake_svc->restarts++; + gettimeofday(&tv, NULL); + tv.tv_usec += 1000; + te = tevent_add_timer(fake_svc->mt_ctx->ev, fake_svc, tv, + do_identity_check, fake_svc); + if (te == NULL) { + DEBUG(0, ("Failed to add do_identity_check event for [%s]!\n", + fake_svc->name)); + sbus_disconnect(conn_ctx); + return; + } + DEBUG(3, ("getIdentity reported NoReply. Retrying.\n")); + return; + } + DEBUG(0,("getIdentity returned an error [%s], closing connection.\n", dbus_message_get_error_name(reply))); /* Falling through to default intentionally*/ -- 1.6.2.5