ldap/servers/plugins/replication/repl5.h | 2 +-
ldap/servers/plugins/replication/repl5_connection.c | 10 +++++-----
ldap/servers/plugins/replication/repl5_inc_protocol.c | 2 +-
ldap/servers/plugins/replication/repl5_protocol_util.c | 7 ++++---
ldap/servers/plugins/replication/repl5_tot_protocol.c | 2 +-
5 files changed, 12 insertions(+), 11 deletions(-)
New commits:
commit 7dc1edf574c2f2008cccadd395dc4b9a6f9ede50
Author: Rich Megginson <rmeggins(a)redhat.com>
Date: Wed Jun 22 20:54:58 2011 -0600
Bug 697694 - rhds82 - incr update state stop_fatal_error "requires administrator
action", with extop_result: 9
https://bugzilla.redhat.com/show_bug.cgi?id=697694
Resolves: bug 697694
Bug Description: incr update state stop_fatal_error "requires administrator
action", with extop_result: 9
Reviewed by: nhosoi, nkinder (Thanks!)
Branch: master
Fix Description: Calling ldap_result with LDAP_RES_ANY will return the first
msgid available. Because the operation sending and receiving is async
done in separate threads, we may not get the msgid corresponding to the
request we sent. This is the cause of the "Bad parameter to an LDAP
routine"
errors. We call ldap_parse_result expecting an EXTENDED operation but we get
some other operation. This causes hard failures to eventually propagate up
and halt replication with fatal errors. The solution is to call
ldap_result with the actual msgid corresponding to the operation that was
sent, instead of LDAP_RES_ANY.
This is hard to reproduce. One way I have found to consistently reproduce
the error is to set up a 4-way MMR and run one of the masters in the
debugger. Break the debugger and let the server sit idle for several minutes
until you see errors in the errors logs of the other masters. Break and
continue like this several times, and you will eventually see "Bad parameter
to an LDAP routine" errors.
Platforms tested: RHEL6 x86_64
Flag Day: no
Doc impact: no
diff --git a/ldap/servers/plugins/replication/repl5.h
b/ldap/servers/plugins/replication/repl5.h
index 0b85306..1a3672f 100644
--- a/ldap/servers/plugins/replication/repl5.h
+++ b/ldap/servers/plugins/replication/repl5.h
@@ -416,7 +416,7 @@ void conn_set_timeout(Repl_Connection *conn, long timeout);
long conn_get_timeout(Repl_Connection *conn);
void conn_set_agmt_changed(Repl_Connection *conn);
ConnResult conn_read_result(Repl_Connection *conn, int *message_id);
-ConnResult conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval
**retdatap, LDAPControl ***returned_controls, int *message_id, int noblock);
+ConnResult conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval
**retdatap, LDAPControl ***returned_controls, int send_msgid, int *resp_msgid, int
noblock);
/* In repl5_protocol.c */
typedef struct repl_protocol Repl_Protocol;
diff --git a/ldap/servers/plugins/replication/repl5_connection.c
b/ldap/servers/plugins/replication/repl5_connection.c
index ebde0b6..c77819a 100644
--- a/ldap/servers/plugins/replication/repl5_connection.c
+++ b/ldap/servers/plugins/replication/repl5_connection.c
@@ -287,7 +287,7 @@ conn_get_error_ex(Repl_Connection *conn, int *operation, int *error,
char **erro
/* The _ex version handles a bunch of parameters (retoidp et al) that were present in the
original
* sync operation functions, but were never actually used) */
ConnResult
-conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retdatap,
LDAPControl ***returned_controls, int *message_id, int block)
+conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retdatap,
LDAPControl ***returned_controls, int send_msgid, int *resp_msgid, int block)
{
LDAPMessage *res = NULL;
int setlevel = 0;
@@ -326,7 +326,7 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct
berval **retda
break;
}
- rc = ldap_result(conn->ld, LDAP_RES_ANY , 1, &local_timeout, &res);
+ rc = ldap_result(conn->ld, send_msgid, 1, &local_timeout, &res);
PR_Unlock(conn->lock);
if (0 != rc)
@@ -415,9 +415,9 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct
berval **retda
char **referrals = NULL;
char *matched = NULL;
- if (message_id)
+ if (resp_msgid)
{
- *message_id = ldap_msgid(res);
+ *resp_msgid = ldap_msgid(res);
}
rc = ldap_parse_result(conn->ld, res, &err, &matched,
@@ -486,7 +486,7 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct
berval **retda
ConnResult
conn_read_result(Repl_Connection *conn, int *message_id)
{
- return conn_read_result_ex(conn,NULL,NULL,NULL,message_id,1);
+ return conn_read_result_ex(conn,NULL,NULL,NULL,LDAP_RES_ANY,message_id,1);
}
/* Because the SDK isn't really thread-safe (it can deadlock between
diff --git a/ldap/servers/plugins/replication/repl5_inc_protocol.c
b/ldap/servers/plugins/replication/repl5_inc_protocol.c
index dac03ba..0e05973 100644
--- a/ldap/servers/plugins/replication/repl5_inc_protocol.c
+++ b/ldap/servers/plugins/replication/repl5_inc_protocol.c
@@ -293,7 +293,7 @@ static void repl5_inc_result_threadmain(void *param)
while (!finished)
{
- conres = conn_read_result_ex(conn, NULL, NULL, NULL, &message_id, 0);
+ conres = conn_read_result_ex(conn, NULL, NULL, NULL, LDAP_RES_ANY, &message_id,
0);
slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: read result
for message_id %d\n", message_id);
/* Timeout here means that we didn't block, not a real timeout */
if (CONN_TIMEOUT == conres)
diff --git a/ldap/servers/plugins/replication/repl5_protocol_util.c
b/ldap/servers/plugins/replication/repl5_protocol_util.c
index 16032b2..732d952 100644
--- a/ldap/servers/plugins/replication/repl5_protocol_util.c
+++ b/ldap/servers/plugins/replication/repl5_protocol_util.c
@@ -236,6 +236,7 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV
**ruv)
if (NULL != current_csn)
{
struct berval *payload = NULL;
+ int send_msgid = 0;
if (prp->repl90consumer)
{
@@ -278,7 +279,7 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV
**ruv)
crc = conn_send_extended_operation(conn,
prp->repl90consumer ? REPL_START_NSDS90_REPLICATION_REQUEST_OID :
REPL_START_NSDS50_REPLICATION_REQUEST_OID, payload,
- NULL /* update control */, NULL /* Message ID */);
+ NULL /* update control */, &send_msgid /* Message ID */);
if (CONN_OPERATION_SUCCESS != crc)
{
int operation, error;
@@ -293,7 +294,7 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV
**ruv)
error ? ldap_err2string(error) : "unknown error");
}
/* Since the operation request is async, we need to wait for the response here */
- crc = conn_read_result_ex(conn,&retoid,&retdata,NULL,NULL,1);
+ crc = conn_read_result_ex(conn,&retoid,&retdata,NULL,send_msgid,NULL,1);
ber_bvfree(payload);
payload = NULL;
/* Look at the response we got. */
@@ -583,7 +584,7 @@ release_replica(Private_Repl_Protocol *prp)
goto error;
}
/* Since the operation request is async, we need to wait for the response here */
- conres =
conn_read_result_ex(prp->conn,&retoid,&retdata,NULL,&ret_message_id,1);
+ conres =
conn_read_result_ex(prp->conn,&retoid,&retdata,NULL,sent_message_id,&ret_message_id,1);
if (CONN_OPERATION_SUCCESS != conres)
{
int operation, error;
diff --git a/ldap/servers/plugins/replication/repl5_tot_protocol.c
b/ldap/servers/plugins/replication/repl5_tot_protocol.c
index f383f4c..ca151bf 100644
--- a/ldap/servers/plugins/replication/repl5_tot_protocol.c
+++ b/ldap/servers/plugins/replication/repl5_tot_protocol.c
@@ -153,7 +153,7 @@ static void repl5_tot_result_threadmain(void *param)
while (!finished)
{
- conres = conn_read_result_ex(conn, NULL, NULL, NULL, &message_id, 0);
+ conres = conn_read_result_ex(conn, NULL, NULL, NULL, LDAP_RES_ANY, &message_id,
0);
/* Timeout here means that we didn't block, not a real timeout */
if (CONN_TIMEOUT == conres)
{