fence-agents: master - fence_cisco_ucs: fence agent does not respect "delay" attribute
by Marek Grác
Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=2424d8...
Commit: 2424d8c71413d6ba4ace2c002ee80c1f4517b4c8
Parent: 7d5ffffa76a33a76cf39b8cd266bd5b38d9ddca5
Author: Marek 'marx' Grac <mgrac(a)redhat.com>
AuthorDate: Wed Feb 6 16:24:48 2013 +0100
Committer: Marek 'marx' Grac <mgrac(a)redhat.com>
CommitterDate: Wed Feb 6 16:24:48 2013 +0100
fence_cisco_ucs: fence agent does not respect "delay" attribute
Resolves: rhbz#896603
---
fence/agents/cisco_ucs/fence_cisco_ucs.py | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/fence/agents/cisco_ucs/fence_cisco_ucs.py b/fence/agents/cisco_ucs/fence_cisco_ucs.py
index e66130e..9783250 100644
--- a/fence/agents/cisco_ucs/fence_cisco_ucs.py
+++ b/fence/agents/cisco_ucs/fence_cisco_ucs.py
@@ -122,6 +122,11 @@ used with Cisco UCS to fence machines."
docs["vendorurl"] = "http://www.cisco.com"
show_docs(options, docs)
+ ## Do the delay of the fence device before logging in
+ ## Delay is important for two-node clusters fencing but we do not need to delay 'status' operations
+ if options["--action"] in ["off", "reboot"]:
+ time.sleep(int(options["--delay"]))
+
### Login
res = send_command(options, "<aaaLogin inName=\"" + options["--username"] + "\" inPassword=\"" + options["--password"] + "\" />", int(options["--login-timeout"]))
result = RE_COOKIE.search(res)
11 years, 4 months
gfs2-utils: master - gfs2_lockcapture: Capture the status of the cluster nodes and find the clusternode name and id.
by shane bradley
Gitweb: http://git.fedorahosted.org/git/?p=gfs2-utils.git;a=commitdiff;h=e6100419...
Commit: e6100419b58041b85463a76283451ebce6041707
Parent: 4bbf3baa94249ce80449c451460938ca33a1dff4
Author: Shane Bradley <sbradley(a)redhat.com>
AuthorDate: Thu Jan 31 09:34:24 2013 -0500
Committer: Shane Bradley <sbradley(a)redhat.com>
CommitterDate: Wed Feb 6 09:13:54 2013 -0500
gfs2_lockcapture: Capture the status of the cluster nodes and find the clusternode name and id.
The status of the cluster will be captured and written to the file with respect
to version: cman_tool nodes, corosync-quorumtool -l. Added two new configuration
variables to the hostinformation.txt for the clusternode name and id. Updated man page.
Signed-off-by: Shane Bradley <sbradley(a)redhat.com>
---
gfs2/man/gfs2_lockcapture.8 | 5 +-
gfs2/scripts/gfs2_lockcapture | 102 ++++++++++++++++++++++++++++++----------
2 files changed, 79 insertions(+), 28 deletions(-)
diff --git a/gfs2/man/gfs2_lockcapture.8 b/gfs2/man/gfs2_lockcapture.8
index 854cd71..acd9113 100644
--- a/gfs2/man/gfs2_lockcapture.8
+++ b/gfs2/man/gfs2_lockcapture.8
@@ -5,7 +5,7 @@ gfs2_lockcapture \- will capture locking information from GFS2 file systems and
.SH SYNOPSIS
.B gfs2_lockcapture \fR[-dqyt] [-o \fIoutput directory]\fR [-r \fInumber of runs]\fR [-s \fIseconds to sleep]\fR [-n \fIname of GFS2 filesystem]\fP
-.PP
+.PP
.B gfs2_lockcapture \fR[-dqyi]
.SH DESCRIPTION
@@ -15,7 +15,7 @@ multiple times and how much time to sleep between each iteration of capturing
the data. By default all of the mounted GFS2 filesystems will have their data
collected unless GFS2 filesystems are specified.
.PP
-Please note that sysrq -t and -m events are trigger or the pid directories in /proc are
+Please note that sysrq -t and -m events are trigger or the pid directories in /proc are
collected on each iteration of capturing the data.
.SH OPTIONS
@@ -51,3 +51,4 @@ number of seconds to sleep between runs of capturing the lockdump data.
name of the GFS2 filesystem(s) that will have their lockdump data captured.
.
.SH SEE ALSO
+gfs2_lockanalyze(8)
diff --git a/gfs2/scripts/gfs2_lockcapture b/gfs2/scripts/gfs2_lockcapture
index 2b3421c..6a63fc8 100644
--- a/gfs2/scripts/gfs2_lockcapture
+++ b/gfs2/scripts/gfs2_lockcapture
@@ -45,12 +45,15 @@ class ClusterNode:
"""
This class represents a cluster node that is a current memeber in a cluster.
"""
- def __init__(self, clusternodeName, clusterName, mapOfMountedFilesystemLabels):
+ def __init__(self, clusternodeName, clusternodeID, clusterName, mapOfMountedFilesystemLabels):
"""
@param clusternodeName: The name of the cluster node.
@type clusternodeName: String
@param clusterName: The name of the cluster that this cluster node is a
member of.
+ @param clusternodeID: The id of the cluster node.
+ @type clusternodeID: Int
+ @param clusterName: The name of the cluster that this cluster node is a
@type clusterName: String
@param mapOfMountedFilesystemLabels: A map of filesystem labels(key) for
a mounted filesystem. The value is the line for the matching mounted
@@ -58,6 +61,7 @@ class ClusterNode:
@type mapOfMountedFilesystemLabels: Dict
"""
self.__clusternodeName = clusternodeName
+ self.__clusternodeID = clusternodeID
self.__clusterName = clusterName
self.__mapOfMountedFilesystemLabels = mapOfMountedFilesystemLabels
@@ -69,7 +73,7 @@ class ClusterNode:
@rtype: String
"""
rString = ""
- rString += "%s:%s" %(self.getClusterName(), self.getClusterNodeName())
+ rString += "%s:%s(id:%d)" %(self.getClusterName(), self.getClusterNodeName(), self.getClusterNodeID())
fsLabels = self.__mapOfMountedFilesystemLabels.keys()
fsLabels.sort()
for fsLabel in fsLabels:
@@ -85,6 +89,14 @@ class ClusterNode:
"""
return self.__clusternodeName
+ def getClusterNodeID(self):
+ """
+ Returns the id of the cluster node.
+ @return: Returns the id of the cluster node.
+ @rtype: String
+ """
+ return self.__clusternodeID
+
def getClusterName(self):
"""
Returns the name of cluster that this cluster node is a member of.
@@ -539,6 +551,7 @@ def getClusterNode(listOfGFS2Names):
# in the output, else return None.
clusterName = ""
clusternodeName = ""
+ clusternodeID = ""
if (runCommand("which", ["cman_tool"])):
stdout = runCommandOutput("cman_tool", ["status"])
if (not stdout == None):
@@ -550,6 +563,8 @@ def getClusterNode(listOfGFS2Names):
clusterName = line.split("Cluster Name:")[1].strip().rstrip()
if (line.startswith("Node name: ")):
clusternodeName = line.split("Node name:")[1].strip().rstrip()
+ if (line.startswith("Node ID: ")):
+ clusternodeID = line.split("Node ID: ")[1].strip().rstrip()
elif (runCommand("which", ["corosync-cmapctl"])):
# Another way to get the local cluster node is: $ crm_node -i; crm_node -l
# Get the name of the cluster.
@@ -559,14 +574,14 @@ def getClusterNode(listOfGFS2Names):
if (len(stdoutSplit) == 2):
clusterName = stdoutSplit[1].strip().rstrip()
# Get the id of the local cluster node so we can get the clusternode name
- thisNodeID = ""
+ clusternodeID = ""
stdout = runCommandOutput("corosync-cmapctl", ["-g", "runtime.votequorum.this_node_id"])
if (not stdout == None):
stdoutSplit = stdout.split("=")
if (len(stdoutSplit) == 2):
- thisNodeID = stdoutSplit[1].strip().rstrip()
+ clusternodeID = stdoutSplit[1].strip().rstrip()
# Now that we the nodeid then we can get the clusternode name.
- if (len(thisNodeID) > 0):
+ if (len(clusternodeID) > 0):
stdout = runCommandOutput("corosync-quorumtool", ["-l"])
if (not stdout == None):
for line in stdout.split("\n"):
@@ -588,7 +603,15 @@ def getClusterNode(listOfGFS2Names):
break
if ((not foundMatch) and (mapOfMountedFilesystemLabels.has_key(label))):
del(mapOfMountedFilesystemLabels[label])
- return ClusterNode(clusternodeName, clusterName, mapOfMountedFilesystemLabels)
+ # Cast the node id to an int, and default is 0 if node is not found or
+ # not castable.
+ clusternodeIDInt = 0
+ if (clusternodeID.isalnum()):
+ try:
+ clusternodeIDInt = int(clusternodeID)
+ except(ValueError):
+ pass
+ return ClusterNode(clusternodeName, clusternodeIDInt, clusterName, mapOfMountedFilesystemLabels)
else:
return None
@@ -701,6 +724,28 @@ def gatherGeneralInformation(pathToDSTDir):
message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ # Write the status of all the nodes in the cluster out.
+ if (runCommand("which", ["cman_tool"])):
+ command = "cman_tool"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool_status")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["status"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ elif (runCommand("which", ["corosync-cmapctl"])):
+ command = "corosync-quorumtool"
+ pathToCommandOutput = os.path.join(pathToDSTDir, "corosync-quorumtool_l")
+ try:
+ fout = open(pathToCommandOutput, "w")
+ runCommand(command, ["-l"], standardOut=fout)
+ fout.close()
+ except IOError:
+ message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput)
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+
def isProcPidStackEnabled(pathToPidData):
"""
@@ -1067,26 +1112,6 @@ if __name__ == "__main__":
# script running.
writeToFile(PATH_TO_PID_FILENAME, str(os.getpid()), createFile=True)
# #######################################################################
- # Verify they want to continue because this script will trigger sysrq events.
- # #######################################################################
- if (not cmdLineOpts.disableQuestions):
- valid = {"yes":True, "y":True, "no":False, "n":False}
- question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?"
- prompt = " [y/n] "
- while True:
- sys.stdout.write(question + prompt)
- choice = raw_input().lower()
- if (choice in valid):
- if (valid.get(choice)):
- # If yes, or y then exit loop and continue.
- break
- else:
- message = "The script will not continue since you chose not to continue."
- logging.getLogger(MAIN_LOGGER_NAME).error(message)
- exitScript(removePidFile=True, errorCode=1)
- else:
- sys.stdout.write("Please respond with '(y)es' or '(n)o'.\n")
- # #######################################################################
# Get the clusternode name and verify that mounted GFS2 filesystems were
# found.
# #######################################################################
@@ -1110,6 +1135,26 @@ if __name__ == "__main__":
print clusternode
exitScript()
# #######################################################################
+ # Verify they want to continue because this script will trigger sysrq events.
+ # #######################################################################
+ if (not cmdLineOpts.disableQuestions):
+ valid = {"yes":True, "y":True, "no":False, "n":False}
+ question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?"
+ prompt = " [y/n] "
+ while True:
+ sys.stdout.write(question + prompt)
+ choice = raw_input().lower()
+ if (choice in valid):
+ if (valid.get(choice)):
+ # If yes, or y then exit loop and continue.
+ break
+ else:
+ message = "The script will not continue since you chose not to continue."
+ logging.getLogger(MAIN_LOGGER_NAME).error(message)
+ exitScript(removePidFile=True, errorCode=1)
+ else:
+ sys.stdout.write("Please respond with '(y)es' or '(n)o'.\n")
+ # #######################################################################
# Create the output directory to verify it can be created before
# proceeding unless it is already created from a previous run data needs
# to be analyzed. Probably could add more debugging on if file or dir.
@@ -1178,6 +1223,11 @@ if __name__ == "__main__":
message = "Pass (%d/%d): Gathering general information about the host." %(i, cmdLineOpts.numberOfRuns)
logging.getLogger(MAIN_LOGGER_NAME).debug(message)
gatherGeneralInformation(pathToOutputRunDir)
+ # Write the clusternode name and id to the general information file.
+ writeToFile(os.path.join(pathToOutputRunDir, "hostinformation.txt"),
+ "NODE_NAME=%s\nNODE_ID=%d" %(clusternode.getClusterNodeName(), clusternode.getClusterNodeID()),
+ appendToFile=True, createFile=True)
+
# Going to sleep for 2 seconds, so that TIMESTAMP should be in the
# past in the logs so that capturing sysrq data will be guaranteed.
time.sleep(2)
11 years, 4 months
fence-agents: master - fence_vmware_soap: Fix previous patch - fix traceback when hostname cannot be resolved
by Marek Grác
Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=7d5fff...
Commit: 7d5ffffa76a33a76cf39b8cd266bd5b38d9ddca5
Parent: 2d159cc3e800426d34c40f9fb53ea17e1b8d1ec7
Author: Marek 'marx' Grac <mgrac(a)redhat.com>
AuthorDate: Wed Feb 6 14:34:09 2013 +0100
Committer: Marek 'marx' Grac <mgrac(a)redhat.com>
CommitterDate: Wed Feb 6 14:34:09 2013 +0100
fence_vmware_soap: Fix previous patch - fix traceback when hostname cannot be resolved
Previous patch was not exactly copied from another branch.
---
fence/agents/vmware_soap/fence_vmware_soap.py | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fence/agents/vmware_soap/fence_vmware_soap.py b/fence/agents/vmware_soap/fence_vmware_soap.py
index b949dab..3c08eeb 100644
--- a/fence/agents/vmware_soap/fence_vmware_soap.py
+++ b/fence/agents/vmware_soap/fence_vmware_soap.py
@@ -20,10 +20,10 @@ def soap_login(options):
url = "http://"
url += options["--ip"] + ":" + str(options["--ipport"]) + "/sdk"
- conn = Client(url + "/vimService.wsdl")
- conn.set_options(location = url)
-
try:
+ conn = Client(url + "/vimService.wsdl")
+ conn.set_options(location = url)
+
mo_ServiceInstance = Property('ServiceInstance')
mo_ServiceInstance._type = 'ServiceInstance'
ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance)
11 years, 4 months
fence-agents: master - fence_vmnware_soap: Fix traceback when hostname cannot be resolved to IP address
by Marek Grác
Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=2d159c...
Commit: 2d159cc3e800426d34c40f9fb53ea17e1b8d1ec7
Parent: 110e74e711d1a341d4f40dc3f253170bd6cb7651
Author: Marek 'marx' Grac <mgrac(a)redhat.com>
AuthorDate: Wed Feb 6 14:06:53 2013 +0100
Committer: Marek 'marx' Grac <mgrac(a)redhat.com>
CommitterDate: Wed Feb 6 14:06:53 2013 +0100
fence_vmnware_soap: Fix traceback when hostname cannot be resolved to IP address
Resolves: rhbz#902404
---
fence/agents/vmware_soap/fence_vmware_soap.py | 12 ++++++------
1 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/fence/agents/vmware_soap/fence_vmware_soap.py b/fence/agents/vmware_soap/fence_vmware_soap.py
index 16ce5b2..b949dab 100644
--- a/fence/agents/vmware_soap/fence_vmware_soap.py
+++ b/fence/agents/vmware_soap/fence_vmware_soap.py
@@ -23,13 +23,13 @@ def soap_login(options):
conn = Client(url + "/vimService.wsdl")
conn.set_options(location = url)
- mo_ServiceInstance = Property('ServiceInstance')
- mo_ServiceInstance._type = 'ServiceInstance'
- ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance)
- mo_SessionManager = Property(ServiceContent.sessionManager.value)
- mo_SessionManager._type = 'SessionManager'
-
try:
+ mo_ServiceInstance = Property('ServiceInstance')
+ mo_ServiceInstance._type = 'ServiceInstance'
+ ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance)
+ mo_SessionManager = Property(ServiceContent.sessionManager.value)
+ mo_SessionManager._type = 'SessionManager'
+
SessionManager = conn.service.Login(mo_SessionManager, options["--username"], options["--password"])
except Exception, ex:
fail(EC_LOGIN_DENIED)
11 years, 4 months
fence-agents: RHEL6 - fence_vmware_soap: Fix traceback when hostname cannot be resolved to IP address
by Marek Grác
Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=715b34...
Commit: 715b34f9c3770b46f49dfd20e3aaef9dfd461596
Parent: aef3292971598b39f237be8400f41d55e080219a
Author: Marek 'marx' Grac <mgrac(a)redhat.com>
AuthorDate: Wed Feb 6 14:00:28 2013 +0100
Committer: Marek 'marx' Grac <mgrac(a)redhat.com>
CommitterDate: Wed Feb 6 14:04:35 2013 +0100
fence_vmware_soap: Fix traceback when hostname cannot be resolved to IP address
Resolves: rhbz#902404
---
fence/agents/vmware_soap/fence_vmware_soap.py | 19 ++++++++++---------
1 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/fence/agents/vmware_soap/fence_vmware_soap.py b/fence/agents/vmware_soap/fence_vmware_soap.py
index 0da7f0d..f01d37b 100644
--- a/fence/agents/vmware_soap/fence_vmware_soap.py
+++ b/fence/agents/vmware_soap/fence_vmware_soap.py
@@ -21,16 +21,17 @@ def soap_login(options):
url = "http://"
url += options["-a"] + ":" + str(options["-u"]) + "/sdk"
- conn = Client(url + "/vimService.wsdl")
- conn.set_options(location = url)
-
- mo_ServiceInstance = Property('ServiceInstance')
- mo_ServiceInstance._type = 'ServiceInstance'
- ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance)
- mo_SessionManager = Property(ServiceContent.sessionManager.value)
- mo_SessionManager._type = 'SessionManager'
-
+
try:
+ conn = Client(url + "/vimService.wsdl")
+ conn.set_options(location = url)
+
+ mo_ServiceInstance = Property('ServiceInstance')
+ mo_ServiceInstance._type = 'ServiceInstance'
+ ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance)
+ mo_SessionManager = Property(ServiceContent.sessionManager.value)
+ mo_SessionManager._type = 'SessionManager'
+
SessionManager = conn.service.Login(mo_SessionManager, options["-l"], options["-p"])
except Exception, ex:
fail(EC_LOGIN_DENIED)
11 years, 4 months
fence-agents: RHEL6 - fence_drac5: fix regression on Dell CMC and Dell DRAC5
by Marek Grác
Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=aef329...
Commit: aef3292971598b39f237be8400f41d55e080219a
Parent: e0b9090cac46a1010f7b57f7aef9412753a312b6
Author: Marek 'marx' Grac <mgrac(a)redhat.com>
AuthorDate: Wed Feb 6 13:54:05 2013 +0100
Committer: Marek 'marx' Grac <mgrac(a)redhat.com>
CommitterDate: Wed Feb 6 13:59:21 2013 +0100
fence_drac5: fix regression on Dell CMC and Dell DRAC5
Standard EOL for agents connecting via ssh is CR/LF.
Some Dell devices represents CR/LF as double-enter what creates a problem in parsing of output.
This patch adds a check for double-enter. This can be detected in function which power on/off machine
because 'get power status' was run before and if we can find a line without any command we know that
there is a double-enter problem.
Resolves: rhbz#905478
---
fence/agents/drac5/fence_drac5.py | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/fence/agents/drac5/fence_drac5.py b/fence/agents/drac5/fence_drac5.py
index 3e9a8d2..7d9082a 100644
--- a/fence/agents/drac5/fence_drac5.py
+++ b/fence/agents/drac5/fence_drac5.py
@@ -52,7 +52,14 @@ def set_power_status(conn, options):
conn.send_eol("racadm serveraction " + action + " -m " + options["-m"])
elif options["model"] == "DRAC 5":
conn.send_eol("racadm serveraction " + action)
+
+ ## Fix issue with double-enter [CR/LF]
+ ## We need to read two additional command prompts (one from get + one from set command)
conn.log_expect(options, options["-c"], int(options["-g"]))
+ if len(conn.before.strip()) == 0:
+ options["eol"] = options["eol"][:-1]
+ conn.log_expect(options, options["-c"], int(options["-g"]))
+ conn.log_expect(options, options["-c"], int(options["-g"]))
except pexpect.EOF:
fail(EC_CONNECTION_LOST)
except pexpect.TIMEOUT:
11 years, 4 months
dlm: master - dlm_controld: retry fencing work while waiting for ringid
by David Teigland
Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=f65f3f67b0f87f5...
Commit: f65f3f67b0f87f52398ff1100aaede36e254abda
Parent: a9c0eb5fb07f25e36a8750abaa2678fd0344ae35
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Jan 31 16:45:41 2013 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Thu Jan 31 16:45:41 2013 -0600
dlm_controld: retry fencing work while waiting for ringid
An update to the cluster ringid won't necessarily cause
the fencing work to be scheduled, so force to it be
polled/retried.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
dlm_controld/daemon_cpg.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
index b6532a3..af9b91d 100644
--- a/dlm_controld/daemon_cpg.c
+++ b/dlm_controld/daemon_cpg.c
@@ -742,17 +742,17 @@ static void daemon_fence_work(void)
/* We've seen a nodedown confchg callback, but not the
corresponding ringid callback. */
log_retry(retry_fencing, "fence work wait for cpg ringid");
+ retry = 1;
goto out;
}
if (cluster_ringid_seq != daemon_ringid.seq) {
/* wait for ringids to be in sync */
log_retry(retry_fencing, "fence work wait for cluster ringid");
+ retry = 1;
goto out;
}
- /* retry = 1; */
-
if (opt(enable_quorum_fencing_ind) && !cluster_quorate) {
/* wait for quorum before doing any fencing, but if there
is none, send_fence_clear below can unblock new nodes */
11 years, 4 months
dlm: master - dlm_controld: don't nack the first and only cg
by David Teigland
Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=a9c0eb5fb07f25e...
Commit: a9c0eb5fb07f25e36a8750abaa2678fd0344ae35
Parent: 23b48a5cf204bee8f362f68733430f5c1330c7d0
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Jan 31 16:42:10 2013 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Thu Jan 31 16:42:10 2013 -0600
dlm_controld: don't nack the first and only cg
I don't think there's any case where we want to nack it,
and I observed a case where it was nacked when we didn't
want it.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
dlm_controld/cpg.c | 16 ++++++++++++++++
1 files changed, 16 insertions(+), 0 deletions(-)
diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c
index 82dae3d..f971158 100644
--- a/dlm_controld/cpg.c
+++ b/dlm_controld/cpg.c
@@ -865,6 +865,22 @@ static int match_change(struct lockspace *ls, struct change *cg,
if (members_mismatch)
return 0;
+ /* Not completely sure if this is a valid assertion or not, i.e. not
+ sure if we really never want to nack our first and only cg. I have
+ seen one case in which a node incorrectly accepted nacks for cg seq
+ 1 and ls change_seq 1. (It was the secondary effect of another bug.)
+
+ Or, it's possible that this should apply a little more broadly as:
+ don't nack our most recent cg, i.e. cg->seq == ls->change_seq (1 or
+ otherwise). I'm hoping to find a test case that will exercise this
+ to clarify the situation here, and then update this comment. */
+
+ if (cg->seq == 1 && ls->change_seq == 1 && (hd->flags & DLM_MFLG_NACK)) {
+ log_group(ls, "match_change %d:%u skip cg %u for nack",
+ hd->nodeid, seq, cg->seq);
+ return 0;
+ }
+
node->last_match_seq = cg->seq;
log_group(ls, "match_change %d:%u matches cg %u", hd->nodeid, seq,
11 years, 4 months
dlm: master - dlm_controld: fix handling of startup partition merge
by David Teigland
Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=23b48a5cf204bee...
Commit: 23b48a5cf204bee8f362f68733430f5c1330c7d0
Parent: 956ddcebf026e2c2b560c808984293502054ff16
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Jan 31 13:09:29 2013 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Thu Jan 31 13:10:06 2013 -0600
dlm_controld: fix handling of startup partition merge
Nack messages are not matched correctly with previous change
structs. Normal start messages should not be matched with
changes that preceeded the node's join time, but nack messages
should, so the match_change() function needs to distinguish.
This can occur when two nodes are joining a lockspace, but
neither has completed when a cluster partition+merge happens.
(copying fix from cluster.git
70b15e7afa730166ef0a3c81948d31ab1dd5aa93)
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
dlm_controld/action.c | 6 +++++-
dlm_controld/cpg.c | 9 ++++++++-
2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/dlm_controld/action.c b/dlm_controld/action.c
index 64c3923..84637f1 100644
--- a/dlm_controld/action.c
+++ b/dlm_controld/action.c
@@ -336,7 +336,11 @@ int set_configfs_members(struct lockspace *ls, char *name,
if (rv) {
log_error("%s: renew rmdir failed: %d",
path, errno);
- goto out;
+
+ /* don't quit here, there's a case where
+ * this can happen, where a node identified
+ * for renewal was not really added
+ * previously */
}
}
diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c
index b299c6f..82dae3d 100644
--- a/dlm_controld/cpg.c
+++ b/dlm_controld/cpg.c
@@ -818,7 +818,14 @@ static int match_change(struct lockspace *ls, struct change *cg,
"cluster add %llu", hd->nodeid, seq, cg->seq,
(unsigned long long)cg->create_time,
(unsigned long long)t);
- return 0;
+
+ /* nacks can apply to older cg's */
+ if (!(hd->flags & DLM_MFLG_NACK)) {
+ return 0;
+ } else {
+ log_group(ls, "match_change %d:%u unskip cg %u for nack",
+ hd->nodeid, seq, cg->seq);
+ }
}
if (node->last_match_seq > cg->seq) {
11 years, 4 months
dlm: master - dlm_controld: avoid mismatching messages with old cgs
by David Teigland
Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=956ddcebf026e2c...
Commit: 956ddcebf026e2c2b560c808984293502054ff16
Parent: 6368f44ec16c9ce8a61c2304729930cbae55388d
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Jan 31 12:03:28 2013 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Thu Jan 31 12:47:52 2013 -0600
dlm_controld: avoid mismatching messages with old cgs
Only match start messages with cg's that are newer than
previously matched cg's for the node.
(copying fix from cluster.git
2a68bd042a771483b8f78b70654694a3c1ea15ae)
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
dlm_controld/cpg.c | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c
index f2582a8..b299c6f 100644
--- a/dlm_controld/cpg.c
+++ b/dlm_controld/cpg.c
@@ -41,6 +41,8 @@ struct node {
int lockspace_member;
int lockspace_fail_reason;
+ uint32_t last_match_seq;
+
uint64_t start_time;
int check_fs;
@@ -819,6 +821,12 @@ static int match_change(struct lockspace *ls, struct change *cg,
return 0;
}
+ if (node->last_match_seq > cg->seq) {
+ log_group(ls, "match_change %d:%u skip cg %u last matched cg %u",
+ hd->nodeid, seq, cg->seq, node->last_match_seq);
+ return 0;
+ }
+
/* verify this is the right change by matching the counts
and the nodeids of the current members */
@@ -850,6 +858,8 @@ static int match_change(struct lockspace *ls, struct change *cg,
if (members_mismatch)
return 0;
+ node->last_match_seq = cg->seq;
+
log_group(ls, "match_change %d:%u matches cg %u", hd->nodeid, seq,
cg->seq);
return 1;
11 years, 4 months