February 2013 - cluster-commits - Fedora Mailing-Lists

fence-agents: master - fence_cisco_ucs: fence agent does not respect "delay" attribute

by Marek Grác

Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=2424d8... Commit: 2424d8c71413d6ba4ace2c002ee80c1f4517b4c8 Parent: 7d5ffffa76a33a76cf39b8cd266bd5b38d9ddca5 Author: Marek 'marx' Grac <mgrac(a)redhat.com> AuthorDate: Wed Feb 6 16:24:48 2013 +0100 Committer: Marek 'marx' Grac <mgrac(a)redhat.com> CommitterDate: Wed Feb 6 16:24:48 2013 +0100 fence_cisco_ucs: fence agent does not respect "delay" attribute Resolves: rhbz#896603 --- fence/agents/cisco_ucs/fence_cisco_ucs.py | 5 +++++ 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/fence/agents/cisco_ucs/fence_cisco_ucs.py b/fence/agents/cisco_ucs/fence_cisco_ucs.py index e66130e..9783250 100644 --- a/fence/agents/cisco_ucs/fence_cisco_ucs.py +++ b/fence/agents/cisco_ucs/fence_cisco_ucs.py @@ -122,6 +122,11 @@ used with Cisco UCS to fence machines." docs["vendorurl"] = "http://www.cisco.com" show_docs(options, docs) + ## Do the delay of the fence device before logging in + ## Delay is important for two-node clusters fencing but we do not need to delay 'status' operations + if options["--action"] in ["off", "reboot"]: + time.sleep(int(options["--delay"])) + ### Login res = send_command(options, "<aaaLogin inName=\"" + options["--username"] + "\" inPassword=\"" + options["--password"] + "\" />", int(options["--login-timeout"])) result = RE_COOKIE.search(res)

11 years, 4 months

1
0
0 / 0

gfs2-utils: master - gfs2_lockcapture: Capture the status of the cluster nodes and find the clusternode name and id.

by shane bradley

Gitweb: http://git.fedorahosted.org/git/?p=gfs2-utils.git;a=commitdiff;h=e6100419... Commit: e6100419b58041b85463a76283451ebce6041707 Parent: 4bbf3baa94249ce80449c451460938ca33a1dff4 Author: Shane Bradley <sbradley(a)redhat.com> AuthorDate: Thu Jan 31 09:34:24 2013 -0500 Committer: Shane Bradley <sbradley(a)redhat.com> CommitterDate: Wed Feb 6 09:13:54 2013 -0500 gfs2_lockcapture: Capture the status of the cluster nodes and find the clusternode name and id. The status of the cluster will be captured and written to the file with respect to version: cman_tool nodes, corosync-quorumtool -l. Added two new configuration variables to the hostinformation.txt for the clusternode name and id. Updated man page. Signed-off-by: Shane Bradley <sbradley(a)redhat.com> --- gfs2/man/gfs2_lockcapture.8 | 5 +- gfs2/scripts/gfs2_lockcapture | 102 ++++++++++++++++++++++++++++++---------- 2 files changed, 79 insertions(+), 28 deletions(-) diff --git a/gfs2/man/gfs2_lockcapture.8 b/gfs2/man/gfs2_lockcapture.8 index 854cd71..acd9113 100644 --- a/gfs2/man/gfs2_lockcapture.8 +++ b/gfs2/man/gfs2_lockcapture.8 @@ -5,7 +5,7 @@ gfs2_lockcapture \- will capture locking information from GFS2 file systems and .SH SYNOPSIS .B gfs2_lockcapture \fR[-dqyt] [-o \fIoutput directory]\fR [-r \fInumber of runs]\fR [-s \fIseconds to sleep]\fR [-n \fIname of GFS2 filesystem]\fP -.PP +.PP .B gfs2_lockcapture \fR[-dqyi] .SH DESCRIPTION @@ -15,7 +15,7 @@ multiple times and how much time to sleep between each iteration of capturing the data. By default all of the mounted GFS2 filesystems will have their data collected unless GFS2 filesystems are specified. .PP -Please note that sysrq -t and -m events are trigger or the pid directories in /proc are +Please note that sysrq -t and -m events are trigger or the pid directories in /proc are collected on each iteration of capturing the data. .SH OPTIONS @@ -51,3 +51,4 @@ number of seconds to sleep between runs of capturing the lockdump data. name of the GFS2 filesystem(s) that will have their lockdump data captured. . .SH SEE ALSO +gfs2_lockanalyze(8) diff --git a/gfs2/scripts/gfs2_lockcapture b/gfs2/scripts/gfs2_lockcapture index 2b3421c..6a63fc8 100644 --- a/gfs2/scripts/gfs2_lockcapture +++ b/gfs2/scripts/gfs2_lockcapture @@ -45,12 +45,15 @@ class ClusterNode: """ This class represents a cluster node that is a current memeber in a cluster. """ - def __init__(self, clusternodeName, clusterName, mapOfMountedFilesystemLabels): + def __init__(self, clusternodeName, clusternodeID, clusterName, mapOfMountedFilesystemLabels): """ @param clusternodeName: The name of the cluster node. @type clusternodeName: String @param clusterName: The name of the cluster that this cluster node is a member of. + @param clusternodeID: The id of the cluster node. + @type clusternodeID: Int + @param clusterName: The name of the cluster that this cluster node is a @type clusterName: String @param mapOfMountedFilesystemLabels: A map of filesystem labels(key) for a mounted filesystem. The value is the line for the matching mounted @@ -58,6 +61,7 @@ class ClusterNode: @type mapOfMountedFilesystemLabels: Dict """ self.__clusternodeName = clusternodeName + self.__clusternodeID = clusternodeID self.__clusterName = clusterName self.__mapOfMountedFilesystemLabels = mapOfMountedFilesystemLabels @@ -69,7 +73,7 @@ class ClusterNode: @rtype: String """ rString = "" - rString += "%s:%s" %(self.getClusterName(), self.getClusterNodeName()) + rString += "%s:%s(id:%d)" %(self.getClusterName(), self.getClusterNodeName(), self.getClusterNodeID()) fsLabels = self.__mapOfMountedFilesystemLabels.keys() fsLabels.sort() for fsLabel in fsLabels: @@ -85,6 +89,14 @@ class ClusterNode: """ return self.__clusternodeName + def getClusterNodeID(self): + """ + Returns the id of the cluster node. + @return: Returns the id of the cluster node. + @rtype: String + """ + return self.__clusternodeID + def getClusterName(self): """ Returns the name of cluster that this cluster node is a member of. @@ -539,6 +551,7 @@ def getClusterNode(listOfGFS2Names): # in the output, else return None. clusterName = "" clusternodeName = "" + clusternodeID = "" if (runCommand("which", ["cman_tool"])): stdout = runCommandOutput("cman_tool", ["status"]) if (not stdout == None): @@ -550,6 +563,8 @@ def getClusterNode(listOfGFS2Names): clusterName = line.split("Cluster Name:")[1].strip().rstrip() if (line.startswith("Node name: ")): clusternodeName = line.split("Node name:")[1].strip().rstrip() + if (line.startswith("Node ID: ")): + clusternodeID = line.split("Node ID: ")[1].strip().rstrip() elif (runCommand("which", ["corosync-cmapctl"])): # Another way to get the local cluster node is: $ crm_node -i; crm_node -l # Get the name of the cluster. @@ -559,14 +574,14 @@ def getClusterNode(listOfGFS2Names): if (len(stdoutSplit) == 2): clusterName = stdoutSplit[1].strip().rstrip() # Get the id of the local cluster node so we can get the clusternode name - thisNodeID = "" + clusternodeID = "" stdout = runCommandOutput("corosync-cmapctl", ["-g", "runtime.votequorum.this_node_id"]) if (not stdout == None): stdoutSplit = stdout.split("=") if (len(stdoutSplit) == 2): - thisNodeID = stdoutSplit[1].strip().rstrip() + clusternodeID = stdoutSplit[1].strip().rstrip() # Now that we the nodeid then we can get the clusternode name. - if (len(thisNodeID) > 0): + if (len(clusternodeID) > 0): stdout = runCommandOutput("corosync-quorumtool", ["-l"]) if (not stdout == None): for line in stdout.split("\n"): @@ -588,7 +603,15 @@ def getClusterNode(listOfGFS2Names): break if ((not foundMatch) and (mapOfMountedFilesystemLabels.has_key(label))): del(mapOfMountedFilesystemLabels[label]) - return ClusterNode(clusternodeName, clusterName, mapOfMountedFilesystemLabels) + # Cast the node id to an int, and default is 0 if node is not found or + # not castable. + clusternodeIDInt = 0 + if (clusternodeID.isalnum()): + try: + clusternodeIDInt = int(clusternodeID) + except(ValueError): + pass + return ClusterNode(clusternodeName, clusternodeIDInt, clusterName, mapOfMountedFilesystemLabels) else: return None @@ -701,6 +724,28 @@ def gatherGeneralInformation(pathToDSTDir): message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Write the status of all the nodes in the cluster out. + if (runCommand("which", ["cman_tool"])): + command = "cman_tool" + pathToCommandOutput = os.path.join(pathToDSTDir, "cman_tool_status") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["status"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + elif (runCommand("which", ["corosync-cmapctl"])): + command = "corosync-quorumtool" + pathToCommandOutput = os.path.join(pathToDSTDir, "corosync-quorumtool_l") + try: + fout = open(pathToCommandOutput, "w") + runCommand(command, ["-l"], standardOut=fout) + fout.close() + except IOError: + message = "There was an error the command output for %s to the file %s." %(command, pathToCommandOutput) + logging.getLogger(MAIN_LOGGER_NAME).error(message) + def isProcPidStackEnabled(pathToPidData): """ @@ -1067,26 +1112,6 @@ if __name__ == "__main__": # script running. writeToFile(PATH_TO_PID_FILENAME, str(os.getpid()), createFile=True) # ####################################################################### - # Verify they want to continue because this script will trigger sysrq events. - # ####################################################################### - if (not cmdLineOpts.disableQuestions): - valid = {"yes":True, "y":True, "no":False, "n":False} - question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?" - prompt = " [y/n] " - while True: - sys.stdout.write(question + prompt) - choice = raw_input().lower() - if (choice in valid): - if (valid.get(choice)): - # If yes, or y then exit loop and continue. - break - else: - message = "The script will not continue since you chose not to continue." - logging.getLogger(MAIN_LOGGER_NAME).error(message) - exitScript(removePidFile=True, errorCode=1) - else: - sys.stdout.write("Please respond with '(y)es' or '(n)o'.\n") - # ####################################################################### # Get the clusternode name and verify that mounted GFS2 filesystems were # found. # ####################################################################### @@ -1110,6 +1135,26 @@ if __name__ == "__main__": print clusternode exitScript() # ####################################################################### + # Verify they want to continue because this script will trigger sysrq events. + # ####################################################################### + if (not cmdLineOpts.disableQuestions): + valid = {"yes":True, "y":True, "no":False, "n":False} + question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?" + prompt = " [y/n] " + while True: + sys.stdout.write(question + prompt) + choice = raw_input().lower() + if (choice in valid): + if (valid.get(choice)): + # If yes, or y then exit loop and continue. + break + else: + message = "The script will not continue since you chose not to continue." + logging.getLogger(MAIN_LOGGER_NAME).error(message) + exitScript(removePidFile=True, errorCode=1) + else: + sys.stdout.write("Please respond with '(y)es' or '(n)o'.\n") + # ####################################################################### # Create the output directory to verify it can be created before # proceeding unless it is already created from a previous run data needs # to be analyzed. Probably could add more debugging on if file or dir. @@ -1178,6 +1223,11 @@ if __name__ == "__main__": message = "Pass (%d/%d): Gathering general information about the host." %(i, cmdLineOpts.numberOfRuns) logging.getLogger(MAIN_LOGGER_NAME).debug(message) gatherGeneralInformation(pathToOutputRunDir) + # Write the clusternode name and id to the general information file. + writeToFile(os.path.join(pathToOutputRunDir, "hostinformation.txt"), + "NODE_NAME=%s\nNODE_ID=%d" %(clusternode.getClusterNodeName(), clusternode.getClusterNodeID()), + appendToFile=True, createFile=True) + # Going to sleep for 2 seconds, so that TIMESTAMP should be in the # past in the logs so that capturing sysrq data will be guaranteed. time.sleep(2)

11 years, 4 months

1
0
0 / 0

fence-agents: master - fence_vmware_soap: Fix previous patch - fix traceback when hostname cannot be resolved

by Marek Grác

Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=7d5fff... Commit: 7d5ffffa76a33a76cf39b8cd266bd5b38d9ddca5 Parent: 2d159cc3e800426d34c40f9fb53ea17e1b8d1ec7 Author: Marek 'marx' Grac <mgrac(a)redhat.com> AuthorDate: Wed Feb 6 14:34:09 2013 +0100 Committer: Marek 'marx' Grac <mgrac(a)redhat.com> CommitterDate: Wed Feb 6 14:34:09 2013 +0100 fence_vmware_soap: Fix previous patch - fix traceback when hostname cannot be resolved Previous patch was not exactly copied from another branch. --- fence/agents/vmware_soap/fence_vmware_soap.py | 6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fence/agents/vmware_soap/fence_vmware_soap.py b/fence/agents/vmware_soap/fence_vmware_soap.py index b949dab..3c08eeb 100644 --- a/fence/agents/vmware_soap/fence_vmware_soap.py +++ b/fence/agents/vmware_soap/fence_vmware_soap.py @@ -20,10 +20,10 @@ def soap_login(options): url = "http://" url += options["--ip"] + ":" + str(options["--ipport"]) + "/sdk" - conn = Client(url + "/vimService.wsdl") - conn.set_options(location = url) - try: + conn = Client(url + "/vimService.wsdl") + conn.set_options(location = url) + mo_ServiceInstance = Property('ServiceInstance') mo_ServiceInstance._type = 'ServiceInstance' ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance)

11 years, 4 months

1
0
0 / 0

fence-agents: master - fence_vmnware_soap: Fix traceback when hostname cannot be resolved to IP address

by Marek Grác

Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=2d159c... Commit: 2d159cc3e800426d34c40f9fb53ea17e1b8d1ec7 Parent: 110e74e711d1a341d4f40dc3f253170bd6cb7651 Author: Marek 'marx' Grac <mgrac(a)redhat.com> AuthorDate: Wed Feb 6 14:06:53 2013 +0100 Committer: Marek 'marx' Grac <mgrac(a)redhat.com> CommitterDate: Wed Feb 6 14:06:53 2013 +0100 fence_vmnware_soap: Fix traceback when hostname cannot be resolved to IP address Resolves: rhbz#902404 --- fence/agents/vmware_soap/fence_vmware_soap.py | 12 ++++++------ 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fence/agents/vmware_soap/fence_vmware_soap.py b/fence/agents/vmware_soap/fence_vmware_soap.py index 16ce5b2..b949dab 100644 --- a/fence/agents/vmware_soap/fence_vmware_soap.py +++ b/fence/agents/vmware_soap/fence_vmware_soap.py @@ -23,13 +23,13 @@ def soap_login(options): conn = Client(url + "/vimService.wsdl") conn.set_options(location = url) - mo_ServiceInstance = Property('ServiceInstance') - mo_ServiceInstance._type = 'ServiceInstance' - ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance) - mo_SessionManager = Property(ServiceContent.sessionManager.value) - mo_SessionManager._type = 'SessionManager' - try: + mo_ServiceInstance = Property('ServiceInstance') + mo_ServiceInstance._type = 'ServiceInstance' + ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance) + mo_SessionManager = Property(ServiceContent.sessionManager.value) + mo_SessionManager._type = 'SessionManager' + SessionManager = conn.service.Login(mo_SessionManager, options["--username"], options["--password"]) except Exception, ex: fail(EC_LOGIN_DENIED)

11 years, 4 months

1
0
0 / 0

fence-agents: RHEL6 - fence_vmware_soap: Fix traceback when hostname cannot be resolved to IP address

by Marek Grác

Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=715b34... Commit: 715b34f9c3770b46f49dfd20e3aaef9dfd461596 Parent: aef3292971598b39f237be8400f41d55e080219a Author: Marek 'marx' Grac <mgrac(a)redhat.com> AuthorDate: Wed Feb 6 14:00:28 2013 +0100 Committer: Marek 'marx' Grac <mgrac(a)redhat.com> CommitterDate: Wed Feb 6 14:04:35 2013 +0100 fence_vmware_soap: Fix traceback when hostname cannot be resolved to IP address Resolves: rhbz#902404 --- fence/agents/vmware_soap/fence_vmware_soap.py | 19 ++++++++++--------- 1 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fence/agents/vmware_soap/fence_vmware_soap.py b/fence/agents/vmware_soap/fence_vmware_soap.py index 0da7f0d..f01d37b 100644 --- a/fence/agents/vmware_soap/fence_vmware_soap.py +++ b/fence/agents/vmware_soap/fence_vmware_soap.py @@ -21,16 +21,17 @@ def soap_login(options): url = "http://" url += options["-a"] + ":" + str(options["-u"]) + "/sdk" - conn = Client(url + "/vimService.wsdl") - conn.set_options(location = url) - - mo_ServiceInstance = Property('ServiceInstance') - mo_ServiceInstance._type = 'ServiceInstance' - ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance) - mo_SessionManager = Property(ServiceContent.sessionManager.value) - mo_SessionManager._type = 'SessionManager' - + try: + conn = Client(url + "/vimService.wsdl") + conn.set_options(location = url) + + mo_ServiceInstance = Property('ServiceInstance') + mo_ServiceInstance._type = 'ServiceInstance' + ServiceContent = conn.service.RetrieveServiceContent(mo_ServiceInstance) + mo_SessionManager = Property(ServiceContent.sessionManager.value) + mo_SessionManager._type = 'SessionManager' + SessionManager = conn.service.Login(mo_SessionManager, options["-l"], options["-p"]) except Exception, ex: fail(EC_LOGIN_DENIED)

11 years, 4 months

1
0
0 / 0

fence-agents: RHEL6 - fence_drac5: fix regression on Dell CMC and Dell DRAC5

by Marek Grác

Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=aef329... Commit: aef3292971598b39f237be8400f41d55e080219a Parent: e0b9090cac46a1010f7b57f7aef9412753a312b6 Author: Marek 'marx' Grac <mgrac(a)redhat.com> AuthorDate: Wed Feb 6 13:54:05 2013 +0100 Committer: Marek 'marx' Grac <mgrac(a)redhat.com> CommitterDate: Wed Feb 6 13:59:21 2013 +0100 fence_drac5: fix regression on Dell CMC and Dell DRAC5 Standard EOL for agents connecting via ssh is CR/LF. Some Dell devices represents CR/LF as double-enter what creates a problem in parsing of output. This patch adds a check for double-enter. This can be detected in function which power on/off machine because 'get power status' was run before and if we can find a line without any command we know that there is a double-enter problem. Resolves: rhbz#905478 --- fence/agents/drac5/fence_drac5.py | 7 +++++++ 1 files changed, 7 insertions(+), 0 deletions(-) diff --git a/fence/agents/drac5/fence_drac5.py b/fence/agents/drac5/fence_drac5.py index 3e9a8d2..7d9082a 100644 --- a/fence/agents/drac5/fence_drac5.py +++ b/fence/agents/drac5/fence_drac5.py @@ -52,7 +52,14 @@ def set_power_status(conn, options): conn.send_eol("racadm serveraction " + action + " -m " + options["-m"]) elif options["model"] == "DRAC 5": conn.send_eol("racadm serveraction " + action) + + ## Fix issue with double-enter [CR/LF] + ## We need to read two additional command prompts (one from get + one from set command) conn.log_expect(options, options["-c"], int(options["-g"])) + if len(conn.before.strip()) == 0: + options["eol"] = options["eol"][:-1] + conn.log_expect(options, options["-c"], int(options["-g"])) + conn.log_expect(options, options["-c"], int(options["-g"])) except pexpect.EOF: fail(EC_CONNECTION_LOST) except pexpect.TIMEOUT:

11 years, 4 months

1
0
0 / 0

dlm: master - dlm_controld: retry fencing work while waiting for ringid

by David Teigland

Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=f65f3f67b0f87f5... Commit: f65f3f67b0f87f52398ff1100aaede36e254abda Parent: a9c0eb5fb07f25e36a8750abaa2678fd0344ae35 Author: David Teigland <teigland(a)redhat.com> AuthorDate: Thu Jan 31 16:45:41 2013 -0600 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Thu Jan 31 16:45:41 2013 -0600 dlm_controld: retry fencing work while waiting for ringid An update to the cluster ringid won't necessarily cause the fencing work to be scheduled, so force to it be polled/retried. Signed-off-by: David Teigland <teigland(a)redhat.com> --- dlm_controld/daemon_cpg.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c index b6532a3..af9b91d 100644 --- a/dlm_controld/daemon_cpg.c +++ b/dlm_controld/daemon_cpg.c @@ -742,17 +742,17 @@ static void daemon_fence_work(void) /* We've seen a nodedown confchg callback, but not the corresponding ringid callback. */ log_retry(retry_fencing, "fence work wait for cpg ringid"); + retry = 1; goto out; } if (cluster_ringid_seq != daemon_ringid.seq) { /* wait for ringids to be in sync */ log_retry(retry_fencing, "fence work wait for cluster ringid"); + retry = 1; goto out; } - /* retry = 1; */ - if (opt(enable_quorum_fencing_ind) && !cluster_quorate) { /* wait for quorum before doing any fencing, but if there is none, send_fence_clear below can unblock new nodes */

11 years, 4 months

1
0
0 / 0

dlm: master - dlm_controld: don't nack the first and only cg

by David Teigland

Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=a9c0eb5fb07f25e... Commit: a9c0eb5fb07f25e36a8750abaa2678fd0344ae35 Parent: 23b48a5cf204bee8f362f68733430f5c1330c7d0 Author: David Teigland <teigland(a)redhat.com> AuthorDate: Thu Jan 31 16:42:10 2013 -0600 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Thu Jan 31 16:42:10 2013 -0600 dlm_controld: don't nack the first and only cg I don't think there's any case where we want to nack it, and I observed a case where it was nacked when we didn't want it. Signed-off-by: David Teigland <teigland(a)redhat.com> --- dlm_controld/cpg.c | 16 ++++++++++++++++ 1 files changed, 16 insertions(+), 0 deletions(-) diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c index 82dae3d..f971158 100644 --- a/dlm_controld/cpg.c +++ b/dlm_controld/cpg.c @@ -865,6 +865,22 @@ static int match_change(struct lockspace *ls, struct change *cg, if (members_mismatch) return 0; + /* Not completely sure if this is a valid assertion or not, i.e. not + sure if we really never want to nack our first and only cg. I have + seen one case in which a node incorrectly accepted nacks for cg seq + 1 and ls change_seq 1. (It was the secondary effect of another bug.) + + Or, it's possible that this should apply a little more broadly as: + don't nack our most recent cg, i.e. cg->seq == ls->change_seq (1 or + otherwise). I'm hoping to find a test case that will exercise this + to clarify the situation here, and then update this comment. */ + + if (cg->seq == 1 && ls->change_seq == 1 && (hd->flags & DLM_MFLG_NACK)) { + log_group(ls, "match_change %d:%u skip cg %u for nack", + hd->nodeid, seq, cg->seq); + return 0; + } + node->last_match_seq = cg->seq; log_group(ls, "match_change %d:%u matches cg %u", hd->nodeid, seq,

11 years, 4 months

1
0
0 / 0

dlm: master - dlm_controld: fix handling of startup partition merge

by David Teigland

Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=23b48a5cf204bee... Commit: 23b48a5cf204bee8f362f68733430f5c1330c7d0 Parent: 956ddcebf026e2c2b560c808984293502054ff16 Author: David Teigland <teigland(a)redhat.com> AuthorDate: Thu Jan 31 13:09:29 2013 -0600 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Thu Jan 31 13:10:06 2013 -0600 dlm_controld: fix handling of startup partition merge Nack messages are not matched correctly with previous change structs. Normal start messages should not be matched with changes that preceeded the node's join time, but nack messages should, so the match_change() function needs to distinguish. This can occur when two nodes are joining a lockspace, but neither has completed when a cluster partition+merge happens. (copying fix from cluster.git 70b15e7afa730166ef0a3c81948d31ab1dd5aa93) Signed-off-by: David Teigland <teigland(a)redhat.com> --- dlm_controld/action.c | 6 +++++- dlm_controld/cpg.c | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/dlm_controld/action.c b/dlm_controld/action.c index 64c3923..84637f1 100644 --- a/dlm_controld/action.c +++ b/dlm_controld/action.c @@ -336,7 +336,11 @@ int set_configfs_members(struct lockspace *ls, char *name, if (rv) { log_error("%s: renew rmdir failed: %d", path, errno); - goto out; + + /* don't quit here, there's a case where + * this can happen, where a node identified + * for renewal was not really added + * previously */ } } diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c index b299c6f..82dae3d 100644 --- a/dlm_controld/cpg.c +++ b/dlm_controld/cpg.c @@ -818,7 +818,14 @@ static int match_change(struct lockspace *ls, struct change *cg, "cluster add %llu", hd->nodeid, seq, cg->seq, (unsigned long long)cg->create_time, (unsigned long long)t); - return 0; + + /* nacks can apply to older cg's */ + if (!(hd->flags & DLM_MFLG_NACK)) { + return 0; + } else { + log_group(ls, "match_change %d:%u unskip cg %u for nack", + hd->nodeid, seq, cg->seq); + } } if (node->last_match_seq > cg->seq) {

11 years, 4 months

1
0
0 / 0

dlm: master - dlm_controld: avoid mismatching messages with old cgs

by David Teigland

Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=956ddcebf026e2c... Commit: 956ddcebf026e2c2b560c808984293502054ff16 Parent: 6368f44ec16c9ce8a61c2304729930cbae55388d Author: David Teigland <teigland(a)redhat.com> AuthorDate: Thu Jan 31 12:03:28 2013 -0600 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Thu Jan 31 12:47:52 2013 -0600 dlm_controld: avoid mismatching messages with old cgs Only match start messages with cg's that are newer than previously matched cg's for the node. (copying fix from cluster.git 2a68bd042a771483b8f78b70654694a3c1ea15ae) Signed-off-by: David Teigland <teigland(a)redhat.com> --- dlm_controld/cpg.c | 10 ++++++++++ 1 files changed, 10 insertions(+), 0 deletions(-) diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c index f2582a8..b299c6f 100644 --- a/dlm_controld/cpg.c +++ b/dlm_controld/cpg.c @@ -41,6 +41,8 @@ struct node { int lockspace_member; int lockspace_fail_reason; + uint32_t last_match_seq; + uint64_t start_time; int check_fs; @@ -819,6 +821,12 @@ static int match_change(struct lockspace *ls, struct change *cg, return 0; } + if (node->last_match_seq > cg->seq) { + log_group(ls, "match_change %d:%u skip cg %u last matched cg %u", + hd->nodeid, seq, cg->seq, node->last_match_seq); + return 0; + } + /* verify this is the right change by matching the counts and the nodeids of the current members */ @@ -850,6 +858,8 @@ static int match_change(struct lockspace *ls, struct change *cg, if (members_mismatch) return 0; + node->last_match_seq = cg->seq; + log_group(ls, "match_change %d:%u matches cg %u", hd->nodeid, seq, cg->seq); return 1;

11 years, 4 months

1
0
0 / 0

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

cluster-commits February 2013