This patchset fixes a bug in background process handling that results in issue https://github.com/jpirko/lnst/issues/67
Jan Tluka (5): Controller: add more comprehensive explanation about proc_id and bg_id Common: Move DEFAULT_TIMEOUT to NetTestCommand Common: add variable to store the time when a command started Slave: add get_remaining_time method Controller: fix the handling of background process timeouts
lnst/Common/NetTestCommand.py | 3 +++ lnst/Controller/Machine.py | 30 ++++++++++++++++++++---------- lnst/Controller/NetTestController.py | 5 +++-- lnst/Slave/NetTestSlave.py | 20 +++++++++++++++++++- 4 files changed, 45 insertions(+), 13 deletions(-)
Signed-off-by: Jan Tluka jtluka@redhat.com --- lnst/Controller/NetTestController.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/lnst/Controller/NetTestController.py b/lnst/Controller/NetTestController.py index f3cdc60..56649a3 100644 --- a/lnst/Controller/NetTestController.py +++ b/lnst/Controller/NetTestController.py @@ -401,8 +401,9 @@ class NetTestController: if "from" in cmd_data: cmd["from"] = cmd_data["from"] elif cmd["type"] in ["wait", "intr", "kill"]: - # XXX The internal name (proc_id) is different, because - # bg_id is already used by LNST in a different context + # 'proc_id' is used to store bg_id for wait/kill/intr + # 'bg_id' is used for test/exec + # this is used to distinguish between the two in NetTestSlave code cmd["proc_id"] = cmd_data["bg_id"] elif cmd["type"] == "config": cmd["persistent"] = False
The DEFAULT_TIMEOUT variable will be used from different modules so make this variable available in common code.
Signed-off-by: Jan Tluka jtluka@redhat.com --- lnst/Common/NetTestCommand.py | 2 ++ lnst/Controller/Machine.py | 3 +-- lnst/Slave/NetTestSlave.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/lnst/Common/NetTestCommand.py b/lnst/Common/NetTestCommand.py index 7b45b33..fbb15a2 100644 --- a/lnst/Common/NetTestCommand.py +++ b/lnst/Common/NetTestCommand.py @@ -21,6 +21,8 @@ import re from lnst.Common.ExecCmd import exec_cmd, ExecCmdFail from lnst.Common.ConnectionHandler import recv_data, send_data
+DEFAULT_TIMEOUT = 60 + def str_command(command): attrs = ["type(%s)" % command["type"]] if command["type"] == "test": diff --git a/lnst/Controller/Machine.py b/lnst/Controller/Machine.py index eefb3aa..530aef8 100644 --- a/lnst/Controller/Machine.py +++ b/lnst/Controller/Machine.py @@ -28,13 +28,12 @@ from lnst.Common.Utils import wait_for, md5sum, dir_md5sum, create_tar_archive from lnst.Common.Utils import check_process_running from lnst.Common.ConnectionHandler import send_data, recv_data from lnst.Common.ConnectionHandler import ConnectionHandler +from lnst.Common.NetTestCommand import DEFAULT_TIMEOUT
# conditional support for libvirt if check_process_running("libvirtd"): from lnst.Controller.VirtUtils import VirtNetCtl, VirtDomainCtl
-DEFAULT_TIMEOUT = 60 - class MachineError(Exception): pass
diff --git a/lnst/Slave/NetTestSlave.py b/lnst/Slave/NetTestSlave.py index d2f7834..7819b82 100644 --- a/lnst/Slave/NetTestSlave.py +++ b/lnst/Slave/NetTestSlave.py @@ -32,6 +32,7 @@ from lnst.Common.ExecCmd import exec_cmd, ExecCmdFail from lnst.Common.ResourceCache import ResourceCache from lnst.Common.NetTestCommand import NetTestCommandContext from lnst.Common.NetTestCommand import CommandException, NetTestCommand +from lnst.Common.NetTestCommand import DEFAULT_TIMEOUT from lnst.Slave.NmConfigDevice import is_nm_managed_by_name from lnst.Common.Utils import check_process_running from lnst.Common.ConnectionHandler import recv_data, send_data
Signed-off-by: Jan Tluka jtluka@redhat.com --- lnst/Common/NetTestCommand.py | 1 + 1 file changed, 1 insertion(+)
diff --git a/lnst/Common/NetTestCommand.py b/lnst/Common/NetTestCommand.py index fbb15a2..98605f4 100644 --- a/lnst/Common/NetTestCommand.py +++ b/lnst/Common/NetTestCommand.py @@ -94,6 +94,7 @@ class NetTestCommand: self._control_cmd= None self._result = None self._log_ctl = log_ctl + self._start_time = None
if "bg_id" not in self._command: self._id = None
This patch adds get_remaining_time() method to NetTestSlave. It's used to make a query from the controller to check if a command running in background has timeouted.
Signed-off-by: Jan Tluka jtluka@redhat.com --- lnst/Slave/NetTestSlave.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/lnst/Slave/NetTestSlave.py b/lnst/Slave/NetTestSlave.py index 7819b82..15292b6 100644 --- a/lnst/Slave/NetTestSlave.py +++ b/lnst/Slave/NetTestSlave.py @@ -20,7 +20,7 @@ import socket import dbus import ctypes import multiprocessing -from time import sleep +from time import sleep, time from xmlrpclib import Binary from tempfile import NamedTemporaryFile from lnst.Common.Logs import log_exc_traceback @@ -331,9 +331,26 @@ class SlaveMethods: self._system_config = {} return True
+ def get_remaining_time(self, bg_id): + cmd = self._command_context.get_cmd(bg_id) + if "timeout" in cmd._command: + cmd_timeout = cmd._command["timeout"] + else: + cmd_timeout = DEFAULT_TIMEOUT + + start_time = cmd._start_time + current_time = time() + + remaining = cmd_timeout - (current_time - start_time) + if remaining < 0: + remaining = 0 + + return int(remaining) + def run_command(self, command): cmd = NetTestCommand(self._command_context, command, self._resource_table, self._log_ctl) + cmd._start_time = time() self._command_context.add_cmd(cmd)
res = cmd.run()
This patch fixes an incorrect key 'bg_id' to 'proc_id' for handling wait command.
Futher it changes how the commands running in background are handled when wait command is called. When a user calls wait command the controller first queries the remaining time of the associated bg process. If it has not timeouted yet the alarm is set to trigger on time calculated from the returned value of get_remaining_time(). If the bg process already timeouted the alarm is set to two seconds just to give some time to reach the try/exception block. Not 100% clean but works reliably than before.
Tested by following recipe:
<lnstrecipe> <network> <host id="m1"> <interfaces> <eth id="x" label="netA"/> </interfaces> </host> </network>
<task> <run host="m1" command="sleep 5" timeout="2"/> <run host="m1" command="sleep 5" timeout="2" bg_id="1"/> <wait host="m1" bg_id="1"/> <run host="m1" command="sleep 5" timeout="6" bg_id="2"/> <wait host="m1" bg_id="2"/> <run host="m1" command="sleep 5" timeout="2" bg_id="3"/> <run host="m1" command="sleep 5" timeout="2" bg_id="4"/> <wait host="m1" bg_id="3"/> <wait host="m1" bg_id="4"/> </task> </lnstrecipe>
Signed-off-by: Jan Tluka jtluka@redhat.com --- lnst/Controller/Machine.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/lnst/Controller/Machine.py b/lnst/Controller/Machine.py index 530aef8..5c53379 100644 --- a/lnst/Controller/Machine.py +++ b/lnst/Controller/Machine.py @@ -244,13 +244,24 @@ class Machine(object):
prev_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
- if "timeout" in command: - timeout = command["timeout"] - logging.debug("Setting timeout to "%d"", timeout) - signal.alarm(timeout) + if command["type"] == "wait": + logging.debug("Get the uptime of bg process with bg_id == %s" % command["proc_id"]) + remaining_time = self._rpc_call("get_remaining_time", command["proc_id"]) + logging.debug("Setting timeout to %d", remaining_time) + if remaining_time > 0: + signal.alarm(remaining_time) + else: + # 2 seconds is enough time to do wait via RPC and collect + # the result + signal.alarm(2) else: - logging.debug("Setting default timeout (%ds)." % DEFAULT_TIMEOUT) - signal.alarm(DEFAULT_TIMEOUT) + if "timeout" in command: + timeout = command["timeout"] + logging.debug("Setting timeout to "%d"", timeout) + signal.alarm(timeout) + else: + logging.debug("Setting default timeout (%ds)." % DEFAULT_TIMEOUT) + signal.alarm(DEFAULT_TIMEOUT)
try: if 'netns' in command and command['netns'] != None: @@ -259,8 +270,8 @@ class Machine(object): else: cmd_res = self._rpc_call("run_command", command) except MachineError as exc: - if "bg_id" in command: - cmd_res = self._rpc_call("kill_command", command["bg_id"]) + if "proc_id" in command: + cmd_res = self._rpc_call("kill_command", command["proc_id"]) else: cmd_res = self._rpc_call("kill_command", None) cmd_res["passed"] = False
lnst-developers@lists.fedorahosted.org