Vinzenz Feenstra has uploaded a new change for review.
Change subject: virt: Try to detect non guest iniated shutdowns ......................................................................
virt: Try to detect non guest iniated shutdowns
When a host system shutsdown due to a variety of reasons (USP, fencing...) VMs running on that system get stopped before VDSM and libvirtd services are stopped. This is due to the way how they are registerd with machined.
This results in the qemu processes being signaled with SIG_TERM and them exiting. To libvirt this looks the same as if the shutdown would have been initiated from within the VM and therefore emits a lifecycle event that looks to VDSM as if the VM was stopped from within. VDSM in this case reports the exit reason as user shutdown. This is a problem for HA VMs which will not be automatically rescheduled when they see a user shutdown exit reason.
This patch attempts to detect non user shutdowns. This requires the VM to have the ovirt guest agent running to detect these scenarios properly.
Change-Id: Ie04b9806fbf0a81dc576aa28cfdda5edb079ce29 Signed-off-by: Vinzenz Feenstra vfeenstr@redhat.com --- M lib/vdsm/virt/guestagent.py M vdsm/virt/vm.py 2 files changed, 20 insertions(+), 1 deletion(-)
git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/91/64991/1
diff --git a/lib/vdsm/virt/guestagent.py b/lib/vdsm/virt/guestagent.py index f2d45a3..487e4d8 100644 --- a/lib/vdsm/virt/guestagent.py +++ b/lib/vdsm/virt/guestagent.py @@ -33,6 +33,7 @@ from vdsm import supervdsm from vdsm import utils from vdsm.common import filecontrol +from vdsm.config import config from vdsm.virt import vmstatus
_MAX_SUPPORTED_API_VERSION = 3 @@ -150,6 +151,8 @@
class GuestAgent(object): MAX_MESSAGE_SIZE = 2 ** 20 # 1 MiB for now + SEEN_SHUTDOWN_TIMEOUT = config.get_int('general', + 'sys_shutdown_timeout', 120) * 2
def __init__(self, socketName, channelListener, log, onStatusChange, api_version=None, user='Unknown', ips=''): @@ -181,6 +184,13 @@ self._completion_lock = threading.Lock() self._completion_events = {} self._first_connect = threading.Event() + self._seen_shutdown = False + + def has_seen_shutdown(self): + diff = time.time() - self._agentTimestamp + if diff < GuestAgent.SEEN_SHUTDOWN_TIMEOUT: + return self._seen_shutdown + return False
def _on_completion(self, reply_id): with self._completion_lock: @@ -345,6 +355,7 @@ # Only change the state AFTER all data of the heartbeat has been # consumed self.guestStatus = vmstatus.UP + self._seen_shutdown = False elif message == 'host-name': self.guestInfo['guestName'] = args['name'] elif message == 'os-version': @@ -387,10 +398,12 @@ self.log.debug("guest agent was uninstalled.") self.guestInfo['appsList'] = () elif message == 'session-startup': + self._seen_shutdown = False self.log.debug("Guest system is started or restarted.") elif message == 'fqdn': self.guestInfo['guestFQDN'] = args['fqdn'] elif message == 'session-shutdown': + self._seen_shutdown = True self.log.debug("Guest system shuts down.") elif message == 'containers': self.guestInfo['guestContainers'] = args['list'] diff --git a/vdsm/virt/vm.py b/vdsm/virt/vm.py index 6109c43..2f53cd8 100644 --- a/vdsm/virt/vm.py +++ b/vdsm/virt/vm.py @@ -4126,8 +4126,14 @@ hooks.after_vm_hibernate(self._domain.xml, self.conf) else: if detail == libvirt.VIR_DOMAIN_EVENT_STOPPED_SHUTDOWN: + # seen_shutdown is used to detect VMs that have been + # stopped by sending them SIG_TERM (e.g. system shutdown) + # In that case libvirt and qemu report a user initiated + # shutdown that is not correct. + seen_shutdown = not self.guestAgent or \ + self.guestAgent.has_seen_shutdown() with self._shutdownLock: - if self._shutdownReason is None: + if self._shutdownReason is None and seen_shutdown: # do not overwrite admin shutdown, if present self._shutdownReason = vmexitreason.USER_SHUTDOWN self._onQemuDeath()