Adam Litke has uploaded a new change for review.
Change subject: jobs: Fix abort semantics
......................................................................
jobs: Fix abort semantics
Prior to this commit jobs.abort was treated as an synchronous operation
and if it returned successfully the caller could assume that no
operations related to the job were still running. Unforatunately this
assumption is wrong since most underlying operations are aborted
asynchronously (ie. by sending a signal to a process).
To fix this we must adopt async abort sementics in the jobs API. If a
job is pending and abort is called we can simply move it to aborted
state. If the job is running we move it to aborting state, call the
abort helper (_abort) and return. When run() finishes it will move an
aborting job to aborted. We must not autodelete or otherwise change the
state of jobs with aborting status because it could mask the fact that a
process is still changing the host or storage.
Change-Id: I801082c50b10cf0571210d65cd3a5cec0d282a5c
Signed-off-by: Adam Litke <alitke(a)redhat.com>
---
M lib/vdsm/jobs.py
M tests/jobsTests.py
2 files changed, 53 insertions(+), 38 deletions(-)
git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/02/65102/1
diff --git a/lib/vdsm/jobs.py b/lib/vdsm/jobs.py
index b92779d..2c6c27c 100644
--- a/lib/vdsm/jobs.py
+++ b/lib/vdsm/jobs.py
@@ -33,11 +33,12 @@
class STATUS:
- PENDING = 'pending' # Job has not started yet
- RUNNING = 'running' # Job is running
- DONE = 'done' # Job has finished successfully
- ABORTED = 'aborted' # Job was aborted by user request
- FAILED = 'failed' # Job has failed
+ PENDING = 'pending' # Job has not started yet
+ RUNNING = 'running' # Job is running
+ DONE = 'done' # Job has finished successfully
+ ABORTING = 'aborting' # Job is running but abort is in progress
+ ABORTED = 'aborted' # Job was aborted by user request
+ FAILED = 'failed' # Job has failed
class ClientError(Exception):
@@ -128,16 +129,20 @@
def abort(self):
with self._status_lock:
- if not self.active:
+ if self.status == STATUS.PENDING:
+ self._status = STATUS.ABORTED
+ if self.autodelete:
+ self._autodelete()
+ elif self.status == STATUS.RUNNING:
+ self._status = STATUS.ABORTING
+ logging.info("Aborting job %r...", self.id)
+ self._abort()
+ # Autodelete is handled by run when the operation is terminated
+ elif self.status == STATUS.ABORTING:
+ logging.info("Retrying abort job %r...", self.id)
+ self._abort()
+ else:
raise JobNotActive()
- logging.info('Job %r aborting...', self._id)
- self._abort()
- self._status = STATUS.ABORTED
-
- # We MUST NOT autodelete a job if abort failed. Otherwise there could
- # still be ongoing operations on storage without any associated job.
- if self.autodelete:
- self._autodelete()
def run(self):
with self._status_lock:
@@ -150,25 +155,37 @@
self._status = STATUS.RUNNING
try:
self._run()
- status = STATUS.DONE
except Exception as e:
- status = STATUS.FAILED
- logging.exception("Job (id=%s desc=%s) failed",
- self.id, self.description)
- if not isinstance(e, exception.VdsmException):
- e = exception.GeneralException(str(e))
- self._error = e
- finally:
with self._status_lock:
- if self.status == STATUS.ABORTED:
- return
- self._status = status
- if self.autodelete:
- self._autodelete()
+ if self.status == STATUS.ABORTING:
+ logging.exception("Failed to abort job (id=%s desc=%s)",
+ self.id, self.description)
+ # We do not autodelete failed aborting jobs because it is
+ # likely that something is still running on the system.
+ else:
+ self._status = STATUS.FAILED
+ logging.exception("Job (id=%s desc=%s) failed",
+ self.id, self.description)
+ if self.autodelete:
+ self._autodelete()
+ if not isinstance(e, exception.VdsmException):
+ e = exception.GeneralException(str(e))
+ self._error = e
+ else:
+ with self._status_lock:
+ if self.status == STATUS.ABORTING:
+ self._status = STATUS.ABORTED
+ else:
+ self._status = STATUS.DONE
+ if self.autodelete:
+ self._autodelete()
def _abort(self):
"""
- May be implemented by child class
+ May be implemented by child class. This is an asynchronous operation
+ which should trigger the abort quickly and return. A successful return
+ does not mean the job has stopped. The caller must wait for the job
+ status to change to aborted.
- Must raise if the job could not be aborted
- Must not raise if the job was aborted
"""
diff --git a/tests/jobsTests.py b/tests/jobsTests.py
index 3b39277..6c6d49a 100644
--- a/tests/jobsTests.py
+++ b/tests/jobsTests.py
@@ -182,13 +182,16 @@
self.assertEqual({}, jobs.info(job_type=bar.job_type,
job_ids=[foo.id]))
- @permutations([[jobs.STATUS.PENDING], [jobs.STATUS.RUNNING]])
- def test_abort_job(self, status):
+ @permutations([
+ [jobs.STATUS.PENDING, jobs.STATUS.ABORTED, False],
+ [jobs.STATUS.RUNNING, jobs.STATUS.ABORTING, True],
+ ])
+ def test_abort_job(self, status, aborted_status, did_abort):
job = TestingJob(status)
jobs.add(job)
jobs.abort(job.id)
- self.assertEqual(jobs.STATUS.ABORTED, job.status)
- self.assertTrue(job._aborted)
+ self.assertEqual(aborted_status, job.status)
+ self.assertEqual(did_abort, job._aborted)
def test_abort_running_job(self):
job = StuckJob()
@@ -217,15 +220,10 @@
def test_abort_not_supported(self):
job = jobs.Job(str(uuid.uuid4()))
+ job._status = jobs.STATUS.RUNNING
jobs.add(job)
self.assertEqual(response.error(jobs.AbortNotSupported.name),
jobs.abort(job.id))
-
- def test_abort_failed(self):
- job = jobs.Job(str(uuid.uuid4()))
- jobs.add(job)
- jobs.abort(job.id)
- self.assertEqual(jobs.STATUS.PENDING, job.status)
@permutations([
[jobs.STATUS.PENDING, True],
--
To view, visit
https://gerrit.ovirt.org/65102
To unsubscribe, visit
https://gerrit.ovirt.org/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I801082c50b10cf0571210d65cd3a5cec0d282a5c
Gerrit-PatchSet: 1
Gerrit-Project: vdsm
Gerrit-Branch: master
Gerrit-Owner: Adam Litke <alitke(a)redhat.com>