Adam Litke has uploaded a new change for review.
Change subject: storage: Introduce VolumeManifest.datapath_session ......................................................................
storage: Introduce VolumeManifest.datapath_session
When performing datapath operations on a volume (eg. copying data) we mark the volume ILLEGAL before starting the operation and only mark the volume LEGAL again once the operation is finished. As long as this is all done with the volume lease held the engine can poll the volume from any host to determine if the operation is running (lease held) and can detect an interrupted/failed operation (lease free and volume ILLEGAL).
Later this contextmanager will be expanded to support volume generation incrementation when exiting successfully which will also allow engine to determine if an operation was completed successfully.
All metadata updates must be performed to a single block with one write in order to ensure atomicity.
Change-Id: I30a3ac2971411778d24e007aac9fcb3009edb4c4 Signed-off-by: Adam Litke alitke@redhat.com --- M tests/storage_sdm_copy_data_test.py M tests/storage_volume_test.py M vdsm/storage/sdm/api/copy_data.py M vdsm/storage/volume.py 4 files changed, 104 insertions(+), 8 deletions(-)
git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/62/64362/1
diff --git a/tests/storage_sdm_copy_data_test.py b/tests/storage_sdm_copy_data_test.py index c0ec533..93f55c6 100644 --- a/tests/storage_sdm_copy_data_test.py +++ b/tests/storage_sdm_copy_data_test.py @@ -221,7 +221,40 @@ # Qemu pads the file to a 1k boundary with null bytes self.assertTrue(f.read().startswith(vm_conf_data))
+ @permutations((('file',), ('block',))) + def test_datapath_session(self, env_type): + job_id = str(uuid.uuid4()) + fmt = sc.RAW_FORMAT + with self.get_vols(env_type, fmt, fmt) as (src_chain, dst_chain): + src_vol = src_chain[0] + dst_vol = dst_chain[0] + source = dict(endpoint_type='div', sd_id=src_vol.sdUUID, + img_id=src_vol.imgUUID, vol_id=src_vol.volUUID) + dest = dict(endpoint_type='div', sd_id=dst_vol.sdUUID, + img_id=dst_vol.imgUUID, vol_id=dst_vol.volUUID) + self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality()) + self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality()) + fake_convert = FakeQemuConvertChecker(src_vol, dst_vol) + with MonkeyPatchScope([(qemuimg, 'convert', fake_convert)]): + job = storage.sdm.api.copy_data.Job(job_id, 0, source, dest) + job.run() + wait_for_job(job) + self.assertEqual(jobs.STATUS.DONE, job.status) + self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality()) + self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality()) + # TODO: Missing tests: # Copy between 2 different domains # Abort before copy # Abort during copy + + +class FakeQemuConvertChecker(object): + def __init__(self, src_vol, dst_vol): + self.src_vol = src_vol + self.dst_vol = dst_vol + + def __call__(self, *args, **kwargs): + assert sc.LEGAL_VOL == self.src_vol.getLegality() + assert sc.ILLEGAL_VOL == self.dst_vol.getLegality() + return qemuimg.QemuImgOperation(['/bin/true']) diff --git a/tests/storage_volume_test.py b/tests/storage_volume_test.py index 79cae70..2231148 100644 --- a/tests/storage_volume_test.py +++ b/tests/storage_volume_test.py @@ -19,10 +19,12 @@ #
from __future__ import absolute_import +import uuid
from monkeypatch import MonkeyPatchScope from storagefakelib import FakeStorageDomainCache from storagetestlib import FakeSD +from storagetestlib import fake_env from testlib import expandPermutations, permutations from testlib import recorded from testlib import VdsmTestCase @@ -34,6 +36,7 @@ from storage import volume
HOST_ID = 1 +MB = 1048576
class FakeSDManifest(object): @@ -92,3 +95,45 @@ self.assertEqual(expected[:1], manifest.__calls__) lock.release() self.assertEqual(expected, manifest.__calls__) + + +@expandPermutations +class VolumeManifestTest(VdsmTestCase): + + def test_datapath_session(self): + img_id = str(uuid.uuid4()) + vol_id = str(uuid.uuid4()) + + with fake_env('file') as env: + env.make_volume(MB, img_id, vol_id) + vol = env.sd_manifest.produceVolume(img_id, vol_id) + vol.setMetadata = CountedInstanceMethod(vol.setMetadata) + self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) + with vol.datapath_session(): + self.assertEqual(sc.ILLEGAL_VOL, vol.getLegality()) + self.assertEqual(1, vol.setMetadata.nr_calls) + self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) + self.assertEqual(2, vol.setMetadata.nr_calls) + + def test_datapath_session_fail_inside_context(self): + img_id = str(uuid.uuid4()) + vol_id = str(uuid.uuid4()) + + with fake_env('file') as env: + env.make_volume(MB, img_id, vol_id) + vol = env.sd_manifest.produceVolume(img_id, vol_id) + self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) + with self.assertRaises(ValueError): + with vol.datapath_session(): + raise ValueError() + self.assertEqual(sc.ILLEGAL_VOL, vol.getLegality()) + + +class CountedInstanceMethod(object): + def __init__(self, method): + self._method = method + self.nr_calls = 0 + + def __call__(self, *args, **kwargs): + self.nr_calls += 1 + return self._method(*args, **kwargs) diff --git a/vdsm/storage/sdm/api/copy_data.py b/vdsm/storage/sdm/api/copy_data.py index d55e651..0304fed 100644 --- a/vdsm/storage/sdm/api/copy_data.py +++ b/vdsm/storage/sdm/api/copy_data.py @@ -73,14 +73,15 @@ src_format = self._source.qemu_format dst_format = self._dest.qemu_format
- self._operation = qemuimg.convert( - self._source.path, - self._dest.path, - srcFormat=src_format, - dstFormat=dst_format, - backing=self._dest.backing_path, - backingFormat=self._dest.backing_qemu_format) - self._operation.wait_for_completion() + with self._dest.datapath_session(): + self._operation = qemuimg.convert( + self._source.path, + self._dest.path, + srcFormat=src_format, + dstFormat=dst_format, + backing=self._dest.backing_path, + backingFormat=self._dest.backing_qemu_format) + self._operation.wait_for_completion()
def _create_endpoint(params, host_id, writable): @@ -103,6 +104,7 @@ self._host_id = host_id self._writable = writable self._vol = None + self.datapath_session = None
@property def locks(self): @@ -146,7 +148,9 @@ dom = sdCache.produce_manifest(self.sd_id) self._vol = dom.produceVolume(self.img_id, self.vol_id) self._vol.prepare(rw=self._writable, justme=True) + self.datapath_session = self._vol.datapath_session try: yield finally: + self.datapath_session = None self._vol.teardown(self.sd_id, self.vol_id, justme=True) diff --git a/vdsm/storage/volume.py b/vdsm/storage/volume.py index 1339d95..0c01e1c 100644 --- a/vdsm/storage/volume.py +++ b/vdsm/storage/volume.py @@ -497,6 +497,20 @@ """ pass
+ @contextmanager + def datapath_session(self): + """ + Must be called with the Volume Lease held. + + In order to detect interrupted datapath operations a volume should be + marked ILLEGAL prior to the first modification of data and subsequently + marked LEGAL again once the operation has completed. Thus, if an + interruption occurs the volume will remain in an ILLEGAL state. + """ + self.setLegality(sc.ILLEGAL_VOL) + yield + self.setLegality(sc.LEGAL_VOL) +
class Volume(object): log = logging.getLogger('storage.Volume')
gerrit-hooks has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.datapath_session ......................................................................
Patch Set 1:
* Update tracker: IGNORE, no Bug-Url found * Check Bug-Url::WARN, no bug url found, make sure header matches 'Bug-Url: ' and is a valid url. * Check merged to previous::IGNORE, Not in stable branch (['ovirt-3.6', 'ovirt-4.0'])
Nir Soffer has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.datapath_session ......................................................................
Patch Set 1: Code-Review-1
(8 comments)
https://gerrit.ovirt.org/#/c/64362/1//COMMIT_MSG Commit Message:
Line 17: incrementation when exiting successfully which will also allow engine to Line 18: determine if an operation was completed successfully. Line 19: Line 20: All metadata updates must be performed to a single block with one write Line 21: in order to ensure atomicity. Maybe separate the new context manger and its tests from the usage of this infrastructure? Line 22: Line 23: Change-Id: I30a3ac2971411778d24e007aac9fcb3009edb4c4
https://gerrit.ovirt.org/#/c/64362/1/tests/storage_sdm_copy_data_test.py File tests/storage_sdm_copy_data_test.py:
Line 231: source = dict(endpoint_type='div', sd_id=src_vol.sdUUID, Line 232: img_id=src_vol.imgUUID, vol_id=src_vol.volUUID) Line 233: dest = dict(endpoint_type='div', sd_id=dst_vol.sdUUID, Line 234: img_id=dst_vol.imgUUID, vol_id=dst_vol.volUUID) Line 235: self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality()) We don't need this assert, since we don't except any change in the source volume. Line 236: self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality()) Line 237: fake_convert = FakeQemuConvertChecker(src_vol, dst_vol) Line 238: with MonkeyPatchScope([(qemuimg, 'convert', fake_convert)]): Line 239: job = storage.sdm.api.copy_data.Job(job_id, 0, source, dest)
Line 239: job = storage.sdm.api.copy_data.Job(job_id, 0, source, dest) Line 240: job.run() Line 241: wait_for_job(job) Line 242: self.assertEqual(jobs.STATUS.DONE, job.status) Line 243: self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality()) Same.
If you want to check the source volume, maybe add another test? Line 244: self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality()) Line 245: Line 246: # TODO: Missing tests: Line 247: # Copy between 2 different domains
Line 240: job.run() Line 241: wait_for_job(job) Line 242: self.assertEqual(jobs.STATUS.DONE, job.status) Line 243: self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality()) Line 244: self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality()) It would be nice to test that volume remain illegal when the operation fails. Later we will verify also the generation. Line 245: Line 246: # TODO: Missing tests: Line 247: # Copy between 2 different domains Line 248: # Abort before copy
Line 249: # Abort during copy Line 250: Line 251: Line 252: class FakeQemuConvertChecker(object): Line 253: def __init__(self, src_vol, dst_vol): We can make this more useful by accepting the command to run. To simulate success, call with /bin/true, and to simulate failure, /bin/false. Line 254: self.src_vol = src_vol Line 255: self.dst_vol = dst_vol Line 256: Line 257: def __call__(self, *args, **kwargs):
https://gerrit.ovirt.org/#/c/64362/1/tests/storage_volume_test.py File tests/storage_volume_test.py:
Line 110: vol.setMetadata = CountedInstanceMethod(vol.setMetadata) Line 111: self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) Line 112: with vol.datapath_session(): Line 113: self.assertEqual(sc.ILLEGAL_VOL, vol.getLegality()) Line 114: self.assertEqual(1, vol.setMetadata.nr_calls) I don't think this verification is needed, documenting how the method should be implemented is good enough.
Also we don't care if this method will perform several metadata calls. We care only about setting the volume to illegal when entering the context, and setting it back only if no exception was raised. Line 115: self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) Line 116: self.assertEqual(2, vol.setMetadata.nr_calls) Line 117: Line 118: def test_datapath_session_fail_inside_context(self):
https://gerrit.ovirt.org/#/c/64362/1/vdsm/storage/sdm/api/copy_data.py File vdsm/storage/sdm/api/copy_data.py:
Line 139 Line 140 Line 141 Line 142 Line 143 I think a property would make this much simpler:
@property def datapath_session(self): return self._vol.datapath_session
https://gerrit.ovirt.org/#/c/64362/1/vdsm/storage/volume.py File vdsm/storage/volume.py:
Line 497: """ Line 498: pass Line 499: Line 500: @contextmanager Line 501: def datapath_session(self): I'm not sure about the name.
How about:
- data_operation - operation - transaction
One word describing this concept would be best.
We allready using the concept of operation, (e.g. QemuOperaiton), so having to do operations on volumes inside a volume.operation context manager makes sense. Line 502: """ Line 503: Must be called with the Volume Lease held. Line 504: Line 505: In order to detect interrupted datapath operations a volume should be
Adam Litke has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.datapath_session ......................................................................
Patch Set 1:
(8 comments)
https://gerrit.ovirt.org/#/c/64362/1//COMMIT_MSG Commit Message:
Line 17: incrementation when exiting successfully which will also allow engine to Line 18: determine if an operation was completed successfully. Line 19: Line 20: All metadata updates must be performed to a single block with one write Line 21: in order to ensure atomicity.
Maybe separate the new context manger and its tests from the usage of this
Sure. Will do. Line 22: Line 23: Change-Id: I30a3ac2971411778d24e007aac9fcb3009edb4c4
https://gerrit.ovirt.org/#/c/64362/1/tests/storage_sdm_copy_data_test.py File tests/storage_sdm_copy_data_test.py:
Line 231: source = dict(endpoint_type='div', sd_id=src_vol.sdUUID, Line 232: img_id=src_vol.imgUUID, vol_id=src_vol.volUUID) Line 233: dest = dict(endpoint_type='div', sd_id=dst_vol.sdUUID, Line 234: img_id=dst_vol.imgUUID, vol_id=dst_vol.volUUID) Line 235: self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality())
We don't need this assert, since we don't except any change in the source v
Done Line 236: self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality()) Line 237: fake_convert = FakeQemuConvertChecker(src_vol, dst_vol) Line 238: with MonkeyPatchScope([(qemuimg, 'convert', fake_convert)]): Line 239: job = storage.sdm.api.copy_data.Job(job_id, 0, source, dest)
Line 239: job = storage.sdm.api.copy_data.Job(job_id, 0, source, dest) Line 240: job.run() Line 241: wait_for_job(job) Line 242: self.assertEqual(jobs.STATUS.DONE, job.status) Line 243: self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality())
Same.
Done Line 244: self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality()) Line 245: Line 246: # TODO: Missing tests: Line 247: # Copy between 2 different domains
Line 240: job.run() Line 241: wait_for_job(job) Line 242: self.assertEqual(jobs.STATUS.DONE, job.status) Line 243: self.assertEqual(sc.LEGAL_VOL, src_vol.getLegality()) Line 244: self.assertEqual(sc.LEGAL_VOL, dst_vol.getLegality())
It would be nice to test that volume remain illegal when the operation fail
Done Line 245: Line 246: # TODO: Missing tests: Line 247: # Copy between 2 different domains Line 248: # Abort before copy
Line 249: # Abort during copy Line 250: Line 251: Line 252: class FakeQemuConvertChecker(object): Line 253: def __init__(self, src_vol, dst_vol):
We can make this more useful by accepting the command to run. To simulate s
Done Line 254: self.src_vol = src_vol Line 255: self.dst_vol = dst_vol Line 256: Line 257: def __call__(self, *args, **kwargs):
https://gerrit.ovirt.org/#/c/64362/1/tests/storage_volume_test.py File tests/storage_volume_test.py:
Line 110: vol.setMetadata = CountedInstanceMethod(vol.setMetadata) Line 111: self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) Line 112: with vol.datapath_session(): Line 113: self.assertEqual(sc.ILLEGAL_VOL, vol.getLegality()) Line 114: self.assertEqual(1, vol.setMetadata.nr_calls)
I don't think this verification is needed, documenting how the method shoul
Actually, I am trying to validate that all metadata changes must be completed with a single write to storage so that we have atomic transitions. Line 115: self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) Line 116: self.assertEqual(2, vol.setMetadata.nr_calls) Line 117: Line 118: def test_datapath_session_fail_inside_context(self):
https://gerrit.ovirt.org/#/c/64362/1/vdsm/storage/sdm/api/copy_data.py File vdsm/storage/sdm/api/copy_data.py:
Line 139 Line 140 Line 141 Line 142 Line 143
I think a property would make this much simpler:
Done
https://gerrit.ovirt.org/#/c/64362/1/vdsm/storage/volume.py File vdsm/storage/volume.py:
Line 497: """ Line 498: pass Line 499: Line 500: @contextmanager Line 501: def datapath_session(self):
I'm not sure about the name.
Renamed to operation Line 502: """ Line 503: Must be called with the Volume Lease held. Line 504: Line 505: In order to detect interrupted datapath operations a volume should be
gerrit-hooks has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 2:
* update_tracker: OK * Check Bug-Url::WARN, no bug url found, make sure header matches 'Bug-Url: ' and is a valid url. * Check merged to previous::IGNORE, Not in stable branch (['ovirt-3.6', 'ovirt-4.0'])
Nir Soffer has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 2:
(1 comment)
Nice, needs more time to review the tests.
https://gerrit.ovirt.org/#/c/64362/2/tests/storage_volume_test.py File tests/storage_volume_test.py:
Line 116: self.assertEqual(2, vol.setMetadata.nr_calls) Line 117: Line 118: def test_operation_fail_inside_context(self): Line 119: img_id = str(uuid.uuid4()) Line 120: vol_id = str(uuid.uuid4()) I think its time to introduce a helper to create uuid string without repeating this boilerplate everywhere. Line 121: Line 122: with fake_env('file') as env: Line 123: env.make_volume(MB, img_id, vol_id) Line 124: vol = env.sd_manifest.produceVolume(img_id, vol_id)
Adam Litke has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 2:
(1 comment)
https://gerrit.ovirt.org/#/c/64362/2/tests/storage_volume_test.py File tests/storage_volume_test.py:
Line 116: self.assertEqual(2, vol.setMetadata.nr_calls) Line 117: Line 118: def test_operation_fail_inside_context(self): Line 119: img_id = str(uuid.uuid4()) Line 120: vol_id = str(uuid.uuid4())
I think its time to introduce a helper to create uuid string without repeat
Done Line 121: Line 122: with fake_env('file') as env: Line 123: env.make_volume(MB, img_id, vol_id) Line 124: vol = env.sd_manifest.produceVolume(img_id, vol_id)
gerrit-hooks has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 3:
* update_tracker: OK * Check Bug-Url::WARN, no bug url found, make sure header matches 'Bug-Url: ' and is a valid url. * Check merged to previous::IGNORE, Not in stable branch (['ovirt-3.6', 'ovirt-4.0'])
gerrit-hooks has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 4:
* Update tracker: IGNORE, no Bug-Url found * Check Bug-Url::WARN, no bug url found, make sure header matches 'Bug-Url: ' and is a valid url. * Check merged to previous::IGNORE, Not in stable branch (['ovirt-3.6', 'ovirt-4.0'])
Nir Soffer has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 4: Code-Review+2
(1 comment)
https://gerrit.ovirt.org/#/c/64362/4/tests/storage_volume_test.py File tests/storage_volume_test.py:
Line 106: Line 107: with fake_env('file') as env: Line 108: env.make_volume(MB, img_id, vol_id) Line 109: vol = env.sd_manifest.produceVolume(img_id, vol_id) Line 110: vol.setMetadata = CountedInstanceMethod(vol.setMetadata) I don't think we need this, this will only make the test break if we change the implementation later to read another value, but lets take it as is. Line 111: self.assertEqual(sc.LEGAL_VOL, vol.getLegality()) Line 112: with vol.operation(): Line 113: self.assertEqual(sc.ILLEGAL_VOL, vol.getLegality()) Line 114: self.assertEqual(1, vol.setMetadata.nr_calls)
Nir Soffer has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 4: Verified+1
Verified by the tests.
Nir Soffer has submitted this change and it was merged.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
storage: Introduce VolumeManifest.operation context
When performing datapath operations on a volume (eg. copying data) we mark the volume ILLEGAL before starting the operation and only mark the volume LEGAL again once the operation is finished. As long as this is all done with the volume lease held the engine can poll the volume from any host to determine if the operation is running (lease held) and can detect an interrupted/failed operation (lease free and volume ILLEGAL).
Later this contextmanager will be expanded to support volume generation incrementation when exiting successfully which will also allow engine to determine if an operation was completed successfully.
All metadata updates must be performed to a single block with one write in order to ensure atomicity.
Change-Id: I30a3ac2971411778d24e007aac9fcb3009edb4c4 Signed-off-by: Adam Litke alitke@redhat.com Reviewed-on: https://gerrit.ovirt.org/64362 Continuous-Integration: Jenkins CI Reviewed-by: Nir Soffer nsoffer@redhat.com Tested-by: Nir Soffer nsoffer@redhat.com --- M tests/storage_volume_test.py M vdsm/storage/volume.py 2 files changed, 59 insertions(+), 0 deletions(-)
Approvals: Nir Soffer: Verified; Looks good to me, approved Jenkins CI: Passed CI tests
gerrit-hooks has posted comments on this change.
Change subject: storage: Introduce VolumeManifest.operation context ......................................................................
Patch Set 5:
* #64362::Update tracker: OK * Set MODIFIED::IGNORE, no Bug-Url found.
vdsm-patches@lists.fedorahosted.org