[xen/f19] Long latency virtual-mmu operations are not preemptible
myoung
myoung at fedoraproject.org
Thu Aug 14 14:51:01 UTC 2014
commit 72532ffb1e9bd6c040a38427ae903d575a05018f
Author: Michael Young <m.a.young at durham.ac.uk>
Date: Thu Aug 14 15:50:29 2014 +0100
Long latency virtual-mmu operations are not preemptible
xen.spec | 10 +-
xsa97-hap-4.2-prereq.patch | 466 ++++++++++++++++++++++++++++++++++++++++++
xsa97-hap-4.2.patch | 485 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 960 insertions(+), 1 deletions(-)
---
diff --git a/xen.spec b/xen.spec
index be026d9..17d555b 100644
--- a/xen.spec
+++ b/xen.spec
@@ -27,7 +27,7 @@
Summary: Xen is a virtual machine monitor
Name: xen
Version: 4.2.4
-Release: 6%{?dist}
+Release: 7%{?dist}
Group: Development/Libraries
License: GPLv2+ and LGPLv2+ and BSD
URL: http://xen.org/
@@ -87,6 +87,8 @@ Patch109: xsa92-4.2.patch
Patch110: xsa96.patch
Patch111: xsa100.patch
Patch112: xen.git-6b4d71d028f445cba7426a144751fddc8bfdd67b.patch
+Patch113: xsa97-hap-4.2-prereq.patch
+Patch114: xsa97-hap-4.2.patch
Patch100: xen-configure-xend.patch
@@ -264,6 +266,8 @@ manage Xen virtual machines.
%patch110 -p1
%patch111 -p1
%patch112 -p1
+%patch113 -p1
+%patch114 -p1
%patch100 -p1
@@ -757,6 +761,10 @@ rm -rf %{buildroot}
%endif
%changelog
+* Thu Aug 14 2014 Michael Young <m.a.young at durham.ac.uk> - 4.2.4-7
+- Long latency virtual-mmu operations are not preemptible
+ [XSA-97, CVE-2014-5146]
+
* Fri Jun 20 2014 Michael Young <m.a.young at durham.ac.uk> - 4.2.4-6
- Hypervisor heap contents leaked to guest [XSA-100, CVE-2014-4021]
(#1110316) with extra patch to avoid regression
diff --git a/xsa97-hap-4.2-prereq.patch b/xsa97-hap-4.2-prereq.patch
new file mode 100644
index 0000000..ce2240a
--- /dev/null
+++ b/xsa97-hap-4.2-prereq.patch
@@ -0,0 +1,466 @@
+x86/mm/hap: Adjust vram tracking to play nicely with log-dirty.
+
+The previous code assumed the guest would be in one of three mutually exclusive
+modes for bookkeeping dirty pages: (1) shadow, (2) hap utilizing the log dirty
+bitmap to support functionality such as live migrate, (3) hap utilizing the
+log dirty bitmap to track dirty vram pages.
+Races arose when a guest attempted to track dirty vram while performing live
+migrate. (The dispatch table managed by paging_log_dirty_init() might change
+in the middle of a log dirty or a vram tracking function.)
+
+This change allows hap log dirty and hap vram tracking to be concurrent.
+Vram tracking no longer uses the log dirty bitmap. Instead it detects
+dirty vram pages by examining their p2m type. The log dirty bitmap is only
+used by the log dirty code. Because the two operations use different
+mechanisms, they are no longer mutually exclusive.
+
+Signed-Off-By: Robert Phillips <robert.phillips at citrix.com>
+Acked-by: Tim Deegan <tim at xen.org>
+
+Minor whitespace changes to conform with coding style
+Signed-off-by: Tim Deegan <tim at xen.org>
+
+Committed-by: Tim Deegan <tim at xen.org>
+master commit: fd91a2a662bc59677e0f217423a7a155d5465886
+master date: 2012-12-13 12:10:14 +0000
+
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -56,132 +56,110 @@
+ /* HAP VRAM TRACKING SUPPORT */
+ /************************************************/
+
+-static int hap_enable_vram_tracking(struct domain *d)
+-{
+- struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+-
+- if ( !dirty_vram )
+- return -EINVAL;
+-
+- /* turn on PG_log_dirty bit in paging mode */
+- paging_lock(d);
+- d->arch.paging.mode |= PG_log_dirty;
+- paging_unlock(d);
+-
+- /* set l1e entries of P2M table to be read-only. */
+- p2m_change_type_range(d, dirty_vram->begin_pfn, dirty_vram->end_pfn,
+- p2m_ram_rw, p2m_ram_logdirty);
+-
+- flush_tlb_mask(d->domain_dirty_cpumask);
+- return 0;
+-}
+-
+-static int hap_disable_vram_tracking(struct domain *d)
+-{
+- struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+-
+- if ( !dirty_vram )
+- return -EINVAL;
+-
+- paging_lock(d);
+- d->arch.paging.mode &= ~PG_log_dirty;
+- paging_unlock(d);
+-
+- /* set l1e entries of P2M table with normal mode */
+- p2m_change_type_range(d, dirty_vram->begin_pfn, dirty_vram->end_pfn,
+- p2m_ram_logdirty, p2m_ram_rw);
+-
+- flush_tlb_mask(d->domain_dirty_cpumask);
+- return 0;
+-}
+-
+-static void hap_clean_vram_tracking(struct domain *d)
+-{
+- struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+-
+- if ( !dirty_vram )
+- return;
+-
+- /* set l1e entries of P2M table to be read-only. */
+- p2m_change_type_range(d, dirty_vram->begin_pfn, dirty_vram->end_pfn,
+- p2m_ram_rw, p2m_ram_logdirty);
+-
+- flush_tlb_mask(d->domain_dirty_cpumask);
+-}
+-
+-static void hap_vram_tracking_init(struct domain *d)
+-{
+- paging_log_dirty_init(d, hap_enable_vram_tracking,
+- hap_disable_vram_tracking,
+- hap_clean_vram_tracking);
+-}
++/*
++ * hap_track_dirty_vram()
++ * Create the domain's dv_dirty_vram struct on demand.
++ * Create a dirty vram range on demand when some [begin_pfn:begin_pfn+nr] is
++ * first encountered.
++ * Collect the guest_dirty bitmask, a bit mask of the dirty vram pages, by
++ * calling paging_log_dirty_range(), which interrogates each vram
++ * page's p2m type looking for pages that have been made writable.
++ */
+
+ int hap_track_dirty_vram(struct domain *d,
+ unsigned long begin_pfn,
+ unsigned long nr,
+- XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
++ XEN_GUEST_HANDLE_64(uint8) guest_dirty_bitmap)
+ {
+ long rc = 0;
+- struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
++ struct sh_dirty_vram *dirty_vram;
++ uint8_t *dirty_bitmap = NULL;
+
+ if ( nr )
+ {
+- if ( paging_mode_log_dirty(d) && dirty_vram )
++ int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
++
++ if ( !paging_mode_log_dirty(d) )
+ {
+- if ( begin_pfn != dirty_vram->begin_pfn ||
+- begin_pfn + nr != dirty_vram->end_pfn )
+- {
+- paging_log_dirty_disable(d);
+- dirty_vram->begin_pfn = begin_pfn;
+- dirty_vram->end_pfn = begin_pfn + nr;
+- rc = paging_log_dirty_enable(d);
+- if (rc != 0)
+- goto param_fail;
+- }
++ hap_logdirty_init(d);
++ rc = paging_log_dirty_enable(d);
++ if ( rc )
++ goto out;
+ }
+- else if ( !paging_mode_log_dirty(d) && !dirty_vram )
++
++ rc = -ENOMEM;
++ dirty_bitmap = xzalloc_bytes(size);
++ if ( !dirty_bitmap )
++ goto out;
++
++ paging_lock(d);
++
++ dirty_vram = d->arch.hvm_domain.dirty_vram;
++ if ( !dirty_vram )
+ {
+ rc = -ENOMEM;
+- if ( (dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
+- goto param_fail;
++ if ( (dirty_vram = xzalloc(struct sh_dirty_vram)) == NULL )
++ {
++ paging_unlock(d);
++ goto out;
++ }
+
++ d->arch.hvm_domain.dirty_vram = dirty_vram;
++ }
++
++ if ( begin_pfn != dirty_vram->begin_pfn ||
++ begin_pfn + nr != dirty_vram->end_pfn )
++ {
+ dirty_vram->begin_pfn = begin_pfn;
+ dirty_vram->end_pfn = begin_pfn + nr;
+- d->arch.hvm_domain.dirty_vram = dirty_vram;
+- hap_vram_tracking_init(d);
+- rc = paging_log_dirty_enable(d);
+- if (rc != 0)
+- goto param_fail;
++
++ paging_unlock(d);
++
++ /* set l1e entries of range within P2M table to be read-only. */
++ p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
++ p2m_ram_rw, p2m_ram_logdirty);
++
++ flush_tlb_mask(d->domain_dirty_cpumask);
++
++ memset(dirty_bitmap, 0xff, size); /* consider all pages dirty */
+ }
+ else
+ {
+- if ( !paging_mode_log_dirty(d) && dirty_vram )
+- rc = -EINVAL;
+- else
+- rc = -ENODATA;
+- goto param_fail;
++ paging_unlock(d);
++
++ domain_pause(d);
++
++ /* get the bitmap */
++ paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
++
++ domain_unpause(d);
+ }
+- /* get the bitmap */
+- rc = paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
++
++ rc = -EFAULT;
++ if ( copy_to_guest(guest_dirty_bitmap, dirty_bitmap, size) == 0 )
++ rc = 0;
+ }
+ else
+ {
+- if ( paging_mode_log_dirty(d) && dirty_vram ) {
+- rc = paging_log_dirty_disable(d);
+- xfree(dirty_vram);
+- dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
+- } else
+- rc = 0;
+- }
++ paging_lock(d);
+
+- return rc;
++ dirty_vram = d->arch.hvm_domain.dirty_vram;
++ if ( dirty_vram )
++ {
++ /*
++ * If zero pages specified while tracking dirty vram
++ * then stop tracking
++ */
++ xfree(dirty_vram);
++ d->arch.hvm_domain.dirty_vram = NULL;
++ }
+
+-param_fail:
+- if ( dirty_vram )
+- {
+- xfree(dirty_vram);
+- dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
++ paging_unlock(d);
+ }
++out:
++ if ( dirty_bitmap )
++ xfree(dirty_bitmap);
++
+ return rc;
+ }
+
+@@ -223,13 +201,6 @@ static void hap_clean_dirty_bitmap(struc
+
+ void hap_logdirty_init(struct domain *d)
+ {
+- struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+- if ( paging_mode_log_dirty(d) && dirty_vram )
+- {
+- paging_log_dirty_disable(d);
+- xfree(dirty_vram);
+- dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
+- }
+
+ /* Reinitialize logdirty mechanism */
+ paging_log_dirty_init(d, hap_enable_log_dirty,
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -447,157 +447,38 @@ int paging_log_dirty_op(struct domain *d
+ return rv;
+ }
+
+-int paging_log_dirty_range(struct domain *d,
+- unsigned long begin_pfn,
+- unsigned long nr,
+- XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
+-{
+- int rv = 0;
+- unsigned long pages = 0;
+- mfn_t *l4, *l3, *l2;
+- unsigned long *l1;
+- int b1, b2, b3, b4;
+- int i2, i3, i4;
+-
+- d->arch.paging.log_dirty.clean_dirty_bitmap(d);
+- paging_lock(d);
+-
+- PAGING_DEBUG(LOGDIRTY, "log-dirty-range: dom %u faults=%u dirty=%u\n",
+- d->domain_id,
+- d->arch.paging.log_dirty.fault_count,
+- d->arch.paging.log_dirty.dirty_count);
+-
+- if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) {
+- printk("%s: %d failed page allocs while logging dirty pages\n",
+- __FUNCTION__, d->arch.paging.log_dirty.failed_allocs);
+- rv = -ENOMEM;
+- goto out;
+- }
++void paging_log_dirty_range(struct domain *d,
++ unsigned long begin_pfn,
++ unsigned long nr,
++ uint8_t *dirty_bitmap)
++{
++ struct p2m_domain *p2m = p2m_get_hostp2m(d);
++ int i;
++ unsigned long pfn;
++
++ /*
++ * Set l1e entries of P2M table to be read-only.
++ *
++ * On first write, it page faults, its entry is changed to read-write,
++ * and on retry the write succeeds.
++ *
++ * We populate dirty_bitmap by looking for entries that have been
++ * switched to read-write.
++ */
+
+- if ( !d->arch.paging.log_dirty.fault_count &&
+- !d->arch.paging.log_dirty.dirty_count ) {
+- unsigned int size = BITS_TO_LONGS(nr);
+-
+- if ( clear_guest(dirty_bitmap, size * BYTES_PER_LONG) != 0 )
+- rv = -EFAULT;
+- goto out;
+- }
+- d->arch.paging.log_dirty.fault_count = 0;
+- d->arch.paging.log_dirty.dirty_count = 0;
++ p2m_lock(p2m);
+
+- b1 = L1_LOGDIRTY_IDX(begin_pfn);
+- b2 = L2_LOGDIRTY_IDX(begin_pfn);
+- b3 = L3_LOGDIRTY_IDX(begin_pfn);
+- b4 = L4_LOGDIRTY_IDX(begin_pfn);
+- l4 = paging_map_log_dirty_bitmap(d);
+-
+- for ( i4 = b4;
+- (pages < nr) && (i4 < LOGDIRTY_NODE_ENTRIES);
+- i4++ )
++ for ( i = 0, pfn = begin_pfn; pfn < begin_pfn + nr; i++, pfn++ )
+ {
+- l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+- for ( i3 = b3;
+- (pages < nr) && (i3 < LOGDIRTY_NODE_ENTRIES);
+- i3++ )
+- {
+- l2 = ((l3 && mfn_valid(l3[i3])) ?
+- map_domain_page(mfn_x(l3[i3])) : NULL);
+- for ( i2 = b2;
+- (pages < nr) && (i2 < LOGDIRTY_NODE_ENTRIES);
+- i2++ )
+- {
+- unsigned int bytes = PAGE_SIZE;
+- uint8_t *s;
+- l1 = ((l2 && mfn_valid(l2[i2])) ?
+- map_domain_page(mfn_x(l2[i2])) : NULL);
+-
+- s = ((uint8_t*)l1) + (b1 >> 3);
+- bytes -= b1 >> 3;
+-
+- if ( likely(((nr - pages + 7) >> 3) < bytes) )
+- bytes = (unsigned int)((nr - pages + 7) >> 3);
+-
+- if ( !l1 )
+- {
+- if ( clear_guest_offset(dirty_bitmap, pages >> 3,
+- bytes) != 0 )
+- {
+- rv = -EFAULT;
+- goto out;
+- }
+- }
+- /* begin_pfn is not 32K aligned, hence we have to bit
+- * shift the bitmap */
+- else if ( b1 & 0x7 )
+- {
+- int i, j;
+- uint32_t *l = (uint32_t*) s;
+- int bits = b1 & 0x7;
+- int bitmask = (1 << bits) - 1;
+- int size = (bytes + BYTES_PER_LONG - 1) / BYTES_PER_LONG;
+- unsigned long bitmap[size];
+- static unsigned long printed = 0;
+-
+- if ( printed != begin_pfn )
+- {
+- dprintk(XENLOG_DEBUG, "%s: begin_pfn %lx is not 32K aligned!\n",
+- __FUNCTION__, begin_pfn);
+- printed = begin_pfn;
+- }
+-
+- for ( i = 0; i < size - 1; i++, l++ ) {
+- bitmap[i] = ((*l) >> bits) |
+- (((*((uint8_t*)(l + 1))) & bitmask) << (sizeof(*l) * 8 - bits));
+- }
+- s = (uint8_t*) l;
+- size = BYTES_PER_LONG - ((b1 >> 3) & 0x3);
+- bitmap[i] = 0;
+- for ( j = 0; j < size; j++, s++ )
+- bitmap[i] |= (*s) << (j * 8);
+- bitmap[i] = (bitmap[i] >> bits) | (bitmask << (size * 8 - bits));
+- if ( copy_to_guest_offset(dirty_bitmap, (pages >> 3),
+- (uint8_t*) bitmap, bytes) != 0 )
+- {
+- rv = -EFAULT;
+- goto out;
+- }
+- }
+- else
+- {
+- if ( copy_to_guest_offset(dirty_bitmap, pages >> 3,
+- s, bytes) != 0 )
+- {
+- rv = -EFAULT;
+- goto out;
+- }
+- }
+-
+- pages += bytes << 3;
+- if ( l1 )
+- {
+- clear_page(l1);
+- unmap_domain_page(l1);
+- }
+- b1 = b1 & 0x7;
+- }
+- b2 = 0;
+- if ( l2 )
+- unmap_domain_page(l2);
+- }
+- b3 = 0;
+- if ( l3 )
+- unmap_domain_page(l3);
++ p2m_type_t pt;
++ pt = p2m_change_type(d, pfn, p2m_ram_rw, p2m_ram_logdirty);
++ if ( pt == p2m_ram_rw )
++ dirty_bitmap[i >> 3] |= (1 << (i & 7));
+ }
+- if ( l4 )
+- unmap_domain_page(l4);
+-
+- paging_unlock(d);
+
+- return rv;
++ p2m_unlock(p2m);
+
+- out:
+- paging_unlock(d);
+- return rv;
++ flush_tlb_mask(d->domain_dirty_cpumask);
+ }
+
+ /* Note that this function takes three function pointers. Callers must supply
+--- a/xen/include/asm-x86/config.h
++++ b/xen/include/asm-x86/config.h
+@@ -17,6 +17,7 @@
+
+ #define BYTES_PER_LONG (1 << LONG_BYTEORDER)
+ #define BITS_PER_LONG (BYTES_PER_LONG << 3)
++#define BITS_PER_BYTE 8
+
+ #define CONFIG_X86 1
+ #define CONFIG_X86_HT 1
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -145,10 +145,10 @@ struct paging_mode {
+ void paging_free_log_dirty_bitmap(struct domain *d);
+
+ /* get the dirty bitmap for a specific range of pfns */
+-int paging_log_dirty_range(struct domain *d,
+- unsigned long begin_pfn,
+- unsigned long nr,
+- XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
++void paging_log_dirty_range(struct domain *d,
++ unsigned long begin_pfn,
++ unsigned long nr,
++ uint8_t *dirty_bitmap);
+
+ /* enable log dirty */
+ int paging_log_dirty_enable(struct domain *d);
diff --git a/xsa97-hap-4.2.patch b/xsa97-hap-4.2.patch
new file mode 100644
index 0000000..7032cdb
--- /dev/null
+++ b/xsa97-hap-4.2.patch
@@ -0,0 +1,485 @@
+x86/paging: make log-dirty operations preemptible
+
+Both the freeing and the inspection of the bitmap get done in (nested)
+loops which - besides having a rather high iteration count in general,
+albeit that would be covered by XSA-77 - have the number of non-trivial
+iterations they need to perform (indirectly) controllable by both the
+guest they are for and any domain controlling the guest (including the
+one running qemu for it).
+
+This is XSA-97.
+
+Signed-off-by: Jan Beulich <jbeulich at suse.com>
+Reviewed-by: Tim Deegan <tim at xen.org>
+
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -2136,7 +2136,9 @@ int domain_relinquish_resources(struct d
+ pci_release_devices(d);
+
+ /* Tear down paging-assistance stuff. */
+- paging_teardown(d);
++ ret = paging_teardown(d);
++ if ( ret )
++ return ret;
+
+ /* Drop the in-use references to page-table bases. */
+ for_each_vcpu ( d, v )
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -66,6 +66,9 @@ long arch_do_domctl(
+ &domctl->u.shadow_op,
+ guest_handle_cast(u_domctl, void));
+ rcu_unlock_domain(d);
++ if ( ret == -EAGAIN )
++ return hypercall_create_continuation(__HYPERVISOR_domctl,
++ "h", u_domctl);
+ copy_to_guest(u_domctl, domctl, 1);
+ }
+ }
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -678,8 +678,7 @@ int hap_domctl(struct domain *d, xen_dom
+ paging_unlock(d);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+- rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+- u_domctl);
++ rc = -EAGAIN;
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = hap_get_allocation(d);
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -26,6 +26,7 @@
+ #include <asm/shadow.h>
+ #include <asm/p2m.h>
+ #include <asm/hap.h>
++#include <asm/event.h>
+ #include <asm/hvm/nestedhvm.h>
+ #include <xen/numa.h>
+ #include <xsm/xsm.h>
+@@ -116,26 +117,46 @@ static void paging_free_log_dirty_page(s
+ d->arch.paging.free_page(d, mfn_to_page(mfn));
+ }
+
+-void paging_free_log_dirty_bitmap(struct domain *d)
++static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
+ {
+ mfn_t *l4, *l3, *l2;
+ int i4, i3, i2;
+
++ paging_lock(d);
++
+ if ( !mfn_valid(d->arch.paging.log_dirty.top) )
+- return;
++ {
++ paging_unlock(d);
++ return 0;
++ }
+
+- paging_lock(d);
++ if ( !d->arch.paging.preempt.vcpu )
++ {
++ memset(&d->arch.paging.preempt.log_dirty, 0,
++ sizeof(d->arch.paging.preempt.log_dirty));
++ ASSERT(rc <= 0);
++ d->arch.paging.preempt.log_dirty.done = -rc;
++ }
++ else if ( d->arch.paging.preempt.vcpu != current ||
++ d->arch.paging.preempt.op != XEN_DOMCTL_SHADOW_OP_OFF )
++ {
++ paging_unlock(d);
++ return -EBUSY;
++ }
+
+ l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
++ i4 = d->arch.paging.preempt.log_dirty.i4;
++ i3 = d->arch.paging.preempt.log_dirty.i3;
++ rc = 0;
+
+- for ( i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++ )
++ for ( ; i4 < LOGDIRTY_NODE_ENTRIES; i4++, i3 = 0 )
+ {
+ if ( !mfn_valid(l4[i4]) )
+ continue;
+
+ l3 = map_domain_page(mfn_x(l4[i4]));
+
+- for ( i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
++ for ( ; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
+ {
+ if ( !mfn_valid(l3[i3]) )
+ continue;
+@@ -148,20 +169,54 @@ void paging_free_log_dirty_bitmap(struct
+
+ unmap_domain_page(l2);
+ paging_free_log_dirty_page(d, l3[i3]);
++ l3[i3] = _mfn(INVALID_MFN);
++
++ if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++ d->arch.paging.preempt.log_dirty.i4 = i4;
++ rc = -EAGAIN;
++ break;
++ }
+ }
+
+ unmap_domain_page(l3);
++ if ( rc )
++ break;
+ paging_free_log_dirty_page(d, l4[i4]);
++ l4[i4] = _mfn(INVALID_MFN);
++
++ if ( i4 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i3 = 0;
++ d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++ rc = -EAGAIN;
++ break;
++ }
+ }
+
+ unmap_domain_page(l4);
+- paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
+- d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+
+- ASSERT(d->arch.paging.log_dirty.allocs == 0);
+- d->arch.paging.log_dirty.failed_allocs = 0;
++ if ( !rc )
++ {
++ paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
++ d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
++
++ ASSERT(d->arch.paging.log_dirty.allocs == 0);
++ d->arch.paging.log_dirty.failed_allocs = 0;
++
++ rc = -d->arch.paging.preempt.log_dirty.done;
++ d->arch.paging.preempt.vcpu = NULL;
++ }
++ else
++ {
++ d->arch.paging.preempt.vcpu = current;
++ d->arch.paging.preempt.op = XEN_DOMCTL_SHADOW_OP_OFF;
++ }
+
+ paging_unlock(d);
++
++ return rc;
+ }
+
+ int paging_log_dirty_enable(struct domain *d)
+@@ -178,15 +233,25 @@ int paging_log_dirty_enable(struct domai
+ return ret;
+ }
+
+-int paging_log_dirty_disable(struct domain *d)
++static int paging_log_dirty_disable(struct domain *d, bool_t resuming)
+ {
+- int ret;
++ int ret = 1;
++
++ if ( !resuming )
++ {
++ domain_pause(d);
++ /* Safe because the domain is paused. */
++ ret = d->arch.paging.log_dirty.disable_log_dirty(d);
++ ASSERT(ret <= 0);
++ }
+
+- domain_pause(d);
+- /* Safe because the domain is paused. */
+- ret = d->arch.paging.log_dirty.disable_log_dirty(d);
+ if ( !paging_mode_log_dirty(d) )
+- paging_free_log_dirty_bitmap(d);
++ {
++ ret = paging_free_log_dirty_bitmap(d, ret);
++ if ( ret == -EAGAIN )
++ return ret;
++ }
++
+ domain_unpause(d);
+
+ return ret;
+@@ -326,7 +391,9 @@ int paging_mfn_is_dirty(struct domain *d
+
+ /* Read a domain's log-dirty bitmap and stats. If the operation is a CLEAN,
+ * clear the bitmap and stats as well. */
+-int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc)
++static int paging_log_dirty_op(struct domain *d,
++ struct xen_domctl_shadow_op *sc,
++ bool_t resuming)
+ {
+ int rv = 0, clean = 0, peek = 1;
+ unsigned long pages = 0;
+@@ -334,9 +401,22 @@ int paging_log_dirty_op(struct domain *d
+ unsigned long *l1 = NULL;
+ int i4, i3, i2;
+
+- domain_pause(d);
++ if ( !resuming )
++ domain_pause(d);
+ paging_lock(d);
+
++ if ( !d->arch.paging.preempt.vcpu )
++ memset(&d->arch.paging.preempt.log_dirty, 0,
++ sizeof(d->arch.paging.preempt.log_dirty));
++ else if ( d->arch.paging.preempt.vcpu != current ||
++ d->arch.paging.preempt.op != sc->op )
++ {
++ paging_unlock(d);
++ ASSERT(!resuming);
++ domain_unpause(d);
++ return -EBUSY;
++ }
++
+ clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+
+ PAGING_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+@@ -365,17 +445,15 @@ int paging_log_dirty_op(struct domain *d
+ goto out;
+ }
+
+- pages = 0;
+ l4 = paging_map_log_dirty_bitmap(d);
++ i4 = d->arch.paging.preempt.log_dirty.i4;
++ i3 = d->arch.paging.preempt.log_dirty.i3;
++ pages = d->arch.paging.preempt.log_dirty.done;
+
+- for ( i4 = 0;
+- (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES);
+- i4++ )
++ for ( ; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++, i3 = 0 )
+ {
+ l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+- for ( i3 = 0;
+- (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES);
+- i3++ )
++ for ( ; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ )
+ {
+ l2 = ((l3 && mfn_valid(l3[i3])) ?
+ map_domain_page(mfn_x(l3[i3])) : NULL);
+@@ -410,18 +488,51 @@ int paging_log_dirty_op(struct domain *d
+ }
+ if ( l2 )
+ unmap_domain_page(l2);
++
++ if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i4 = i4;
++ d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++ rv = -EAGAIN;
++ break;
++ }
+ }
+ if ( l3 )
+ unmap_domain_page(l3);
++
++ if ( !rv && i4 < LOGDIRTY_NODE_ENTRIES - 1 &&
++ hypercall_preempt_check() )
++ {
++ d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++ d->arch.paging.preempt.log_dirty.i3 = 0;
++ rv = -EAGAIN;
++ }
++ if ( rv )
++ break;
+ }
+ if ( l4 )
+ unmap_domain_page(l4);
+
+- if ( pages < sc->pages )
+- sc->pages = pages;
++ if ( !rv )
++ d->arch.paging.preempt.vcpu = NULL;
++ else
++ {
++ d->arch.paging.preempt.vcpu = current;
++ d->arch.paging.preempt.op = sc->op;
++ d->arch.paging.preempt.log_dirty.done = pages;
++ }
+
+ paging_unlock(d);
+
++ if ( rv )
++ {
++ /* Never leave the domain paused for other errors. */
++ ASSERT(rv == -EAGAIN);
++ return rv;
++ }
++
++ if ( pages < sc->pages )
++ sc->pages = pages;
+ if ( clean )
+ {
+ /* We need to further call clean_dirty_bitmap() functions of specific
+@@ -432,6 +543,7 @@ int paging_log_dirty_op(struct domain *d
+ return rv;
+
+ out:
++ d->arch.paging.preempt.vcpu = NULL;
+ paging_unlock(d);
+ domain_unpause(d);
+
+@@ -498,12 +610,6 @@ void paging_log_dirty_init(struct domain
+ d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
+ }
+
+-/* This function fress log dirty bitmap resources. */
+-static void paging_log_dirty_teardown(struct domain*d)
+-{
+- paging_free_log_dirty_bitmap(d);
+-}
+-
+ /************************************************/
+ /* CODE FOR PAGING SUPPORT */
+ /************************************************/
+@@ -547,6 +653,7 @@ void paging_vcpu_init(struct vcpu *v)
+ int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
+ XEN_GUEST_HANDLE(void) u_domctl)
+ {
++ bool_t resuming = 0;
+ int rc;
+
+ if ( unlikely(d == current->domain) )
+@@ -569,6 +676,20 @@ int paging_domctl(struct domain *d, xen_
+ return -EINVAL;
+ }
+
++ if ( d->arch.paging.preempt.vcpu )
++ {
++ if ( d->arch.paging.preempt.vcpu != current ||
++ d->arch.paging.preempt.op != sc->op )
++ {
++ printk(XENLOG_G_DEBUG
++ "d%d:v%d: Paging op %#x on Dom%u with unfinished prior op %#x\n",
++ current->domain->domain_id, current->vcpu_id,
++ sc->op, d->domain_id, d->arch.paging.preempt.op);
++ return -EBUSY;
++ }
++ resuming = 1;
++ }
++
+ rc = xsm_shadow_control(d, sc->op);
+ if ( rc )
+ return rc;
+@@ -594,13 +714,13 @@ int paging_domctl(struct domain *d, xen_
+
+ case XEN_DOMCTL_SHADOW_OP_OFF:
+ if ( paging_mode_log_dirty(d) )
+- if ( (rc = paging_log_dirty_disable(d)) != 0 )
++ if ( (rc = paging_log_dirty_disable(d, resuming)) != 0 )
+ return rc;
+ break;
+
+ case XEN_DOMCTL_SHADOW_OP_CLEAN:
+ case XEN_DOMCTL_SHADOW_OP_PEEK:
+- return paging_log_dirty_op(d, sc);
++ return paging_log_dirty_op(d, sc, resuming);
+ }
+
+ /* Here, dispatch domctl to the appropriate paging code */
+@@ -611,18 +731,24 @@ int paging_domctl(struct domain *d, xen_
+ }
+
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d)
++int paging_teardown(struct domain *d)
+ {
++ int rc;
++
+ if ( hap_enabled(d) )
+ hap_teardown(d);
+ else
+ shadow_teardown(d);
+
+ /* clean up log dirty resources. */
+- paging_log_dirty_teardown(d);
++ rc = paging_free_log_dirty_bitmap(d, 0);
++ if ( rc == -EAGAIN )
++ return rc;
+
+ /* Move populate-on-demand cache back to domain_list for destruction */
+ p2m_pod_empty_cache(d);
++
++ return rc;
+ }
+
+ /* Call once all of the references to the domain have gone away */
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -3829,8 +3829,7 @@ int shadow_domctl(struct domain *d,
+ paging_unlock(d);
+ if ( preempted )
+ /* Not finished. Set up to re-run the call. */
+- rc = hypercall_create_continuation(
+- __HYPERVISOR_domctl, "h", u_domctl);
++ rc = -EAGAIN;
+ else
+ /* Finished. Return the new allocation */
+ sc->mb = shadow_get_allocation(d);
+--- a/xen/common/domain.c
++++ b/xen/common/domain.c
+@@ -479,7 +479,6 @@ int domain_kill(struct domain *d)
+ rc = domain_relinquish_resources(d);
+ if ( rc != 0 )
+ {
+- BUG_ON(rc != -EAGAIN);
+ break;
+ }
+ d->is_dying = DOMDYING_dead;
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -193,6 +193,20 @@ struct paging_domain {
+ struct hap_domain hap;
+ /* log dirty support */
+ struct log_dirty_domain log_dirty;
++
++ /* preemption handling */
++ struct {
++ struct vcpu *vcpu;
++ unsigned int op;
++ union {
++ struct {
++ unsigned long done:PADDR_BITS - PAGE_SHIFT;
++ unsigned long i4:PAGETABLE_ORDER;
++ unsigned long i3:PAGETABLE_ORDER;
++ } log_dirty;
++ };
++ } preempt;
++
+ /* alloc/free pages from the pool for paging-assistance structures
+ * (used by p2m and log-dirty code for their tries) */
+ struct page_info * (*alloc_page)(struct domain *d);
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -141,9 +141,6 @@ struct paging_mode {
+ /*****************************************************************************
+ * Log dirty code */
+
+-/* free log dirty bitmap resource */
+-void paging_free_log_dirty_bitmap(struct domain *d);
+-
+ /* get the dirty bitmap for a specific range of pfns */
+ void paging_log_dirty_range(struct domain *d,
+ unsigned long begin_pfn,
+@@ -153,9 +150,6 @@ void paging_log_dirty_range(struct domai
+ /* enable log dirty */
+ int paging_log_dirty_enable(struct domain *d);
+
+-/* disable log dirty */
+-int paging_log_dirty_disable(struct domain *d);
+-
+ /* log dirty initialization */
+ void paging_log_dirty_init(struct domain *d,
+ int (*enable_log_dirty)(struct domain *d),
+@@ -218,7 +212,7 @@ int paging_domctl(struct domain *d, xen_
+ XEN_GUEST_HANDLE(void) u_domctl);
+
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d);
++int paging_teardown(struct domain *d);
+
+ /* Call once all of the references to the domain have gone away */
+ void paging_final_teardown(struct domain *d);
More information about the scm-commits
mailing list