[xen/f19] Long latency virtual-mmu operations are not preemptible

myoung myoung at fedoraproject.org
Thu Aug 14 14:51:01 UTC 2014


commit 72532ffb1e9bd6c040a38427ae903d575a05018f
Author: Michael Young <m.a.young at durham.ac.uk>
Date:   Thu Aug 14 15:50:29 2014 +0100

    Long latency virtual-mmu operations are not preemptible

 xen.spec                   |   10 +-
 xsa97-hap-4.2-prereq.patch |  466 ++++++++++++++++++++++++++++++++++++++++++
 xsa97-hap-4.2.patch        |  485 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 960 insertions(+), 1 deletions(-)
---
diff --git a/xen.spec b/xen.spec
index be026d9..17d555b 100644
--- a/xen.spec
+++ b/xen.spec
@@ -27,7 +27,7 @@
 Summary: Xen is a virtual machine monitor
 Name:    xen
 Version: 4.2.4
-Release: 6%{?dist}
+Release: 7%{?dist}
 Group:   Development/Libraries
 License: GPLv2+ and LGPLv2+ and BSD
 URL:     http://xen.org/
@@ -87,6 +87,8 @@ Patch109: xsa92-4.2.patch
 Patch110: xsa96.patch
 Patch111: xsa100.patch
 Patch112: xen.git-6b4d71d028f445cba7426a144751fddc8bfdd67b.patch
+Patch113: xsa97-hap-4.2-prereq.patch
+Patch114: xsa97-hap-4.2.patch
 
 Patch100: xen-configure-xend.patch
 
@@ -264,6 +266,8 @@ manage Xen virtual machines.
 %patch110 -p1
 %patch111 -p1
 %patch112 -p1
+%patch113 -p1
+%patch114 -p1
 
 %patch100 -p1
 
@@ -757,6 +761,10 @@ rm -rf %{buildroot}
 %endif
 
 %changelog
+* Thu Aug 14 2014 Michael Young <m.a.young at durham.ac.uk> - 4.2.4-7
+- Long latency virtual-mmu operations are not preemptible
+	[XSA-97, CVE-2014-5146]
+
 * Fri Jun 20 2014 Michael Young <m.a.young at durham.ac.uk> - 4.2.4-6
 - Hypervisor heap contents leaked to guest [XSA-100, CVE-2014-4021]
 	(#1110316) with extra patch to avoid regression
diff --git a/xsa97-hap-4.2-prereq.patch b/xsa97-hap-4.2-prereq.patch
new file mode 100644
index 0000000..ce2240a
--- /dev/null
+++ b/xsa97-hap-4.2-prereq.patch
@@ -0,0 +1,466 @@
+x86/mm/hap: Adjust vram tracking to play nicely with log-dirty.
+
+The previous code assumed the guest would be in one of three mutually exclusive
+modes for bookkeeping dirty pages: (1) shadow, (2) hap utilizing the log dirty
+bitmap to support functionality such as live migrate, (3) hap utilizing the
+log dirty bitmap to track dirty vram pages.
+Races arose when a guest attempted to track dirty vram while performing live
+migrate.  (The dispatch table managed by paging_log_dirty_init() might change
+in the middle of a log dirty or a vram tracking function.)
+
+This change allows hap log dirty and hap vram tracking to be concurrent.
+Vram tracking no longer uses the log dirty bitmap.  Instead it detects
+dirty vram pages by examining their p2m type.  The log dirty bitmap is only
+used by the log dirty code.  Because the two operations use different
+mechanisms, they are no longer mutually exclusive.
+
+Signed-Off-By: Robert Phillips <robert.phillips at citrix.com>
+Acked-by: Tim Deegan <tim at xen.org>
+
+Minor whitespace changes to conform with coding style
+Signed-off-by: Tim Deegan <tim at xen.org>
+
+Committed-by: Tim Deegan <tim at xen.org>
+master commit: fd91a2a662bc59677e0f217423a7a155d5465886
+master date: 2012-12-13 12:10:14 +0000
+
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -56,132 +56,110 @@
+ /*          HAP VRAM TRACKING SUPPORT           */
+ /************************************************/
+ 
+-static int hap_enable_vram_tracking(struct domain *d)
+-{
+-    struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+-
+-    if ( !dirty_vram )
+-        return -EINVAL;
+-
+-    /* turn on PG_log_dirty bit in paging mode */
+-    paging_lock(d);
+-    d->arch.paging.mode |= PG_log_dirty;
+-    paging_unlock(d);
+-
+-    /* set l1e entries of P2M table to be read-only. */
+-    p2m_change_type_range(d, dirty_vram->begin_pfn, dirty_vram->end_pfn, 
+-                          p2m_ram_rw, p2m_ram_logdirty);
+-
+-    flush_tlb_mask(d->domain_dirty_cpumask);
+-    return 0;
+-}
+-
+-static int hap_disable_vram_tracking(struct domain *d)
+-{
+-    struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+-
+-    if ( !dirty_vram )
+-        return -EINVAL;
+-
+-    paging_lock(d);
+-    d->arch.paging.mode &= ~PG_log_dirty;
+-    paging_unlock(d);
+-
+-    /* set l1e entries of P2M table with normal mode */
+-    p2m_change_type_range(d, dirty_vram->begin_pfn, dirty_vram->end_pfn, 
+-                          p2m_ram_logdirty, p2m_ram_rw);
+-
+-    flush_tlb_mask(d->domain_dirty_cpumask);
+-    return 0;
+-}
+-
+-static void hap_clean_vram_tracking(struct domain *d)
+-{
+-    struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+-
+-    if ( !dirty_vram )
+-        return;
+-
+-    /* set l1e entries of P2M table to be read-only. */
+-    p2m_change_type_range(d, dirty_vram->begin_pfn, dirty_vram->end_pfn, 
+-                          p2m_ram_rw, p2m_ram_logdirty);
+-
+-    flush_tlb_mask(d->domain_dirty_cpumask);
+-}
+-
+-static void hap_vram_tracking_init(struct domain *d)
+-{
+-    paging_log_dirty_init(d, hap_enable_vram_tracking,
+-                          hap_disable_vram_tracking,
+-                          hap_clean_vram_tracking);
+-}
++/*
++ * hap_track_dirty_vram()
++ * Create the domain's dv_dirty_vram struct on demand.
++ * Create a dirty vram range on demand when some [begin_pfn:begin_pfn+nr] is
++ * first encountered.
++ * Collect the guest_dirty bitmask, a bit mask of the dirty vram pages, by
++ * calling paging_log_dirty_range(), which interrogates each vram
++ * page's p2m type looking for pages that have been made writable.
++ */
+ 
+ int hap_track_dirty_vram(struct domain *d,
+                          unsigned long begin_pfn,
+                          unsigned long nr,
+-                         XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
++                         XEN_GUEST_HANDLE_64(uint8) guest_dirty_bitmap)
+ {
+     long rc = 0;
+-    struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
++    struct sh_dirty_vram *dirty_vram;
++    uint8_t *dirty_bitmap = NULL;
+ 
+     if ( nr )
+     {
+-        if ( paging_mode_log_dirty(d) && dirty_vram )
++        int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
++
++        if ( !paging_mode_log_dirty(d) )
+         {
+-            if ( begin_pfn != dirty_vram->begin_pfn ||
+-                 begin_pfn + nr != dirty_vram->end_pfn )
+-            {
+-                paging_log_dirty_disable(d);
+-                dirty_vram->begin_pfn = begin_pfn;
+-                dirty_vram->end_pfn = begin_pfn + nr;
+-                rc = paging_log_dirty_enable(d);
+-                if (rc != 0)
+-                    goto param_fail;
+-            }
++            hap_logdirty_init(d);
++            rc = paging_log_dirty_enable(d);
++            if ( rc )
++                goto out;
+         }
+-        else if ( !paging_mode_log_dirty(d) && !dirty_vram )
++
++        rc = -ENOMEM;
++        dirty_bitmap = xzalloc_bytes(size);
++        if ( !dirty_bitmap )
++            goto out;
++
++        paging_lock(d);
++
++        dirty_vram = d->arch.hvm_domain.dirty_vram;
++        if ( !dirty_vram )
+         {
+             rc = -ENOMEM;
+-            if ( (dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL )
+-                goto param_fail;
++            if ( (dirty_vram = xzalloc(struct sh_dirty_vram)) == NULL )
++            {
++                paging_unlock(d);
++                goto out;
++            }
+ 
++            d->arch.hvm_domain.dirty_vram = dirty_vram;
++        }
++
++        if ( begin_pfn != dirty_vram->begin_pfn ||
++             begin_pfn + nr != dirty_vram->end_pfn )
++        {
+             dirty_vram->begin_pfn = begin_pfn;
+             dirty_vram->end_pfn = begin_pfn + nr;
+-            d->arch.hvm_domain.dirty_vram = dirty_vram;
+-            hap_vram_tracking_init(d);
+-            rc = paging_log_dirty_enable(d);
+-            if (rc != 0)
+-                goto param_fail;
++
++            paging_unlock(d);
++
++            /* set l1e entries of range within P2M table to be read-only. */
++            p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
++                                  p2m_ram_rw, p2m_ram_logdirty);
++
++            flush_tlb_mask(d->domain_dirty_cpumask);
++
++            memset(dirty_bitmap, 0xff, size); /* consider all pages dirty */
+         }
+         else
+         {
+-            if ( !paging_mode_log_dirty(d) && dirty_vram )
+-                rc = -EINVAL;
+-            else
+-                rc = -ENODATA;
+-            goto param_fail;
++            paging_unlock(d);
++
++            domain_pause(d);
++
++            /* get the bitmap */
++            paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
++
++            domain_unpause(d);
+         }
+-        /* get the bitmap */
+-        rc = paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap);
++
++        rc = -EFAULT;
++        if ( copy_to_guest(guest_dirty_bitmap, dirty_bitmap, size) == 0 )
++            rc = 0;
+     }
+     else
+     {
+-        if ( paging_mode_log_dirty(d) && dirty_vram ) {
+-            rc = paging_log_dirty_disable(d);
+-            xfree(dirty_vram);
+-            dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
+-        } else
+-            rc = 0;
+-    }
++        paging_lock(d);
+ 
+-    return rc;
++        dirty_vram = d->arch.hvm_domain.dirty_vram;
++        if ( dirty_vram )
++        {
++            /*
++             * If zero pages specified while tracking dirty vram
++             * then stop tracking
++             */
++            xfree(dirty_vram);
++            d->arch.hvm_domain.dirty_vram = NULL;
++        }
+ 
+-param_fail:
+-    if ( dirty_vram )
+-    {
+-        xfree(dirty_vram);
+-        dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
++        paging_unlock(d);
+     }
++out:
++    if ( dirty_bitmap )
++        xfree(dirty_bitmap);
++
+     return rc;
+ }
+ 
+@@ -223,13 +201,6 @@ static void hap_clean_dirty_bitmap(struc
+ 
+ void hap_logdirty_init(struct domain *d)
+ {
+-    struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
+-    if ( paging_mode_log_dirty(d) && dirty_vram )
+-    {
+-        paging_log_dirty_disable(d);
+-        xfree(dirty_vram);
+-        dirty_vram = d->arch.hvm_domain.dirty_vram = NULL;
+-    }
+ 
+     /* Reinitialize logdirty mechanism */
+     paging_log_dirty_init(d, hap_enable_log_dirty,
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -447,157 +447,38 @@ int paging_log_dirty_op(struct domain *d
+     return rv;
+ }
+ 
+-int paging_log_dirty_range(struct domain *d,
+-                            unsigned long begin_pfn,
+-                            unsigned long nr,
+-                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap)
+-{
+-    int rv = 0;
+-    unsigned long pages = 0;
+-    mfn_t *l4, *l3, *l2;
+-    unsigned long *l1;
+-    int b1, b2, b3, b4;
+-    int i2, i3, i4;
+-
+-    d->arch.paging.log_dirty.clean_dirty_bitmap(d);
+-    paging_lock(d);
+-
+-    PAGING_DEBUG(LOGDIRTY, "log-dirty-range: dom %u faults=%u dirty=%u\n",
+-                 d->domain_id,
+-                 d->arch.paging.log_dirty.fault_count,
+-                 d->arch.paging.log_dirty.dirty_count);
+-
+-    if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) {
+-        printk("%s: %d failed page allocs while logging dirty pages\n",
+-               __FUNCTION__, d->arch.paging.log_dirty.failed_allocs);
+-        rv = -ENOMEM;
+-        goto out;
+-    }
++void paging_log_dirty_range(struct domain *d,
++                           unsigned long begin_pfn,
++                           unsigned long nr,
++                           uint8_t *dirty_bitmap)
++{
++    struct p2m_domain *p2m = p2m_get_hostp2m(d);
++    int i;
++    unsigned long pfn;
++
++    /*
++     * Set l1e entries of P2M table to be read-only.
++     *
++     * On first write, it page faults, its entry is changed to read-write,
++     * and on retry the write succeeds.
++     *
++     * We populate dirty_bitmap by looking for entries that have been
++     * switched to read-write.
++     */
+ 
+-    if ( !d->arch.paging.log_dirty.fault_count &&
+-         !d->arch.paging.log_dirty.dirty_count ) {
+-        unsigned int size = BITS_TO_LONGS(nr);
+-
+-        if ( clear_guest(dirty_bitmap, size * BYTES_PER_LONG) != 0 )
+-            rv = -EFAULT;
+-        goto out;
+-    }
+-    d->arch.paging.log_dirty.fault_count = 0;
+-    d->arch.paging.log_dirty.dirty_count = 0;
++    p2m_lock(p2m);
+ 
+-    b1 = L1_LOGDIRTY_IDX(begin_pfn);
+-    b2 = L2_LOGDIRTY_IDX(begin_pfn);
+-    b3 = L3_LOGDIRTY_IDX(begin_pfn);
+-    b4 = L4_LOGDIRTY_IDX(begin_pfn);
+-    l4 = paging_map_log_dirty_bitmap(d);
+-
+-    for ( i4 = b4;
+-          (pages < nr) && (i4 < LOGDIRTY_NODE_ENTRIES);
+-          i4++ )
++    for ( i = 0, pfn = begin_pfn; pfn < begin_pfn + nr; i++, pfn++ )
+     {
+-        l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+-        for ( i3 = b3;
+-              (pages < nr) && (i3 < LOGDIRTY_NODE_ENTRIES);
+-              i3++ )
+-        {
+-            l2 = ((l3 && mfn_valid(l3[i3])) ?
+-                  map_domain_page(mfn_x(l3[i3])) : NULL);
+-            for ( i2 = b2;
+-                  (pages < nr) && (i2 < LOGDIRTY_NODE_ENTRIES);
+-                  i2++ )
+-            {
+-                unsigned int bytes = PAGE_SIZE;
+-                uint8_t *s;
+-                l1 = ((l2 && mfn_valid(l2[i2])) ?
+-                      map_domain_page(mfn_x(l2[i2])) : NULL);
+-
+-                s = ((uint8_t*)l1) + (b1 >> 3);
+-                bytes -= b1 >> 3;
+-
+-                if ( likely(((nr - pages + 7) >> 3) < bytes) )
+-                    bytes = (unsigned int)((nr - pages + 7) >> 3);
+-
+-                if ( !l1 )
+-                {
+-                    if ( clear_guest_offset(dirty_bitmap, pages >> 3,
+-                                            bytes) != 0 )
+-                    {
+-                        rv = -EFAULT;
+-                        goto out;
+-                    }
+-                }
+-                /* begin_pfn is not 32K aligned, hence we have to bit
+-                 * shift the bitmap */
+-                else if ( b1 & 0x7 )
+-                {
+-                    int i, j;
+-                    uint32_t *l = (uint32_t*) s;
+-                    int bits = b1 & 0x7;
+-                    int bitmask = (1 << bits) - 1;
+-                    int size = (bytes + BYTES_PER_LONG - 1) / BYTES_PER_LONG;
+-                    unsigned long bitmap[size];
+-                    static unsigned long printed = 0;
+-
+-                    if ( printed != begin_pfn )
+-                    {
+-                        dprintk(XENLOG_DEBUG, "%s: begin_pfn %lx is not 32K aligned!\n",
+-                                __FUNCTION__, begin_pfn);
+-                        printed = begin_pfn;
+-                    }
+-
+-                    for ( i = 0; i < size - 1; i++, l++ ) {
+-                        bitmap[i] = ((*l) >> bits) |
+-                            (((*((uint8_t*)(l + 1))) & bitmask) << (sizeof(*l) * 8 - bits));
+-                    }
+-                    s = (uint8_t*) l;
+-                    size = BYTES_PER_LONG - ((b1 >> 3) & 0x3);
+-                    bitmap[i] = 0;
+-                    for ( j = 0; j < size; j++, s++ )
+-                        bitmap[i] |= (*s) << (j * 8);
+-                    bitmap[i] = (bitmap[i] >> bits) | (bitmask << (size * 8 - bits));
+-                    if ( copy_to_guest_offset(dirty_bitmap, (pages >> 3),
+-                                (uint8_t*) bitmap, bytes) != 0 )
+-                    {
+-                        rv = -EFAULT;
+-                        goto out;
+-                    }
+-                }
+-                else
+-                {
+-                    if ( copy_to_guest_offset(dirty_bitmap, pages >> 3,
+-                                              s, bytes) != 0 )
+-                    {
+-                        rv = -EFAULT;
+-                        goto out;
+-                    }
+-                }
+-
+-                pages += bytes << 3;
+-                if ( l1 )
+-                {
+-                    clear_page(l1);
+-                    unmap_domain_page(l1);
+-                }
+-                b1 = b1 & 0x7;
+-            }
+-            b2 = 0;
+-            if ( l2 )
+-                unmap_domain_page(l2);
+-        }
+-        b3 = 0;
+-        if ( l3 )
+-            unmap_domain_page(l3);
++        p2m_type_t pt;
++        pt = p2m_change_type(d, pfn, p2m_ram_rw, p2m_ram_logdirty);
++        if ( pt == p2m_ram_rw )
++            dirty_bitmap[i >> 3] |= (1 << (i & 7));
+     }
+-    if ( l4 )
+-        unmap_domain_page(l4);
+-
+-    paging_unlock(d);
+ 
+-    return rv;
++    p2m_unlock(p2m);
+ 
+- out:
+-    paging_unlock(d);
+-    return rv;
++    flush_tlb_mask(d->domain_dirty_cpumask);
+ }
+ 
+ /* Note that this function takes three function pointers. Callers must supply
+--- a/xen/include/asm-x86/config.h
++++ b/xen/include/asm-x86/config.h
+@@ -17,6 +17,7 @@
+ 
+ #define BYTES_PER_LONG (1 << LONG_BYTEORDER)
+ #define BITS_PER_LONG (BYTES_PER_LONG << 3)
++#define BITS_PER_BYTE 8
+ 
+ #define CONFIG_X86 1
+ #define CONFIG_X86_HT 1
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -145,10 +145,10 @@ struct paging_mode {
+ void paging_free_log_dirty_bitmap(struct domain *d);
+ 
+ /* get the dirty bitmap for a specific range of pfns */
+-int paging_log_dirty_range(struct domain *d,
+-                           unsigned long begin_pfn,
+-                           unsigned long nr,
+-                           XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
++void paging_log_dirty_range(struct domain *d,
++                            unsigned long begin_pfn,
++                            unsigned long nr,
++                            uint8_t *dirty_bitmap);
+ 
+ /* enable log dirty */
+ int paging_log_dirty_enable(struct domain *d);
diff --git a/xsa97-hap-4.2.patch b/xsa97-hap-4.2.patch
new file mode 100644
index 0000000..7032cdb
--- /dev/null
+++ b/xsa97-hap-4.2.patch
@@ -0,0 +1,485 @@
+x86/paging: make log-dirty operations preemptible
+
+Both the freeing and the inspection of the bitmap get done in (nested)
+loops which - besides having a rather high iteration count in general,
+albeit that would be covered by XSA-77 - have the number of non-trivial
+iterations they need to perform (indirectly) controllable by both the
+guest they are for and any domain controlling the guest (including the
+one running qemu for it).
+
+This is XSA-97.
+
+Signed-off-by: Jan Beulich <jbeulich at suse.com>
+Reviewed-by: Tim Deegan <tim at xen.org>
+
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -2136,7 +2136,9 @@ int domain_relinquish_resources(struct d
+         pci_release_devices(d);
+ 
+         /* Tear down paging-assistance stuff. */
+-        paging_teardown(d);
++        ret = paging_teardown(d);
++        if ( ret )
++            return ret;
+ 
+         /* Drop the in-use references to page-table bases. */
+         for_each_vcpu ( d, v )
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -66,6 +66,9 @@ long arch_do_domctl(
+                                 &domctl->u.shadow_op,
+                                 guest_handle_cast(u_domctl, void));
+             rcu_unlock_domain(d);
++            if ( ret == -EAGAIN )
++                return hypercall_create_continuation(__HYPERVISOR_domctl,
++                                                     "h", u_domctl);
+             copy_to_guest(u_domctl, domctl, 1);
+         } 
+     }
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -678,8 +678,7 @@ int hap_domctl(struct domain *d, xen_dom
+         paging_unlock(d);
+         if ( preempted )
+             /* Not finished.  Set up to re-run the call. */
+-            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+-                                               u_domctl);
++            rc = -EAGAIN;
+         else
+             /* Finished.  Return the new allocation */
+             sc->mb = hap_get_allocation(d);
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -26,6 +26,7 @@
+ #include <asm/shadow.h>
+ #include <asm/p2m.h>
+ #include <asm/hap.h>
++#include <asm/event.h>
+ #include <asm/hvm/nestedhvm.h>
+ #include <xen/numa.h>
+ #include <xsm/xsm.h>
+@@ -116,26 +117,46 @@ static void paging_free_log_dirty_page(s
+     d->arch.paging.free_page(d, mfn_to_page(mfn));
+ }
+ 
+-void paging_free_log_dirty_bitmap(struct domain *d)
++static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
+ {
+     mfn_t *l4, *l3, *l2;
+     int i4, i3, i2;
+ 
++    paging_lock(d);
++
+     if ( !mfn_valid(d->arch.paging.log_dirty.top) )
+-        return;
++    {
++        paging_unlock(d);
++        return 0;
++    }
+ 
+-    paging_lock(d);
++    if ( !d->arch.paging.preempt.vcpu )
++    {
++        memset(&d->arch.paging.preempt.log_dirty, 0,
++               sizeof(d->arch.paging.preempt.log_dirty));
++        ASSERT(rc <= 0);
++        d->arch.paging.preempt.log_dirty.done = -rc;
++    }
++    else if ( d->arch.paging.preempt.vcpu != current ||
++              d->arch.paging.preempt.op != XEN_DOMCTL_SHADOW_OP_OFF )
++    {
++        paging_unlock(d);
++        return -EBUSY;
++    }
+ 
+     l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
++    i4 = d->arch.paging.preempt.log_dirty.i4;
++    i3 = d->arch.paging.preempt.log_dirty.i3;
++    rc = 0;
+ 
+-    for ( i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++ )
++    for ( ; i4 < LOGDIRTY_NODE_ENTRIES; i4++, i3 = 0 )
+     {
+         if ( !mfn_valid(l4[i4]) )
+             continue;
+ 
+         l3 = map_domain_page(mfn_x(l4[i4]));
+ 
+-        for ( i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
++        for ( ; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
+         {
+             if ( !mfn_valid(l3[i3]) )
+                 continue;
+@@ -148,20 +169,54 @@ void paging_free_log_dirty_bitmap(struct
+ 
+             unmap_domain_page(l2);
+             paging_free_log_dirty_page(d, l3[i3]);
++            l3[i3] = _mfn(INVALID_MFN);
++
++            if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++            {
++                d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++                d->arch.paging.preempt.log_dirty.i4 = i4;
++                rc = -EAGAIN;
++                break;
++            }
+         }
+ 
+         unmap_domain_page(l3);
++        if ( rc )
++            break;
+         paging_free_log_dirty_page(d, l4[i4]);
++        l4[i4] = _mfn(INVALID_MFN);
++
++        if ( i4 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++        {
++            d->arch.paging.preempt.log_dirty.i3 = 0;
++            d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++            rc = -EAGAIN;
++            break;
++        }
+     }
+ 
+     unmap_domain_page(l4);
+-    paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
+-    d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+ 
+-    ASSERT(d->arch.paging.log_dirty.allocs == 0);
+-    d->arch.paging.log_dirty.failed_allocs = 0;
++    if ( !rc )
++    {
++        paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
++        d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
++
++        ASSERT(d->arch.paging.log_dirty.allocs == 0);
++        d->arch.paging.log_dirty.failed_allocs = 0;
++
++        rc = -d->arch.paging.preempt.log_dirty.done;
++        d->arch.paging.preempt.vcpu = NULL;
++    }
++    else
++    {
++        d->arch.paging.preempt.vcpu = current;
++        d->arch.paging.preempt.op = XEN_DOMCTL_SHADOW_OP_OFF;
++    }
+ 
+     paging_unlock(d);
++
++    return rc;
+ }
+ 
+ int paging_log_dirty_enable(struct domain *d)
+@@ -178,15 +233,25 @@ int paging_log_dirty_enable(struct domai
+     return ret;
+ }
+ 
+-int paging_log_dirty_disable(struct domain *d)
++static int paging_log_dirty_disable(struct domain *d, bool_t resuming)
+ {
+-    int ret;
++    int ret = 1;
++
++    if ( !resuming )
++    {
++        domain_pause(d);
++        /* Safe because the domain is paused. */
++        ret = d->arch.paging.log_dirty.disable_log_dirty(d);
++        ASSERT(ret <= 0);
++    }
+ 
+-    domain_pause(d);
+-    /* Safe because the domain is paused. */
+-    ret = d->arch.paging.log_dirty.disable_log_dirty(d);
+     if ( !paging_mode_log_dirty(d) )
+-        paging_free_log_dirty_bitmap(d);
++    {
++        ret = paging_free_log_dirty_bitmap(d, ret);
++        if ( ret == -EAGAIN )
++            return ret;
++    }
++
+     domain_unpause(d);
+ 
+     return ret;
+@@ -326,7 +391,9 @@ int paging_mfn_is_dirty(struct domain *d
+ 
+ /* Read a domain's log-dirty bitmap and stats.  If the operation is a CLEAN,
+  * clear the bitmap and stats as well. */
+-int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc)
++static int paging_log_dirty_op(struct domain *d,
++                               struct xen_domctl_shadow_op *sc,
++                               bool_t resuming)
+ {
+     int rv = 0, clean = 0, peek = 1;
+     unsigned long pages = 0;
+@@ -334,9 +401,22 @@ int paging_log_dirty_op(struct domain *d
+     unsigned long *l1 = NULL;
+     int i4, i3, i2;
+ 
+-    domain_pause(d);
++    if ( !resuming )
++        domain_pause(d);
+     paging_lock(d);
+ 
++    if ( !d->arch.paging.preempt.vcpu )
++        memset(&d->arch.paging.preempt.log_dirty, 0,
++               sizeof(d->arch.paging.preempt.log_dirty));
++    else if ( d->arch.paging.preempt.vcpu != current ||
++              d->arch.paging.preempt.op != sc->op )
++    {
++        paging_unlock(d);
++        ASSERT(!resuming);
++        domain_unpause(d);
++        return -EBUSY;
++    }
++
+     clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+ 
+     PAGING_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+@@ -365,17 +445,15 @@ int paging_log_dirty_op(struct domain *d
+         goto out;
+     }
+ 
+-    pages = 0;
+     l4 = paging_map_log_dirty_bitmap(d);
++    i4 = d->arch.paging.preempt.log_dirty.i4;
++    i3 = d->arch.paging.preempt.log_dirty.i3;
++    pages = d->arch.paging.preempt.log_dirty.done;
+ 
+-    for ( i4 = 0;
+-          (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES);
+-          i4++ )
++    for ( ; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++, i3 = 0 )
+     {
+         l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+-        for ( i3 = 0;
+-              (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES);
+-              i3++ )
++        for ( ; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ )
+         {
+             l2 = ((l3 && mfn_valid(l3[i3])) ?
+                   map_domain_page(mfn_x(l3[i3])) : NULL);
+@@ -410,18 +488,51 @@ int paging_log_dirty_op(struct domain *d
+             }
+             if ( l2 )
+                 unmap_domain_page(l2);
++
++            if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++            {
++                d->arch.paging.preempt.log_dirty.i4 = i4;
++                d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++                rv = -EAGAIN;
++                break;
++            }
+         }
+         if ( l3 )
+             unmap_domain_page(l3);
++
++        if ( !rv && i4 < LOGDIRTY_NODE_ENTRIES - 1 &&
++             hypercall_preempt_check() )
++        {
++            d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++            d->arch.paging.preempt.log_dirty.i3 = 0;
++            rv = -EAGAIN;
++        }
++        if ( rv )
++            break;
+     }
+     if ( l4 )
+         unmap_domain_page(l4);
+ 
+-    if ( pages < sc->pages )
+-        sc->pages = pages;
++    if ( !rv )
++        d->arch.paging.preempt.vcpu = NULL;
++    else
++    {
++        d->arch.paging.preempt.vcpu = current;
++        d->arch.paging.preempt.op = sc->op;
++        d->arch.paging.preempt.log_dirty.done = pages;
++    }
+ 
+     paging_unlock(d);
+ 
++    if ( rv )
++    {
++        /* Never leave the domain paused for other errors. */
++        ASSERT(rv == -EAGAIN);
++        return rv;
++    }
++
++    if ( pages < sc->pages )
++        sc->pages = pages;
+     if ( clean )
+     {
+         /* We need to further call clean_dirty_bitmap() functions of specific
+@@ -432,6 +543,7 @@ int paging_log_dirty_op(struct domain *d
+     return rv;
+ 
+  out:
++    d->arch.paging.preempt.vcpu = NULL;
+     paging_unlock(d);
+     domain_unpause(d);
+ 
+@@ -498,12 +610,6 @@ void paging_log_dirty_init(struct domain
+     d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
+ }
+ 
+-/* This function fress log dirty bitmap resources. */
+-static void paging_log_dirty_teardown(struct domain*d)
+-{
+-    paging_free_log_dirty_bitmap(d);
+-}
+-
+ /************************************************/
+ /*           CODE FOR PAGING SUPPORT            */
+ /************************************************/
+@@ -547,6 +653,7 @@ void paging_vcpu_init(struct vcpu *v)
+ int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
+                   XEN_GUEST_HANDLE(void) u_domctl)
+ {
++    bool_t resuming = 0;
+     int rc;
+ 
+     if ( unlikely(d == current->domain) )
+@@ -569,6 +676,20 @@ int paging_domctl(struct domain *d, xen_
+         return -EINVAL;
+     }
+ 
++    if ( d->arch.paging.preempt.vcpu )
++    {
++        if ( d->arch.paging.preempt.vcpu != current ||
++             d->arch.paging.preempt.op != sc->op )
++        {
++            printk(XENLOG_G_DEBUG
++                   "d%d:v%d: Paging op %#x on Dom%u with unfinished prior op %#x\n",
++                   current->domain->domain_id, current->vcpu_id,
++                   sc->op, d->domain_id, d->arch.paging.preempt.op);
++            return -EBUSY;
++        }
++        resuming = 1;
++    }
++
+     rc = xsm_shadow_control(d, sc->op);
+     if ( rc )
+         return rc;
+@@ -594,13 +714,13 @@ int paging_domctl(struct domain *d, xen_
+ 
+     case XEN_DOMCTL_SHADOW_OP_OFF:
+         if ( paging_mode_log_dirty(d) )
+-            if ( (rc = paging_log_dirty_disable(d)) != 0 )
++            if ( (rc = paging_log_dirty_disable(d, resuming)) != 0 )
+                 return rc;
+         break;
+ 
+     case XEN_DOMCTL_SHADOW_OP_CLEAN:
+     case XEN_DOMCTL_SHADOW_OP_PEEK:
+-        return paging_log_dirty_op(d, sc);
++        return paging_log_dirty_op(d, sc, resuming);
+     }
+ 
+     /* Here, dispatch domctl to the appropriate paging code */
+@@ -611,18 +731,24 @@ int paging_domctl(struct domain *d, xen_
+ }
+ 
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d)
++int paging_teardown(struct domain *d)
+ {
++    int rc;
++
+     if ( hap_enabled(d) )
+         hap_teardown(d);
+     else
+         shadow_teardown(d);
+ 
+     /* clean up log dirty resources. */
+-    paging_log_dirty_teardown(d);
++    rc = paging_free_log_dirty_bitmap(d, 0);
++    if ( rc == -EAGAIN )
++        return rc;
+ 
+     /* Move populate-on-demand cache back to domain_list for destruction */
+     p2m_pod_empty_cache(d);
++
++    return rc;
+ }
+ 
+ /* Call once all of the references to the domain have gone away */
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -3829,8 +3829,7 @@ int shadow_domctl(struct domain *d, 
+         paging_unlock(d);
+         if ( preempted )
+             /* Not finished.  Set up to re-run the call. */
+-            rc = hypercall_create_continuation(
+-                __HYPERVISOR_domctl, "h", u_domctl);
++            rc = -EAGAIN;
+         else 
+             /* Finished.  Return the new allocation */
+             sc->mb = shadow_get_allocation(d);
+--- a/xen/common/domain.c
++++ b/xen/common/domain.c
+@@ -479,7 +479,6 @@ int domain_kill(struct domain *d)
+         rc = domain_relinquish_resources(d);
+         if ( rc != 0 )
+         {
+-            BUG_ON(rc != -EAGAIN);
+             break;
+         }
+         d->is_dying = DOMDYING_dead;
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -193,6 +193,20 @@ struct paging_domain {
+     struct hap_domain       hap;
+     /* log dirty support */
+     struct log_dirty_domain log_dirty;
++
++    /* preemption handling */
++    struct {
++        struct vcpu *vcpu;
++        unsigned int op;
++        union {
++            struct {
++                unsigned long done:PADDR_BITS - PAGE_SHIFT;
++                unsigned long i4:PAGETABLE_ORDER;
++                unsigned long i3:PAGETABLE_ORDER;
++            } log_dirty;
++        };
++    } preempt;
++
+     /* alloc/free pages from the pool for paging-assistance structures
+      * (used by p2m and log-dirty code for their tries) */
+     struct page_info * (*alloc_page)(struct domain *d);
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -141,9 +141,6 @@ struct paging_mode {
+ /*****************************************************************************
+  * Log dirty code */
+ 
+-/* free log dirty bitmap resource */
+-void paging_free_log_dirty_bitmap(struct domain *d);
+-
+ /* get the dirty bitmap for a specific range of pfns */
+ void paging_log_dirty_range(struct domain *d,
+                             unsigned long begin_pfn,
+@@ -153,9 +150,6 @@ void paging_log_dirty_range(struct domai
+ /* enable log dirty */
+ int paging_log_dirty_enable(struct domain *d);
+ 
+-/* disable log dirty */
+-int paging_log_dirty_disable(struct domain *d);
+-
+ /* log dirty initialization */
+ void paging_log_dirty_init(struct domain *d,
+                            int  (*enable_log_dirty)(struct domain *d),
+@@ -218,7 +212,7 @@ int paging_domctl(struct domain *d, xen_
+                   XEN_GUEST_HANDLE(void) u_domctl);
+ 
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d);
++int paging_teardown(struct domain *d);
+ 
+ /* Call once all of the references to the domain have gone away */
+ void paging_final_teardown(struct domain *d);


More information about the scm-commits mailing list