[xen] Long latency virtual-mmu operations are not preemptible

myoung myoung at fedoraproject.org
Tue Aug 12 21:14:29 UTC 2014


commit 5d36bf8069d7f75ecb35c94afc7bb88ccf6d226f
Author: Michael Young <m.a.young at durham.ac.uk>
Date:   Tue Aug 12 22:14:20 2014 +0100

    Long latency virtual-mmu operations are not preemptible

 xen.spec            |    8 +-
 xsa97-hap-4.4.patch |  485 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 492 insertions(+), 1 deletions(-)
---
diff --git a/xen.spec b/xen.spec
index b6aa2f5..0cf9137 100644
--- a/xen.spec
+++ b/xen.spec
@@ -53,7 +53,7 @@
 Summary: Xen is a virtual machine monitor
 Name:    xen
 Version: 4.4.0
-Release: 9%{?dist}
+Release: 10%{?dist}
 Group:   Development/Libraries
 License: GPLv2+ and LGPLv2+ and BSD
 URL:     http://xen.org/
@@ -106,6 +106,7 @@ Patch24: xsa92.patch
 Patch25: xsa96.patch
 Patch26: xsa100.patch
 Patch27: xen.git-6b4d71d028f445cba7426a144751fddc8bfdd67b.patch
+Patch28: xsa97-hap-4.4.patch
 
 Patch99: localgcc490fix.patch
 Patch100: xen-configure-xend.patch
@@ -297,6 +298,7 @@ manage Xen virtual machines.
 %patch25 -p1
 %patch26 -p1
 %patch27 -p1
+%patch28 -p1
 
 %patch99 -p1
 %patch100 -p1
@@ -894,6 +896,10 @@ rm -rf %{buildroot}
 %endif
 
 %changelog
+* Tue Aug 12 2014 Michael Young <m.a.young at durham.ac.uk> - 4.4.0-10
+- Long latency virtual-mmu operations are not preemptible
+	[XSA-97, CVE-2014-5146]
+
 * Thu Aug 07 2014 Richard W.M. Jones <rjones at redhat.com> - 4.4.0-9
 - ocaml-4.02.0-0.8.git10e45753.fc22 rebuild.
 
diff --git a/xsa97-hap-4.4.patch b/xsa97-hap-4.4.patch
new file mode 100644
index 0000000..d005efc
--- /dev/null
+++ b/xsa97-hap-4.4.patch
@@ -0,0 +1,485 @@
+x86/paging: make log-dirty operations preemptible
+
+Both the freeing and the inspection of the bitmap get done in (nested)
+loops which - besides having a rather high iteration count in general,
+albeit that would be covered by XSA-77 - have the number of non-trivial
+iterations they need to perform (indirectly) controllable by both the
+guest they are for and any domain controlling the guest (including the
+one running qemu for it).
+
+This is XSA-97.
+
+Signed-off-by: Jan Beulich <jbeulich at suse.com>
+Reviewed-by: Tim Deegan <tim at xen.org>
+
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -1915,7 +1915,9 @@ int domain_relinquish_resources(struct d
+         pci_release_devices(d);
+ 
+         /* Tear down paging-assistance stuff. */
+-        paging_teardown(d);
++        ret = paging_teardown(d);
++        if ( ret )
++            return ret;
+ 
+         /* Drop the in-use references to page-table bases. */
+         for_each_vcpu ( d, v )
+--- a/xen/arch/x86/domctl.c
++++ b/xen/arch/x86/domctl.c
+@@ -61,6 +61,9 @@ long arch_do_domctl(
+         ret = paging_domctl(d,
+                             &domctl->u.shadow_op,
+                             guest_handle_cast(u_domctl, void));
++        if ( ret == -EAGAIN )
++            return hypercall_create_continuation(__HYPERVISOR_domctl,
++                                                 "h", u_domctl);
+         copyback = 1;
+     }
+     break;
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -572,8 +572,7 @@ int hap_domctl(struct domain *d, xen_dom
+         paging_unlock(d);
+         if ( preempted )
+             /* Not finished.  Set up to re-run the call. */
+-            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+-                                               u_domctl);
++            rc = -EAGAIN;
+         else
+             /* Finished.  Return the new allocation */
+             sc->mb = hap_get_allocation(d);
+--- a/xen/arch/x86/mm/paging.c
++++ b/xen/arch/x86/mm/paging.c
+@@ -26,6 +26,7 @@
+ #include <asm/shadow.h>
+ #include <asm/p2m.h>
+ #include <asm/hap.h>
++#include <asm/event.h>
+ #include <asm/hvm/nestedhvm.h>
+ #include <xen/numa.h>
+ #include <xsm/xsm.h>
+@@ -116,26 +117,46 @@ static void paging_free_log_dirty_page(s
+     d->arch.paging.free_page(d, mfn_to_page(mfn));
+ }
+ 
+-void paging_free_log_dirty_bitmap(struct domain *d)
++static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
+ {
+     mfn_t *l4, *l3, *l2;
+     int i4, i3, i2;
+ 
++    paging_lock(d);
++
+     if ( !mfn_valid(d->arch.paging.log_dirty.top) )
+-        return;
++    {
++        paging_unlock(d);
++        return 0;
++    }
+ 
+-    paging_lock(d);
++    if ( !d->arch.paging.preempt.vcpu )
++    {
++        memset(&d->arch.paging.preempt.log_dirty, 0,
++               sizeof(d->arch.paging.preempt.log_dirty));
++        ASSERT(rc <= 0);
++        d->arch.paging.preempt.log_dirty.done = -rc;
++    }
++    else if ( d->arch.paging.preempt.vcpu != current ||
++              d->arch.paging.preempt.op != XEN_DOMCTL_SHADOW_OP_OFF )
++    {
++        paging_unlock(d);
++        return -EBUSY;
++    }
+ 
+     l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
++    i4 = d->arch.paging.preempt.log_dirty.i4;
++    i3 = d->arch.paging.preempt.log_dirty.i3;
++    rc = 0;
+ 
+-    for ( i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++ )
++    for ( ; i4 < LOGDIRTY_NODE_ENTRIES; i4++, i3 = 0 )
+     {
+         if ( !mfn_valid(l4[i4]) )
+             continue;
+ 
+         l3 = map_domain_page(mfn_x(l4[i4]));
+ 
+-        for ( i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
++        for ( ; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
+         {
+             if ( !mfn_valid(l3[i3]) )
+                 continue;
+@@ -148,20 +169,54 @@ void paging_free_log_dirty_bitmap(struct
+ 
+             unmap_domain_page(l2);
+             paging_free_log_dirty_page(d, l3[i3]);
++            l3[i3] = _mfn(INVALID_MFN);
++
++            if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++            {
++                d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++                d->arch.paging.preempt.log_dirty.i4 = i4;
++                rc = -EAGAIN;
++                break;
++            }
+         }
+ 
+         unmap_domain_page(l3);
++        if ( rc )
++            break;
+         paging_free_log_dirty_page(d, l4[i4]);
++        l4[i4] = _mfn(INVALID_MFN);
++
++        if ( i4 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++        {
++            d->arch.paging.preempt.log_dirty.i3 = 0;
++            d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++            rc = -EAGAIN;
++            break;
++        }
+     }
+ 
+     unmap_domain_page(l4);
+-    paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
+-    d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+ 
+-    ASSERT(d->arch.paging.log_dirty.allocs == 0);
+-    d->arch.paging.log_dirty.failed_allocs = 0;
++    if ( !rc )
++    {
++        paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
++        d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
++
++        ASSERT(d->arch.paging.log_dirty.allocs == 0);
++        d->arch.paging.log_dirty.failed_allocs = 0;
++
++        rc = -d->arch.paging.preempt.log_dirty.done;
++        d->arch.paging.preempt.vcpu = NULL;
++    }
++    else
++    {
++        d->arch.paging.preempt.vcpu = current;
++        d->arch.paging.preempt.op = XEN_DOMCTL_SHADOW_OP_OFF;
++    }
+ 
+     paging_unlock(d);
++
++    return rc;
+ }
+ 
+ int paging_log_dirty_enable(struct domain *d, bool_t log_global)
+@@ -178,15 +233,25 @@ int paging_log_dirty_enable(struct domai
+     return ret;
+ }
+ 
+-int paging_log_dirty_disable(struct domain *d)
++static int paging_log_dirty_disable(struct domain *d, bool_t resuming)
+ {
+-    int ret;
++    int ret = 1;
++
++    if ( !resuming )
++    {
++        domain_pause(d);
++        /* Safe because the domain is paused. */
++        ret = d->arch.paging.log_dirty.disable_log_dirty(d);
++        ASSERT(ret <= 0);
++    }
+ 
+-    domain_pause(d);
+-    /* Safe because the domain is paused. */
+-    ret = d->arch.paging.log_dirty.disable_log_dirty(d);
+     if ( !paging_mode_log_dirty(d) )
+-        paging_free_log_dirty_bitmap(d);
++    {
++        ret = paging_free_log_dirty_bitmap(d, ret);
++        if ( ret == -EAGAIN )
++            return ret;
++    }
++
+     domain_unpause(d);
+ 
+     return ret;
+@@ -326,7 +391,9 @@ int paging_mfn_is_dirty(struct domain *d
+ 
+ /* Read a domain's log-dirty bitmap and stats.  If the operation is a CLEAN,
+  * clear the bitmap and stats as well. */
+-int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc)
++static int paging_log_dirty_op(struct domain *d,
++                               struct xen_domctl_shadow_op *sc,
++                               bool_t resuming)
+ {
+     int rv = 0, clean = 0, peek = 1;
+     unsigned long pages = 0;
+@@ -334,9 +401,22 @@ int paging_log_dirty_op(struct domain *d
+     unsigned long *l1 = NULL;
+     int i4, i3, i2;
+ 
+-    domain_pause(d);
++    if ( !resuming )
++        domain_pause(d);
+     paging_lock(d);
+ 
++    if ( !d->arch.paging.preempt.vcpu )
++        memset(&d->arch.paging.preempt.log_dirty, 0,
++               sizeof(d->arch.paging.preempt.log_dirty));
++    else if ( d->arch.paging.preempt.vcpu != current ||
++              d->arch.paging.preempt.op != sc->op )
++    {
++        paging_unlock(d);
++        ASSERT(!resuming);
++        domain_unpause(d);
++        return -EBUSY;
++    }
++
+     clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+ 
+     PAGING_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
+@@ -365,17 +445,15 @@ int paging_log_dirty_op(struct domain *d
+         goto out;
+     }
+ 
+-    pages = 0;
+     l4 = paging_map_log_dirty_bitmap(d);
++    i4 = d->arch.paging.preempt.log_dirty.i4;
++    i3 = d->arch.paging.preempt.log_dirty.i3;
++    pages = d->arch.paging.preempt.log_dirty.done;
+ 
+-    for ( i4 = 0;
+-          (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES);
+-          i4++ )
++    for ( ; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++, i3 = 0 )
+     {
+         l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+-        for ( i3 = 0;
+-              (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES);
+-              i3++ )
++        for ( ; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ )
+         {
+             l2 = ((l3 && mfn_valid(l3[i3])) ?
+                   map_domain_page(mfn_x(l3[i3])) : NULL);
+@@ -410,18 +488,51 @@ int paging_log_dirty_op(struct domain *d
+             }
+             if ( l2 )
+                 unmap_domain_page(l2);
++
++            if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
++            {
++                d->arch.paging.preempt.log_dirty.i4 = i4;
++                d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
++                rv = -EAGAIN;
++                break;
++            }
+         }
+         if ( l3 )
+             unmap_domain_page(l3);
++
++        if ( !rv && i4 < LOGDIRTY_NODE_ENTRIES - 1 &&
++             hypercall_preempt_check() )
++        {
++            d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
++            d->arch.paging.preempt.log_dirty.i3 = 0;
++            rv = -EAGAIN;
++        }
++        if ( rv )
++            break;
+     }
+     if ( l4 )
+         unmap_domain_page(l4);
+ 
+-    if ( pages < sc->pages )
+-        sc->pages = pages;
++    if ( !rv )
++        d->arch.paging.preempt.vcpu = NULL;
++    else
++    {
++        d->arch.paging.preempt.vcpu = current;
++        d->arch.paging.preempt.op = sc->op;
++        d->arch.paging.preempt.log_dirty.done = pages;
++    }
+ 
+     paging_unlock(d);
+ 
++    if ( rv )
++    {
++        /* Never leave the domain paused for other errors. */
++        ASSERT(rv == -EAGAIN);
++        return rv;
++    }
++
++    if ( pages < sc->pages )
++        sc->pages = pages;
+     if ( clean )
+     {
+         /* We need to further call clean_dirty_bitmap() functions of specific
+@@ -432,6 +543,7 @@ int paging_log_dirty_op(struct domain *d
+     return rv;
+ 
+  out:
++    d->arch.paging.preempt.vcpu = NULL;
+     paging_unlock(d);
+     domain_unpause(d);
+ 
+@@ -499,12 +611,6 @@ void paging_log_dirty_init(struct domain
+     d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
+ }
+ 
+-/* This function fress log dirty bitmap resources. */
+-static void paging_log_dirty_teardown(struct domain*d)
+-{
+-    paging_free_log_dirty_bitmap(d);
+-}
+-
+ /************************************************/
+ /*           CODE FOR PAGING SUPPORT            */
+ /************************************************/
+@@ -548,6 +654,7 @@ void paging_vcpu_init(struct vcpu *v)
+ int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
+                   XEN_GUEST_HANDLE_PARAM(void) u_domctl)
+ {
++    bool_t resuming = 0;
+     int rc;
+ 
+     if ( unlikely(d == current->domain) )
+@@ -570,6 +677,20 @@ int paging_domctl(struct domain *d, xen_
+         return -EINVAL;
+     }
+ 
++    if ( d->arch.paging.preempt.vcpu )
++    {
++        if ( d->arch.paging.preempt.vcpu != current ||
++             d->arch.paging.preempt.op != sc->op )
++        {
++            printk(XENLOG_G_DEBUG
++                   "d%d:v%d: Paging op %#x on Dom%u with unfinished prior op %#x\n",
++                   current->domain->domain_id, current->vcpu_id,
++                   sc->op, d->domain_id, d->arch.paging.preempt.op);
++            return -EBUSY;
++        }
++        resuming = 1;
++    }
++
+     rc = xsm_shadow_control(XSM_HOOK, d, sc->op);
+     if ( rc )
+         return rc;
+@@ -595,13 +716,13 @@ int paging_domctl(struct domain *d, xen_
+ 
+     case XEN_DOMCTL_SHADOW_OP_OFF:
+         if ( paging_mode_log_dirty(d) )
+-            if ( (rc = paging_log_dirty_disable(d)) != 0 )
++            if ( (rc = paging_log_dirty_disable(d, resuming)) != 0 )
+                 return rc;
+         break;
+ 
+     case XEN_DOMCTL_SHADOW_OP_CLEAN:
+     case XEN_DOMCTL_SHADOW_OP_PEEK:
+-        return paging_log_dirty_op(d, sc);
++        return paging_log_dirty_op(d, sc, resuming);
+     }
+ 
+     /* Here, dispatch domctl to the appropriate paging code */
+@@ -612,18 +733,24 @@ int paging_domctl(struct domain *d, xen_
+ }
+ 
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d)
++int paging_teardown(struct domain *d)
+ {
++    int rc;
++
+     if ( hap_enabled(d) )
+         hap_teardown(d);
+     else
+         shadow_teardown(d);
+ 
+     /* clean up log dirty resources. */
+-    paging_log_dirty_teardown(d);
++    rc = paging_free_log_dirty_bitmap(d, 0);
++    if ( rc == -EAGAIN )
++        return rc;
+ 
+     /* Move populate-on-demand cache back to domain_list for destruction */
+     p2m_pod_empty_cache(d);
++
++    return rc;
+ }
+ 
+ /* Call once all of the references to the domain have gone away */
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -3706,8 +3706,7 @@ int shadow_domctl(struct domain *d, 
+         paging_unlock(d);
+         if ( preempted )
+             /* Not finished.  Set up to re-run the call. */
+-            rc = hypercall_create_continuation(
+-                __HYPERVISOR_domctl, "h", u_domctl);
++            rc = -EAGAIN;
+         else 
+             /* Finished.  Return the new allocation */
+             sc->mb = shadow_get_allocation(d);
+--- a/xen/common/domain.c
++++ b/xen/common/domain.c
+@@ -536,7 +536,6 @@ int domain_kill(struct domain *d)
+         rc = domain_relinquish_resources(d);
+         if ( rc != 0 )
+         {
+-            BUG_ON(rc != -EAGAIN);
+             break;
+         }
+         for_each_vcpu ( d, v )
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -186,6 +186,20 @@ struct paging_domain {
+     struct hap_domain       hap;
+     /* log dirty support */
+     struct log_dirty_domain log_dirty;
++
++    /* preemption handling */
++    struct {
++        struct vcpu *vcpu;
++        unsigned int op;
++        union {
++            struct {
++                unsigned long done:PADDR_BITS - PAGE_SHIFT;
++                unsigned long i4:PAGETABLE_ORDER;
++                unsigned long i3:PAGETABLE_ORDER;
++            } log_dirty;
++        };
++    } preempt;
++
+     /* alloc/free pages from the pool for paging-assistance structures
+      * (used by p2m and log-dirty code for their tries) */
+     struct page_info * (*alloc_page)(struct domain *d);
+--- a/xen/include/asm-x86/paging.h
++++ b/xen/include/asm-x86/paging.h
+@@ -133,9 +133,6 @@ struct paging_mode {
+ /*****************************************************************************
+  * Log dirty code */
+ 
+-/* free log dirty bitmap resource */
+-void paging_free_log_dirty_bitmap(struct domain *d);
+-
+ /* get the dirty bitmap for a specific range of pfns */
+ void paging_log_dirty_range(struct domain *d,
+                             unsigned long begin_pfn,
+@@ -145,9 +142,6 @@ void paging_log_dirty_range(struct domai
+ /* enable log dirty */
+ int paging_log_dirty_enable(struct domain *d, bool_t log_global);
+ 
+-/* disable log dirty */
+-int paging_log_dirty_disable(struct domain *d);
+-
+ /* log dirty initialization */
+ void paging_log_dirty_init(struct domain *d,
+                            int  (*enable_log_dirty)(struct domain *d,
+@@ -207,7 +201,7 @@ int paging_domctl(struct domain *d, xen_
+                   XEN_GUEST_HANDLE_PARAM(void) u_domctl);
+ 
+ /* Call when destroying a domain */
+-void paging_teardown(struct domain *d);
++int paging_teardown(struct domain *d);
+ 
+ /* Call once all of the references to the domain have gone away */
+ void paging_final_teardown(struct domain *d);


More information about the scm-commits mailing list