orion pushed to openmpi (master). "Add upstream patch to fix race/hang on 32bit machines"

notifications at fedoraproject.org notifications at fedoraproject.org
Mon Mar 30 23:02:46 UTC 2015


>From 3923dafb55c62593e2317716b54833b9c9585f77 Mon Sep 17 00:00:00 2001
From: Orion Poplawski <orion at cora.nwra.com>
Date: Mon, 30 Mar 2015 16:51:42 -0600
Subject: Add upstream patch to fix race/hang on 32bit machines


diff --git a/openmpi-vader.patch b/openmpi-vader.patch
new file mode 100644
index 0000000..ea5510e
--- /dev/null
+++ b/openmpi-vader.patch
@@ -0,0 +1,103 @@
+diff -up openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_endpoint.h.vader openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_endpoint.h
+--- openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_endpoint.h.vader	2015-03-24 19:21:54.000000000 -0600
++++ openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_endpoint.h	2015-03-30 16:35:41.411110838 -0600
+@@ -11,7 +11,7 @@
+  * Copyright (c) 2004-2005 The Regents of the University of California.
+  *                         All rights reserved.
+  * Copyright (c) 2006-2007 Voltaire. All rights reserved.
+- * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
++ * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
+  *                         reserved.
+  * $COPYRIGHT$
+  *
+@@ -48,14 +48,16 @@ typedef struct mca_btl_base_endpoint_t {
+     /* per peer buffers */
+     struct {
+         unsigned char *buffer; /**< starting address of peer's fast box out */
+-        unsigned int start, seq;
+         uint32_t *startp;
++        unsigned int start;
++        uint16_t seq;
+     } fbox_in;
+ 
+     struct {
+         unsigned char *buffer; /**< starting address of peer's fast box in */
+-        unsigned int start, end, seq;
+         uint32_t *startp;      /**< pointer to location storing start offset */
++        unsigned int start, end;
++        uint16_t seq;
+     } fbox_out;
+ 
+     int32_t peer_smp_rank;  /**< my peer's SMP process rank.  Used for accessing
+diff -up openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_fbox.h.vader openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_fbox.h
+--- openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_fbox.h.vader	2015-03-24 19:21:54.000000000 -0600
++++ openmpi-v1.8.4-134-g9ad2aa8/ompi/mca/btl/vader/btl_vader_fbox.h	2015-03-30 16:35:41.412110833 -0600
+@@ -1,6 +1,6 @@
+ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+ /*
+- * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
++ * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
+  *                         reserved.
+  * $COPYRIGHT$
+  *
+@@ -18,9 +18,16 @@
+ 
+ typedef union mca_btl_vader_fbox_hdr_t {
+     struct {
++        /* NTH: on 32-bit platforms loading/unloading the header may be completed
++         * in multiple instructions. To ensure that seq is never loaded before tag
++         * and the tag is never read before seq put them in the same 32-bits of the
++         * header. */
++        /** message tag */
+         uint16_t  tag;
+-        uint16_t  size;
+-        uint32_t  seq;
++        /** sequence number */
++        uint16_t  seq;
++        /** message size */
++        uint32_t  size;
+     } data;
+     uint64_t ival;
+ } mca_btl_vader_fbox_hdr_t;
+@@ -40,6 +47,13 @@ typedef union mca_btl_vader_fbox_hdr_t {
+ 
+ void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, mca_btl_base_endpoint_t *ep);
+ 
++static inline void mca_btl_vader_fbox_set_header (mca_btl_vader_fbox_hdr_t *hdr, uint16_t tag,
++                                                  uint16_t seq, uint32_t size)
++{
++    mca_btl_vader_fbox_hdr_t tmp = {.data = {.tag = tag, .seq = seq, .size = size}};
++    hdr->ival = tmp.ival;
++}
++
+ /* attempt to reserve a contiguous segment from the remote ep */
+ static inline unsigned char *mca_btl_vader_reserve_fbox (mca_btl_base_endpoint_t *ep, size_t size)
+ {
+@@ -88,12 +102,10 @@ static inline unsigned char *mca_btl_vad
+         /* if this is the end of the buffer and the fragment doesn't fit then mark the remaining buffer space to
+          * be skipped and check if the fragment can be written at the beginning of the buffer. */
+         if (OPAL_UNLIKELY(buffer_free > 0 && buffer_free < size && start <= end)) {
+-            mca_btl_vader_fbox_hdr_t tmp = {.data = {.size = buffer_free - sizeof (mca_btl_vader_fbox_hdr_t),
+-                                                     .seq = ep->fbox_out.seq++, .tag = 0xff}};
+-
+             BTL_VERBOSE(("message will not fit in remaining buffer space. skipping to beginning"));
+ 
+-            MCA_BTL_VADER_FBOX_HDR(dst)->ival = tmp.ival;
++            mca_btl_vader_fbox_set_header (MCA_BTL_VADER_FBOX_HDR(dst), 0xff, ep->fbox_out.seq++,
++                                           buffer_free - sizeof (mca_btl_vader_fbox_hdr_t));
+ 
+             end = MCA_BTL_VADER_FBOX_ALIGNMENT;
+             /* toggle the high bit */
+@@ -114,11 +126,7 @@ static inline unsigned char *mca_btl_vad
+                  (unsigned int) size, end, start, end, hbs, buffer_free));
+ 
+     /* write out part of the header now. the tag will be written when the data is available */
+-    {
+-        mca_btl_vader_fbox_hdr_t tmp = {.data = {.size = data_size, .tag = 0, .seq = ep->fbox_out.seq++}};
+-
+-        MCA_BTL_VADER_FBOX_HDR(dst)->ival = tmp.ival;
+-    }
++    mca_btl_vader_fbox_set_header (MCA_BTL_VADER_FBOX_HDR(dst), 0, ep->fbox_out.seq++, data_size);
+ 
+     end += size;
+ 
diff --git a/openmpi.spec b/openmpi.spec
index ab561be..c9cce67 100644
--- a/openmpi.spec
+++ b/openmpi.spec
@@ -40,6 +40,9 @@ Patch1:			openmpi-ltdl.patch
 # Fix typo in liboshmem name
 # https://github.com/open-mpi/ompi/pull/221
 Patch2:                 openmpi-oshmem.patch
+# Upstream patch to fix race/hang on 32-bit
+# https://github.com/open-mpi/ompi/pull/503
+Patch3:                 openmpi-vader.patch
 
 BuildRequires:		gcc-gfortran
 #sparc64 don't have valgrind
@@ -120,6 +123,7 @@ Contains development wrapper for compiling Java with openmpi.
 %patch0 -p1 -b .atomic
 %patch1 -p1 -b .ltdl
 %patch2 -p1 -b .oshmem
+%patch3 -p1 -b .vader
 # Make sure we don't use the local libltdl library
 rm -r opal/libltdl
 
@@ -244,6 +248,9 @@ make check
 
 
 %changelog
+* Mon Mar 30 2015 Orion Poplawski <orion at cora.nwra.com> 1.8.4-7.20150324gitg9ad2aa8
+- Add upstream patch to fix race/hang on 32bit machines
+
 * Fri Mar 27 2015 Orion Poplawski <orion at cora.nwra.com> 1.8.4-6.20150324gitg9ad2aa8
 - Update to latest 1.8.4 snapshot
 - Add upstream patch to fix atomics on 32bit
-- 
cgit v0.10.2


	http://pkgs.fedoraproject.org/cgit/openmpi.git/commit/?h=master&id=3923dafb55c62593e2317716b54833b9c9585f77


More information about the scm-commits mailing list