modules/enterprise/server/itests-2/src/test/java/org/rhq/enterprise/server/measurement/test/AvailabilityManagerTest.java | 229 ++++++++-- modules/enterprise/server/jar/src/main/java/org/rhq/enterprise/server/measurement/AvailabilityManagerBean.java | 62 ++ 2 files changed, 242 insertions(+), 49 deletions(-)
New commits: commit 3605ce3398557277d8ddf4deb3ffaa83337b7c58 Author: Jay Shaughnessy jshaughn@redhat.com Date: Tue Dec 18 10:06:59 2012 -0500
[Bug 884338 (not a fix) - Agent's availability report is ignored due to bogus stale resource error] I failed to reproduce this issue but: - Add some better logging and some attempted repair code for this situation - Add a couple more tests
diff --git a/modules/enterprise/server/itests-2/src/test/java/org/rhq/enterprise/server/measurement/test/AvailabilityManagerTest.java b/modules/enterprise/server/itests-2/src/test/java/org/rhq/enterprise/server/measurement/test/AvailabilityManagerTest.java index f180036..ff1193d 100644 --- a/modules/enterprise/server/itests-2/src/test/java/org/rhq/enterprise/server/measurement/test/AvailabilityManagerTest.java +++ b/modules/enterprise/server/itests-2/src/test/java/org/rhq/enterprise/server/measurement/test/AvailabilityManagerTest.java @@ -24,7 +24,6 @@ import java.util.Date; import java.util.List; import java.util.Random;
-import javax.persistence.EntityManager; import javax.persistence.Query; import javax.transaction.Status;
@@ -37,6 +36,7 @@ import org.rhq.core.domain.measurement.Availability; import org.rhq.core.domain.measurement.AvailabilityType; import org.rhq.core.domain.measurement.ResourceAvailability; import org.rhq.core.domain.resource.Agent; +import org.rhq.core.domain.resource.InventoryStatus; import org.rhq.core.domain.resource.Resource; import org.rhq.core.domain.resource.ResourceCategory; import org.rhq.core.domain.resource.ResourceType; @@ -117,7 +117,6 @@ public class AvailabilityManagerTest extends AbstractEJB3Test {
if (additionalResources != null) { getTransactionManager().begin(); - EntityManager em = getEntityManager();
for (Resource res : additionalResources) { Resource res2 = em.find(Resource.class, res.getId()); @@ -130,7 +129,6 @@ public class AvailabilityManagerTest extends AbstractEJB3Test {
if (theResourceType != null) { getTransactionManager().begin(); - EntityManager em = getEntityManager();
em.remove(em.find(ResourceType.class, theResourceType.getId())); theResourceType = null; @@ -159,7 +157,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { beginTx();
try { - setupResource(em); + setupResource(); commitAndClose();
Availability aThen = new Availability(theResource, then, UP); @@ -211,7 +209,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { beginTx();
try { - setupResource(em); + setupResource(); commitAndClose();
Availability aThen = new Availability(theResource, then, UP); @@ -278,7 +276,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { List<AvailabilityPoint> availPoints; Availability avail;
- setupResource(em); + setupResource(); // platform: UNKNOWN(0) --> commitAndClose();
@@ -430,7 +428,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { beginTx();
try { - setupResource(em); + setupResource(); commitAndClose();
@@ -460,7 +458,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { beginTx();
try { - setupResource(em); + setupResource(); commitAndClose();
@@ -490,7 +488,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { try { prepareForTestAgents();
- setupResource(em); + setupResource(); commitAndClose();
@@ -561,13 +559,13 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { try { prepareForTestAgents();
- setupResource(em); // setup theResource + setupResource(); // setup theResource
allResources.add(theResource);
// now create a bunch more resources for (int i = 0; i < 99; i++) { - allResources.add(setupAnotherResource(em, i, theResource)); + allResources.add(setupAnotherResource(i, theResource)); } em.flush();
@@ -736,7 +734,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { beginTx();
try { - setupResource(em); // inserts initial UNKNOWN Availability at epoch + setupResource(); // inserts initial UNKNOWN Availability at epoch commitAndClose();
@@ -823,7 +821,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { beginTx();
try { - setupResource(em); + setupResource(); commitAndClose();
@@ -884,7 +882,7 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { try { Availability avail;
- setupResource(em); + setupResource(); commitAndClose();
@@ -972,11 +970,11 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { Availability avail; AvailabilityReport report;
- setupResource(em); + setupResource(); commitAndClose();
- long allAvailCount = setUpAvailabilities(em); + long allAvailCount = setUpAvailabilities();
// we now have 1:00 UP, 1:20 DOWN, 1:40 UP Subject overlord = LookupUtil.getSubjectManager().getOverlord(); @@ -1050,11 +1048,10 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { Availability avail; AvailabilityReport report;
- setupResource(em); + setupResource(); commitAndClose();
- - long allAvailCount = setUpAvailabilities(em); + long allAvailCount = setUpAvailabilities();
// we now have 1:00 UP, 1:20 DOWN, 1:40 UP Subject overlord = LookupUtil.getSubjectManager().getOverlord(); @@ -1121,19 +1118,174 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { } }
+ /** + * This was an attempt to get the behavior of this bug but it didn't. This inserts duplicate avail records, which + * should be discarded. + */ + @Test(enabled = ENABLE_TESTS) + public void testBZ884338_1() throws Exception { + beginTx(); + + try { + Availability avail; + AvailabilityReport report; + + setupResource(); + commitAndClose(); + + long allAvailCount = setUpAvailabilities(); + assertTrue("should have >= 3 avail records", 3 <= allAvailCount); + List<Availability> avails = getResourceAvailabilities(theResource); + // there is always the UNKNOWN period starting at epoch, plus the three created in setup + assertEquals(avails.toString(), 4, avails.size()); + + // we now have 1:00 UP, 1:20 DOWN, 1:40 UP + avail = availabilityManager.getCurrentAvailabilityForResource(overlord, theResource.getId()); + assert avail.getAvailabilityType() == UP; + + // insert a duplicate current 1:40 UP and an out-of-order 1:20 DOWN, these should be ignored + Long currentStartTime = avail.getStartTime(); + long newStartTime = currentStartTime; + avail = new Availability(theResource, newStartTime, UP); + report = new AvailabilityReport(false, theAgent.getName()); + report.addAvailability(avail); + + newStartTime = (currentStartTime - (20 * 60 * 1000L)); + avail = new Availability(theResource, newStartTime, DOWN); + report = new AvailabilityReport(false, theAgent.getName()); + report.addAvailability(avail); + + Thread.sleep(1000); + availabilityManager.mergeAvailabilityReport(report); + + // the agent should have been updated, but no new rows in availability were added + Agent agent = LookupUtil.getAgentManager().getAgentByName(theAgent.getName()); + Date lastReport = new Date(agent.getLastAvailabilityReport()); + assert lastReport != null; + assertEquals(allAvailCount, countAvailabilitiesInDB().longValue()); + avails = getResourceAvailabilities(theResource); + assertEquals(avails.toString(), 4, avails.size()); + + // avail start times should be unchanged 0, 1:00 (UP), 1:20(DOWN), 1:40(UP) + avail = avails.get(0); // 0..1:00 + assertTrue(avail.toString(), Math.abs(avail.getStartTime() - 0L) < 1000L); + assertEquals(avail.toString(), AvailabilityType.UNKNOWN, avail.getAvailabilityType()); + assertTrue(avail.toString(), Math.abs(avail.getEndTime() - avails.get(1).getStartTime()) < 1000L); + avail = avails.get(1); // 1:00..1:20 + assertTrue(avail.toString(), Math.abs(currentStartTime - (avail.getStartTime() + (40 * 60 * 1000))) < 1000L); + assertEquals(avail.toString(), AvailabilityType.UP, avail.getAvailabilityType()); + assertTrue(avail.toString(), Math.abs(avail.getEndTime() - avails.get(2).getStartTime()) < 1000L); + avail = avails.get(2); // 1:20..1:40 + assertTrue(avail.toString(), Math.abs(currentStartTime - (avail.getStartTime() + (20 * 60 * 1000))) < 1000L); + assertEquals(avail.toString(), AvailabilityType.DOWN, avail.getAvailabilityType()); + assertTrue(avail.toString(), Math.abs(avail.getEndTime() - avails.get(3).getStartTime()) < 1000L); + avail = avails.get(3); // 1:40 + assertTrue(avail.toString(), Math.abs(currentStartTime - avail.getStartTime()) < 1000L); + assertEquals(avail.toString(), AvailabilityType.UP, avail.getAvailabilityType()); + assertEquals(avail.toString(), avail.getEndTime(), null); + + } catch (Exception e) { + e.printStackTrace(); + throw e; + } finally { + if (Status.STATUS_ACTIVE == getTransactionManager().getStatus()) { + getTransactionManager().rollback(); + } + } + } + + /** + * Still can't reproduce the bug but this test tries to validate the repair code we have in case we detect + * the problem. + */ + @Test(enabled = ENABLE_TESTS) + public void testBZ884338_2() throws Exception { + beginTx(); + + try { + Availability avail; + AvailabilityReport report; + + setupResource(); + + commitAndClose(); + + long allAvailCount = setUpAvailabilities(); + assertTrue("should have >= 3 avail records", 3 <= allAvailCount); + List<Availability> avails = getResourceAvailabilities(theResource); + // there is always the UNKNOWN period starting at epoch, plus the three created in setup + assertEquals(avails.toString(), 4, avails.size()); + + // we now have 1:00 UP, 1:20 DOWN, 1:40 UP + avail = availabilityManager.getCurrentAvailabilityForResource(overlord, theResource.getId()); + + beginTx(); + + avail = em.find(Availability.class, avail.getId()); + assert avail.getAvailabilityType() == UP; + + // mess things up by assigning an end time to the latest avail record. + Long currentStartTime = avail.getStartTime(); + + Long nonNullEndTime = currentStartTime + 1000L; + avail.setEndTime(nonNullEndTime); + avail = em.merge(avail); + + commitAndClose(); + + // try to insert new avail, this should trigger the repair code + long newStartTime = (currentStartTime + (5 * 60 * 1000L)); + avail = new Availability(theResource, newStartTime, DOWN); + report = new AvailabilityReport(false, theAgent.getName()); + report.addAvailability(avail); + + Thread.sleep(1000); + availabilityManager.mergeAvailabilityReport(report); + + avails = getResourceAvailabilities(theResource); + assertEquals(avails.toString(), 4, avails.size()); + + // avail start times should now be 0, 1:00 (UP), 1:10(DOWN), 1:40(UP) + avail = avails.get(0); // 0..1:00 + assertTrue(avail.toString(), Math.abs(avail.getStartTime() - 0L) < 1000L); + assertEquals(avail.toString(), AvailabilityType.UNKNOWN, avail.getAvailabilityType()); + assertTrue(avail.toString(), Math.abs(avail.getEndTime() - avails.get(1).getStartTime()) < 1000L); + avail = avails.get(1); // 1:00..1:20 + assertTrue(avail.toString(), Math.abs(currentStartTime - (avail.getStartTime() + (40 * 60 * 1000))) < 1000L); + assertEquals(avail.toString(), AvailabilityType.UP, avail.getAvailabilityType()); + assertTrue(avail.toString(), Math.abs(avail.getEndTime() - avails.get(2).getStartTime()) < 1000L); + avail = avails.get(2); // 1:20..1:40 + assertTrue(avail.toString(), Math.abs(currentStartTime - (avail.getStartTime() + (20 * 60 * 1000))) < 1000L); + assertEquals(avail.toString(), AvailabilityType.DOWN, avail.getAvailabilityType()); + assertTrue(avail.toString(), Math.abs(avail.getEndTime() - avails.get(3).getStartTime()) < 1000L); + avail = avails.get(3); // 1:40 + assertTrue(avail.toString(), Math.abs(currentStartTime - avail.getStartTime()) < 1000L); + assertEquals(avail.toString(), AvailabilityType.UP, avail.getAvailabilityType()); + assertEquals(avail.toString(), null, avail.getEndTime()); // THIS IS THE CHANGE + + } catch (Exception e) { + e.printStackTrace(); + throw e; + } finally { + if (Status.STATUS_ACTIVE == getTransactionManager().getStatus()) { + getTransactionManager().rollback(); + } + } + } + @Test(enabled = ENABLE_TESTS) public void testMergeReportPerformance() throws Exception { beginTx(); List<Resource> allResources = new ArrayList<Resource>();
try { - setupResource(em); // setup theResource + setupResource(); // setup theResource
allResources.add(theResource);
// now create a bunch more resources for (int i = 0; i < 100; i++) { - allResources.add(setupAnotherResource(em, i, theResource)); + allResources.add(setupAnotherResource(i, theResource)); } em.flush();
@@ -1260,8 +1412,6 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { /** * See how many rows we have in the availability table * - * @param em EntityManager to use - * * @return the rowcount * * @throws Exception @@ -1277,22 +1427,36 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { }
/** - * Just set up a resource where we can attach the availabilities to + * See how many rows we have in the availability table * - * @param em The EntityManager to use + * @return the rowcount + * + * @throws Exception + */ + private List<Availability> getResourceAvailabilities(Resource r) throws Exception { + AvailabilityCriteria c = new AvailabilityCriteria(); + c.addFilterResourceId(r.getId()); + c.addSortStartTime(PageOrdering.ASC); + return availabilityManager.findAvailabilityByCriteria(overlord, c); + } + + /** + * Just set up a resource where we can attach the availabilities to * * @return A Resource ready to use */ - private Resource setupResource(EntityManager em) { - theAgent = new Agent("testagent", "localhost", 1234, "", "randomToken"); + private Resource setupResource() { + String prefix = this.getClass().getSimpleName() + "_"; + theAgent = new Agent(prefix + "agent", "localhost", 1234, "", "randomToken"); em.persist(theAgent);
- theResourceType = new ResourceType("test-plat", "test-plugin", ResourceCategory.PLATFORM, null); + theResourceType = new ResourceType(prefix + "type", prefix + "plugin", ResourceCategory.PLATFORM, null); em.persist(theResourceType);
- theResource = new Resource("test-platform-key", "test-platform-name", theResourceType); + theResource = new Resource(prefix + "resourceKey", prefix + "resourceName", theResourceType); theResource.setUuid("" + new Random().nextInt()); theResource.setAgent(theAgent); + theResource.setInventoryStatus(InventoryStatus.COMMITTED); em.persist(theResource);
em.flush(); @@ -1303,12 +1467,11 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { * Set up another unique resource that will be related to <code>theAgent</code>. The resource will be of type <code> * theResourceType</code>. * - * @param em The EntityManager to use * @param uniqueNumber used to define a unique key for the resource * * @return A Resource ready to use */ - private Resource setupAnotherResource(EntityManager em, int uniqueNumber, Resource parentResource) { + private Resource setupAnotherResource(int uniqueNumber, Resource parentResource) { Resource newResource;
newResource = new Resource("test-platform-key-" + uniqueNumber, "test-platform-name-" + uniqueNumber, @@ -1326,13 +1489,11 @@ public class AvailabilityManagerTest extends AbstractEJB3Test { * Set up an availability scenario where we set up availability for one hour, split it in the middle and have 20min * up, 20min down, 20min up starting at 1:00am. * - * @param em An EntityManager to use - * * @return total number of availability records in the DB after we've added ours * * @throws Exception */ - private long setUpAvailabilities(EntityManager em) throws Exception { + private long setUpAvailabilities() throws Exception { Calendar cal = Calendar.getInstance(); cal.setTime(new Date()); cal.set(Calendar.HOUR, 1); diff --git a/modules/enterprise/server/jar/src/main/java/org/rhq/enterprise/server/measurement/AvailabilityManagerBean.java b/modules/enterprise/server/jar/src/main/java/org/rhq/enterprise/server/measurement/AvailabilityManagerBean.java index 05dd802..31f1040 100644 --- a/modules/enterprise/server/jar/src/main/java/org/rhq/enterprise/server/measurement/AvailabilityManagerBean.java +++ b/modules/enterprise/server/jar/src/main/java/org/rhq/enterprise/server/measurement/AvailabilityManagerBean.java @@ -46,6 +46,7 @@ import org.rhq.core.domain.discovery.AvailabilityReport; import org.rhq.core.domain.measurement.Availability; import org.rhq.core.domain.measurement.AvailabilityType; import org.rhq.core.domain.measurement.ResourceAvailability; +import org.rhq.core.domain.resource.InventoryStatus; import org.rhq.core.domain.resource.Resource; import org.rhq.core.domain.resource.composite.ResourceIdWithAvailabilityComposite; import org.rhq.core.domain.resource.group.composite.ResourceGroupComposite; @@ -556,9 +557,41 @@ public class AvailabilityManagerBean implements AvailabilityManagerLocal, Availa } catch (NoResultException nre) { // This should not happen unless the Resource in the report is stale, which can happen in certain // sync scenarios. A Resource is given its initial Availability/ResourceAvailability when it is - // persisted so it is guaranteed to have Availability, so, the Resource must not exist. - log.info("Skipping mergeAvailabilityReport() for stale resource [" + reported.getResource() - + "]. These messages should go away after the next agent synchronization with the server."); + // persisted so it is guaranteed to have Availability, so, the Resource must not exist. At least + // it must not exist in my utopian view of the world. Let's just make sure... + Resource attachedResource = (Resource) entityManager.find(Resource.class, reported.getResource() + .getId()); + if (null == attachedResource) { + // expected case + log.info("Skipping mergeAvailabilityReport() for stale resource [" + reported.getResource() + + "]. These messages should go away after the next agent synchronization with the server."); + + } else if (InventoryStatus.COMMITTED == attachedResource.getInventoryStatus()) { + // this should not happen, it means the resource exists but has no latest Availability + // record (i.e. sendTime == null). Try to correct the situation. + log.warn("Resource [" + reported.getResource() + + "] has no latest availability record (i.e. no endtime) - will attempt to repair.\n" + + report.toString(false)); + try { + List<Availability> attachedAvails = attachedResource.getAvailability(); + if (attachedAvails.isEmpty()) { + attachedResource.initCurrentAvailability(); + entityManager.merge(attachedResource); + + } else { + Availability attachedLastAvail = attachedAvails.get(attachedAvails.size() - 1); + attachedLastAvail.setEndTime(null); + entityManager.merge(attachedLastAvail); + } + + // ask the agent for a full report so as to ensure we are in sync with agent + askForFullReport = true; + + } catch (Throwable t) { + log.warn("Unable to repair latest availablity for Resource [" + reported.getResource() + + "]", t); + } + }
} catch (NonUniqueResultException nure) { // This condition should never happen. In my world of la-la land, I've done everything @@ -661,7 +694,7 @@ public class AvailabilityManagerBean implements AvailabilityManagerLocal, Availa public void updateAgentResourceAvailabilities(int agentId, AvailabilityType platformAvailType, AvailabilityType childAvailType) {
- platformAvailType = (null == platformAvailType) ? AvailabilityType.UNKNOWN : platformAvailType; + platformAvailType = (null == platformAvailType) ? AvailabilityType.DOWN : platformAvailType; childAvailType = (null == childAvailType) ? AvailabilityType.UNKNOWN : childAvailType;
// get the platform resource if not already at platformAvailType (since this is the one @@ -681,17 +714,8 @@ public class AvailabilityManagerBean implements AvailabilityManagerLocal, Availa query.setParameter("disabled", AvailabilityType.DISABLED); List<ResourceIdWithAvailabilityComposite> resourcesWithStatus = query.getResultList();
- // The above queries only return resources if they have at least one row in Availability. - // This may be a problem in the future, and may need to be fixed. - // If a resource has 0 rows of availability, then it is by definition "unknown". If, - // availabilityType is null, we don't have to do anything since the unknown state hasn't changed. - // If this method is told to set all agent resources to something of other than unknown (null) - // availability, then we may need to completely rethink the query we do above so it returns composite - // objects for all resources, even those that have 0 rows of availability. Remember though, that once - // we get an availability report from an agent, a resource will have at least 1 availability row. So, - // a resource should rarely have 0 avail rows; if it does, it normally gets one within a minute - // (since the agent sends avail reports every 60 seconds or so by default). So this problem might not - // be as bad as first thought. + // The above queries only return resources if they have at least one row in Availability. This should + // not be a problem since a new Resource gets an initial UNKNOWN Availability record at persist-time.
if (log.isDebugEnabled()) { log.debug("Agent #[" + agentId + "] is going to have [" + resourcesWithStatus.size() @@ -815,6 +839,14 @@ public class AvailabilityManagerBean implements AvailabilityManagerLocal, Availa }
return; + + } catch (NonUniqueResultException nure) { + // This should not happen but can happen if the startTime exactly matches an existing start time. In + // this case assume we have somehow been passed a duplicate report, and ignore the entry. + log.warn("Resource [" + toInsert.getResource() + + "] received a duplicate Availability. It is being ignored: " + toInsert); + + return; }
// If we are inserting the same availability type, the first one can just continue
rhq-commits@lists.fedorahosted.org