use of org.voltcore.messaging.SiteFailureMessage in project voltdb by VoltDB.
the class MeshArbiter method discoverGlobalFaultData_rcv.
/**
* Collect the failure site update messages from all sites This site sent
* its own mailbox the above broadcast the maximum is local to this site.
* This also ensures at least one response.
*
* Concurrent failures can be detected by additional reports from the FaultDistributor
* or a mismatch in the set of failed hosts reported in a message from another site
*/
private boolean discoverGlobalFaultData_rcv(Set<Long> hsIds) {
long blockedOnReceiveStart = System.currentTimeMillis();
long lastReportTime = 0;
boolean haveEnough = false;
int[] forwardStallCount = new int[] { FORWARD_STALL_COUNT };
do {
VoltMessage m = m_mailbox.recvBlocking(receiveSubjects, 5);
/*
* If fault resolution takes longer then 10 seconds start logging
*/
final long now = System.currentTimeMillis();
if (now - blockedOnReceiveStart > 10000) {
if (now - lastReportTime > 60000) {
lastReportTime = System.currentTimeMillis();
haveNecessaryFaultInfo(m_seeker.getSurvivors(), true);
}
}
if (m == null) {
// Send a heartbeat to keep the dead host timeout active. Needed because IV2 doesn't
// generate its own heartbeats to keep this running.
m_meshAide.sendHeartbeats(m_seeker.getSurvivors());
} else if (m.getSubject() == Subject.SITE_FAILURE_UPDATE.getId()) {
SiteFailureMessage sfm = (SiteFailureMessage) m;
if (!m_seeker.getSurvivors().contains(m.m_sourceHSId) || m_failedSites.contains(m.m_sourceHSId) || m_failedSites.containsAll(sfm.getFailedSites()))
continue;
if (!sfm.m_decision.isEmpty()) {
m_decidedSurvivors.put(sfm.m_sourceHSId, sfm);
}
updateFailedSitesLedger(hsIds, sfm);
m_seeker.add(sfm);
addForwardCandidate(new SiteFailureForwardMessage(sfm));
m_recoveryLog.info("Agreement, Received " + sfm);
} else if (m.getSubject() == Subject.SITE_FAILURE_FORWARD.getId()) {
SiteFailureForwardMessage fsfm = (SiteFailureForwardMessage) m;
addForwardCandidate(fsfm);
if (!hsIds.contains(fsfm.m_sourceHSId) || m_seeker.getSurvivors().contains(fsfm.m_reportingHSId) || m_failedSites.contains(fsfm.m_reportingHSId) || m_failedSites.containsAll(fsfm.getFailedSites()))
continue;
m_seeker.add(fsfm);
m_recoveryLog.info("Agreement, Received forward " + fsfm);
forwardStallCount[0] = FORWARD_STALL_COUNT;
} else if (m.getSubject() == Subject.FAILURE.getId()) {
/*
* If the fault distributor reports a new fault, ignore it if it is known , otherwise
* re-deliver the message to ourself and then abort so that the process can restart.
*/
FaultMessage fm = (FaultMessage) m;
Discard ignoreIt = mayIgnore(hsIds, fm);
if (Discard.DoNot == ignoreIt) {
m_mailbox.deliverFront(m);
m_recoveryLog.info("Agreement, Detected a concurrent failure from FaultDistributor, new failed site " + CoreUtils.hsIdToString(fm.failedSite));
return false;
} else {
if (m_recoveryLog.isDebugEnabled()) {
ignoreIt.log(fm);
}
}
}
haveEnough = haveEnough || haveNecessaryFaultInfo(m_seeker.getSurvivors(), false);
if (haveEnough) {
Iterator<Map.Entry<Long, SiteFailureForwardMessage>> itr = m_forwardCandidates.entrySet().iterator();
while (itr.hasNext()) {
Map.Entry<Long, SiteFailureForwardMessage> e = itr.next();
Set<Long> unseenBy = m_seeker.forWhomSiteIsDead(e.getKey());
if (unseenBy.size() > 0) {
m_mailbox.send(Longs.toArray(unseenBy), e.getValue());
m_recoveryLog.info("Agreement, fowarding to " + CoreUtils.hsIdCollectionToString(unseenBy) + " " + e.getValue());
}
itr.remove();
}
}
} while (!haveEnough || m_seeker.needForward(forwardStallCount));
return true;
}
use of org.voltcore.messaging.SiteFailureMessage in project voltdb by VoltDB.
the class TestMeshArbiter method testMixOfWitnessedAndNon.
@Test
public void testMixOfWitnessedAndNon() throws Exception {
Maker<SiteFailureMessage> um = a(SiteFailureMessage, with(sfmSurvivors, Longs.asList(0, 2, 3)), with(sfmFailures, sfmFailed(1)), with(sfmSafeTxns, sfmSafe(0, 10, 1, 11, 2, 22, 3, 33)));
when(aide.getNewestSafeTransactionForInitiator(1L)).thenReturn(11L);
when(mbox.recvBlocking(any(Subject[].class), eq(5L))).thenReturn(make(um.but(with(sfmSource, 2L)))).thenReturn(new FaultMessage(0, 1));
Map<Long, Long> decision = arbiter.reconfigureOnFault(hsids, new FaultMessage(2, 1, ImmutableSet.of(0L, 2L, 3L)));
verify(mbox, times(1)).deliverFront(any(VoltMessage.class));
verify(mbox, times(1)).send(any(long[].class), any(VoltMessage.class));
verify(mbox).send(any(long[].class), argThat(siteFailureIs(sfmFailed(1), sfmSurvived(0, 1, 2, 3))));
assertEquals(decision, ImmutableMap.<Long, Long>of());
reset(mbox);
when(mbox.recvBlocking(any(Subject[].class), eq(5L))).thenReturn(make(um.but(with(sfmSource, 0L)))).thenReturn(make(um.but(with(sfmSource, 3L))));
decision = arbiter.reconfigureOnFault(hsids, new FaultMessage(0, 1));
verify(mbox, never()).deliverFront(any(VoltMessage.class));
verify(mbox, times(2)).send(any(long[].class), argThat(siteFailureIs(sfmFailed(1), sfmSurvived(0, 2, 3))));
assertEquals(decision, ImmutableMap.<Long, Long>of(1L, 11L));
}
use of org.voltcore.messaging.SiteFailureMessage in project voltdb by VoltDB.
the class MiniNode method run.
@Override
public void run() {
m_miniSite.start();
for (long HSId : m_HSIds) {
// Don't track your own death
if (HSId != m_HSId) {
m_deadTracker.startTracking(HSId);
}
}
m_nodeState.set(NodeState.RUN);
while (m_shouldContinue.get()) {
Message msg = m_recvQ.poll();
synchronized (this) {
if (msg != null) {
if (msg.m_close) {
int failedHostId = CoreUtils.getHostIdFromHSId(msg.m_src);
long agreementHSId = CoreUtils.getHSIdFromHostAndSite(failedHostId, HostMessenger.AGREEMENT_SITE_ID);
m_miniSite.reportFault(agreementHSId);
m_deadTracker.stopTracking(msg.m_src);
} else {
m_deadTracker.updateHSId(msg.m_src);
// inject actual message into mailbox
VoltMessage message = msg.m_msg;
// snoop for SiteFailureMessage, inject into MiniSite's mailbox
if (message instanceof SiteFailureMessage && !(message instanceof SiteFailureForwardMessage)) {
SiteFailureMessage sfm = (SiteFailureMessage) message;
for (FaultMessage fm : sfm.asFaultMessages()) {
m_miniSite.reportFault(fm);
}
}
m_mailbox.deliver(message);
}
}
// Do dead host detection. Need to keep track of receive gaps from the remaining set
// of live hosts.
Set<Long> deadHosts = m_deadTracker.checkTimeouts();
for (long HSId : deadHosts) {
int failedHostId = CoreUtils.getHostIdFromHSId(HSId);
long agreementHSId = CoreUtils.getHSIdFromHostAndSite(failedHostId, HostMessenger.AGREEMENT_SITE_ID);
m_miniSite.reportFault(agreementHSId);
m_deadTracker.stopTracking(HSId);
}
}
}
}
use of org.voltcore.messaging.SiteFailureMessage in project voltdb by VoltDB.
the class TestMeshArbiter method testOverlappingFailures.
@Test
public void testOverlappingFailures() throws Exception {
Maker<SiteFailureMessage> site12Sfm = a(SiteFailureMessage, with(sfmSurvivors, Longs.asList(0, 3)), with(sfmFailures, sfmFailed(1, 2)), with(sfmSafeTxns, sfmSafe(0, 10, 1, 11, 2, 22, 3, 33)));
when(aide.getNewestSafeTransactionForInitiator(1L)).thenReturn(11L);
when(aide.getNewestSafeTransactionForInitiator(2L)).thenReturn(22L);
when(mbox.recv(any(Subject[].class))).thenReturn(new FaultMessage(0, 2)).thenReturn((VoltMessage) null);
when(mbox.recvBlocking(any(Subject[].class), eq(5L))).thenReturn(make(site12Sfm.but(with(sfmSource, 0L)))).thenReturn(make(site12Sfm.but(with(sfmSource, 3L))));
Map<Long, Long> decision = arbiter.reconfigureOnFault(hsids, new FaultMessage(0, 1));
verify(mbox, times(2)).send(any(long[].class), argThat(siteFailureIs(sfmFailed(1, 2), sfmSurvived(0, 3))));
assertEquals(decision, ImmutableMap.<Long, Long>of(1L, 11L, 2L, 22L));
}
use of org.voltcore.messaging.SiteFailureMessage in project voltdb by VoltDB.
the class TestMeshArbiter method testPingsOnLongReceives.
@Test
public void testPingsOnLongReceives() throws Exception {
Maker<SiteFailureMessage> siteOneSfm = a(SiteFailureMessage, with(sfmSurvivors, Longs.asList(0, 2, 3)), with(sfmFailures, sfmFailed(1)), with(sfmSafeTxns, sfmSafe(0, 10, 1, 11, 2, 22, 3, 33)));
when(aide.getNewestSafeTransactionForInitiator(1L)).thenReturn(11L);
when(mbox.recvBlocking(any(Subject[].class), eq(5L))).thenReturn((VoltMessage) null).thenReturn((VoltMessage) null).thenReturn((VoltMessage) null).thenReturn(make(siteOneSfm.but(with(sfmSource, 0L)))).thenReturn(make(siteOneSfm.but(with(sfmSource, 2L)))).thenReturn(make(siteOneSfm.but(with(sfmSource, 3L))));
Map<Long, Long> decision = arbiter.reconfigureOnFault(hsids, new FaultMessage(0, 1));
verify(mbox, times(2)).send(any(long[].class), argThat(siteFailureIs(sfmFailed(1), sfmSurvived(0, 2, 3))));
verify(aide, atLeast(2)).sendHeartbeats(destinationCaptor.capture());
assertEquals(destinationCaptor.getValue(), sfmSurvived(0, 2, 3));
assertEquals(decision, ImmutableMap.<Long, Long>of(1L, 11L));
}
Aggregations