use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast by hazelcast.
the class TopologyChangeTest method when_jobParticipantReceivesStaleInitOperation_then_jobRestarts.
@Test
public void when_jobParticipantReceivesStaleInitOperation_then_jobRestarts() {
// Given
HazelcastInstance newInstance = createHazelcastInstance(config);
for (HazelcastInstance instance : instances) {
assertClusterSizeEventually(NODE_COUNT + 1, instance);
}
rejectOperationsBetween(instances[0], instances[2], JetInitDataSerializerHook.FACTORY_ID, singletonList(INIT_EXECUTION_OP));
DAG dag = new DAG().vertex(new Vertex("test", new MockPS(TestProcessors.Identity::new, nodeCount + 1)));
Job job = instances[0].getJet().newJob(dag);
JetServiceBackend jetServiceBackend = getJetServiceBackend(instances[0]);
assertTrueEventually(() -> assertFalse(jetServiceBackend.getJobCoordinationService().getMasterContexts().isEmpty()));
MasterContext masterContext = jetServiceBackend.getJobCoordinationService().getMasterContext(job.getId());
assertTrueEventually(() -> {
assertEquals(STARTING, masterContext.jobStatus());
assertNotEquals(0, masterContext.executionId());
});
// When
long executionId = masterContext.executionId();
assertTrueEventually(() -> {
Arrays.stream(instances).filter(instance -> !instance.getCluster().getLocalMember().isLiteMember()).filter(instance -> instance != instances[2]).map(JetTestSupport::getJetServiceBackend).map(service -> service.getJobExecutionService().getExecutionContext(executionId)).forEach(Assert::assertNotNull);
});
newInstance.getLifecycleService().terminate();
for (HazelcastInstance instance : instances) {
assertClusterSizeEventually(NODE_COUNT, instance);
}
resetPacketFiltersFrom(instances[0]);
// Then
job.join();
assertNotEquals(executionId, masterContext.executionId());
}
use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast by hazelcast.
the class SplitBrainTest method when_minorityMasterBecomesMajorityMaster_then_jobKeepsRunning.
@Test
public void when_minorityMasterBecomesMajorityMaster_then_jobKeepsRunning() {
int firstSubClusterSize = 2;
int secondSubClusterSize = 1;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
NoOutputSourceP.executionStarted = new CountDownLatch(secondSubClusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<HazelcastInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(NoOutputSourceP::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[2].getJet().newJob(dag);
assertOpenEventually(NoOutputSourceP.executionStarted);
};
Consumer<HazelcastInstance[]> afterMerge = instances -> {
assertEquals(clusterSize, instances.length);
logger.info("Shutting down 1st instance");
instances[0].shutdown();
logger.info("1st instance down, starting another instance");
createHazelcastInstance(createConfig());
logger.info("Shutting down 2nd instance");
instances[1].shutdown();
assertTrue(((ClusterService) instances[2].getCluster()).isMaster());
assertJobStatusEventually(jobRef[0], RUNNING, 10);
assertTrueAllTheTime(() -> assertEquals(RUNNING, jobRef[0].getStatus()), 5);
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, null, afterMerge);
}
use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast by hazelcast.
the class MasterJobContext method onStartExecutionComplete.
private void onStartExecutionComplete(Throwable error, Collection<Entry<MemberInfo, Object>> responses) {
JobStatus status = mc.jobStatus();
if (status != STARTING && status != RUNNING) {
logCannotComplete(error);
error = new IllegalStateException("Job coordination failed");
}
setJobMetrics(responses.stream().filter(en -> en.getValue() instanceof RawJobMetrics).map(e1 -> (RawJobMetrics) e1.getValue()).collect(Collectors.toList()));
if (error instanceof JobTerminateRequestedException && ((JobTerminateRequestedException) error).mode().isWithTerminalSnapshot()) {
Throwable finalError = error;
// The terminal snapshot on members is always completed before replying to StartExecutionOp.
// However, the response to snapshot operations can be processed after the response to
// StartExecutionOp, so wait for that too.
mc.snapshotContext().terminalSnapshotFuture().whenCompleteAsync(withTryCatch(logger, (r, e) -> finalizeJob(finalError)));
} else {
if (error instanceof ExecutionNotFoundException) {
// If the StartExecutionOperation didn't find the execution, it means that it was cancelled.
if (requestedTerminationMode != null) {
// This cancellation can be because the master cancelled it. If that's the case, convert the exception
// to JobTerminateRequestedException.
error = new JobTerminateRequestedException(requestedTerminationMode).initCause(error);
}
// The cancellation can also happen if some participant left and
// the target cancelled the execution locally in JobExecutionService.onMemberRemoved().
// We keep this (and possibly other) exceptions as they are
// and let the execution complete with failure.
}
finalizeJob(error);
}
}
Aggregations