use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast-jet by hazelcast.
the class MasterContext method invokeCompleteExecution.
private void invokeCompleteExecution(Throwable error) {
JobStatus status = jobStatus();
Throwable finalError;
if (status == STARTING || status == RESTARTING || status == RUNNING) {
logger.fine("Completing " + jobIdString());
finalError = error;
} else {
if (error != null) {
logger.severe("Cannot properly complete failed " + jobIdString() + ": status is " + status, error);
} else {
logger.severe("Cannot properly complete " + jobIdString() + ": status is " + status);
}
finalError = new IllegalStateException("Job coordination failed.");
}
Function<ExecutionPlan, Operation> operationCtor = plan -> new CompleteExecutionOperation(executionId, finalError);
invoke(operationCtor, responses -> finalizeJob(error), null);
}
use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast-jet by hazelcast.
the class SplitBrainTest method when_quorumIsLostOnMinority_then_jobRestartsUntilMerge.
@Test
public void when_quorumIsLostOnMinority_then_jobRestartsUntilMerge() {
int firstSubClusterSize = 3;
int secondSubClusterSize = 2;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
StuckProcessor.executionStarted = new CountDownLatch(clusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<JetInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(StuckProcessor::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[0].newJob(dag, new JobConfig().setSplitBrainProtection(true));
assertOpenEventually(StuckProcessor.executionStarted);
};
Future[] minorityJobFutureRef = new Future[1];
BiConsumer<JetInstance[], JetInstance[]> onSplit = (firstSubCluster, secondSubCluster) -> {
StuckProcessor.proceedLatch.countDown();
assertTrueEventually(() -> assertEquals(clusterSize + firstSubClusterSize, MockPS.initCount.get()));
long jobId = jobRef[0].getId();
assertTrueEventually(() -> {
JetService service = getJetService(firstSubCluster[0]);
assertEquals(COMPLETED, service.getJobCoordinationService().getJobStatus(jobId));
});
JetService service2 = getJetService(secondSubCluster[0]);
assertTrueEventually(() -> {
assertEquals(STARTING, service2.getJobCoordinationService().getJobStatus(jobId));
});
MasterContext masterContext = service2.getJobCoordinationService().getMasterContext(jobId);
assertNotNull(masterContext);
minorityJobFutureRef[0] = masterContext.completionFuture();
assertTrueAllTheTime(() -> {
assertEquals(STARTING, service2.getJobCoordinationService().getJobStatus(jobId));
}, 20);
};
Consumer<JetInstance[]> afterMerge = instances -> {
assertTrueEventually(() -> {
assertEquals(clusterSize + firstSubClusterSize, MockPS.initCount.get());
assertEquals(clusterSize + firstSubClusterSize, MockPS.closeCount.get());
});
assertEquals(clusterSize, MockPS.receivedCloseErrors.size());
MockPS.receivedCloseErrors.forEach(t -> assertTrue(t instanceof TopologyChangedException));
try {
minorityJobFutureRef[0].get();
fail();
} catch (CancellationException ignored) {
} catch (Exception e) {
throw new AssertionError(e);
}
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, onSplit, afterMerge);
}
use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast-jet by hazelcast.
the class SplitBrainTest method when_quorumIsLostOnBothSides_then_jobRestartsUntilMerge.
@Test
public void when_quorumIsLostOnBothSides_then_jobRestartsUntilMerge() {
int firstSubClusterSize = 2;
int secondSubClusterSize = 2;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
StuckProcessor.executionStarted = new CountDownLatch(clusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<JetInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(StuckProcessor::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[0].newJob(dag, new JobConfig().setSplitBrainProtection(true));
assertOpenEventually(StuckProcessor.executionStarted);
};
BiConsumer<JetInstance[], JetInstance[]> onSplit = (firstSubCluster, secondSubCluster) -> {
StuckProcessor.proceedLatch.countDown();
long jobId = jobRef[0].getId();
assertTrueEventually(() -> {
JetService service1 = getJetService(firstSubCluster[0]);
JetService service2 = getJetService(secondSubCluster[0]);
assertEquals(RESTARTING, service1.getJobCoordinationService().getJobStatus(jobId));
assertEquals(STARTING, service2.getJobCoordinationService().getJobStatus(jobId));
});
assertTrueAllTheTime(() -> {
JetService service1 = getJetService(firstSubCluster[0]);
JetService service2 = getJetService(secondSubCluster[0]);
assertEquals(RESTARTING, service1.getJobCoordinationService().getJobStatus(jobId));
assertEquals(STARTING, service2.getJobCoordinationService().getJobStatus(jobId));
}, 20);
};
Consumer<JetInstance[]> afterMerge = instances -> {
assertTrueEventually(() -> {
assertEquals(clusterSize * 2, MockPS.initCount.get());
assertEquals(clusterSize * 2, MockPS.closeCount.get());
});
assertEquals(clusterSize, MockPS.receivedCloseErrors.size());
MockPS.receivedCloseErrors.forEach(t -> assertTrue(t instanceof TopologyChangedException));
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, onSplit, afterMerge);
}
use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast by hazelcast.
the class HazelcastBootstrap method awaitJobsStarted.
private static void awaitJobsStarted() {
List<Job> submittedJobs = ((BootstrappedJetProxy) HazelcastBootstrap.supplier.get().getJet()).submittedJobs();
int submittedCount = submittedJobs.size();
if (submittedCount == 0) {
System.out.println("The JAR didn't submit any jobs.");
return;
}
int previousCount = -1;
while (true) {
uncheckRun(() -> Thread.sleep(JOB_START_CHECK_INTERVAL_MILLIS));
List<Job> startedJobs = submittedJobs.stream().filter(job -> !STARTUP_STATUSES.contains(job.getStatus())).collect(Collectors.toList());
submittedJobs = submittedJobs.stream().filter(job -> !startedJobs.contains(job)).collect(Collectors.toList());
int remainingCount = submittedJobs.size();
if (submittedJobs.isEmpty() && remainingCount == previousCount) {
break;
}
if (remainingCount == previousCount) {
continue;
}
for (Job job : startedJobs) {
// back to startup statuses.
if (job.getName() != null) {
System.out.println("Job '" + job.getName() + "' submitted at " + toLocalDateTime(job.getSubmissionTime()) + " changed status to " + job.getStatus() + " at " + toLocalDateTime(System.currentTimeMillis()) + ".");
} else {
System.out.println("Job '" + job.getIdString() + "' submitted at " + toLocalDateTime(job.getSubmissionTime()) + " changed status to " + job.getStatus() + " at " + toLocalDateTime(System.currentTimeMillis()) + ".");
}
}
if (remainingCount == 1) {
System.out.println("A job is still starting...");
} else if (remainingCount > 1) {
System.out.format("%,d jobs are still starting...%n", remainingCount);
}
previousCount = remainingCount;
}
}
use of com.hazelcast.jet.core.JobStatus.STARTING in project hazelcast-jet by hazelcast.
the class TopologyChangeTest method when_jobParticipantReceivesStaleInitOperation_then_jobRestarts.
@Test
public void when_jobParticipantReceivesStaleInitOperation_then_jobRestarts() {
// Given
JetInstance newInstance = createJetMember(config);
for (JetInstance instance : instances) {
assertClusterSizeEventually(NODE_COUNT + 1, instance.getHazelcastInstance());
}
rejectOperationsBetween(instances[0].getHazelcastInstance(), instances[2].getHazelcastInstance(), JetInitDataSerializerHook.FACTORY_ID, singletonList(INIT_EXECUTION_OP));
DAG dag = new DAG().vertex(new Vertex("test", new MockPS(TestProcessors.Identity::new, nodeCount + 1)));
Job job = instances[0].newJob(dag);
JetService jetService = getJetService(instances[0]);
assertTrueEventually(() -> assertFalse(jetService.getJobCoordinationService().getMasterContexts().isEmpty()));
MasterContext masterContext = jetService.getJobCoordinationService().getMasterContext(job.getId());
assertTrueEventually(() -> {
assertEquals(STARTING, masterContext.jobStatus());
assertNotEquals(0, masterContext.getExecutionId());
});
// When
long executionId = masterContext.getExecutionId();
assertTrueEventually(() -> {
Arrays.stream(instances).filter(instance -> !instance.getHazelcastInstance().getCluster().getLocalMember().isLiteMember()).filter(instance -> instance != instances[2]).map(JetTestSupport::getJetService).map(service -> service.getJobExecutionService().getExecutionContext(executionId)).forEach(Assert::assertNotNull);
});
newInstance.getHazelcastInstance().getLifecycleService().terminate();
for (JetInstance instance : instances) {
assertClusterSizeEventually(NODE_COUNT, instance.getHazelcastInstance());
}
resetPacketFiltersFrom(instances[0].getHazelcastInstance());
// Then
job.join();
assertNotEquals(executionId, masterContext.getExecutionId());
}
Aggregations