use of com.hazelcast.jet.core.JobStatus.RUNNING in project hazelcast by hazelcast.
the class MasterJobContext method onStartExecutionComplete.
private void onStartExecutionComplete(Throwable error, Collection<Entry<MemberInfo, Object>> responses) {
JobStatus status = mc.jobStatus();
if (status != STARTING && status != RUNNING) {
logCannotComplete(error);
error = new IllegalStateException("Job coordination failed");
}
setJobMetrics(responses.stream().filter(en -> en.getValue() instanceof RawJobMetrics).map(e1 -> (RawJobMetrics) e1.getValue()).collect(Collectors.toList()));
if (error instanceof JobTerminateRequestedException && ((JobTerminateRequestedException) error).mode().isWithTerminalSnapshot()) {
Throwable finalError = error;
// The terminal snapshot on members is always completed before replying to StartExecutionOp.
// However, the response to snapshot operations can be processed after the response to
// StartExecutionOp, so wait for that too.
mc.snapshotContext().terminalSnapshotFuture().whenCompleteAsync(withTryCatch(logger, (r, e) -> finalizeJob(finalError)));
} else {
if (error instanceof ExecutionNotFoundException) {
// If the StartExecutionOperation didn't find the execution, it means that it was cancelled.
if (requestedTerminationMode != null) {
// This cancellation can be because the master cancelled it. If that's the case, convert the exception
// to JobTerminateRequestedException.
error = new JobTerminateRequestedException(requestedTerminationMode).initCause(error);
}
// The cancellation can also happen if some participant left and
// the target cancelled the execution locally in JobExecutionService.onMemberRemoved().
// We keep this (and possibly other) exceptions as they are
// and let the execution complete with failure.
}
finalizeJob(error);
}
}
use of com.hazelcast.jet.core.JobStatus.RUNNING in project hazelcast by hazelcast.
the class SplitBrainTest method when_splitBrainProtectionDisabled_then_jobRunsTwiceAndAgainOnceAfterHeal.
@Test
public void when_splitBrainProtectionDisabled_then_jobRunsTwiceAndAgainOnceAfterHeal() {
int firstSubClusterSize = 3;
int secondSubClusterSize = 2;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
NoOutputSourceP.executionStarted = new CountDownLatch(secondSubClusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<HazelcastInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(NoOutputSourceP::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[0].getJet().newJob(dag, new JobConfig().setSplitBrainProtection(false));
assertTrueEventually(() -> assertEquals("initCount", clusterSize, MockPS.initCount.get()), 10);
assertOpenEventually("executionStarted", NoOutputSourceP.executionStarted);
};
BiConsumer<HazelcastInstance[], HazelcastInstance[]> onSplit = (firstSubCluster, secondSubCluster) -> {
Job jobRef1 = firstSubCluster[0].getJet().getJob(jobRef[0].getId());
Job jobRef2 = secondSubCluster[0].getJet().getJob(jobRef[0].getId());
assertNotNull("jobRef1", jobRef1);
assertNotNull("jobRef2", jobRef2);
assertTrueEventually(() -> assertEquals("job not running on subcluster 1", RUNNING, jobRef1.getStatus()));
assertTrueEventually(() -> assertEquals("job not running on subcluster 2", RUNNING, jobRef2.getStatus()));
// we need assert-eventually here because we might observe RUNNING state from an execution before the split
assertTrueEventually(() -> assertEquals("initCount", clusterSize * 2, MockPS.initCount.get()));
};
Consumer<HazelcastInstance[]> afterMerge = instances -> {
// this assert will hold after the job scales up
assertTrueEventually(() -> assertEquals(clusterSize * 3, MockPS.initCount.get()), 20);
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, onSplit, afterMerge);
}
use of com.hazelcast.jet.core.JobStatus.RUNNING in project hazelcast by hazelcast.
the class SplitBrainTest method when_minorityMasterBecomesMajorityMaster_then_jobKeepsRunning.
@Test
public void when_minorityMasterBecomesMajorityMaster_then_jobKeepsRunning() {
int firstSubClusterSize = 2;
int secondSubClusterSize = 1;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
NoOutputSourceP.executionStarted = new CountDownLatch(secondSubClusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<HazelcastInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(NoOutputSourceP::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[2].getJet().newJob(dag);
assertOpenEventually(NoOutputSourceP.executionStarted);
};
Consumer<HazelcastInstance[]> afterMerge = instances -> {
assertEquals(clusterSize, instances.length);
logger.info("Shutting down 1st instance");
instances[0].shutdown();
logger.info("1st instance down, starting another instance");
createHazelcastInstance(createConfig());
logger.info("Shutting down 2nd instance");
instances[1].shutdown();
assertTrue(((ClusterService) instances[2].getCluster()).isMaster());
assertJobStatusEventually(jobRef[0], RUNNING, 10);
assertTrueAllTheTime(() -> assertEquals(RUNNING, jobRef[0].getStatus()), 5);
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, null, afterMerge);
}
use of com.hazelcast.jet.core.JobStatus.RUNNING in project hazelcast by hazelcast.
the class SnapshotLargeChunk_IntegrationTest method test_snapshotRestoreLargeChunk.
@Test
public void test_snapshotRestoreLargeChunk() {
HazelcastInstance instance = createHazelcastInstance();
DAG dag = new DAG();
dag.newVertex("src", LargeStateP::new).localParallelism(1);
Job job = instance.getJet().newJob(dag, new JobConfig().setProcessingGuarantee(ProcessingGuarantee.EXACTLY_ONCE).setSnapshotIntervalMillis(DAYS.toMillis(1)));
assertJobStatusEventually(job, RUNNING);
job.restart();
assertJobStatusEventually(job, RUNNING);
// assert that the snapshot exists and that the chunk was large enough
IMap<Object, Object> map = instance.getMap(JobRepository.snapshotDataMapName(job.getId(), 0));
SnapshotValidationRecord validationRec = (SnapshotValidationRecord) map.get(SnapshotValidationRecord.KEY);
assertEquals(1, validationRec.numChunks());
IntSummaryStatistics stats = map.values().stream().filter(v -> v instanceof byte[]).collect(summarizingInt(v -> ((byte[]) v).length));
assertTrue("min=" + stats.getMin(), stats.getMin() > AsyncSnapshotWriterImpl.DEFAULT_CHUNK_SIZE);
}
use of com.hazelcast.jet.core.JobStatus.RUNNING in project hazelcast by hazelcast.
the class WriteFilePTest method stressTest.
private void stressTest(boolean graceful, boolean exactlyOnce) throws Exception {
int numItems = 500;
Pipeline p = Pipeline.create();
p.readFrom(SourceBuilder.stream("src", procCtx -> tuple2(new int[1], procCtx.logger())).fillBufferFn((ctx, buf) -> {
if (ctx.f0()[0] < numItems) {
buf.add(ctx.f0()[0]++);
sleepMillis(5);
}
}).createSnapshotFn(ctx -> {
ctx.f1().fine("src vertex saved to snapshot: " + ctx.f0()[0]);
return ctx.f0()[0];
}).restoreSnapshotFn((ctx, state) -> {
ctx.f0()[0] = state.get(0);
ctx.f1().fine("src vertex restored from snapshot: " + ctx.f0()[0]);
}).build()).withoutTimestamps().writeTo(Sinks.filesBuilder(directory.toString()).exactlyOnce(exactlyOnce).build()).setLocalParallelism(2);
JobConfig config = new JobConfig().setProcessingGuarantee(EXACTLY_ONCE).setSnapshotIntervalMillis(50);
JobProxy job = (JobProxy) instance().getJet().newJob(p, config);
long endTime = System.nanoTime() + SECONDS.toNanos(60);
do {
assertJobStatusEventually(job, RUNNING);
sleepMillis(100);
job.restart(graceful);
try {
checkFileContents(0, numItems, exactlyOnce, true, false);
// if content matches, break the loop. Otherwise restart and try again
break;
} catch (AssertionError ignored) {
}
} while (System.nanoTime() < endTime);
waitForNextSnapshot(new JobRepository(instance()), job.getId(), 10, true);
ditchJob(job, instances());
// when the job is cancelled, there should be no temporary files
checkFileContents(0, numItems, exactlyOnce, false, false);
}
Aggregations