use of com.hazelcast.jet.core.JobStatus.COMPLETED in project hazelcast by hazelcast.
the class SplitBrainTest method when_quorumIsLostOnMinority_then_jobDoesNotRestartOnMinorityAndCancelledAfterMerge.
@Test
public void when_quorumIsLostOnMinority_then_jobDoesNotRestartOnMinorityAndCancelledAfterMerge() {
int firstSubClusterSize = 3;
int secondSubClusterSize = 2;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
NoOutputSourceP.executionStarted = new CountDownLatch(clusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<HazelcastInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(NoOutputSourceP::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[0].getJet().newJob(dag, new JobConfig().setSplitBrainProtection(true));
assertOpenEventually(NoOutputSourceP.executionStarted);
};
Future[] minorityJobFutureRef = new Future[1];
BiConsumer<HazelcastInstance[], HazelcastInstance[]> onSplit = (firstSubCluster, secondSubCluster) -> {
NoOutputSourceP.proceedLatch.countDown();
assertTrueEventually(() -> assertEquals(clusterSize + firstSubClusterSize, MockPS.initCount.get()));
long jobId = jobRef[0].getId();
assertTrueEventually(() -> {
JetServiceBackend service = getJetServiceBackend(firstSubCluster[0]);
assertEquals(COMPLETED, service.getJobCoordinationService().getJobStatus(jobId).get());
});
JetServiceBackend service2 = getJetServiceBackend(secondSubCluster[0]);
assertTrueEventually(() -> {
MasterContext masterContext = service2.getJobCoordinationService().getMasterContext(jobId);
assertNotNull(masterContext);
minorityJobFutureRef[0] = masterContext.jobContext().jobCompletionFuture();
});
assertTrueAllTheTime(() -> {
assertStatusNotRunningOrStarting(service2.getJobCoordinationService().getJobStatus(jobId).get());
}, 20);
};
Consumer<HazelcastInstance[]> afterMerge = instances -> {
assertTrueEventually(() -> {
assertEquals(clusterSize + firstSubClusterSize, MockPS.initCount.get());
assertEquals(clusterSize + firstSubClusterSize, MockPS.closeCount.get());
});
assertEquals(clusterSize, MockPS.receivedCloseErrors.size());
MockPS.receivedCloseErrors.forEach(t -> assertTrue("received " + t, t instanceof CancellationException));
try {
minorityJobFutureRef[0].get();
fail();
} catch (CancellationException expected) {
} catch (Exception e) {
throw new AssertionError(e);
}
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, onSplit, afterMerge);
}
use of com.hazelcast.jet.core.JobStatus.COMPLETED in project hazelcast by hazelcast.
the class MasterJobContext method onStartExecutionComplete.
private void onStartExecutionComplete(Throwable error, Collection<Entry<MemberInfo, Object>> responses) {
JobStatus status = mc.jobStatus();
if (status != STARTING && status != RUNNING) {
logCannotComplete(error);
error = new IllegalStateException("Job coordination failed");
}
setJobMetrics(responses.stream().filter(en -> en.getValue() instanceof RawJobMetrics).map(e1 -> (RawJobMetrics) e1.getValue()).collect(Collectors.toList()));
if (error instanceof JobTerminateRequestedException && ((JobTerminateRequestedException) error).mode().isWithTerminalSnapshot()) {
Throwable finalError = error;
// The terminal snapshot on members is always completed before replying to StartExecutionOp.
// However, the response to snapshot operations can be processed after the response to
// StartExecutionOp, so wait for that too.
mc.snapshotContext().terminalSnapshotFuture().whenCompleteAsync(withTryCatch(logger, (r, e) -> finalizeJob(finalError)));
} else {
if (error instanceof ExecutionNotFoundException) {
// If the StartExecutionOperation didn't find the execution, it means that it was cancelled.
if (requestedTerminationMode != null) {
// This cancellation can be because the master cancelled it. If that's the case, convert the exception
// to JobTerminateRequestedException.
error = new JobTerminateRequestedException(requestedTerminationMode).initCause(error);
}
// The cancellation can also happen if some participant left and
// the target cancelled the execution locally in JobExecutionService.onMemberRemoved().
// We keep this (and possibly other) exceptions as they are
// and let the execution complete with failure.
}
finalizeJob(error);
}
}
use of com.hazelcast.jet.core.JobStatus.COMPLETED in project hazelcast by hazelcast.
the class SplitBrainTest method when_splitBrainProtectionIsDisabled_then_jobCompletesOnBothSides.
@Test
public void when_splitBrainProtectionIsDisabled_then_jobCompletesOnBothSides() {
int firstSubClusterSize = 2;
int secondSubClusterSize = 2;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
NoOutputSourceP.executionStarted = new CountDownLatch(clusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<HazelcastInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(NoOutputSourceP::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[0].getJet().newJob(dag);
assertOpenEventually(NoOutputSourceP.executionStarted);
};
BiConsumer<HazelcastInstance[], HazelcastInstance[]> onSplit = (firstSubCluster, secondSubCluster) -> {
NoOutputSourceP.proceedLatch.countDown();
long jobId = jobRef[0].getId();
assertTrueEventually(() -> {
JetServiceBackend service1 = getJetServiceBackend(firstSubCluster[0]);
JetServiceBackend service2 = getJetServiceBackend(secondSubCluster[0]);
assertEquals(COMPLETED, service1.getJobCoordinationService().getJobStatus(jobId).get());
assertEquals(COMPLETED, service2.getJobCoordinationService().getJobStatus(jobId).get());
});
};
Consumer<HazelcastInstance[]> afterMerge = instances -> {
assertTrueEventually(() -> {
assertEquals("init count", clusterSize * 2, MockPS.initCount.get());
assertEquals("close count", clusterSize * 2, MockPS.closeCount.get());
});
assertEquals(clusterSize, MockPS.receivedCloseErrors.size());
MockPS.receivedCloseErrors.forEach(t -> assertTrue("received " + t, t instanceof CancellationException));
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, onSplit, afterMerge);
}
use of com.hazelcast.jet.core.JobStatus.COMPLETED in project hazelcast by hazelcast.
the class WatermarkCoalescer_TerminalSnapshotTest method test.
@Test
public void test() throws Exception {
/*
This test tests the issue that after a terminal barrier is processed, no other work should
be done by the ProcessorTasklet or CIES after that (except for emitting the DONE_ITEM).
Also, if at-least-once guarantee is used, the tasklet should not continue to drain
the queue that had the barrier while waiting for other barriers.
Specifically, the issue was that in at-least-once mode the DONE_ITEM was processed
after the terminal barrier while waiting for the barrier on other queues/edges. The
DONE_ITEM could have caused a WM being emitted after the barrier, which is ok
for the at-least-once mode, but the terminal snapshot should behave as if exactly-once
mode was used.
This test ensures that we're waiting for a WM in coalescer (by having a stream skew)
and then does a graceful restart in at-least-once mode and checks that the results are
correct.
*/
String key0 = generateKeyForPartition(instance, 0);
String key1 = generateKeyForPartition(instance, 1);
Pipeline p = Pipeline.create();
p.readFrom(Sources.mapJournal(sourceMap, JournalInitialPosition.START_FROM_OLDEST)).withTimestamps(Map.Entry::getValue, 0).setLocalParallelism(PARTITION_COUNT).groupingKey(Map.Entry::getKey).window(WindowDefinition.sliding(1, 1)).aggregate(AggregateOperations.counting()).setLocalParallelism(PARTITION_COUNT).writeTo(SinkBuilder.sinkBuilder("throwing", ctx -> "").<KeyedWindowResult<String, Long>>receiveFn((w, kwr) -> {
if (kwr.result() != COUNT) {
throw new RuntimeException("Received unexpected item " + kwr + ", expected count is " + COUNT);
}
}).build());
Job job = instance.getJet().newJob(p, new JobConfig().setProcessingGuarantee(ProcessingGuarantee.AT_LEAST_ONCE));
List<Future> futures = new ArrayList<>();
futures.add(spawn(() -> {
for (; ; ) {
assertJobStatusEventually(job, JobStatus.RUNNING);
System.out.println("============RESTARTING JOB=========");
job.restart();
Thread.sleep(2000);
}
}));
// one producer is twice as fast as the other, to cause waiting for WM while doing snapshot
futures.add(spawn(() -> producer(key0, 1)));
futures.add(spawn(() -> producer(key1, 2)));
sleepSeconds(20);
for (Future f : futures) {
f.cancel(true);
// check that the future was cancelled and didn't fail with another error
try {
f.get();
fail("Exception was expected");
} catch (CancellationException expected) {
}
}
// check that the job is running
JobStatus status = job.getStatus();
assertTrue("job should not be completed, status=" + status, status != FAILED && status != COMPLETED && status != SUSPENDED);
}
use of com.hazelcast.jet.core.JobStatus.COMPLETED in project hazelcast-jet by hazelcast.
the class SplitBrainTest method when_splitBrainProtectionIsDisabled_then_jobCompletesOnBothSides.
@Test
public void when_splitBrainProtectionIsDisabled_then_jobCompletesOnBothSides() {
int firstSubClusterSize = 2;
int secondSubClusterSize = 2;
int clusterSize = firstSubClusterSize + secondSubClusterSize;
StuckProcessor.executionStarted = new CountDownLatch(clusterSize * PARALLELISM);
Job[] jobRef = new Job[1];
Consumer<JetInstance[]> beforeSplit = instances -> {
MockPS processorSupplier = new MockPS(StuckProcessor::new, clusterSize);
DAG dag = new DAG().vertex(new Vertex("test", processorSupplier));
jobRef[0] = instances[0].newJob(dag);
assertOpenEventually(StuckProcessor.executionStarted);
};
BiConsumer<JetInstance[], JetInstance[]> onSplit = (firstSubCluster, secondSubCluster) -> {
StuckProcessor.proceedLatch.countDown();
long jobId = jobRef[0].getId();
assertTrueEventually(() -> {
JetService service1 = getJetService(firstSubCluster[0]);
JetService service2 = getJetService(secondSubCluster[0]);
assertEquals(COMPLETED, service1.getJobCoordinationService().getJobStatus(jobId));
assertEquals(COMPLETED, service2.getJobCoordinationService().getJobStatus(jobId));
});
};
Consumer<JetInstance[]> afterMerge = instances -> {
assertTrueEventually(() -> {
assertEquals(clusterSize * 2, MockPS.initCount.get());
assertEquals(clusterSize * 2, MockPS.closeCount.get());
});
assertEquals(clusterSize, MockPS.receivedCloseErrors.size());
MockPS.receivedCloseErrors.forEach(t -> assertTrue(t instanceof TopologyChangedException));
};
testSplitBrain(firstSubClusterSize, secondSubClusterSize, beforeSplit, onSplit, afterMerge);
}
Aggregations