use of org.apache.flink.core.testutils.OneShotLatch in project flink by apache.
the class DispatcherCleanupITCase method createAndStartJobGraphStoreWithCleanupFailures.
private JobGraphStore createAndStartJobGraphStoreWithCleanupFailures(int numberOfCleanupFailures, Throwable throwable, AtomicInteger actualCleanupCallCount, OneShotLatch successfulCleanupLatch) throws Exception {
final AtomicInteger failureCount = new AtomicInteger(numberOfCleanupFailures);
final JobGraphStore jobGraphStore = TestingJobGraphStore.newBuilder().setGlobalCleanupFunction((ignoredJobId, ignoredExecutor) -> {
actualCleanupCallCount.incrementAndGet();
if (failureCount.getAndDecrement() > 0) {
return FutureUtils.completedExceptionally(throwable);
}
successfulCleanupLatch.trigger();
return FutureUtils.completedVoidFuture();
}).build();
jobGraphStore.start(null);
return jobGraphStore;
}
use of org.apache.flink.core.testutils.OneShotLatch in project flink by apache.
the class DispatcherCleanupITCase method testCleanupThroughRetries.
@Test
public void testCleanupThroughRetries() throws Exception {
final JobGraph jobGraph = createJobGraph();
final JobID jobId = jobGraph.getJobID();
// JobGraphStore
final AtomicInteger actualGlobalCleanupCallCount = new AtomicInteger();
final OneShotLatch successfulCleanupLatch = new OneShotLatch();
final int numberOfErrors = 5;
final RuntimeException temporaryError = new RuntimeException("Expected RuntimeException: Unable to remove job graph.");
final JobGraphStore jobGraphStore = createAndStartJobGraphStoreWithCleanupFailures(numberOfErrors, temporaryError, actualGlobalCleanupCallCount, successfulCleanupLatch);
haServices.setJobGraphStore(jobGraphStore);
// Construct leader election service.
final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
haServices.setJobMasterLeaderElectionService(jobId, leaderElectionService);
// start the dispatcher with enough retries on cleanup
final JobManagerRunnerRegistry jobManagerRunnerRegistry = new DefaultJobManagerRunnerRegistry(2);
final Dispatcher dispatcher = createTestingDispatcherBuilder().setResourceCleanerFactory(new DispatcherResourceCleanerFactory(ForkJoinPool.commonPool(), TestingRetryStrategies.createWithNumberOfRetries(numberOfErrors), jobManagerRunnerRegistry, haServices.getJobGraphStore(), blobServer, haServices, UnregisteredMetricGroups.createUnregisteredJobManagerMetricGroup())).build();
dispatcher.start();
toTerminate.add(dispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
waitForJobToFinish(leaderElectionService, dispatcherGateway, jobId);
successfulCleanupLatch.await();
assertThat(actualGlobalCleanupCallCount.get(), equalTo(numberOfErrors + 1));
assertThat("The JobGraph should be removed from JobGraphStore.", haServices.getJobGraphStore().getJobIds(), IsEmptyCollection.empty());
CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().hasJobResultEntry(jobId), Deadline.fromNow(Duration.ofMinutes(5)), "The JobResultStore should have this job marked as clean.");
}
use of org.apache.flink.core.testutils.OneShotLatch in project flink by apache.
the class DispatcherCleanupITCase method testCleanupAfterLeadershipChange.
@Test
public void testCleanupAfterLeadershipChange() throws Exception {
final JobGraph jobGraph = createJobGraph();
final JobID jobId = jobGraph.getJobID();
// Construct job graph store.
final AtomicInteger actualGlobalCleanupCallCount = new AtomicInteger();
final OneShotLatch successfulCleanupLatch = new OneShotLatch();
final RuntimeException temporaryError = new RuntimeException("Unable to remove job graph.");
final JobGraphStore jobGraphStore = createAndStartJobGraphStoreWithCleanupFailures(1, temporaryError, actualGlobalCleanupCallCount, successfulCleanupLatch);
haServices.setJobGraphStore(jobGraphStore);
// Construct leader election service.
final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
haServices.setJobMasterLeaderElectionService(jobId, leaderElectionService);
// start the dispatcher with no retries on cleanup
final CountDownLatch jobGraphRemovalErrorReceived = new CountDownLatch(1);
final Dispatcher dispatcher = createTestingDispatcherBuilder().setFatalErrorHandler(throwable -> {
final Optional<Throwable> maybeError = ExceptionUtils.findThrowable(throwable, temporaryError::equals);
if (maybeError.isPresent()) {
jobGraphRemovalErrorReceived.countDown();
} else {
testingFatalErrorHandlerResource.getFatalErrorHandler().onFatalError(throwable);
}
}).build();
dispatcher.start();
toTerminate.add(dispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
waitForJobToFinish(leaderElectionService, dispatcherGateway, jobId);
jobGraphRemovalErrorReceived.await();
// Remove job master leadership.
leaderElectionService.notLeader();
// This will clear internal state of election service, so a new contender can register.
leaderElectionService.stop();
assertThat(successfulCleanupLatch.isTriggered(), CoreMatchers.is(false));
assertThat("The JobGraph is still stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), CoreMatchers.is(Collections.singleton(jobId)));
assertThat("The JobResultStore has this job marked as dirty.", haServices.getJobResultStore().getDirtyResults().stream().map(JobResult::getJobId).collect(Collectors.toSet()), CoreMatchers.is(Collections.singleton(jobId)));
// Run a second dispatcher, that restores our finished job.
final Dispatcher secondDispatcher = createTestingDispatcherBuilder().setRecoveredDirtyJobs(haServices.getJobResultStore().getDirtyResults()).build();
secondDispatcher.start();
toTerminate.add(secondDispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().getDirtyResults().isEmpty(), Deadline.fromNow(TimeUtils.toDuration(TIMEOUT)));
assertThat("The JobGraph is not stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), IsEmptyCollection.empty());
assertTrue("The JobResultStore has the job listed as clean.", haServices.getJobResultStore().hasJobResultEntry(jobId));
// wait for the successful cleanup to be triggered
successfulCleanupLatch.await();
assertThat(actualGlobalCleanupCallCount.get(), equalTo(2));
}
use of org.apache.flink.core.testutils.OneShotLatch in project flink by apache.
the class RemoteInputChannelTest method testOnFailedPartitionRequestDoesNotBlockNetworkThreads.
/**
* Test to guard against FLINK-13249.
*/
@Test
public void testOnFailedPartitionRequestDoesNotBlockNetworkThreads() throws Exception {
final long testBlockedWaitTimeoutMillis = 30_000L;
final PartitionProducerStateChecker partitionProducerStateChecker = (jobId, intermediateDataSetId, resultPartitionId) -> CompletableFuture.completedFuture(ExecutionState.RUNNING);
final NettyShuffleEnvironment shuffleEnvironment = new NettyShuffleEnvironmentBuilder().build();
final Task task = new TestTaskBuilder(shuffleEnvironment).setPartitionProducerStateChecker(partitionProducerStateChecker).build();
final SingleInputGate inputGate = new SingleInputGateBuilder().setPartitionProducerStateProvider(task).build();
TestTaskBuilder.setTaskState(task, ExecutionState.RUNNING);
final OneShotLatch ready = new OneShotLatch();
final OneShotLatch blocker = new OneShotLatch();
final AtomicBoolean timedOutOrInterrupted = new AtomicBoolean(false);
final ConnectionManager blockingConnectionManager = new TestingConnectionManager() {
@Override
public PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) {
ready.trigger();
try {
// We block here, in a section that holds the
// SingleInputGate#requestLock
blocker.await(testBlockedWaitTimeoutMillis, TimeUnit.MILLISECONDS);
} catch (InterruptedException | TimeoutException e) {
timedOutOrInterrupted.set(true);
}
return new TestingPartitionRequestClient();
}
};
final RemoteInputChannel remoteInputChannel = InputChannelBuilder.newBuilder().setConnectionManager(blockingConnectionManager).buildRemoteChannel(inputGate);
inputGate.setInputChannels(remoteInputChannel);
final Thread simulatedNetworkThread = new Thread(() -> {
try {
ready.await();
// We want to make sure that our simulated network thread does not
// block on
// SingleInputGate#requestLock as well through this call.
remoteInputChannel.onFailedPartitionRequest();
// Will only give free the blocker if we did not block ourselves.
blocker.trigger();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
});
simulatedNetworkThread.start();
// The entry point to that will lead us into
// blockingConnectionManager#createPartitionRequestClient(...).
inputGate.requestPartitions();
simulatedNetworkThread.join();
Assert.assertFalse("Test ended by timeout or interruption - this indicates that the network thread was blocked.", timedOutOrInterrupted.get());
}
use of org.apache.flink.core.testutils.OneShotLatch in project flink by apache.
the class TaskExecutorTest method testSlotAcceptance.
/**
* Tests that accepted slots go into state assigned and the others are returned to the resource
* manager.
*/
@Test
public void testSlotAcceptance() throws Exception {
final InstanceID registrationId = new InstanceID();
final OneShotLatch taskExecutorIsRegistered = new OneShotLatch();
final CompletableFuture<Tuple3<InstanceID, SlotID, AllocationID>> availableSlotFuture = new CompletableFuture<>();
final TestingResourceManagerGateway resourceManagerGateway = createRmWithTmRegisterAndNotifySlotHooks(registrationId, taskExecutorIsRegistered, availableSlotFuture);
final AllocationID allocationId1 = new AllocationID();
final AllocationID allocationId2 = new AllocationID();
final SlotOffer offer1 = new SlotOffer(allocationId1, 0, ResourceProfile.ANY);
final OneShotLatch offerSlotsLatch = new OneShotLatch();
final OneShotLatch taskInTerminalState = new OneShotLatch();
final CompletableFuture<Collection<SlotOffer>> offerResultFuture = new CompletableFuture<>();
final TestingJobMasterGateway jobMasterGateway = createJobMasterWithSlotOfferAndTaskTerminationHooks(offerSlotsLatch, taskInTerminalState, offerResultFuture);
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(2);
final TaskManagerServices taskManagerServices = createTaskManagerServicesWithTaskSlotTable(taskSlotTable);
final TestingTaskExecutor taskManager = createTestingTaskExecutor(taskManagerServices);
try {
taskManager.start();
taskManager.waitUntilStarted();
final TaskExecutorGateway tmGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
// wait until registered at the RM
taskExecutorIsRegistered.await();
// request 2 slots for the given allocation ids
AllocationID[] allocationIds = new AllocationID[] { allocationId1, allocationId2 };
for (int i = 0; i < allocationIds.length; i++) {
requestSlot(tmGateway, jobId, allocationIds[i], buildSlotID(i), ResourceProfile.UNKNOWN, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
}
// notify job leader to start slot offering
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
// wait until slots have been offered
offerSlotsLatch.await();
offerResultFuture.complete(Collections.singletonList(offer1));
final Tuple3<InstanceID, SlotID, AllocationID> instanceIDSlotIDAllocationIDTuple3 = availableSlotFuture.get();
final Tuple3<InstanceID, SlotID, AllocationID> expectedResult = Tuple3.of(registrationId, buildSlotID(1), allocationId2);
assertThat(instanceIDSlotIDAllocationIDTuple3, equalTo(expectedResult));
// the slot 1 can be activate for task submission
submit(allocationId1, jobMasterGateway, tmGateway, NoOpInvokable.class);
// wait for the task completion
taskInTerminalState.await();
// the slot 2 can NOT be activate for task submission
try {
submit(allocationId2, jobMasterGateway, tmGateway, NoOpInvokable.class);
fail("It should not be possible to submit task to acquired by JM slot with index 1 (allocationId2)");
} catch (CompletionException e) {
assertThat(e.getCause(), instanceOf(TaskSubmissionException.class));
}
// the slot 2 is free to request
requestSlot(tmGateway, jobId, allocationId2, buildSlotID(1), ResourceProfile.UNKNOWN, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
Aggregations