Search in sources :

Example 11 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class JobMasterTest method testReleasingTaskExecutorIfNoMoreSlotsRegistered.

/**
 * Tests that the TaskExecutor is released if all of its slots have been freed.
 */
@Test
public void testReleasingTaskExecutorIfNoMoreSlotsRegistered() throws Exception {
    final JobGraph jobGraph = createSingleVertexJobWithRestartStrategy();
    final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
    final CompletableFuture<JobID> disconnectTaskExecutorFuture = new CompletableFuture<>();
    final CompletableFuture<AllocationID> freedSlotFuture = new CompletableFuture<>();
    final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setFreeSlotFunction((allocationID, throwable) -> {
        freedSlotFuture.complete(allocationID);
        return CompletableFuture.completedFuture(Acknowledge.get());
    }).setDisconnectJobManagerConsumer((jobID, throwable) -> disconnectTaskExecutorFuture.complete(jobID)).createTestingTaskExecutorGateway();
    final LocalUnresolvedTaskManagerLocation taskManagerLocation = new LocalUnresolvedTaskManagerLocation();
    try {
        jobMaster.start();
        final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
        final Collection<SlotOffer> slotOffers = registerSlotsAtJobMaster(1, jobMasterGateway, jobGraph.getJobID(), testingTaskExecutorGateway, taskManagerLocation);
        // check that we accepted the offered slot
        assertThat(slotOffers, hasSize(1));
        final AllocationID allocationId = slotOffers.iterator().next().getAllocationId();
        // now fail the allocation and check that we close the connection to the TaskExecutor
        jobMasterGateway.failSlot(taskManagerLocation.getResourceID(), allocationId, new FlinkException("Fail allocation test exception"));
        // we should free the slot and then disconnect from the TaskExecutor because we use no
        // longer slots from it
        assertThat(freedSlotFuture.get(), equalTo(allocationId));
        assertThat(disconnectTaskExecutorFuture.get(), equalTo(jobGraph.getJobID()));
    } finally {
        RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
    }
}
Also used : TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) DefaultSchedulerFactory(org.apache.flink.runtime.scheduler.DefaultSchedulerFactory) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) Arrays(java.util.Arrays) Tuple3(org.apache.flink.api.java.tuple.Tuple3) SlotPoolService(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolService) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) RestartStrategyOptions(org.apache.flink.configuration.RestartStrategyOptions) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) PhysicalSlot(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlot) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) Duration(java.time.Duration) Map(java.util.Map) Matchers.nullValue(org.hamcrest.Matchers.nullValue) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) ClassRule(org.junit.ClassRule) SimpleSlotContext(org.apache.flink.runtime.instance.SimpleSlotContext) SlotPoolServiceFactory(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolServiceFactory) AfterClass(org.junit.AfterClass) BlockingQueue(java.util.concurrent.BlockingQueue) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) Category(org.junit.experimental.categories.Category) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) CountDownLatch(java.util.concurrent.CountDownLatch) TimeUtils(org.apache.flink.util.TimeUtils) Matchers.is(org.hamcrest.Matchers.is) Time(org.apache.flink.api.common.time.Time) InputSplitSource(org.apache.flink.core.io.InputSplitSource) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) FlinkException(org.apache.flink.util.FlinkException) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) AccessExecution(org.apache.flink.runtime.executiongraph.AccessExecution) JobStatus(org.apache.flink.api.common.JobStatus) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) DefaultInputSplitAssigner(org.apache.flink.api.common.io.DefaultInputSplitAssigner) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) BiConsumer(java.util.function.BiConsumer) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) DistributionPattern(org.apache.flink.runtime.jobgraph.DistributionPattern) Nullable(javax.annotation.Nullable) CheckpointProperties(org.apache.flink.runtime.checkpoint.CheckpointProperties) Before(org.junit.Before) InputSplitAssigner(org.apache.flink.core.io.InputSplitAssigner) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) InputSplit(org.apache.flink.core.io.InputSplit) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) Test(org.junit.Test) IOException(java.io.IOException) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) JobID(org.apache.flink.api.common.JobID) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) ArrayDeque(java.util.ArrayDeque) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) CheckpointRetentionPolicy(org.apache.flink.runtime.checkpoint.CheckpointRetentionPolicy) Deadline(org.apache.flink.api.common.time.Deadline) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) BiFunction(java.util.function.BiFunction) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) TaskExecutorToJobManagerHeartbeatPayload(org.apache.flink.runtime.taskexecutor.TaskExecutorToJobManagerHeartbeatPayload) AggregateFunction(org.apache.flink.api.common.functions.AggregateFunction) InstantiationUtil(org.apache.flink.util.InstantiationUtil) After(org.junit.After) TestLogger(org.apache.flink.util.TestLogger) TestingSchedulerNGFactory(org.apache.flink.runtime.scheduler.TestingSchedulerNGFactory) Assert.fail(org.junit.Assert.fail) BlobServerOptions(org.apache.flink.configuration.BlobServerOptions) CompletedCheckpointStorageLocation(org.apache.flink.runtime.state.CompletedCheckpointStorageLocation) Collection(java.util.Collection) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) UUID(java.util.UUID) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) Collectors(java.util.stream.Collectors) SlotInfoWithUtilization(org.apache.flink.runtime.jobmaster.slotpool.SlotInfoWithUtilization) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) Objects(java.util.Objects) TestingUtils(org.apache.flink.testutils.TestingUtils) List(java.util.List) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) ResultPartitionDeploymentDescriptor(org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor) Matchers.equalTo(org.hamcrest.Matchers.equalTo) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Optional(java.util.Optional) Queue(java.util.Queue) Matchers.anyOf(org.hamcrest.Matchers.anyOf) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) IntStream(java.util.stream.IntStream) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) BeforeClass(org.junit.BeforeClass) AccessExecutionVertex(org.apache.flink.runtime.executiongraph.AccessExecutionVertex) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ResultPartitionType(org.apache.flink.runtime.io.network.partition.ResultPartitionType) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) RestartStrategies(org.apache.flink.api.common.restartstrategy.RestartStrategies) Function(java.util.function.Function) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) FailoverStrategyFactoryLoader(org.apache.flink.runtime.executiongraph.failover.flip1.FailoverStrategyFactoryLoader) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TestingJobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingJobMasterPartitionTracker) FailsWithAdaptiveScheduler(org.apache.flink.testutils.junit.FailsWithAdaptiveScheduler) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) TestingSlotPoolServiceBuilder(org.apache.flink.runtime.jobmaster.slotpool.TestingSlotPoolServiceBuilder) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nonnull(javax.annotation.Nonnull) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Matchers.empty(org.hamcrest.Matchers.empty) JobGraphBuilder(org.apache.flink.runtime.jobgraph.JobGraphBuilder) TestingSchedulerNG(org.apache.flink.runtime.scheduler.TestingSchedulerNG) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) Matchers(org.hamcrest.Matchers) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) TimeUnit(java.util.concurrent.TimeUnit) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) ClosureCleaner(org.apache.flink.api.java.ClosureCleaner) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) RecipientUnreachableException(org.apache.flink.runtime.rpc.exceptions.RecipientUnreachableException) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) FlinkException(org.apache.flink.util.FlinkException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) CompletableFuture(java.util.concurrent.CompletableFuture) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 12 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class DispatcherTest method testRetrieveJobResultAfterSubmissionOfFailedJob.

@Test
public void testRetrieveJobResultAfterSubmissionOfFailedJob() throws Exception {
    dispatcher = createAndStartDispatcher(heartbeatServices, haServices, new ExpectedJobIdJobManagerRunnerFactory(jobId, createdJobManagerRunnerLatch));
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    final JobID failedJobId = new JobID();
    final String failedJobName = "test";
    final CompletableFuture<Acknowledge> submitFuture = dispatcherGateway.submitFailedJob(failedJobId, failedJobName, new RuntimeException("Test exception."));
    submitFuture.get();
    final ArchivedExecutionGraph archivedExecutionGraph = dispatcherGateway.requestJob(failedJobId, TIMEOUT).get();
    Assertions.assertThat(archivedExecutionGraph.getJobID()).isEqualTo(failedJobId);
    Assertions.assertThat(archivedExecutionGraph.getJobName()).isEqualTo(failedJobName);
    Assertions.assertThat(archivedExecutionGraph.getState()).isEqualTo(JobStatus.FAILED);
    Assertions.assertThat(archivedExecutionGraph.getFailureInfo()).isNotNull().extracting(ErrorInfo::getException).extracting(e -> e.deserializeError(Thread.currentThread().getContextClassLoader())).satisfies(exception -> Assertions.assertThat(exception).isInstanceOf(RuntimeException.class).hasMessage("Test exception."));
}
Also used : Arrays(java.util.Arrays) JobSubmissionException(org.apache.flink.runtime.client.JobSubmissionException) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobManagerJobMetricGroupFactory(org.apache.flink.runtime.jobmaster.factories.JobManagerJobMetricGroupFactory) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) ResourceSpec(org.apache.flink.api.common.operators.ResourceSpec) DefaultJobMasterServiceProcessFactory(org.apache.flink.runtime.jobmaster.factories.DefaultJobMasterServiceProcessFactory) Duration(java.time.Duration) Is.is(org.hamcrest.core.Is.is) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) TestingJobManagerRunner(org.apache.flink.runtime.jobmaster.TestingJobManagerRunner) Path(java.nio.file.Path) BlockingQueue(java.util.concurrent.BlockingQueue) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) CheckpointMetadata(org.apache.flink.runtime.checkpoint.metadata.CheckpointMetadata) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) CountDownLatch(java.util.concurrent.CountDownLatch) TimeUtils(org.apache.flink.util.TimeUtils) Assert.assertFalse(org.junit.Assert.assertFalse) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) JobMasterServiceLeadershipRunner(org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) FlinkException(org.apache.flink.util.FlinkException) BlobServer(org.apache.flink.runtime.blob.BlobServer) JobStatus(org.apache.flink.api.common.JobStatus) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) RpcService(org.apache.flink.runtime.rpc.RpcService) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) PermanentBlobKey(org.apache.flink.runtime.blob.PermanentBlobKey) Before(org.junit.Before) JobManagerRunnerResult(org.apache.flink.runtime.jobmaster.JobManagerRunnerResult) CheckpointStorageLocation(org.apache.flink.runtime.state.CheckpointStorageLocation) Files(java.nio.file.Files) JobMasterServiceProcessFactory(org.apache.flink.runtime.jobmaster.factories.JobMasterServiceProcessFactory) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) ErrorInfo(org.apache.flink.runtime.executiongraph.ErrorInfo) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) IOException(java.io.IOException) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) TestingJobMasterServiceFactory(org.apache.flink.runtime.jobmaster.factories.TestingJobMasterServiceFactory) JobID(org.apache.flink.api.common.JobID) Paths(java.nio.file.Paths) Assert(org.junit.Assert) ArrayDeque(java.util.ArrayDeque) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Assert.assertEquals(org.junit.Assert.assertEquals) Deadline(org.apache.flink.api.common.time.Deadline) NoSuchFileException(java.nio.file.NoSuchFileException) URISyntaxException(java.net.URISyntaxException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) CheckpointStorage(org.apache.flink.runtime.state.CheckpointStorage) Assert.assertThat(org.junit.Assert.assertThat) InstantiationUtil(org.apache.flink.util.InstantiationUtil) After(org.junit.After) JobMasterService(org.apache.flink.runtime.jobmaster.JobMasterService) Assertions(org.assertj.core.api.Assertions) Checkpoints(org.apache.flink.runtime.checkpoint.Checkpoints) Assert.fail(org.junit.Assert.fail) URI(java.net.URI) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) JobDetails(org.apache.flink.runtime.messages.webmonitor.JobDetails) CompletedCheckpointStorageLocation(org.apache.flink.runtime.state.CompletedCheckpointStorageLocation) Collection(java.util.Collection) FlinkJobTerminatedWithoutCancellationException(org.apache.flink.runtime.messages.FlinkJobTerminatedWithoutCancellationException) UUID(java.util.UUID) Preconditions(org.apache.flink.util.Preconditions) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) CheckpointMetadataOutputStream(org.apache.flink.runtime.state.CheckpointMetadataOutputStream) TestingCleanupRunnerFactory(org.apache.flink.runtime.dispatcher.cleanup.TestingCleanupRunnerFactory) FlinkJobNotFoundException(org.apache.flink.runtime.messages.FlinkJobNotFoundException) Matchers.equalTo(org.hamcrest.Matchers.equalTo) JobManagerSharedServices(org.apache.flink.runtime.jobmaster.JobManagerSharedServices) Queue(java.util.Queue) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) CheckpointStorageCoordinatorView(org.apache.flink.runtime.state.CheckpointStorageCoordinatorView) LeaderElectionService(org.apache.flink.runtime.leaderelection.LeaderElectionService) Assert.assertThrows(org.junit.Assert.assertThrows) FlinkMatchers(org.apache.flink.core.testutils.FlinkMatchers) TestingJobGraphStore(org.apache.flink.runtime.testutils.TestingJobGraphStore) CompletableFuture(java.util.concurrent.CompletableFuture) AtomicReference(java.util.concurrent.atomic.AtomicReference) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) JobResult(org.apache.flink.runtime.jobmaster.JobResult) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) Nonnull(javax.annotation.Nonnull) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) JobGraphBuilder(org.apache.flink.runtime.jobgraph.JobGraphBuilder) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) ArchivedExecutionGraphBuilder(org.apache.flink.runtime.rest.handler.legacy.utils.ArchivedExecutionGraphBuilder) Configuration(org.apache.flink.configuration.Configuration) Matchers(org.hamcrest.Matchers) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) TestingJobMasterService(org.apache.flink.runtime.jobmaster.TestingJobMasterService) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) TimeUnit(java.util.concurrent.TimeUnit) MultipleJobsDetails(org.apache.flink.runtime.messages.webmonitor.MultipleJobsDetails) JobManagerRunner(org.apache.flink.runtime.jobmaster.JobManagerRunner) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Collections(java.util.Collections) DuplicateJobSubmissionException(org.apache.flink.runtime.client.DuplicateJobSubmissionException) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ErrorInfo(org.apache.flink.runtime.executiongraph.ErrorInfo) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 13 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class TaskExecutorTest method runJobManagerHeartbeatTest.

private void runJobManagerHeartbeatTest(ResourceID jmResourceId, HeartbeatServices heartbeatServices, Consumer<TestingJobMasterGatewayBuilder> jobMasterGatewayBuilderConsumer, TriConsumer<ResourceID, TaskExecutorGateway, AllocationID> heartbeatAction) throws IOException, InterruptedException, ExecutionException, TimeoutException {
    final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
    final String jobMasterAddress = "jm";
    final UUID jmLeaderId = UUID.randomUUID();
    final CountDownLatch registrationAttempts = new CountDownLatch(2);
    final OneShotLatch slotOfferedLatch = new OneShotLatch();
    final CompletableFuture<ResourceID> disconnectTaskManagerFuture = new CompletableFuture<>();
    final TestingJobMasterGatewayBuilder testingJobMasterGatewayBuilder = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> {
        registrationAttempts.countDown();
        return CompletableFuture.completedFuture(new JMTMRegistrationSuccess(jmResourceId));
    }).setDisconnectTaskManagerFunction(resourceID -> {
        disconnectTaskManagerFuture.complete(resourceID);
        return CompletableFuture.completedFuture(Acknowledge.get());
    }).setOfferSlotsFunction((resourceID, slotOffers) -> {
        slotOfferedLatch.trigger();
        return CompletableFuture.completedFuture(slotOffers);
    });
    jobMasterGatewayBuilderConsumer.accept(testingJobMasterGatewayBuilder);
    final TestingJobMasterGateway jobMasterGateway = testingJobMasterGatewayBuilder.build();
    final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
    final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).setJobLeaderService(jobLeaderService).setTaskStateManager(localStateStoresManager).build();
    final TestingTaskExecutor taskManager = createTestingTaskExecutor(taskManagerServices, heartbeatServices);
    final OneShotLatch slotReportReceived = new OneShotLatch();
    final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
    testingResourceManagerGateway.setSendSlotReportFunction(ignored -> {
        slotReportReceived.trigger();
        return CompletableFuture.completedFuture(Acknowledge.get());
    });
    final Queue<CompletableFuture<RegistrationResponse>> registrationResponses = new ArrayDeque<>();
    registrationResponses.add(CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("foobar", 1234))));
    registrationResponses.add(new CompletableFuture<>());
    testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> registrationResponses.poll());
    rpc.registerGateway(jobMasterAddress, jobMasterGateway);
    rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
    try {
        taskManager.start();
        taskManager.waitUntilStarted();
        final TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
        resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
        slotReportReceived.await();
        final AllocationID allocationId = new AllocationID();
        requestSlot(taskExecutorGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.UNKNOWN, jobMasterAddress, testingResourceManagerGateway.getFencingToken());
        // now inform the task manager about the new job leader
        jobManagerLeaderRetriever.notifyListener(jobMasterAddress, jmLeaderId);
        // register task manager success will trigger monitoring heartbeat target between tm and
        // jm
        slotOfferedLatch.await();
        heartbeatAction.accept(unresolvedTaskManagerLocation.getResourceID(), taskExecutorGateway, allocationId);
        // the timeout should trigger disconnecting from the JobManager
        final ResourceID resourceID = disconnectTaskManagerFuture.get();
        assertThat(resourceID, equalTo(unresolvedTaskManagerLocation.getResourceID()));
        assertTrue("The TaskExecutor should try to reconnect to the JM", registrationAttempts.await(timeout.toMilliseconds(), TimeUnit.SECONDS));
    } finally {
        RpcUtils.terminateRpcEndpoint(taskManager, timeout);
    }
}
Also used : Arrays(java.util.Arrays) Tuple3(org.apache.flink.api.java.tuple.Tuple3) TaskSlotUtils.createDefaultTimerService(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils.createDefaultTimerService) MemorySize(org.apache.flink.configuration.MemorySize) NetUtils(org.apache.flink.util.NetUtils) InetAddress(java.net.InetAddress) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) TaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TaskExecutorPartitionTracker) FunctionUtils(org.apache.flink.util.function.FunctionUtils) Matchers.nullValue(org.hamcrest.Matchers.nullValue) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) Builder(org.apache.flink.runtime.taskexecutor.TaskSubmissionTestEnvironment.Builder) Matchers.notNullValue(org.hamcrest.Matchers.notNullValue) Failure(org.apache.flink.runtime.registration.RegistrationResponse.Failure) NoOpTaskExecutorBlobService(org.apache.flink.runtime.blob.NoOpTaskExecutorBlobService) TestingTaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingTaskExecutorPartitionTracker) NoOpMetricRegistry(org.apache.flink.runtime.metrics.NoOpMetricRegistry) BlockingQueue(java.util.concurrent.BlockingQueue) Matchers.startsWith(org.hamcrest.Matchers.startsWith) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) TaskExecutorStateChangelogStoragesManager(org.apache.flink.runtime.state.TaskExecutorStateChangelogStoragesManager) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) CountDownLatch(java.util.concurrent.CountDownLatch) TimeUtils(org.apache.flink.util.TimeUtils) Matchers.contains(org.hamcrest.Matchers.contains) Matchers.is(org.hamcrest.Matchers.is) Matchers.containsString(org.hamcrest.Matchers.containsString) Time(org.apache.flink.api.common.time.Time) TaskExecutorRegistration(org.apache.flink.runtime.resourcemanager.TaskExecutorRegistration) TaskExecutorPartitionTrackerImpl(org.apache.flink.runtime.io.network.partition.TaskExecutorPartitionTrackerImpl) FlinkException(org.apache.flink.util.FlinkException) TaskSlotTableImpl(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTableImpl) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) Callable(java.util.concurrent.Callable) DEFAULT_RESOURCE_PROFILE(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils.DEFAULT_RESOURCE_PROFILE) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) TestingTaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TestingTaskSlotTable) ArrayList(java.util.ArrayList) TaskManagerOptions(org.apache.flink.configuration.TaskManagerOptions) TestingClassLoaderLease(org.apache.flink.runtime.execution.librarycache.TestingClassLoaderLease) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) TestName(org.junit.rules.TestName) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Matchers.hasSize(org.hamcrest.Matchers.hasSize) TriConsumer(org.apache.flink.util.function.TriConsumer) TestFileUtils(org.apache.flink.testutils.TestFileUtils) Before(org.junit.Before) RetryingRegistrationConfiguration(org.apache.flink.runtime.registration.RetryingRegistrationConfiguration) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) TriConsumerWithException(org.apache.flink.util.function.TriConsumerWithException) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) IOException(java.io.IOException) InstanceID(org.apache.flink.runtime.instance.InstanceID) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) Executors(org.apache.flink.util.concurrent.Executors) ThreadSafeTaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.ThreadSafeTaskSlotTable) JobID(org.apache.flink.api.common.JobID) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) Task(org.apache.flink.runtime.taskmanager.Task) Assert.assertNull(org.junit.Assert.assertNull) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) ArrayDeque(java.util.ArrayDeque) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Assert.assertEquals(org.junit.Assert.assertEquals) CPUResource(org.apache.flink.api.common.resources.CPUResource) MultiShotLatch(org.apache.flink.core.testutils.MultiShotLatch) Deadline(org.apache.flink.api.common.time.Deadline) ClusterPartitionReport(org.apache.flink.runtime.taskexecutor.partition.ClusterPartitionReport) IntStream.range(java.util.stream.IntStream.range) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) ShuffleEnvironment(org.apache.flink.runtime.shuffle.ShuffleEnvironment) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) BlockingNoOpInvokable(org.apache.flink.runtime.testtasks.BlockingNoOpInvokable) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) Lists(org.apache.flink.shaded.guava30.com.google.common.collect.Lists) Assert.assertThat(org.junit.Assert.assertThat) After(org.junit.After) TestLogger(org.apache.flink.util.TestLogger) Assert.fail(org.junit.Assert.fail) TransientBlobKey(org.apache.flink.runtime.blob.TransientBlobKey) RegistrationTimeoutException(org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException) Collection(java.util.Collection) KvStateRegistry(org.apache.flink.runtime.query.KvStateRegistry) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) CompletionException(java.util.concurrent.CompletionException) UUID(java.util.UUID) NettyShuffleEnvironment(org.apache.flink.runtime.io.network.NettyShuffleEnvironment) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) TaskManagerException(org.apache.flink.runtime.taskexecutor.exceptions.TaskManagerException) Collectors(java.util.stream.Collectors) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) ExecutorUtils(org.apache.flink.util.ExecutorUtils) TaskSlotUtils.createTotalResourceProfile(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils.createTotalResourceProfile) List(java.util.List) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Queue(java.util.Queue) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) AllocatedSlotInfo(org.apache.flink.runtime.jobmaster.AllocatedSlotInfo) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) LeaderRetrievalListener(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener) TaskInvokable(org.apache.flink.runtime.jobgraph.tasks.TaskInvokable) TaskExecutorLocalStateStoresManager(org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) CompletableFuture(java.util.concurrent.CompletableFuture) TaskDeploymentDescriptorBuilder(org.apache.flink.runtime.deployment.TaskDeploymentDescriptorBuilder) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) NettyShuffleEnvironmentOptions(org.apache.flink.configuration.NettyShuffleEnvironmentOptions) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) ExternalResourceInfoProvider(org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider) ConfigConstants(org.apache.flink.configuration.ConfigConstants) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nonnull(javax.annotation.Nonnull) TaskSlotUtils(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils) JMTMRegistrationRejection(org.apache.flink.runtime.jobmaster.JMTMRegistrationRejection) Matchers.empty(org.hamcrest.Matchers.empty) NettyShuffleEnvironmentBuilder(org.apache.flink.runtime.io.network.NettyShuffleEnvironmentBuilder) Assert.assertNotNull(org.junit.Assert.assertNotNull) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) Reference(org.apache.flink.util.Reference) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) AllocatedSlotReport(org.apache.flink.runtime.jobmaster.AllocatedSlotReport) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) Rule(org.junit.Rule) UnregisteredMetricGroups.createUnregisteredTaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups.createUnregisteredTaskManagerMetricGroup) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) RecipientUnreachableException(org.apache.flink.runtime.rpc.exceptions.RecipientUnreachableException) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) InstanceID(org.apache.flink.runtime.instance.InstanceID) Matchers.containsString(org.hamcrest.Matchers.containsString) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) UUID(java.util.UUID) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskExecutorLocalStateStoresManager(org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager) CountDownLatch(java.util.concurrent.CountDownLatch) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) ArrayDeque(java.util.ArrayDeque) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway)

Example 14 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class AbstractDispatcherTest method setUp.

@Before
public void setUp() throws Exception {
    heartbeatServices = new HeartbeatServices(1000L, 10000L);
    haServices = new TestingHighAvailabilityServices();
    haServices.setCheckpointRecoveryFactory(new StandaloneCheckpointRecoveryFactory());
    haServices.setResourceManagerLeaderRetriever(new SettableLeaderRetrievalService());
    haServices.setJobGraphStore(new StandaloneJobGraphStore());
    haServices.setJobResultStore(new EmbeddedJobResultStore());
    configuration = new Configuration();
    blobServer = new BlobServer(configuration, temporaryFolder.newFolder(), new VoidBlobStore());
}
Also used : HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) StandaloneJobGraphStore(org.apache.flink.runtime.jobmanager.StandaloneJobGraphStore) VoidBlobStore(org.apache.flink.runtime.blob.VoidBlobStore) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) Configuration(org.apache.flink.configuration.Configuration) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) BlobServer(org.apache.flink.runtime.blob.BlobServer) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) Before(org.junit.Before)

Example 15 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class JobMasterTest method setupClass.

@BeforeClass
public static void setupClass() {
    rpcService = new TestingRpcService();
    fastHeartbeatServices = new HeartbeatServices(fastHeartbeatInterval, fastHeartbeatTimeout, -1);
    heartbeatServices = new HeartbeatServices(heartbeatInterval, heartbeatTimeout, 1);
}
Also used : HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) BeforeClass(org.junit.BeforeClass)

Aggregations

HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)19 Configuration (org.apache.flink.configuration.Configuration)16 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)14 Test (org.junit.Test)12 UUID (java.util.UUID)11 CompletableFuture (java.util.concurrent.CompletableFuture)11 JobID (org.apache.flink.api.common.JobID)11 FlinkException (org.apache.flink.util.FlinkException)11 IOException (java.io.IOException)10 Collection (java.util.Collection)10 Collections (java.util.Collections)10 ExecutionException (java.util.concurrent.ExecutionException)10 TimeUnit (java.util.concurrent.TimeUnit)10 TimeoutException (java.util.concurrent.TimeoutException)10 Time (org.apache.flink.api.common.time.Time)10 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)10 RpcUtils (org.apache.flink.runtime.rpc.RpcUtils)10 ExceptionUtils (org.apache.flink.util.ExceptionUtils)10 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)10 Before (org.junit.Before)10