Search in sources :

Example 16 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class JobMasterTest method testAllocatedSlotReportDoesNotContainStaleInformation.

/**
 * Tests that the {@link AllocatedSlotReport} contains up to date information and not stale
 * information about the allocated slots on the {@link JobMaster}.
 *
 * <p>This is a probabilistic test case which only fails if executed repeatedly without the fix
 * for FLINK-12863.
 */
@Test
public void testAllocatedSlotReportDoesNotContainStaleInformation() throws Exception {
    final CompletableFuture<Void> assertionFuture = new CompletableFuture<>();
    final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
    final AtomicBoolean terminateHeartbeatVerification = new AtomicBoolean(false);
    final OneShotLatch hasReceivedSlotOffers = new OneShotLatch();
    final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setHeartbeatJobManagerFunction((taskManagerId, allocatedSlotReport) -> {
        try {
            if (hasReceivedSlotOffers.isTriggered()) {
                assertThat(allocatedSlotReport.getAllocatedSlotInfos(), hasSize(1));
            } else {
                assertThat(allocatedSlotReport.getAllocatedSlotInfos(), empty());
            }
        } catch (AssertionError e) {
            assertionFuture.completeExceptionally(e);
        }
        if (terminateHeartbeatVerification.get()) {
            assertionFuture.complete(null);
        }
        return FutureUtils.completedVoidFuture();
    }).createTestingTaskExecutorGateway();
    rpcService.registerGateway(taskExecutorGateway.getAddress(), taskExecutorGateway);
    final JobManagerSharedServices jobManagerSharedServices = new TestingJobManagerSharedServicesBuilder().build();
    final JobGraph jobGraph = JobGraphTestUtils.singleNoOpJobGraph();
    final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withHeartbeatServices(new HeartbeatServices(5L, 1000L)).withSlotPoolServiceSchedulerFactory(DefaultSlotPoolServiceSchedulerFactory.create(new TestingSlotPoolFactory(hasReceivedSlotOffers), new DefaultSchedulerFactory())).createJobMaster();
    jobMaster.start();
    try {
        final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
        // register task manager will trigger monitor heartbeat target, schedule heartbeat
        // request at interval time
        CompletableFuture<RegistrationResponse> registrationResponse = jobMasterGateway.registerTaskManager(jobGraph.getJobID(), TaskManagerRegistrationInformation.create(taskExecutorGateway.getAddress(), unresolvedTaskManagerLocation, TestingUtils.zeroUUID()), testingTimeout);
        // wait for the completion of the registration
        registrationResponse.get();
        final SlotOffer slotOffer = new SlotOffer(new AllocationID(), 0, ResourceProfile.ANY);
        final CompletableFuture<Collection<SlotOffer>> slotOfferFuture = jobMasterGateway.offerSlots(unresolvedTaskManagerLocation.getResourceID(), Collections.singleton(slotOffer), testingTimeout);
        assertThat(slotOfferFuture.get(), containsInAnyOrder(slotOffer));
        terminateHeartbeatVerification.set(true);
        // make sure that no assertion has been violated
        assertionFuture.get();
    } finally {
        RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
        jobManagerSharedServices.shutdown();
    }
}
Also used : TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) DefaultSchedulerFactory(org.apache.flink.runtime.scheduler.DefaultSchedulerFactory) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) Arrays(java.util.Arrays) Tuple3(org.apache.flink.api.java.tuple.Tuple3) SlotPoolService(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolService) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) RestartStrategyOptions(org.apache.flink.configuration.RestartStrategyOptions) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) PhysicalSlot(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlot) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) Duration(java.time.Duration) Map(java.util.Map) Matchers.nullValue(org.hamcrest.Matchers.nullValue) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) ClassRule(org.junit.ClassRule) SimpleSlotContext(org.apache.flink.runtime.instance.SimpleSlotContext) SlotPoolServiceFactory(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolServiceFactory) AfterClass(org.junit.AfterClass) BlockingQueue(java.util.concurrent.BlockingQueue) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) Category(org.junit.experimental.categories.Category) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) CountDownLatch(java.util.concurrent.CountDownLatch) TimeUtils(org.apache.flink.util.TimeUtils) Matchers.is(org.hamcrest.Matchers.is) Time(org.apache.flink.api.common.time.Time) InputSplitSource(org.apache.flink.core.io.InputSplitSource) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) FlinkException(org.apache.flink.util.FlinkException) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) AccessExecution(org.apache.flink.runtime.executiongraph.AccessExecution) JobStatus(org.apache.flink.api.common.JobStatus) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) DefaultInputSplitAssigner(org.apache.flink.api.common.io.DefaultInputSplitAssigner) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) BiConsumer(java.util.function.BiConsumer) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) DistributionPattern(org.apache.flink.runtime.jobgraph.DistributionPattern) Nullable(javax.annotation.Nullable) CheckpointProperties(org.apache.flink.runtime.checkpoint.CheckpointProperties) Before(org.junit.Before) InputSplitAssigner(org.apache.flink.core.io.InputSplitAssigner) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) InputSplit(org.apache.flink.core.io.InputSplit) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) Test(org.junit.Test) IOException(java.io.IOException) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) JobID(org.apache.flink.api.common.JobID) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) ArrayDeque(java.util.ArrayDeque) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) CheckpointRetentionPolicy(org.apache.flink.runtime.checkpoint.CheckpointRetentionPolicy) Deadline(org.apache.flink.api.common.time.Deadline) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) BiFunction(java.util.function.BiFunction) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) TaskExecutorToJobManagerHeartbeatPayload(org.apache.flink.runtime.taskexecutor.TaskExecutorToJobManagerHeartbeatPayload) AggregateFunction(org.apache.flink.api.common.functions.AggregateFunction) InstantiationUtil(org.apache.flink.util.InstantiationUtil) After(org.junit.After) TestLogger(org.apache.flink.util.TestLogger) TestingSchedulerNGFactory(org.apache.flink.runtime.scheduler.TestingSchedulerNGFactory) Assert.fail(org.junit.Assert.fail) BlobServerOptions(org.apache.flink.configuration.BlobServerOptions) CompletedCheckpointStorageLocation(org.apache.flink.runtime.state.CompletedCheckpointStorageLocation) Collection(java.util.Collection) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) UUID(java.util.UUID) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) Collectors(java.util.stream.Collectors) SlotInfoWithUtilization(org.apache.flink.runtime.jobmaster.slotpool.SlotInfoWithUtilization) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) Objects(java.util.Objects) TestingUtils(org.apache.flink.testutils.TestingUtils) List(java.util.List) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) ResultPartitionDeploymentDescriptor(org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor) Matchers.equalTo(org.hamcrest.Matchers.equalTo) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Optional(java.util.Optional) Queue(java.util.Queue) Matchers.anyOf(org.hamcrest.Matchers.anyOf) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) IntStream(java.util.stream.IntStream) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) BeforeClass(org.junit.BeforeClass) AccessExecutionVertex(org.apache.flink.runtime.executiongraph.AccessExecutionVertex) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ResultPartitionType(org.apache.flink.runtime.io.network.partition.ResultPartitionType) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) RestartStrategies(org.apache.flink.api.common.restartstrategy.RestartStrategies) Function(java.util.function.Function) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) FailoverStrategyFactoryLoader(org.apache.flink.runtime.executiongraph.failover.flip1.FailoverStrategyFactoryLoader) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TestingJobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingJobMasterPartitionTracker) FailsWithAdaptiveScheduler(org.apache.flink.testutils.junit.FailsWithAdaptiveScheduler) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) TestingSlotPoolServiceBuilder(org.apache.flink.runtime.jobmaster.slotpool.TestingSlotPoolServiceBuilder) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nonnull(javax.annotation.Nonnull) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Matchers.empty(org.hamcrest.Matchers.empty) JobGraphBuilder(org.apache.flink.runtime.jobgraph.JobGraphBuilder) TestingSchedulerNG(org.apache.flink.runtime.scheduler.TestingSchedulerNG) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) Matchers(org.hamcrest.Matchers) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) TimeUnit(java.util.concurrent.TimeUnit) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) ClosureCleaner(org.apache.flink.api.java.ClosureCleaner) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) RecipientUnreachableException(org.apache.flink.runtime.rpc.exceptions.RecipientUnreachableException) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) DefaultSchedulerFactory(org.apache.flink.runtime.scheduler.DefaultSchedulerFactory) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) CompletableFuture(java.util.concurrent.CompletableFuture) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) Collection(java.util.Collection) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) Test(org.junit.Test)

Example 17 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class TaskExecutorBuilder method build.

public TaskExecutor build() throws Exception {
    final TaskExecutorBlobService resolvedTaskExecutorBlobService;
    TaskExecutorResourceUtils.adjustForLocalExecution(configuration);
    if (taskExecutorBlobService == null) {
        resolvedTaskExecutorBlobService = NoOpTaskExecutorBlobService.INSTANCE;
    } else {
        resolvedTaskExecutorBlobService = taskExecutorBlobService;
    }
    final TaskManagerConfiguration resolvedTaskManagerConfiguration;
    if (taskManagerConfiguration == null) {
        resolvedTaskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration, taskExecutorResourceSpec, rpcService.getAddress(), workingDirectory.getTmpDirectory());
    } else {
        resolvedTaskManagerConfiguration = taskManagerConfiguration;
    }
    final TaskManagerServices resolvedTaskManagerServices;
    if (taskManagerServices == null) {
        final TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(configuration, resourceId, rpcService.getAddress(), true, taskExecutorResourceSpec, workingDirectory);
        resolvedTaskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, VoidPermanentBlobService.INSTANCE, UnregisteredMetricGroups.createUnregisteredTaskManagerMetricGroup(), Executors.newDirectExecutorService(), throwable -> {
        }, workingDirectory);
    } else {
        resolvedTaskManagerServices = taskManagerServices;
    }
    return new TaskExecutor(rpcService, resolvedTaskManagerConfiguration, haServices, resolvedTaskManagerServices, externalResourceInfoProvider, heartbeatServices, taskManagerMetricGroup, metricQueryServiceAddress, resolvedTaskExecutorBlobService, fatalErrorHandler, partitionTracker);
}
Also used : TaskExecutorBlobService(org.apache.flink.runtime.blob.TaskExecutorBlobService) VoidPermanentBlobService(org.apache.flink.runtime.blob.VoidPermanentBlobService) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) Configuration(org.apache.flink.configuration.Configuration) NoOpTaskExecutorBlobService(org.apache.flink.runtime.blob.NoOpTaskExecutorBlobService) TestingTaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingTaskExecutorPartitionTracker) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) MemorySize(org.apache.flink.configuration.MemorySize) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) Executors(org.apache.flink.util.concurrent.Executors) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) RpcService(org.apache.flink.runtime.rpc.RpcService) WorkingDirectory(org.apache.flink.runtime.entrypoint.WorkingDirectory) TaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TaskExecutorPartitionTracker) ExternalResourceInfoProvider(org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) BlobCacheService(org.apache.flink.runtime.blob.BlobCacheService) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) NoOpFatalErrorHandler(org.apache.flink.runtime.rest.util.NoOpFatalErrorHandler) Collections(java.util.Collections) Nullable(javax.annotation.Nullable) CPUResource(org.apache.flink.api.common.resources.CPUResource) TaskExecutorBlobService(org.apache.flink.runtime.blob.TaskExecutorBlobService) NoOpTaskExecutorBlobService(org.apache.flink.runtime.blob.NoOpTaskExecutorBlobService)

Example 18 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class MiniCluster method setupDispatcherResourceManagerComponents.

@GuardedBy("lock")
private void setupDispatcherResourceManagerComponents(Configuration configuration, RpcServiceFactory dispatcherResourceManagerComponentRpcServiceFactory, MetricQueryServiceRetriever metricQueryServiceRetriever) throws Exception {
    dispatcherResourceManagerComponents.addAll(createDispatcherResourceManagerComponents(configuration, dispatcherResourceManagerComponentRpcServiceFactory, haServices, blobServer, heartbeatServices, metricRegistry, metricQueryServiceRetriever, new ShutDownFatalErrorHandler()));
    final Collection<CompletableFuture<ApplicationStatus>> shutDownFutures = new ArrayList<>(dispatcherResourceManagerComponents.size());
    for (DispatcherResourceManagerComponent dispatcherResourceManagerComponent : dispatcherResourceManagerComponents) {
        final CompletableFuture<ApplicationStatus> shutDownFuture = dispatcherResourceManagerComponent.getShutDownFuture();
        FutureUtils.assertNoException(shutDownFuture.thenCompose(applicationStatus -> dispatcherResourceManagerComponent.stopApplication(applicationStatus, null)));
        shutDownFutures.add(shutDownFuture);
    }
    FutureUtils.completeAll(shutDownFutures).whenComplete((ignored, exception) -> closeAsync());
}
Also used : InetAddress(java.net.InetAddress) ClusterOverview(org.apache.flink.runtime.messages.webmonitor.ClusterOverview) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FunctionUtils(org.apache.flink.util.function.FunctionUtils) Duration(java.time.Duration) JobStatusMessage(org.apache.flink.runtime.client.JobStatusMessage) HighAvailabilityServicesUtils(org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils) GuardedBy(javax.annotation.concurrent.GuardedBy) ClientUtils(org.apache.flink.runtime.client.ClientUtils) Executors(java.util.concurrent.Executors) AccessExecutionGraph(org.apache.flink.runtime.executiongraph.AccessExecutionGraph) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ClusterEntrypointUtils(org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils) Time(org.apache.flink.api.common.time.Time) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) FlinkException(org.apache.flink.util.FlinkException) BlobServer(org.apache.flink.runtime.blob.BlobServer) CoordinationResponse(org.apache.flink.runtime.operators.coordination.CoordinationResponse) MetricRegistryImpl(org.apache.flink.runtime.metrics.MetricRegistryImpl) SavepointConfigOptions(org.apache.flink.runtime.jobgraph.SavepointConfigOptions) JobStatus(org.apache.flink.api.common.JobStatus) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) RpcService(org.apache.flink.runtime.rpc.RpcService) ResourceOverview(org.apache.flink.runtime.resourcemanager.ResourceOverview) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) FileOutputFormat(org.apache.flink.api.common.io.FileOutputFormat) Executor(java.util.concurrent.Executor) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) TaskExecutor(org.apache.flink.runtime.taskexecutor.TaskExecutor) IOException(java.io.IOException) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) ExecutionException(java.util.concurrent.ExecutionException) JobID(org.apache.flink.api.common.JobID) ConfigurationUtils(org.apache.flink.configuration.ConfigurationUtils) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) ProcessMetricGroup(org.apache.flink.runtime.metrics.groups.ProcessMetricGroup) ClusterOptions(org.apache.flink.configuration.ClusterOptions) EmbeddedHaServicesWithLeadershipControl(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LoggerFactory(org.slf4j.LoggerFactory) ExceptionUtils(org.apache.flink.util.ExceptionUtils) ReporterSetup(org.apache.flink.runtime.metrics.ReporterSetup) InstantiationUtil(org.apache.flink.util.InstantiationUtil) URI(java.net.URI) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) TaskManagerRunner(org.apache.flink.runtime.taskexecutor.TaskManagerRunner) TriggerSavepointMode(org.apache.flink.runtime.dispatcher.TriggerSavepointMode) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) LeaderRetriever(org.apache.flink.runtime.webmonitor.retriever.LeaderRetriever) ExecutorThreadFactory(org.apache.flink.util.concurrent.ExecutorThreadFactory) Collection(java.util.Collection) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) CompletionException(java.util.concurrent.CompletionException) MetricUtils(org.apache.flink.runtime.metrics.util.MetricUtils) UUID(java.util.UUID) InetSocketAddress(java.net.InetSocketAddress) Collectors(java.util.stream.Collectors) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ExecutorUtils(org.apache.flink.util.ExecutorUtils) JobExecutionResult(org.apache.flink.api.common.JobExecutionResult) List(java.util.List) SerializedValue(org.apache.flink.util.SerializedValue) CoordinationRequest(org.apache.flink.runtime.operators.coordination.CoordinationRequest) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Optional(java.util.Optional) DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) HaLeadershipControl(org.apache.flink.runtime.highavailability.nonha.embedded.HaLeadershipControl) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) RpcMetricQueryServiceRetriever(org.apache.flink.runtime.webmonitor.retriever.impl.RpcMetricQueryServiceRetriever) CompletableFuture(java.util.concurrent.CompletableFuture) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) Function(java.util.function.Function) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) RestoreMode(org.apache.flink.runtime.jobgraph.RestoreMode) JobResult(org.apache.flink.runtime.jobmaster.JobResult) DefaultDispatcherResourceManagerComponentFactory(org.apache.flink.runtime.entrypoint.component.DefaultDispatcherResourceManagerComponentFactory) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) ExternalResourceInfoProvider(org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) RpcGatewayRetriever(org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nonnull(javax.annotation.Nonnull) JobSubmissionResult(org.apache.flink.api.common.JobSubmissionResult) ExecutorService(java.util.concurrent.ExecutorService) MetricQueryServiceRetriever(org.apache.flink.runtime.webmonitor.retriever.MetricQueryServiceRetriever) DispatcherResourceManagerComponentFactory(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) Preconditions.checkState(org.apache.flink.util.Preconditions.checkState) Logger(org.slf4j.Logger) AutoCloseableAsync(org.apache.flink.util.AutoCloseableAsync) ExponentialBackoffRetryStrategy(org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy) Configuration(org.apache.flink.configuration.Configuration) Reference(org.apache.flink.util.Reference) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) TimeUnit(java.util.concurrent.TimeUnit) RpcSystem(org.apache.flink.runtime.rpc.RpcSystem) WorkingDirectory(org.apache.flink.runtime.entrypoint.WorkingDirectory) DispatcherResourceManagerComponent(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) Internal(org.apache.flink.annotation.Internal) BlobCacheService(org.apache.flink.runtime.blob.BlobCacheService) BlobClient(org.apache.flink.runtime.blob.BlobClient) BlobUtils(org.apache.flink.runtime.blob.BlobUtils) StandaloneResourceManagerFactory(org.apache.flink.runtime.resourcemanager.StandaloneResourceManagerFactory) Collections(java.util.Collections) HighAvailabilityOptions(org.apache.flink.configuration.HighAvailabilityOptions) CompletableFuture(java.util.concurrent.CompletableFuture) DispatcherResourceManagerComponent(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) ArrayList(java.util.ArrayList) GuardedBy(javax.annotation.concurrent.GuardedBy)

Example 19 with HeartbeatServices

use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.

the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.

@Test
public void testCancelingOnProcessFailure() throws Throwable {
    Assume.assumeTrue("---- Skipping Process Failure test : Could not find java executable ----", getJavaCommandPath() != null);
    TestProcess taskManagerProcess = null;
    final TestingFatalErrorHandler fatalErrorHandler = new TestingFatalErrorHandler();
    Configuration config = new Configuration();
    config.setString(JobManagerOptions.ADDRESS, "localhost");
    config.set(AkkaOptions.ASK_TIMEOUT_DURATION, Duration.ofSeconds(100));
    config.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperResource.getConnectString());
    config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().getAbsolutePath());
    config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
    config.set(TaskManagerOptions.CPU_CORES, 1.0);
    config.setInteger(RestOptions.PORT, 0);
    final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
    final int jobManagerPort = rpcService.getPort();
    config.setInteger(JobManagerOptions.PORT, jobManagerPort);
    final DispatcherResourceManagerComponentFactory resourceManagerComponentFactory = DefaultDispatcherResourceManagerComponentFactory.createSessionComponentFactory(StandaloneResourceManagerFactory.getInstance());
    DispatcherResourceManagerComponent dispatcherResourceManagerComponent = null;
    final ScheduledExecutorService ioExecutor = TestingUtils.defaultExecutor();
    final HighAvailabilityServices haServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(config, ioExecutor, AddressResolution.NO_ADDRESS_RESOLUTION, RpcSystem.load(), NoOpFatalErrorHandler.INSTANCE);
    final AtomicReference<Throwable> programException = new AtomicReference<>();
    try {
        dispatcherResourceManagerComponent = resourceManagerComponentFactory.create(config, ResourceID.generate(), ioExecutor, rpcService, haServices, blobServerResource.getBlobServer(), new HeartbeatServices(100L, 10000L, 2), NoOpMetricRegistry.INSTANCE, new MemoryExecutionGraphInfoStore(), VoidMetricQueryServiceRetriever.INSTANCE, fatalErrorHandler);
        TestProcessBuilder taskManagerProcessBuilder = new TestProcessBuilder(TaskExecutorProcessEntryPoint.class.getName());
        taskManagerProcessBuilder.addConfigAsMainClassArgs(config);
        taskManagerProcess = taskManagerProcessBuilder.start();
        // start the test program, which infinitely blocks
        Runnable programRunner = new Runnable() {

            @Override
            public void run() {
                try {
                    ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", 1337, config);
                    env.setParallelism(2);
                    env.setRestartStrategy(RestartStrategies.noRestart());
                    env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {

                        @Override
                        public Long map(Long value) throws Exception {
                            synchronized (this) {
                                System.out.println(TASK_DEPLOYED_MARKER);
                                wait();
                            }
                            return 0L;
                        }
                    }).output(new DiscardingOutputFormat<>());
                    env.execute();
                } catch (Throwable t) {
                    programException.set(t);
                }
            }
        };
        Thread programThread = new Thread(programRunner);
        programThread.start();
        waitUntilAtLeastOneTaskHasBeenDeployed(taskManagerProcess);
        // kill the TaskManager after the job started to run
        taskManagerProcess.destroy();
        taskManagerProcess = null;
        // the job should fail within a few seconds due to heartbeat timeouts
        // since the CI environment is often slow, we conservatively give it up to 2 minutes
        programThread.join(TIMEOUT.toMillis());
        assertFalse("The program did not cancel in time", programThread.isAlive());
        Throwable error = programException.get();
        assertNotNull("The program did not fail properly", error);
        assertTrue(error instanceof ProgramInvocationException);
    // all seems well :-)
    } catch (Exception | Error e) {
        if (taskManagerProcess != null) {
            printOutput("TaskManager OUT", taskManagerProcess.getProcessOutput().toString());
            printOutput("TaskManager ERR", taskManagerProcess.getErrorOutput().toString());
        }
        throw ExceptionUtils.firstOrSuppressed(e, programException.get());
    } finally {
        if (taskManagerProcess != null) {
            taskManagerProcess.destroy();
        }
        if (dispatcherResourceManagerComponent != null) {
            dispatcherResourceManagerComponent.stopApplication(ApplicationStatus.SUCCEEDED, null);
        }
        fatalErrorHandler.rethrowError();
        RpcUtils.terminateRpcService(rpcService, Time.seconds(100L));
        haServices.closeAndCleanupAllData();
    }
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) MapFunction(org.apache.flink.api.common.functions.MapFunction) TestProcess(org.apache.flink.test.util.TestProcessBuilder.TestProcess) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) DispatcherResourceManagerComponent(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent) TaskExecutorProcessEntryPoint(org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AtomicReference(java.util.concurrent.atomic.AtomicReference) TaskExecutorProcessEntryPoint(org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint) TestProcessBuilder(org.apache.flink.test.util.TestProcessBuilder) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) TimeoutException(java.util.concurrent.TimeoutException) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) RpcService(org.apache.flink.runtime.rpc.RpcService) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) DefaultDispatcherResourceManagerComponentFactory(org.apache.flink.runtime.entrypoint.component.DefaultDispatcherResourceManagerComponentFactory) DispatcherResourceManagerComponentFactory(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory) Test(org.junit.Test)

Aggregations

HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)19 Configuration (org.apache.flink.configuration.Configuration)16 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)14 Test (org.junit.Test)12 UUID (java.util.UUID)11 CompletableFuture (java.util.concurrent.CompletableFuture)11 JobID (org.apache.flink.api.common.JobID)11 FlinkException (org.apache.flink.util.FlinkException)11 IOException (java.io.IOException)10 Collection (java.util.Collection)10 Collections (java.util.Collections)10 ExecutionException (java.util.concurrent.ExecutionException)10 TimeUnit (java.util.concurrent.TimeUnit)10 TimeoutException (java.util.concurrent.TimeoutException)10 Time (org.apache.flink.api.common.time.Time)10 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)10 RpcUtils (org.apache.flink.runtime.rpc.RpcUtils)10 ExceptionUtils (org.apache.flink.util.ExceptionUtils)10 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)10 Before (org.junit.Before)10