Search in sources :

Example 81 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class JobExceptionsHandler method createJobExceptionsInfo.

private static JobExceptionsInfoWithHistory createJobExceptionsInfo(ExecutionGraphInfo executionGraphInfo, int exceptionToReportMaxSize) {
    final ArchivedExecutionGraph executionGraph = executionGraphInfo.getArchivedExecutionGraph();
    if (executionGraph.getFailureInfo() == null) {
        return new JobExceptionsInfoWithHistory();
    }
    List<JobExceptionsInfo.ExecutionExceptionInfo> taskExceptionList = new ArrayList<>();
    boolean truncated = false;
    for (AccessExecutionVertex task : executionGraph.getAllExecutionVertices()) {
        Optional<ErrorInfo> failure = task.getFailureInfo();
        if (failure.isPresent()) {
            if (taskExceptionList.size() >= exceptionToReportMaxSize) {
                truncated = true;
                break;
            }
            TaskManagerLocation location = task.getCurrentAssignedResourceLocation();
            String locationString = toString(location);
            long timestamp = task.getStateTimestamp(ExecutionState.FAILED);
            taskExceptionList.add(new JobExceptionsInfo.ExecutionExceptionInfo(failure.get().getExceptionAsString(), task.getTaskNameWithSubtaskIndex(), locationString, timestamp == 0 ? -1 : timestamp));
        }
    }
    final ErrorInfo rootCause = executionGraph.getFailureInfo();
    return new JobExceptionsInfoWithHistory(rootCause.getExceptionAsString(), rootCause.getTimestamp(), taskExceptionList, truncated, createJobExceptionHistory(executionGraphInfo.getExceptionHistory(), exceptionToReportMaxSize));
}
Also used : TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) ErrorInfo(org.apache.flink.runtime.executiongraph.ErrorInfo) ArrayList(java.util.ArrayList) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) JobExceptionsInfo(org.apache.flink.runtime.rest.messages.JobExceptionsInfo) JobExceptionsInfoWithHistory(org.apache.flink.runtime.rest.messages.JobExceptionsInfoWithHistory) AccessExecutionVertex(org.apache.flink.runtime.executiongraph.AccessExecutionVertex)

Example 82 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class SubtasksAllAccumulatorsHandler method handleRequest.

@Override
protected SubtasksAllAccumulatorsInfo handleRequest(HandlerRequest<EmptyRequestBody> request, AccessExecutionJobVertex jobVertex) throws RestHandlerException {
    JobVertexID jobVertexId = jobVertex.getJobVertexId();
    int parallelism = jobVertex.getParallelism();
    final List<SubtasksAllAccumulatorsInfo.SubtaskAccumulatorsInfo> subtaskAccumulatorsInfos = new ArrayList<>();
    for (AccessExecutionVertex vertex : jobVertex.getTaskVertices()) {
        TaskManagerLocation location = vertex.getCurrentAssignedResourceLocation();
        String locationString = location == null ? "(unassigned)" : location.getHostname();
        StringifiedAccumulatorResult[] accs = vertex.getCurrentExecutionAttempt().getUserAccumulatorsStringified();
        List<UserAccumulator> userAccumulators = new ArrayList<>(accs.length);
        for (StringifiedAccumulatorResult acc : accs) {
            userAccumulators.add(new UserAccumulator(acc.getName(), acc.getType(), acc.getValue()));
        }
        subtaskAccumulatorsInfos.add(new SubtasksAllAccumulatorsInfo.SubtaskAccumulatorsInfo(vertex.getCurrentExecutionAttempt().getParallelSubtaskIndex(), vertex.getCurrentExecutionAttempt().getAttemptNumber(), locationString, userAccumulators));
    }
    return new SubtasksAllAccumulatorsInfo(jobVertexId, parallelism, subtaskAccumulatorsInfos);
}
Also used : TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ArrayList(java.util.ArrayList) StringifiedAccumulatorResult(org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult) SubtasksAllAccumulatorsInfo(org.apache.flink.runtime.rest.messages.job.SubtasksAllAccumulatorsInfo) UserAccumulator(org.apache.flink.runtime.rest.messages.job.UserAccumulator) AccessExecutionVertex(org.apache.flink.runtime.executiongraph.AccessExecutionVertex)

Example 83 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class JobMasterTest method testJobMasterDisconnectsOldTaskExecutorIfNewSessionIsSeen.

@Test
public void testJobMasterDisconnectsOldTaskExecutorIfNewSessionIsSeen() throws Exception {
    final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).createJobMaster();
    final CompletableFuture<Void> firstTaskExecutorDisconnectedFuture = new CompletableFuture<>();
    final TestingTaskExecutorGateway firstTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setAddress("firstTaskExecutor").setDisconnectJobManagerConsumer((jobID, throwable) -> firstTaskExecutorDisconnectedFuture.complete(null)).createTestingTaskExecutorGateway();
    final TestingTaskExecutorGateway secondTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setAddress("secondTaskExecutor").createTestingTaskExecutorGateway();
    rpcService.registerGateway(firstTaskExecutorGateway.getAddress(), firstTaskExecutorGateway);
    rpcService.registerGateway(secondTaskExecutorGateway.getAddress(), secondTaskExecutorGateway);
    try {
        jobMaster.start();
        final LocalUnresolvedTaskManagerLocation taskManagerLocation = new LocalUnresolvedTaskManagerLocation();
        final UUID firstTaskManagerSessionId = UUID.randomUUID();
        final CompletableFuture<RegistrationResponse> firstRegistrationResponse = jobMaster.registerTaskManager(jobGraph.getJobID(), TaskManagerRegistrationInformation.create(firstTaskExecutorGateway.getAddress(), taskManagerLocation, firstTaskManagerSessionId), testingTimeout);
        assertThat(firstRegistrationResponse.get(), instanceOf(JMTMRegistrationSuccess.class));
        final UUID secondTaskManagerSessionId = UUID.randomUUID();
        final CompletableFuture<RegistrationResponse> secondRegistrationResponse = jobMaster.registerTaskManager(jobGraph.getJobID(), TaskManagerRegistrationInformation.create(secondTaskExecutorGateway.getAddress(), taskManagerLocation, secondTaskManagerSessionId), testingTimeout);
        assertThat(secondRegistrationResponse.get(), instanceOf(JMTMRegistrationSuccess.class));
        // the first TaskExecutor should be disconnected
        firstTaskExecutorDisconnectedFuture.get();
    } finally {
        RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
    }
}
Also used : TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) DefaultSchedulerFactory(org.apache.flink.runtime.scheduler.DefaultSchedulerFactory) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) Arrays(java.util.Arrays) Tuple3(org.apache.flink.api.java.tuple.Tuple3) SlotPoolService(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolService) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) RestartStrategyOptions(org.apache.flink.configuration.RestartStrategyOptions) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) PhysicalSlot(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlot) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) Duration(java.time.Duration) Map(java.util.Map) Matchers.nullValue(org.hamcrest.Matchers.nullValue) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) ClassRule(org.junit.ClassRule) SimpleSlotContext(org.apache.flink.runtime.instance.SimpleSlotContext) SlotPoolServiceFactory(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolServiceFactory) AfterClass(org.junit.AfterClass) BlockingQueue(java.util.concurrent.BlockingQueue) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) Category(org.junit.experimental.categories.Category) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) CountDownLatch(java.util.concurrent.CountDownLatch) TimeUtils(org.apache.flink.util.TimeUtils) Matchers.is(org.hamcrest.Matchers.is) Time(org.apache.flink.api.common.time.Time) InputSplitSource(org.apache.flink.core.io.InputSplitSource) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) FlinkException(org.apache.flink.util.FlinkException) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) AccessExecution(org.apache.flink.runtime.executiongraph.AccessExecution) JobStatus(org.apache.flink.api.common.JobStatus) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) DefaultInputSplitAssigner(org.apache.flink.api.common.io.DefaultInputSplitAssigner) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) BiConsumer(java.util.function.BiConsumer) Matchers.hasSize(org.hamcrest.Matchers.hasSize) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) DistributionPattern(org.apache.flink.runtime.jobgraph.DistributionPattern) Nullable(javax.annotation.Nullable) CheckpointProperties(org.apache.flink.runtime.checkpoint.CheckpointProperties) Before(org.junit.Before) InputSplitAssigner(org.apache.flink.core.io.InputSplitAssigner) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) InputSplit(org.apache.flink.core.io.InputSplit) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) Test(org.junit.Test) IOException(java.io.IOException) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) JobID(org.apache.flink.api.common.JobID) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) ArrayDeque(java.util.ArrayDeque) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) CheckpointRetentionPolicy(org.apache.flink.runtime.checkpoint.CheckpointRetentionPolicy) Deadline(org.apache.flink.api.common.time.Deadline) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) BiFunction(java.util.function.BiFunction) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) TaskExecutorToJobManagerHeartbeatPayload(org.apache.flink.runtime.taskexecutor.TaskExecutorToJobManagerHeartbeatPayload) AggregateFunction(org.apache.flink.api.common.functions.AggregateFunction) InstantiationUtil(org.apache.flink.util.InstantiationUtil) After(org.junit.After) TestLogger(org.apache.flink.util.TestLogger) TestingSchedulerNGFactory(org.apache.flink.runtime.scheduler.TestingSchedulerNGFactory) Assert.fail(org.junit.Assert.fail) BlobServerOptions(org.apache.flink.configuration.BlobServerOptions) CompletedCheckpointStorageLocation(org.apache.flink.runtime.state.CompletedCheckpointStorageLocation) Collection(java.util.Collection) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) UUID(java.util.UUID) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) Collectors(java.util.stream.Collectors) SlotInfoWithUtilization(org.apache.flink.runtime.jobmaster.slotpool.SlotInfoWithUtilization) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) Objects(java.util.Objects) TestingUtils(org.apache.flink.testutils.TestingUtils) List(java.util.List) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) ResultPartitionDeploymentDescriptor(org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor) Matchers.equalTo(org.hamcrest.Matchers.equalTo) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Optional(java.util.Optional) Queue(java.util.Queue) Matchers.anyOf(org.hamcrest.Matchers.anyOf) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) IntStream(java.util.stream.IntStream) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) BeforeClass(org.junit.BeforeClass) AccessExecutionVertex(org.apache.flink.runtime.executiongraph.AccessExecutionVertex) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ResultPartitionType(org.apache.flink.runtime.io.network.partition.ResultPartitionType) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) RestartStrategies(org.apache.flink.api.common.restartstrategy.RestartStrategies) Function(java.util.function.Function) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) FailoverStrategyFactoryLoader(org.apache.flink.runtime.executiongraph.failover.flip1.FailoverStrategyFactoryLoader) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TestingJobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingJobMasterPartitionTracker) FailsWithAdaptiveScheduler(org.apache.flink.testutils.junit.FailsWithAdaptiveScheduler) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) TestingSlotPoolServiceBuilder(org.apache.flink.runtime.jobmaster.slotpool.TestingSlotPoolServiceBuilder) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nonnull(javax.annotation.Nonnull) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Matchers.empty(org.hamcrest.Matchers.empty) JobGraphBuilder(org.apache.flink.runtime.jobgraph.JobGraphBuilder) TestingSchedulerNG(org.apache.flink.runtime.scheduler.TestingSchedulerNG) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) Matchers(org.hamcrest.Matchers) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) TimeUnit(java.util.concurrent.TimeUnit) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) ClosureCleaner(org.apache.flink.api.java.ClosureCleaner) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) RecipientUnreachableException(org.apache.flink.runtime.rpc.exceptions.RecipientUnreachableException) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) CompletableFuture(java.util.concurrent.CompletableFuture) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) Test(org.junit.Test)

Example 84 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class SubtaskExecutionAttemptDetailsInfo method create.

public static SubtaskExecutionAttemptDetailsInfo create(AccessExecution execution, @Nullable MetricFetcher metricFetcher, JobID jobID, JobVertexID jobVertexID) {
    final ExecutionState status = execution.getState();
    final long now = System.currentTimeMillis();
    final TaskManagerLocation location = execution.getAssignedResourceLocation();
    final String locationString = location == null ? "(unassigned)" : location.getHostname();
    String taskmanagerId = location == null ? "(unassigned)" : location.getResourceID().toString();
    long startTime = execution.getStateTimestamp(ExecutionState.DEPLOYING);
    if (startTime == 0) {
        startTime = -1;
    }
    final long endTime = status.isTerminal() ? execution.getStateTimestamp(status) : -1;
    final long duration = startTime > 0 ? ((endTime > 0 ? endTime : now) - startTime) : -1;
    final MutableIOMetrics ioMetrics = new MutableIOMetrics();
    ioMetrics.addIOMetrics(execution, metricFetcher, jobID.toString(), jobVertexID.toString());
    final IOMetricsInfo ioMetricsInfo = new IOMetricsInfo(ioMetrics.getNumBytesIn(), ioMetrics.isNumBytesInComplete(), ioMetrics.getNumBytesOut(), ioMetrics.isNumBytesOutComplete(), ioMetrics.getNumRecordsIn(), ioMetrics.isNumRecordsInComplete(), ioMetrics.getNumRecordsOut(), ioMetrics.isNumRecordsOutComplete());
    return new SubtaskExecutionAttemptDetailsInfo(execution.getParallelSubtaskIndex(), status, execution.getAttemptNumber(), locationString, startTime, endTime, duration, ioMetricsInfo, taskmanagerId);
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) MutableIOMetrics(org.apache.flink.runtime.rest.handler.util.MutableIOMetrics) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) IOMetricsInfo(org.apache.flink.runtime.rest.messages.job.metrics.IOMetricsInfo)

Aggregations

TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)84 Test (org.junit.Test)42 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)25 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)18 AccessExecutionVertex (org.apache.flink.runtime.executiongraph.AccessExecutionVertex)15 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)15 ArrayList (java.util.ArrayList)14 JobID (org.apache.flink.api.common.JobID)13 InetAddress (java.net.InetAddress)12 ExecutionException (java.util.concurrent.ExecutionException)12 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)12 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)12 Instance (org.apache.flink.runtime.instance.Instance)12 LocalTaskManagerLocation (org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation)11 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)10 HashMap (java.util.HashMap)9 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)9 Collection (java.util.Collection)8 SchedulerTestUtils.getRandomInstance (org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance)8 List (java.util.List)7