Search in sources :

Example 36 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class MesosFlinkResourceManager method taskTerminated.

/**
	 * Invoked when a Mesos task reaches a terminal status.
	 */
private void taskTerminated(Protos.TaskID taskID, Protos.TaskStatus status) {
    // this callback occurs for failed containers and for released containers alike
    final ResourceID id = extractResourceID(taskID);
    boolean existed;
    try {
        existed = workerStore.removeWorker(taskID);
    } catch (Exception ex) {
        fatalError("unable to remove worker", ex);
        return;
    }
    if (!existed) {
        LOG.info("Received a termination notice for an unrecognized worker: {}", id);
        return;
    }
    // check if this is a failed task or a released task
    if (workersBeingReturned.remove(id) != null) {
        // regular finished worker that we released
        LOG.info("Worker {} finished successfully with diagnostics: {}", id, status.getMessage());
    } else {
        // failed worker, either at startup, or running
        final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
        if (launched != null) {
            LOG.info("Mesos task {} failed, with a TaskManager in launch or registration. " + "State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
        // we will trigger re-acquiring new workers at the end
        } else {
            // failed registered worker
            LOG.info("Mesos task {} failed, with a registered TaskManager. " + "State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
            // notify the generic logic, which notifies the JobManager, etc.
            notifyWorkerFailed(id, "Mesos task " + id + " failed.  State: " + status.getState());
        }
        // general failure logging
        failedTasksSoFar++;
        String diagMessage = String.format("Diagnostics for task %s in state %s : " + "reason=%s message=%s", id, status.getState(), status.getReason(), status.getMessage());
        sendInfoMessage(diagMessage);
        LOG.info(diagMessage);
        LOG.info("Total number of failed tasks so far: {}", failedTasksSoFar);
        // maxFailedTasks == -1 is infinite number of retries.
        if (maxFailedTasks >= 0 && failedTasksSoFar > maxFailedTasks) {
            String msg = "Stopping Mesos session because the number of failed tasks (" + failedTasksSoFar + ") exceeded the maximum failed tasks (" + maxFailedTasks + "). This number is controlled by the '" + ConfigConstants.MESOS_MAX_FAILED_TASKS + "' configuration setting. " + "By default its the number of requested tasks.";
            LOG.error(msg);
            self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender());
            // no need to do anything else
            return;
        }
    }
    // in case failed containers were among the finished containers, make
    // sure we re-examine and request new ones
    triggerCheckWorkers();
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) StopCluster(org.apache.flink.runtime.clusterframework.messages.StopCluster) MesosWorkerStore(org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException)

Example 37 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class TaskManagerLogHandlerTest method testLogFetchingFailure.

@Test
public void testLogFetchingFailure() throws Exception {
    // ========= setup TaskManager =================================================================================
    InstanceID tmID = new InstanceID();
    ResourceID tmRID = new ResourceID(tmID.toString());
    TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
    when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
    Instance taskManager = mock(Instance.class);
    when(taskManager.getId()).thenReturn(tmID);
    when(taskManager.getTaskManagerID()).thenReturn(tmRID);
    when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
    CompletableFuture<BlobKey> future = new FlinkCompletableFuture<>();
    future.completeExceptionally(new IOException("failure"));
    when(taskManagerGateway.requestTaskManagerLog(any(Time.class))).thenReturn(future);
    // ========= setup JobManager ==================================================================================
    ActorGateway jobManagerGateway = mock(ActorGateway.class);
    Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
    when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
    when(jobManagerGateway.ask(isA(JobManagerMessages.getRequestBlobManagerPort().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) 5));
    when(jobManagerGateway.ask(isA(JobManagerMessages.RequestTaskManagerInstance.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new JobManagerMessages.TaskManagerInstance(Option.apply(taskManager))));
    when(jobManagerGateway.path()).thenReturn("/jm/address");
    JobManagerRetriever retriever = mock(JobManagerRetriever.class);
    when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
    TaskManagerLogHandler handler = new TaskManagerLogHandler(retriever, ExecutionContext$.MODULE$.fromExecutor(Executors.directExecutor()), Future$.MODULE$.successful("/jm/address"), AkkaUtils.getDefaultClientTimeout(), TaskManagerLogHandler.FileMode.LOG, new Configuration(), false);
    final AtomicReference<String> exception = new AtomicReference<>();
    ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
    when(ctx.write(isA(ByteBuf.class))).thenAnswer(new Answer<Object>() {

        @Override
        public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
            ByteBuf data = invocationOnMock.getArgumentAt(0, ByteBuf.class);
            exception.set(new String(data.array(), ConfigConstants.DEFAULT_CHARSET));
            return null;
        }
    });
    Map<String, String> pathParams = new HashMap<>();
    pathParams.put(TaskManagersHandler.TASK_MANAGER_ID_KEY, tmID.toString());
    Routed routed = mock(Routed.class);
    when(routed.pathParams()).thenReturn(pathParams);
    when(routed.request()).thenReturn(new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/taskmanagers/" + tmID + "/log"));
    handler.respondAsLeader(ctx, routed, jobManagerGateway);
    Assert.assertEquals("Fetching TaskManager log failed.", exception.get());
}
Also used : Configuration(org.apache.flink.configuration.Configuration) InstanceID(org.apache.flink.runtime.instance.InstanceID) Instance(org.apache.flink.runtime.instance.Instance) HashMap(java.util.HashMap) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) Time(org.apache.flink.api.common.time.Time) ChannelHandlerContext(io.netty.channel.ChannelHandlerContext) ByteBuf(io.netty.buffer.ByteBuf) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) BlobKey(org.apache.flink.runtime.blob.BlobKey) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) Routed(io.netty.handler.codec.http.router.Routed) DefaultFullHttpRequest(io.netty.handler.codec.http.DefaultFullHttpRequest) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) InvocationOnMock(org.mockito.invocation.InvocationOnMock) JobManagerRetriever(org.apache.flink.runtime.webmonitor.JobManagerRetriever) Test(org.junit.Test)

Example 38 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class MetricFetcherTest method testUpdate.

@Test
public void testUpdate() throws Exception {
    // ========= setup TaskManager =================================================================================
    JobID jobID = new JobID();
    InstanceID tmID = new InstanceID();
    ResourceID tmRID = new ResourceID(tmID.toString());
    TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
    when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
    Instance taskManager = mock(Instance.class);
    when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
    when(taskManager.getId()).thenReturn(tmID);
    when(taskManager.getTaskManagerID()).thenReturn(tmRID);
    // ========= setup JobManager ==================================================================================
    JobDetails details = mock(JobDetails.class);
    when(details.getJobId()).thenReturn(jobID);
    ActorGateway jobManagerGateway = mock(ActorGateway.class);
    Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
    when(jobManagerGateway.ask(isA(RequestJobDetails.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MultipleJobsDetails(new JobDetails[0], new JobDetails[0])));
    when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
    when(jobManagerGateway.path()).thenReturn("/jm/address");
    JobManagerRetriever retriever = mock(JobManagerRetriever.class);
    when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
    // ========= setup QueryServices ================================================================================
    Object requestMetricsAnswer = createRequestDumpAnswer(tmID, jobID);
    final ActorRef jmQueryService = mock(ActorRef.class);
    final ActorRef tmQueryService = mock(ActorRef.class);
    ActorSystem actorSystem = mock(ActorSystem.class);
    when(actorSystem.actorFor(eq("/jm/" + METRIC_QUERY_SERVICE_NAME))).thenReturn(jmQueryService);
    when(actorSystem.actorFor(eq("/tm/" + METRIC_QUERY_SERVICE_NAME + "_" + tmRID.getResourceIdString()))).thenReturn(tmQueryService);
    MetricFetcher.BasicGateway jmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
    when(jmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MetricDumpSerialization.MetricSerializationResult(new byte[0], 0, 0, 0, 0)));
    MetricFetcher.BasicGateway tmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
    when(tmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(requestMetricsAnswer));
    whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {

        @Override
        public boolean equals(Object o) {
            return o == jmQueryService;
        }
    })).thenReturn(jmQueryServiceGateway);
    whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {

        @Override
        public boolean equals(Object o) {
            return o == tmQueryService;
        }
    })).thenReturn(tmQueryServiceGateway);
    // ========= start MetricFetcher testing =======================================================================
    ExecutionContextExecutor context = ExecutionContext$.MODULE$.fromExecutor(new CurrentThreadExecutor());
    MetricFetcher fetcher = new MetricFetcher(actorSystem, retriever, context);
    // verify that update fetches metrics and updates the store
    fetcher.update();
    MetricStore store = fetcher.getMetricStore();
    synchronized (store) {
        assertEquals("7", store.jobManager.metrics.get("abc.hist_min"));
        assertEquals("6", store.jobManager.metrics.get("abc.hist_max"));
        assertEquals("4.0", store.jobManager.metrics.get("abc.hist_mean"));
        assertEquals("0.5", store.jobManager.metrics.get("abc.hist_median"));
        assertEquals("5.0", store.jobManager.metrics.get("abc.hist_stddev"));
        assertEquals("0.75", store.jobManager.metrics.get("abc.hist_p75"));
        assertEquals("0.9", store.jobManager.metrics.get("abc.hist_p90"));
        assertEquals("0.95", store.jobManager.metrics.get("abc.hist_p95"));
        assertEquals("0.98", store.jobManager.metrics.get("abc.hist_p98"));
        assertEquals("0.99", store.jobManager.metrics.get("abc.hist_p99"));
        assertEquals("0.999", store.jobManager.metrics.get("abc.hist_p999"));
        assertEquals("x", store.getTaskManagerMetricStore(tmID.toString()).metrics.get("abc.gauge"));
        assertEquals("5.0", store.getJobMetricStore(jobID.toString()).metrics.get("abc.jc"));
        assertEquals("2", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.abc.tc"));
        assertEquals("1", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.opname.abc.oc"));
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) InstanceID(org.apache.flink.runtime.instance.InstanceID) Instance(org.apache.flink.runtime.instance.Instance) ActorRef(akka.actor.ActorRef) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) MultipleJobsDetails(org.apache.flink.runtime.messages.webmonitor.MultipleJobsDetails) RequestJobDetails(org.apache.flink.runtime.messages.webmonitor.RequestJobDetails) JobDetails(org.apache.flink.runtime.messages.webmonitor.JobDetails) MetricDumpSerialization(org.apache.flink.runtime.metrics.dump.MetricDumpSerialization) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionContextExecutor(scala.concurrent.ExecutionContextExecutor) FiniteDuration(scala.concurrent.duration.FiniteDuration) RequestJobDetails(org.apache.flink.runtime.messages.webmonitor.RequestJobDetails) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobManagerRetriever(org.apache.flink.runtime.webmonitor.JobManagerRetriever) JobID(org.apache.flink.api.common.JobID) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 39 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class ArchivedJobGenerationUtils method generateArchivedJob.

private static void generateArchivedJob() throws Exception {
    // Attempt
    StringifiedAccumulatorResult acc1 = new StringifiedAccumulatorResult("name1", "type1", "value1");
    StringifiedAccumulatorResult acc2 = new StringifiedAccumulatorResult("name2", "type2", "value2");
    TaskManagerLocation location = new TaskManagerLocation(new ResourceID("hello"), InetAddress.getLocalHost(), 1234);
    originalAttempt = new ArchivedExecutionBuilder().setStateTimestamps(new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9 }).setParallelSubtaskIndex(1).setAttemptNumber(0).setAssignedResourceLocation(location).setUserAccumulators(new StringifiedAccumulatorResult[] { acc1, acc2 }).setState(ExecutionState.FINISHED).setFailureCause("attemptException").build();
    // Subtask
    originalSubtask = new ArchivedExecutionVertexBuilder().setSubtaskIndex(originalAttempt.getParallelSubtaskIndex()).setTaskNameWithSubtask("hello(1/1)").setCurrentExecution(originalAttempt).build();
    // Task
    originalTask = new ArchivedExecutionJobVertexBuilder().setTaskVertices(new ArchivedExecutionVertex[] { originalSubtask }).build();
    // Job
    Map<JobVertexID, ArchivedExecutionJobVertex> tasks = new HashMap<>();
    tasks.put(originalTask.getJobVertexId(), originalTask);
    originalJob = new ArchivedExecutionGraphBuilder().setJobID(new JobID()).setTasks(tasks).setFailureCause("jobException").setState(JobStatus.FINISHED).setStateTimestamps(new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }).setArchivedUserAccumulators(new StringifiedAccumulatorResult[] { acc1, acc2 }).build();
}
Also used : ArchivedExecutionJobVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionJobVertex) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) HashMap(java.util.HashMap) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) StringifiedAccumulatorResult(org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult) JobID(org.apache.flink.api.common.JobID)

Example 40 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class InputChannelDeploymentDescriptor method fromEdges.

// ------------------------------------------------------------------------
/**
	 * Creates an input channel deployment descriptor for each partition.
	 */
public static InputChannelDeploymentDescriptor[] fromEdges(ExecutionEdge[] edges, SimpleSlot consumerSlot, boolean allowLazyDeployment) throws ExecutionGraphException {
    final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
    final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[edges.length];
    // Each edge is connected to a different result partition
    for (int i = 0; i < edges.length; i++) {
        final IntermediateResultPartition consumedPartition = edges[i].getSource();
        final Execution producer = consumedPartition.getProducer().getCurrentExecutionAttempt();
        final ExecutionState producerState = producer.getState();
        final SimpleSlot producerSlot = producer.getAssignedResource();
        final ResultPartitionLocation partitionLocation;
        // The producing task needs to be RUNNING or already FINISHED
        if (consumedPartition.isConsumable() && producerSlot != null && (producerState == ExecutionState.RUNNING || producerState == ExecutionState.FINISHED || producerState == ExecutionState.SCHEDULED || producerState == ExecutionState.DEPLOYING)) {
            final TaskManagerLocation partitionTaskManagerLocation = producerSlot.getTaskManagerLocation();
            final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
            if (partitionTaskManager.equals(consumerTaskManager)) {
                // Consuming task is deployed to the same TaskManager as the partition => local
                partitionLocation = ResultPartitionLocation.createLocal();
            } else {
                // Different instances => remote
                final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, consumedPartition.getIntermediateResult().getConnectionIndex());
                partitionLocation = ResultPartitionLocation.createRemote(connectionId);
            }
        } else if (allowLazyDeployment) {
            // The producing task might not have registered the partition yet
            partitionLocation = ResultPartitionLocation.createUnknown();
        } else if (producerState == ExecutionState.CANCELING || producerState == ExecutionState.CANCELED || producerState == ExecutionState.FAILED) {
            String msg = "Trying to schedule a task whose inputs were canceled or failed. " + "The producer is in state " + producerState + ".";
            throw new ExecutionGraphException(msg);
        } else {
            String msg = String.format("Trying to eagerly schedule a task whose inputs " + "are not ready (partition consumable? %s, producer state: %s, producer slot: %s).", consumedPartition.isConsumable(), producerState, producerSlot);
            throw new ExecutionGraphException(msg);
        }
        final ResultPartitionID consumedPartitionId = new ResultPartitionID(consumedPartition.getPartitionId(), producer.getAttemptId());
        icdd[i] = new InputChannelDeploymentDescriptor(consumedPartitionId, partitionLocation);
    }
    return icdd;
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ExecutionGraphException(org.apache.flink.runtime.executiongraph.ExecutionGraphException) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) IntermediateResultPartition(org.apache.flink.runtime.executiongraph.IntermediateResultPartition) Execution(org.apache.flink.runtime.executiongraph.Execution) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID)

Aggregations

ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)74 Test (org.junit.Test)48 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)25 Time (org.apache.flink.api.common.time.Time)18 UUID (java.util.UUID)16 JobID (org.apache.flink.api.common.JobID)16 Configuration (org.apache.flink.configuration.Configuration)14 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)13 JavaTestKit (akka.testkit.JavaTestKit)12 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)12 InetAddress (java.net.InetAddress)11 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)10 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)10 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)10 SlotRequest (org.apache.flink.runtime.resourcemanager.SlotRequest)10 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)9 NetworkEnvironment (org.apache.flink.runtime.io.network.NetworkEnvironment)9 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)9 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)9 TestingSerialRpcService (org.apache.flink.runtime.rpc.TestingSerialRpcService)9