Search in sources :

Example 31 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class MetricFetcher method fetchMetrics.

private void fetchMetrics() {
    try {
        Option<scala.Tuple2<ActorGateway, Integer>> jobManagerGatewayAndWebPort = retriever.getJobManagerGatewayAndWebPort();
        if (jobManagerGatewayAndWebPort.isDefined()) {
            ActorGateway jobManager = jobManagerGatewayAndWebPort.get()._1();
            /**
				 * Remove all metrics that belong to a job that is not running and no longer archived.
				 */
            Future<Object> jobDetailsFuture = jobManager.ask(new RequestJobDetails(true, true), timeout);
            jobDetailsFuture.onSuccess(new OnSuccess<Object>() {

                @Override
                public void onSuccess(Object result) throws Throwable {
                    MultipleJobsDetails details = (MultipleJobsDetails) result;
                    ArrayList<String> toRetain = new ArrayList<>();
                    for (JobDetails job : details.getRunningJobs()) {
                        toRetain.add(job.getJobId().toString());
                    }
                    for (JobDetails job : details.getFinishedJobs()) {
                        toRetain.add(job.getJobId().toString());
                    }
                    synchronized (metrics) {
                        metrics.jobs.keySet().retainAll(toRetain);
                    }
                }
            }, ctx);
            logErrorOnFailure(jobDetailsFuture, "Fetching of JobDetails failed.");
            String jobManagerPath = jobManager.path();
            String queryServicePath = jobManagerPath.substring(0, jobManagerPath.lastIndexOf('/') + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME;
            ActorRef jobManagerQueryService = actorSystem.actorFor(queryServicePath);
            queryMetrics(jobManagerQueryService);
            /**
				 * We first request the list of all registered task managers from the job manager, and then
				 * request the respective metric dump from each task manager.
				 *
				 * All stored metrics that do not belong to a registered task manager will be removed.
				 */
            Future<Object> registeredTaskManagersFuture = jobManager.ask(JobManagerMessages.getRequestRegisteredTaskManagers(), timeout);
            registeredTaskManagersFuture.onSuccess(new OnSuccess<Object>() {

                @Override
                public void onSuccess(Object result) throws Throwable {
                    Iterable<Instance> taskManagers = ((JobManagerMessages.RegisteredTaskManagers) result).asJavaIterable();
                    List<String> activeTaskManagers = new ArrayList<>();
                    for (Instance taskManager : taskManagers) {
                        activeTaskManagers.add(taskManager.getId().toString());
                        String taskManagerPath = taskManager.getTaskManagerGateway().getAddress();
                        String queryServicePath = taskManagerPath.substring(0, taskManagerPath.lastIndexOf('/') + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME + "_" + taskManager.getTaskManagerID().getResourceIdString();
                        ActorRef taskManagerQueryService = actorSystem.actorFor(queryServicePath);
                        queryMetrics(taskManagerQueryService);
                    }
                    synchronized (metrics) {
                        // remove all metrics belonging to unregistered task managers
                        metrics.taskManagers.keySet().retainAll(activeTaskManagers);
                    }
                }
            }, ctx);
            logErrorOnFailure(registeredTaskManagersFuture, "Fetchin list of registered TaskManagers failed.");
        }
    } catch (Exception e) {
        LOG.warn("Exception while fetching metrics.", e);
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) ActorRef(akka.actor.ActorRef) ArrayList(java.util.ArrayList) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) RequestJobDetails(org.apache.flink.runtime.messages.webmonitor.RequestJobDetails) MultipleJobsDetails(org.apache.flink.runtime.messages.webmonitor.MultipleJobsDetails) RequestJobDetails(org.apache.flink.runtime.messages.webmonitor.RequestJobDetails) JobDetails(org.apache.flink.runtime.messages.webmonitor.JobDetails) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ArrayList(java.util.ArrayList) List(java.util.List)

Example 32 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class TaskManagerLogHandlerTest method testLogFetchingFailure.

@Test
public void testLogFetchingFailure() throws Exception {
    // ========= setup TaskManager =================================================================================
    InstanceID tmID = new InstanceID();
    ResourceID tmRID = new ResourceID(tmID.toString());
    TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
    when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
    Instance taskManager = mock(Instance.class);
    when(taskManager.getId()).thenReturn(tmID);
    when(taskManager.getTaskManagerID()).thenReturn(tmRID);
    when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
    CompletableFuture<BlobKey> future = new FlinkCompletableFuture<>();
    future.completeExceptionally(new IOException("failure"));
    when(taskManagerGateway.requestTaskManagerLog(any(Time.class))).thenReturn(future);
    // ========= setup JobManager ==================================================================================
    ActorGateway jobManagerGateway = mock(ActorGateway.class);
    Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
    when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
    when(jobManagerGateway.ask(isA(JobManagerMessages.getRequestBlobManagerPort().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) 5));
    when(jobManagerGateway.ask(isA(JobManagerMessages.RequestTaskManagerInstance.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new JobManagerMessages.TaskManagerInstance(Option.apply(taskManager))));
    when(jobManagerGateway.path()).thenReturn("/jm/address");
    JobManagerRetriever retriever = mock(JobManagerRetriever.class);
    when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
    TaskManagerLogHandler handler = new TaskManagerLogHandler(retriever, ExecutionContext$.MODULE$.fromExecutor(Executors.directExecutor()), Future$.MODULE$.successful("/jm/address"), AkkaUtils.getDefaultClientTimeout(), TaskManagerLogHandler.FileMode.LOG, new Configuration(), false);
    final AtomicReference<String> exception = new AtomicReference<>();
    ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
    when(ctx.write(isA(ByteBuf.class))).thenAnswer(new Answer<Object>() {

        @Override
        public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
            ByteBuf data = invocationOnMock.getArgumentAt(0, ByteBuf.class);
            exception.set(new String(data.array(), ConfigConstants.DEFAULT_CHARSET));
            return null;
        }
    });
    Map<String, String> pathParams = new HashMap<>();
    pathParams.put(TaskManagersHandler.TASK_MANAGER_ID_KEY, tmID.toString());
    Routed routed = mock(Routed.class);
    when(routed.pathParams()).thenReturn(pathParams);
    when(routed.request()).thenReturn(new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/taskmanagers/" + tmID + "/log"));
    handler.respondAsLeader(ctx, routed, jobManagerGateway);
    Assert.assertEquals("Fetching TaskManager log failed.", exception.get());
}
Also used : Configuration(org.apache.flink.configuration.Configuration) InstanceID(org.apache.flink.runtime.instance.InstanceID) Instance(org.apache.flink.runtime.instance.Instance) HashMap(java.util.HashMap) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) Time(org.apache.flink.api.common.time.Time) ChannelHandlerContext(io.netty.channel.ChannelHandlerContext) ByteBuf(io.netty.buffer.ByteBuf) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) BlobKey(org.apache.flink.runtime.blob.BlobKey) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) Routed(io.netty.handler.codec.http.router.Routed) DefaultFullHttpRequest(io.netty.handler.codec.http.DefaultFullHttpRequest) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) InvocationOnMock(org.mockito.invocation.InvocationOnMock) JobManagerRetriever(org.apache.flink.runtime.webmonitor.JobManagerRetriever) Test(org.junit.Test)

Example 33 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class MetricFetcherTest method testUpdate.

@Test
public void testUpdate() throws Exception {
    // ========= setup TaskManager =================================================================================
    JobID jobID = new JobID();
    InstanceID tmID = new InstanceID();
    ResourceID tmRID = new ResourceID(tmID.toString());
    TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
    when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
    Instance taskManager = mock(Instance.class);
    when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
    when(taskManager.getId()).thenReturn(tmID);
    when(taskManager.getTaskManagerID()).thenReturn(tmRID);
    // ========= setup JobManager ==================================================================================
    JobDetails details = mock(JobDetails.class);
    when(details.getJobId()).thenReturn(jobID);
    ActorGateway jobManagerGateway = mock(ActorGateway.class);
    Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
    when(jobManagerGateway.ask(isA(RequestJobDetails.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MultipleJobsDetails(new JobDetails[0], new JobDetails[0])));
    when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
    when(jobManagerGateway.path()).thenReturn("/jm/address");
    JobManagerRetriever retriever = mock(JobManagerRetriever.class);
    when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
    // ========= setup QueryServices ================================================================================
    Object requestMetricsAnswer = createRequestDumpAnswer(tmID, jobID);
    final ActorRef jmQueryService = mock(ActorRef.class);
    final ActorRef tmQueryService = mock(ActorRef.class);
    ActorSystem actorSystem = mock(ActorSystem.class);
    when(actorSystem.actorFor(eq("/jm/" + METRIC_QUERY_SERVICE_NAME))).thenReturn(jmQueryService);
    when(actorSystem.actorFor(eq("/tm/" + METRIC_QUERY_SERVICE_NAME + "_" + tmRID.getResourceIdString()))).thenReturn(tmQueryService);
    MetricFetcher.BasicGateway jmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
    when(jmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MetricDumpSerialization.MetricSerializationResult(new byte[0], 0, 0, 0, 0)));
    MetricFetcher.BasicGateway tmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
    when(tmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(requestMetricsAnswer));
    whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {

        @Override
        public boolean equals(Object o) {
            return o == jmQueryService;
        }
    })).thenReturn(jmQueryServiceGateway);
    whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {

        @Override
        public boolean equals(Object o) {
            return o == tmQueryService;
        }
    })).thenReturn(tmQueryServiceGateway);
    // ========= start MetricFetcher testing =======================================================================
    ExecutionContextExecutor context = ExecutionContext$.MODULE$.fromExecutor(new CurrentThreadExecutor());
    MetricFetcher fetcher = new MetricFetcher(actorSystem, retriever, context);
    // verify that update fetches metrics and updates the store
    fetcher.update();
    MetricStore store = fetcher.getMetricStore();
    synchronized (store) {
        assertEquals("7", store.jobManager.metrics.get("abc.hist_min"));
        assertEquals("6", store.jobManager.metrics.get("abc.hist_max"));
        assertEquals("4.0", store.jobManager.metrics.get("abc.hist_mean"));
        assertEquals("0.5", store.jobManager.metrics.get("abc.hist_median"));
        assertEquals("5.0", store.jobManager.metrics.get("abc.hist_stddev"));
        assertEquals("0.75", store.jobManager.metrics.get("abc.hist_p75"));
        assertEquals("0.9", store.jobManager.metrics.get("abc.hist_p90"));
        assertEquals("0.95", store.jobManager.metrics.get("abc.hist_p95"));
        assertEquals("0.98", store.jobManager.metrics.get("abc.hist_p98"));
        assertEquals("0.99", store.jobManager.metrics.get("abc.hist_p99"));
        assertEquals("0.999", store.jobManager.metrics.get("abc.hist_p999"));
        assertEquals("x", store.getTaskManagerMetricStore(tmID.toString()).metrics.get("abc.gauge"));
        assertEquals("5.0", store.getJobMetricStore(jobID.toString()).metrics.get("abc.jc"));
        assertEquals("2", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.abc.tc"));
        assertEquals("1", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.opname.abc.oc"));
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) InstanceID(org.apache.flink.runtime.instance.InstanceID) Instance(org.apache.flink.runtime.instance.Instance) ActorRef(akka.actor.ActorRef) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) MultipleJobsDetails(org.apache.flink.runtime.messages.webmonitor.MultipleJobsDetails) RequestJobDetails(org.apache.flink.runtime.messages.webmonitor.RequestJobDetails) JobDetails(org.apache.flink.runtime.messages.webmonitor.JobDetails) MetricDumpSerialization(org.apache.flink.runtime.metrics.dump.MetricDumpSerialization) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionContextExecutor(scala.concurrent.ExecutionContextExecutor) FiniteDuration(scala.concurrent.duration.FiniteDuration) RequestJobDetails(org.apache.flink.runtime.messages.webmonitor.RequestJobDetails) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobManagerRetriever(org.apache.flink.runtime.webmonitor.JobManagerRetriever) JobID(org.apache.flink.api.common.JobID) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 34 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class Scheduler method getNewSlotForSharingGroup.

/**
	 * Tries to allocate a new slot for a vertex that is part of a slot sharing group. If one
	 * of the instances has a slot available, the method will allocate it as a shared slot, add that
	 * shared slot to the sharing group, and allocate a simple slot from that shared slot.
	 * 
	 * <p>This method will try to allocate a slot from one of the local instances, and fall back to
	 * non-local instances, if permitted.</p>
	 * 
	 * @param vertex The vertex to allocate the slot for.
	 * @param requestedLocations The locations that are considered local. May be null or empty, if the
	 *                           vertex has no location preferences.
	 * @param groupAssignment The slot sharing group of the vertex. Mandatory parameter.
	 * @param constraint The co-location constraint of the vertex. May be null.
	 * @param localOnly Flag to indicate if non-local choices are acceptable.
	 * 
	 * @return A sub-slot for the given vertex, or {@code null}, if no slot is available.
	 */
protected SimpleSlot getNewSlotForSharingGroup(ExecutionVertex vertex, Iterable<TaskManagerLocation> requestedLocations, SlotSharingGroupAssignment groupAssignment, CoLocationConstraint constraint, boolean localOnly) {
    // in the set-with-available-instances
    while (true) {
        Pair<Instance, Locality> instanceLocalityPair = findInstance(requestedLocations, localOnly);
        if (instanceLocalityPair == null) {
            // nothing is available
            return null;
        }
        final Instance instanceToUse = instanceLocalityPair.getLeft();
        final Locality locality = instanceLocalityPair.getRight();
        try {
            JobVertexID groupID = vertex.getJobvertexId();
            // allocate a shared slot from the instance
            SharedSlot sharedSlot = instanceToUse.allocateSharedSlot(vertex.getJobId(), groupAssignment);
            // if the instance has further available slots, re-add it to the set of available resources.
            if (instanceToUse.hasResourcesAvailable()) {
                this.instancesWithAvailableResources.put(instanceToUse.getTaskManagerID(), instanceToUse);
            }
            if (sharedSlot != null) {
                // add the shared slot to the assignment group and allocate a sub-slot
                SimpleSlot slot = constraint == null ? groupAssignment.addSharedSlotAndAllocateSubSlot(sharedSlot, locality, groupID) : groupAssignment.addSharedSlotAndAllocateSubSlot(sharedSlot, locality, constraint);
                if (slot != null) {
                    return slot;
                } else {
                    // could not add and allocate the sub-slot, so release shared slot
                    sharedSlot.releaseSlot();
                }
            }
        } catch (InstanceDiedException e) {
            // the instance died it has not yet been propagated to this scheduler
            // remove the instance from the set of available instances
            removeInstance(instanceToUse);
        }
    // if we failed to get a slot, fall through the loop
    }
}
Also used : SharedSlot(org.apache.flink.runtime.instance.SharedSlot) Instance(org.apache.flink.runtime.instance.Instance) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) InstanceDiedException(org.apache.flink.runtime.instance.InstanceDiedException)

Example 35 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class Scheduler method getFreeSlotForTask.

/**
	 * Gets a suitable instance to schedule the vertex execution to.
	 * <p>
	 * NOTE: This method does is not thread-safe, it needs to be synchronized by the caller.
	 * 
	 * @param vertex The task to run. 
	 * @return The instance to run the vertex on, it {@code null}, if no instance is available.
	 */
protected SimpleSlot getFreeSlotForTask(ExecutionVertex vertex, Iterable<TaskManagerLocation> requestedLocations, boolean localOnly) {
    // in the set-with-available-instances
    while (true) {
        Pair<Instance, Locality> instanceLocalityPair = findInstance(requestedLocations, localOnly);
        if (instanceLocalityPair == null) {
            return null;
        }
        Instance instanceToUse = instanceLocalityPair.getLeft();
        Locality locality = instanceLocalityPair.getRight();
        try {
            SimpleSlot slot = instanceToUse.allocateSimpleSlot(vertex.getJobId());
            // if the instance has further available slots, re-add it to the set of available resources.
            if (instanceToUse.hasResourcesAvailable()) {
                this.instancesWithAvailableResources.put(instanceToUse.getTaskManagerID(), instanceToUse);
            }
            if (slot != null) {
                slot.setLocality(locality);
                return slot;
            }
        } catch (InstanceDiedException e) {
            // the instance died it has not yet been propagated to this scheduler
            // remove the instance from the set of available instances
            removeInstance(instanceToUse);
        }
    // if we failed to get a slot, fall through the loop
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) InstanceDiedException(org.apache.flink.runtime.instance.InstanceDiedException)

Aggregations

Instance (org.apache.flink.runtime.instance.Instance)63 Test (org.junit.Test)52 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)38 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)33 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)29 IOException (java.io.IOException)19 JobID (org.apache.flink.api.common.JobID)15 ExecutionException (java.util.concurrent.ExecutionException)14 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)14 SchedulerTestUtils.getRandomInstance (org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance)14 ExecutionGraphTestUtils.getInstance (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.getInstance)12 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)12 SimpleActorGateway (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.SimpleActorGateway)11 ExecutionGraphTestUtils.getExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.getExecutionVertex)11 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)11 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)10 FiniteDuration (scala.concurrent.duration.FiniteDuration)9 SuppressRestartsException (org.apache.flink.runtime.execution.SuppressRestartsException)8 BaseTestingActorGateway (org.apache.flink.runtime.instance.BaseTestingActorGateway)8 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)8