use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class MetricFetcher method fetchMetrics.
private void fetchMetrics() {
try {
Option<scala.Tuple2<ActorGateway, Integer>> jobManagerGatewayAndWebPort = retriever.getJobManagerGatewayAndWebPort();
if (jobManagerGatewayAndWebPort.isDefined()) {
ActorGateway jobManager = jobManagerGatewayAndWebPort.get()._1();
/**
* Remove all metrics that belong to a job that is not running and no longer archived.
*/
Future<Object> jobDetailsFuture = jobManager.ask(new RequestJobDetails(true, true), timeout);
jobDetailsFuture.onSuccess(new OnSuccess<Object>() {
@Override
public void onSuccess(Object result) throws Throwable {
MultipleJobsDetails details = (MultipleJobsDetails) result;
ArrayList<String> toRetain = new ArrayList<>();
for (JobDetails job : details.getRunningJobs()) {
toRetain.add(job.getJobId().toString());
}
for (JobDetails job : details.getFinishedJobs()) {
toRetain.add(job.getJobId().toString());
}
synchronized (metrics) {
metrics.jobs.keySet().retainAll(toRetain);
}
}
}, ctx);
logErrorOnFailure(jobDetailsFuture, "Fetching of JobDetails failed.");
String jobManagerPath = jobManager.path();
String queryServicePath = jobManagerPath.substring(0, jobManagerPath.lastIndexOf('/') + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME;
ActorRef jobManagerQueryService = actorSystem.actorFor(queryServicePath);
queryMetrics(jobManagerQueryService);
/**
* We first request the list of all registered task managers from the job manager, and then
* request the respective metric dump from each task manager.
*
* All stored metrics that do not belong to a registered task manager will be removed.
*/
Future<Object> registeredTaskManagersFuture = jobManager.ask(JobManagerMessages.getRequestRegisteredTaskManagers(), timeout);
registeredTaskManagersFuture.onSuccess(new OnSuccess<Object>() {
@Override
public void onSuccess(Object result) throws Throwable {
Iterable<Instance> taskManagers = ((JobManagerMessages.RegisteredTaskManagers) result).asJavaIterable();
List<String> activeTaskManagers = new ArrayList<>();
for (Instance taskManager : taskManagers) {
activeTaskManagers.add(taskManager.getId().toString());
String taskManagerPath = taskManager.getTaskManagerGateway().getAddress();
String queryServicePath = taskManagerPath.substring(0, taskManagerPath.lastIndexOf('/') + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME + "_" + taskManager.getTaskManagerID().getResourceIdString();
ActorRef taskManagerQueryService = actorSystem.actorFor(queryServicePath);
queryMetrics(taskManagerQueryService);
}
synchronized (metrics) {
// remove all metrics belonging to unregistered task managers
metrics.taskManagers.keySet().retainAll(activeTaskManagers);
}
}
}, ctx);
logErrorOnFailure(registeredTaskManagersFuture, "Fetchin list of registered TaskManagers failed.");
}
} catch (Exception e) {
LOG.warn("Exception while fetching metrics.", e);
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class TaskManagerLogHandlerTest method testLogFetchingFailure.
@Test
public void testLogFetchingFailure() throws Exception {
// ========= setup TaskManager =================================================================================
InstanceID tmID = new InstanceID();
ResourceID tmRID = new ResourceID(tmID.toString());
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
Instance taskManager = mock(Instance.class);
when(taskManager.getId()).thenReturn(tmID);
when(taskManager.getTaskManagerID()).thenReturn(tmRID);
when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
CompletableFuture<BlobKey> future = new FlinkCompletableFuture<>();
future.completeExceptionally(new IOException("failure"));
when(taskManagerGateway.requestTaskManagerLog(any(Time.class))).thenReturn(future);
// ========= setup JobManager ==================================================================================
ActorGateway jobManagerGateway = mock(ActorGateway.class);
Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
when(jobManagerGateway.ask(isA(JobManagerMessages.getRequestBlobManagerPort().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) 5));
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestTaskManagerInstance.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new JobManagerMessages.TaskManagerInstance(Option.apply(taskManager))));
when(jobManagerGateway.path()).thenReturn("/jm/address");
JobManagerRetriever retriever = mock(JobManagerRetriever.class);
when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
TaskManagerLogHandler handler = new TaskManagerLogHandler(retriever, ExecutionContext$.MODULE$.fromExecutor(Executors.directExecutor()), Future$.MODULE$.successful("/jm/address"), AkkaUtils.getDefaultClientTimeout(), TaskManagerLogHandler.FileMode.LOG, new Configuration(), false);
final AtomicReference<String> exception = new AtomicReference<>();
ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
when(ctx.write(isA(ByteBuf.class))).thenAnswer(new Answer<Object>() {
@Override
public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
ByteBuf data = invocationOnMock.getArgumentAt(0, ByteBuf.class);
exception.set(new String(data.array(), ConfigConstants.DEFAULT_CHARSET));
return null;
}
});
Map<String, String> pathParams = new HashMap<>();
pathParams.put(TaskManagersHandler.TASK_MANAGER_ID_KEY, tmID.toString());
Routed routed = mock(Routed.class);
when(routed.pathParams()).thenReturn(pathParams);
when(routed.request()).thenReturn(new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/taskmanagers/" + tmID + "/log"));
handler.respondAsLeader(ctx, routed, jobManagerGateway);
Assert.assertEquals("Fetching TaskManager log failed.", exception.get());
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class MetricFetcherTest method testUpdate.
@Test
public void testUpdate() throws Exception {
// ========= setup TaskManager =================================================================================
JobID jobID = new JobID();
InstanceID tmID = new InstanceID();
ResourceID tmRID = new ResourceID(tmID.toString());
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
Instance taskManager = mock(Instance.class);
when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(taskManager.getId()).thenReturn(tmID);
when(taskManager.getTaskManagerID()).thenReturn(tmRID);
// ========= setup JobManager ==================================================================================
JobDetails details = mock(JobDetails.class);
when(details.getJobId()).thenReturn(jobID);
ActorGateway jobManagerGateway = mock(ActorGateway.class);
Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
when(jobManagerGateway.ask(isA(RequestJobDetails.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MultipleJobsDetails(new JobDetails[0], new JobDetails[0])));
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
when(jobManagerGateway.path()).thenReturn("/jm/address");
JobManagerRetriever retriever = mock(JobManagerRetriever.class);
when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
// ========= setup QueryServices ================================================================================
Object requestMetricsAnswer = createRequestDumpAnswer(tmID, jobID);
final ActorRef jmQueryService = mock(ActorRef.class);
final ActorRef tmQueryService = mock(ActorRef.class);
ActorSystem actorSystem = mock(ActorSystem.class);
when(actorSystem.actorFor(eq("/jm/" + METRIC_QUERY_SERVICE_NAME))).thenReturn(jmQueryService);
when(actorSystem.actorFor(eq("/tm/" + METRIC_QUERY_SERVICE_NAME + "_" + tmRID.getResourceIdString()))).thenReturn(tmQueryService);
MetricFetcher.BasicGateway jmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
when(jmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MetricDumpSerialization.MetricSerializationResult(new byte[0], 0, 0, 0, 0)));
MetricFetcher.BasicGateway tmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
when(tmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(requestMetricsAnswer));
whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {
@Override
public boolean equals(Object o) {
return o == jmQueryService;
}
})).thenReturn(jmQueryServiceGateway);
whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {
@Override
public boolean equals(Object o) {
return o == tmQueryService;
}
})).thenReturn(tmQueryServiceGateway);
// ========= start MetricFetcher testing =======================================================================
ExecutionContextExecutor context = ExecutionContext$.MODULE$.fromExecutor(new CurrentThreadExecutor());
MetricFetcher fetcher = new MetricFetcher(actorSystem, retriever, context);
// verify that update fetches metrics and updates the store
fetcher.update();
MetricStore store = fetcher.getMetricStore();
synchronized (store) {
assertEquals("7", store.jobManager.metrics.get("abc.hist_min"));
assertEquals("6", store.jobManager.metrics.get("abc.hist_max"));
assertEquals("4.0", store.jobManager.metrics.get("abc.hist_mean"));
assertEquals("0.5", store.jobManager.metrics.get("abc.hist_median"));
assertEquals("5.0", store.jobManager.metrics.get("abc.hist_stddev"));
assertEquals("0.75", store.jobManager.metrics.get("abc.hist_p75"));
assertEquals("0.9", store.jobManager.metrics.get("abc.hist_p90"));
assertEquals("0.95", store.jobManager.metrics.get("abc.hist_p95"));
assertEquals("0.98", store.jobManager.metrics.get("abc.hist_p98"));
assertEquals("0.99", store.jobManager.metrics.get("abc.hist_p99"));
assertEquals("0.999", store.jobManager.metrics.get("abc.hist_p999"));
assertEquals("x", store.getTaskManagerMetricStore(tmID.toString()).metrics.get("abc.gauge"));
assertEquals("5.0", store.getJobMetricStore(jobID.toString()).metrics.get("abc.jc"));
assertEquals("2", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.abc.tc"));
assertEquals("1", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.opname.abc.oc"));
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class Scheduler method getNewSlotForSharingGroup.
/**
* Tries to allocate a new slot for a vertex that is part of a slot sharing group. If one
* of the instances has a slot available, the method will allocate it as a shared slot, add that
* shared slot to the sharing group, and allocate a simple slot from that shared slot.
*
* <p>This method will try to allocate a slot from one of the local instances, and fall back to
* non-local instances, if permitted.</p>
*
* @param vertex The vertex to allocate the slot for.
* @param requestedLocations The locations that are considered local. May be null or empty, if the
* vertex has no location preferences.
* @param groupAssignment The slot sharing group of the vertex. Mandatory parameter.
* @param constraint The co-location constraint of the vertex. May be null.
* @param localOnly Flag to indicate if non-local choices are acceptable.
*
* @return A sub-slot for the given vertex, or {@code null}, if no slot is available.
*/
protected SimpleSlot getNewSlotForSharingGroup(ExecutionVertex vertex, Iterable<TaskManagerLocation> requestedLocations, SlotSharingGroupAssignment groupAssignment, CoLocationConstraint constraint, boolean localOnly) {
// in the set-with-available-instances
while (true) {
Pair<Instance, Locality> instanceLocalityPair = findInstance(requestedLocations, localOnly);
if (instanceLocalityPair == null) {
// nothing is available
return null;
}
final Instance instanceToUse = instanceLocalityPair.getLeft();
final Locality locality = instanceLocalityPair.getRight();
try {
JobVertexID groupID = vertex.getJobvertexId();
// allocate a shared slot from the instance
SharedSlot sharedSlot = instanceToUse.allocateSharedSlot(vertex.getJobId(), groupAssignment);
// if the instance has further available slots, re-add it to the set of available resources.
if (instanceToUse.hasResourcesAvailable()) {
this.instancesWithAvailableResources.put(instanceToUse.getTaskManagerID(), instanceToUse);
}
if (sharedSlot != null) {
// add the shared slot to the assignment group and allocate a sub-slot
SimpleSlot slot = constraint == null ? groupAssignment.addSharedSlotAndAllocateSubSlot(sharedSlot, locality, groupID) : groupAssignment.addSharedSlotAndAllocateSubSlot(sharedSlot, locality, constraint);
if (slot != null) {
return slot;
} else {
// could not add and allocate the sub-slot, so release shared slot
sharedSlot.releaseSlot();
}
}
} catch (InstanceDiedException e) {
// the instance died it has not yet been propagated to this scheduler
// remove the instance from the set of available instances
removeInstance(instanceToUse);
}
// if we failed to get a slot, fall through the loop
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class Scheduler method getFreeSlotForTask.
/**
* Gets a suitable instance to schedule the vertex execution to.
* <p>
* NOTE: This method does is not thread-safe, it needs to be synchronized by the caller.
*
* @param vertex The task to run.
* @return The instance to run the vertex on, it {@code null}, if no instance is available.
*/
protected SimpleSlot getFreeSlotForTask(ExecutionVertex vertex, Iterable<TaskManagerLocation> requestedLocations, boolean localOnly) {
// in the set-with-available-instances
while (true) {
Pair<Instance, Locality> instanceLocalityPair = findInstance(requestedLocations, localOnly);
if (instanceLocalityPair == null) {
return null;
}
Instance instanceToUse = instanceLocalityPair.getLeft();
Locality locality = instanceLocalityPair.getRight();
try {
SimpleSlot slot = instanceToUse.allocateSimpleSlot(vertex.getJobId());
// if the instance has further available slots, re-add it to the set of available resources.
if (instanceToUse.hasResourcesAvailable()) {
this.instancesWithAvailableResources.put(instanceToUse.getTaskManagerID(), instanceToUse);
}
if (slot != null) {
slot.setLocality(locality);
return slot;
}
} catch (InstanceDiedException e) {
// the instance died it has not yet been propagated to this scheduler
// remove the instance from the set of available instances
removeInstance(instanceToUse);
}
// if we failed to get a slot, fall through the loop
}
}
Aggregations