use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class MesosFlinkResourceManager method taskTerminated.
/**
* Invoked when a Mesos task reaches a terminal status.
*/
private void taskTerminated(Protos.TaskID taskID, Protos.TaskStatus status) {
// this callback occurs for failed containers and for released containers alike
final ResourceID id = extractResourceID(taskID);
boolean existed;
try {
existed = workerStore.removeWorker(taskID);
} catch (Exception ex) {
fatalError("unable to remove worker", ex);
return;
}
if (!existed) {
LOG.info("Received a termination notice for an unrecognized worker: {}", id);
return;
}
// check if this is a failed task or a released task
if (workersBeingReturned.remove(id) != null) {
// regular finished worker that we released
LOG.info("Worker {} finished successfully with diagnostics: {}", id, status.getMessage());
} else {
// failed worker, either at startup, or running
final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
if (launched != null) {
LOG.info("Mesos task {} failed, with a TaskManager in launch or registration. " + "State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
// we will trigger re-acquiring new workers at the end
} else {
// failed registered worker
LOG.info("Mesos task {} failed, with a registered TaskManager. " + "State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
// notify the generic logic, which notifies the JobManager, etc.
notifyWorkerFailed(id, "Mesos task " + id + " failed. State: " + status.getState());
}
// general failure logging
failedTasksSoFar++;
String diagMessage = String.format("Diagnostics for task %s in state %s : " + "reason=%s message=%s", id, status.getState(), status.getReason(), status.getMessage());
sendInfoMessage(diagMessage);
LOG.info(diagMessage);
LOG.info("Total number of failed tasks so far: {}", failedTasksSoFar);
// maxFailedTasks == -1 is infinite number of retries.
if (maxFailedTasks >= 0 && failedTasksSoFar > maxFailedTasks) {
String msg = "Stopping Mesos session because the number of failed tasks (" + failedTasksSoFar + ") exceeded the maximum failed tasks (" + maxFailedTasks + "). This number is controlled by the '" + ConfigConstants.MESOS_MAX_FAILED_TASKS + "' configuration setting. " + "By default its the number of requested tasks.";
LOG.error(msg);
self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender());
// no need to do anything else
return;
}
}
// in case failed containers were among the finished containers, make
// sure we re-examine and request new ones
triggerCheckWorkers();
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class TaskManagerLogHandlerTest method testLogFetchingFailure.
@Test
public void testLogFetchingFailure() throws Exception {
// ========= setup TaskManager =================================================================================
InstanceID tmID = new InstanceID();
ResourceID tmRID = new ResourceID(tmID.toString());
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
Instance taskManager = mock(Instance.class);
when(taskManager.getId()).thenReturn(tmID);
when(taskManager.getTaskManagerID()).thenReturn(tmRID);
when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
CompletableFuture<BlobKey> future = new FlinkCompletableFuture<>();
future.completeExceptionally(new IOException("failure"));
when(taskManagerGateway.requestTaskManagerLog(any(Time.class))).thenReturn(future);
// ========= setup JobManager ==================================================================================
ActorGateway jobManagerGateway = mock(ActorGateway.class);
Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
when(jobManagerGateway.ask(isA(JobManagerMessages.getRequestBlobManagerPort().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) 5));
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestTaskManagerInstance.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new JobManagerMessages.TaskManagerInstance(Option.apply(taskManager))));
when(jobManagerGateway.path()).thenReturn("/jm/address");
JobManagerRetriever retriever = mock(JobManagerRetriever.class);
when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
TaskManagerLogHandler handler = new TaskManagerLogHandler(retriever, ExecutionContext$.MODULE$.fromExecutor(Executors.directExecutor()), Future$.MODULE$.successful("/jm/address"), AkkaUtils.getDefaultClientTimeout(), TaskManagerLogHandler.FileMode.LOG, new Configuration(), false);
final AtomicReference<String> exception = new AtomicReference<>();
ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
when(ctx.write(isA(ByteBuf.class))).thenAnswer(new Answer<Object>() {
@Override
public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
ByteBuf data = invocationOnMock.getArgumentAt(0, ByteBuf.class);
exception.set(new String(data.array(), ConfigConstants.DEFAULT_CHARSET));
return null;
}
});
Map<String, String> pathParams = new HashMap<>();
pathParams.put(TaskManagersHandler.TASK_MANAGER_ID_KEY, tmID.toString());
Routed routed = mock(Routed.class);
when(routed.pathParams()).thenReturn(pathParams);
when(routed.request()).thenReturn(new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/taskmanagers/" + tmID + "/log"));
handler.respondAsLeader(ctx, routed, jobManagerGateway);
Assert.assertEquals("Fetching TaskManager log failed.", exception.get());
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class MetricFetcherTest method testUpdate.
@Test
public void testUpdate() throws Exception {
// ========= setup TaskManager =================================================================================
JobID jobID = new JobID();
InstanceID tmID = new InstanceID();
ResourceID tmRID = new ResourceID(tmID.toString());
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
Instance taskManager = mock(Instance.class);
when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(taskManager.getId()).thenReturn(tmID);
when(taskManager.getTaskManagerID()).thenReturn(tmRID);
// ========= setup JobManager ==================================================================================
JobDetails details = mock(JobDetails.class);
when(details.getJobId()).thenReturn(jobID);
ActorGateway jobManagerGateway = mock(ActorGateway.class);
Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
when(jobManagerGateway.ask(isA(RequestJobDetails.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MultipleJobsDetails(new JobDetails[0], new JobDetails[0])));
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
when(jobManagerGateway.path()).thenReturn("/jm/address");
JobManagerRetriever retriever = mock(JobManagerRetriever.class);
when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
// ========= setup QueryServices ================================================================================
Object requestMetricsAnswer = createRequestDumpAnswer(tmID, jobID);
final ActorRef jmQueryService = mock(ActorRef.class);
final ActorRef tmQueryService = mock(ActorRef.class);
ActorSystem actorSystem = mock(ActorSystem.class);
when(actorSystem.actorFor(eq("/jm/" + METRIC_QUERY_SERVICE_NAME))).thenReturn(jmQueryService);
when(actorSystem.actorFor(eq("/tm/" + METRIC_QUERY_SERVICE_NAME + "_" + tmRID.getResourceIdString()))).thenReturn(tmQueryService);
MetricFetcher.BasicGateway jmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
when(jmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MetricDumpSerialization.MetricSerializationResult(new byte[0], 0, 0, 0, 0)));
MetricFetcher.BasicGateway tmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
when(tmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(requestMetricsAnswer));
whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {
@Override
public boolean equals(Object o) {
return o == jmQueryService;
}
})).thenReturn(jmQueryServiceGateway);
whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {
@Override
public boolean equals(Object o) {
return o == tmQueryService;
}
})).thenReturn(tmQueryServiceGateway);
// ========= start MetricFetcher testing =======================================================================
ExecutionContextExecutor context = ExecutionContext$.MODULE$.fromExecutor(new CurrentThreadExecutor());
MetricFetcher fetcher = new MetricFetcher(actorSystem, retriever, context);
// verify that update fetches metrics and updates the store
fetcher.update();
MetricStore store = fetcher.getMetricStore();
synchronized (store) {
assertEquals("7", store.jobManager.metrics.get("abc.hist_min"));
assertEquals("6", store.jobManager.metrics.get("abc.hist_max"));
assertEquals("4.0", store.jobManager.metrics.get("abc.hist_mean"));
assertEquals("0.5", store.jobManager.metrics.get("abc.hist_median"));
assertEquals("5.0", store.jobManager.metrics.get("abc.hist_stddev"));
assertEquals("0.75", store.jobManager.metrics.get("abc.hist_p75"));
assertEquals("0.9", store.jobManager.metrics.get("abc.hist_p90"));
assertEquals("0.95", store.jobManager.metrics.get("abc.hist_p95"));
assertEquals("0.98", store.jobManager.metrics.get("abc.hist_p98"));
assertEquals("0.99", store.jobManager.metrics.get("abc.hist_p99"));
assertEquals("0.999", store.jobManager.metrics.get("abc.hist_p999"));
assertEquals("x", store.getTaskManagerMetricStore(tmID.toString()).metrics.get("abc.gauge"));
assertEquals("5.0", store.getJobMetricStore(jobID.toString()).metrics.get("abc.jc"));
assertEquals("2", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.abc.tc"));
assertEquals("1", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.opname.abc.oc"));
}
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class ArchivedJobGenerationUtils method generateArchivedJob.
private static void generateArchivedJob() throws Exception {
// Attempt
StringifiedAccumulatorResult acc1 = new StringifiedAccumulatorResult("name1", "type1", "value1");
StringifiedAccumulatorResult acc2 = new StringifiedAccumulatorResult("name2", "type2", "value2");
TaskManagerLocation location = new TaskManagerLocation(new ResourceID("hello"), InetAddress.getLocalHost(), 1234);
originalAttempt = new ArchivedExecutionBuilder().setStateTimestamps(new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9 }).setParallelSubtaskIndex(1).setAttemptNumber(0).setAssignedResourceLocation(location).setUserAccumulators(new StringifiedAccumulatorResult[] { acc1, acc2 }).setState(ExecutionState.FINISHED).setFailureCause("attemptException").build();
// Subtask
originalSubtask = new ArchivedExecutionVertexBuilder().setSubtaskIndex(originalAttempt.getParallelSubtaskIndex()).setTaskNameWithSubtask("hello(1/1)").setCurrentExecution(originalAttempt).build();
// Task
originalTask = new ArchivedExecutionJobVertexBuilder().setTaskVertices(new ArchivedExecutionVertex[] { originalSubtask }).build();
// Job
Map<JobVertexID, ArchivedExecutionJobVertex> tasks = new HashMap<>();
tasks.put(originalTask.getJobVertexId(), originalTask);
originalJob = new ArchivedExecutionGraphBuilder().setJobID(new JobID()).setTasks(tasks).setFailureCause("jobException").setState(JobStatus.FINISHED).setStateTimestamps(new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }).setArchivedUserAccumulators(new StringifiedAccumulatorResult[] { acc1, acc2 }).build();
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class InputChannelDeploymentDescriptor method fromEdges.
// ------------------------------------------------------------------------
/**
* Creates an input channel deployment descriptor for each partition.
*/
public static InputChannelDeploymentDescriptor[] fromEdges(ExecutionEdge[] edges, SimpleSlot consumerSlot, boolean allowLazyDeployment) throws ExecutionGraphException {
final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[edges.length];
// Each edge is connected to a different result partition
for (int i = 0; i < edges.length; i++) {
final IntermediateResultPartition consumedPartition = edges[i].getSource();
final Execution producer = consumedPartition.getProducer().getCurrentExecutionAttempt();
final ExecutionState producerState = producer.getState();
final SimpleSlot producerSlot = producer.getAssignedResource();
final ResultPartitionLocation partitionLocation;
// The producing task needs to be RUNNING or already FINISHED
if (consumedPartition.isConsumable() && producerSlot != null && (producerState == ExecutionState.RUNNING || producerState == ExecutionState.FINISHED || producerState == ExecutionState.SCHEDULED || producerState == ExecutionState.DEPLOYING)) {
final TaskManagerLocation partitionTaskManagerLocation = producerSlot.getTaskManagerLocation();
final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
if (partitionTaskManager.equals(consumerTaskManager)) {
// Consuming task is deployed to the same TaskManager as the partition => local
partitionLocation = ResultPartitionLocation.createLocal();
} else {
// Different instances => remote
final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, consumedPartition.getIntermediateResult().getConnectionIndex());
partitionLocation = ResultPartitionLocation.createRemote(connectionId);
}
} else if (allowLazyDeployment) {
// The producing task might not have registered the partition yet
partitionLocation = ResultPartitionLocation.createUnknown();
} else if (producerState == ExecutionState.CANCELING || producerState == ExecutionState.CANCELED || producerState == ExecutionState.FAILED) {
String msg = "Trying to schedule a task whose inputs were canceled or failed. " + "The producer is in state " + producerState + ".";
throw new ExecutionGraphException(msg);
} else {
String msg = String.format("Trying to eagerly schedule a task whose inputs " + "are not ready (partition consumable? %s, producer state: %s, producer slot: %s).", consumedPartition.isConsumable(), producerState, producerSlot);
throw new ExecutionGraphException(msg);
}
final ResultPartitionID consumedPartitionId = new ResultPartitionID(consumedPartition.getPartitionId(), producer.getAttemptId());
icdd[i] = new InputChannelDeploymentDescriptor(consumedPartitionId, partitionLocation);
}
return icdd;
}
Aggregations