use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class ExecutionVertexDeploymentTest method testDeployFailedAsynchronously.
@Test
public void testDeployFailedAsynchronously() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid);
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0], AkkaUtils.getDefaultTimeout());
final Instance instance = getInstance(new ActorTaskManagerGateway(new SimpleFailingActorGateway(TestingUtils.directExecutionContext())));
final SimpleSlot slot = instance.allocateSimpleSlot(ejv.getJobId());
assertEquals(ExecutionState.CREATED, vertex.getExecutionState());
vertex.deployToSlot(slot);
// wait until the state transition must be done
for (int i = 0; i < 100; i++) {
if (vertex.getExecutionState() == ExecutionState.FAILED && vertex.getFailureCause() != null) {
break;
} else {
Thread.sleep(10);
}
}
assertEquals(ExecutionState.FAILED, vertex.getExecutionState());
assertNotNull(vertex.getFailureCause());
assertTrue(vertex.getFailureCause().getMessage().contains(ERROR_MESSAGE));
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.DEPLOYING) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.FAILED) > 0);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class ExecutionGraphTestUtils method getInstance.
public static Instance getInstance(final TaskManagerGateway gateway, final int numberOfSlots) throws Exception {
ResourceID resourceID = ResourceID.generate();
HardwareDescription hardwareDescription = new HardwareDescription(4, 2L * 1024 * 1024 * 1024, 1024 * 1024 * 1024, 512 * 1024 * 1024);
InetAddress address = InetAddress.getByName("127.0.0.1");
TaskManagerLocation connection = new TaskManagerLocation(resourceID, address, 10001);
return new Instance(gateway, connection, new InstanceID(), hardwareDescription, numberOfSlots);
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class ExecutionGraphDeploymentTest method testBuildDeploymentDescriptor.
@Test
public void testBuildDeploymentDescriptor() {
try {
final JobID jobId = new JobID();
final JobVertexID jid1 = new JobVertexID();
final JobVertexID jid2 = new JobVertexID();
final JobVertexID jid3 = new JobVertexID();
final JobVertexID jid4 = new JobVertexID();
JobVertex v1 = new JobVertex("v1", jid1);
JobVertex v2 = new JobVertex("v2", jid2);
JobVertex v3 = new JobVertex("v3", jid3);
JobVertex v4 = new JobVertex("v4", jid4);
v1.setParallelism(10);
v2.setParallelism(10);
v3.setParallelism(10);
v4.setParallelism(10);
v1.setInvokableClass(BatchTask.class);
v2.setInvokableClass(BatchTask.class);
v3.setInvokableClass(BatchTask.class);
v4.setInvokableClass(BatchTask.class);
v2.connectNewDataSetAsInput(v1, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
v3.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
v4.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
ExecutionGraph eg = new ExecutionGraph(TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), jobId, "some job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy(), new Scheduler(TestingUtils.defaultExecutionContext()));
List<JobVertex> ordered = Arrays.asList(v1, v2, v3, v4);
eg.attachJobGraph(ordered);
ExecutionJobVertex ejv = eg.getAllVertices().get(jid2);
ExecutionVertex vertex = ejv.getTaskVertices()[3];
ExecutionGraphTestUtils.SimpleActorGateway instanceGateway = new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext());
final Instance instance = getInstance(new ActorTaskManagerGateway(instanceGateway));
final SimpleSlot slot = instance.allocateSimpleSlot(jobId);
assertEquals(ExecutionState.CREATED, vertex.getExecutionState());
vertex.deployToSlot(slot);
assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState());
TaskDeploymentDescriptor descr = instanceGateway.lastTDD;
assertNotNull(descr);
JobInformation jobInformation = descr.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
TaskInformation taskInformation = descr.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
assertEquals(jobId, jobInformation.getJobId());
assertEquals(jid2, taskInformation.getJobVertexId());
assertEquals(3, descr.getSubtaskIndex());
assertEquals(10, taskInformation.getNumberOfSubtasks());
assertEquals(BatchTask.class.getName(), taskInformation.getInvokableClassName());
assertEquals("v2", taskInformation.getTaskName());
Collection<ResultPartitionDeploymentDescriptor> producedPartitions = descr.getProducedPartitions();
Collection<InputGateDeploymentDescriptor> consumedPartitions = descr.getInputGates();
assertEquals(2, producedPartitions.size());
assertEquals(1, consumedPartitions.size());
Iterator<ResultPartitionDeploymentDescriptor> iteratorProducedPartitions = producedPartitions.iterator();
Iterator<InputGateDeploymentDescriptor> iteratorConsumedPartitions = consumedPartitions.iterator();
assertEquals(10, iteratorProducedPartitions.next().getNumberOfSubpartitions());
assertEquals(10, iteratorProducedPartitions.next().getNumberOfSubpartitions());
assertEquals(10, iteratorConsumedPartitions.next().getInputChannelDeploymentDescriptors().length);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class TaskManagerLogHandler method respondAsLeader.
/**
* Response when running with leading JobManager.
*/
@Override
protected void respondAsLeader(final ChannelHandlerContext ctx, final Routed routed, final ActorGateway jobManager) {
if (cache == null) {
scala.concurrent.Future<Object> portFuture = jobManager.ask(JobManagerMessages.getRequestBlobManagerPort(), timeout);
scala.concurrent.Future<BlobCache> cacheFuture = portFuture.map(new Mapper<Object, BlobCache>() {
@Override
public BlobCache checkedApply(Object result) throws IOException {
Option<String> hostOption = jobManager.actor().path().address().host();
String host = hostOption.isDefined() ? hostOption.get() : "localhost";
int port = (int) result;
return new BlobCache(new InetSocketAddress(host, port), config);
}
}, executor);
cache = new FlinkFuture<>(cacheFuture);
}
final String taskManagerID = routed.pathParams().get(TaskManagersHandler.TASK_MANAGER_ID_KEY);
final HttpRequest request = routed.request();
//fetch TaskManager logs if no other process is currently doing it
if (lastRequestPending.putIfAbsent(taskManagerID, true) == null) {
try {
InstanceID instanceID = new InstanceID(StringUtils.hexStringToByte(taskManagerID));
scala.concurrent.Future<JobManagerMessages.TaskManagerInstance> scalaTaskManagerFuture = jobManager.ask(new JobManagerMessages.RequestTaskManagerInstance(instanceID), timeout).mapTo(ClassTag$.MODULE$.<JobManagerMessages.TaskManagerInstance>apply(JobManagerMessages.TaskManagerInstance.class));
Future<JobManagerMessages.TaskManagerInstance> taskManagerFuture = new FlinkFuture<>(scalaTaskManagerFuture);
Future<BlobKey> blobKeyFuture = taskManagerFuture.thenCompose(new ApplyFunction<JobManagerMessages.TaskManagerInstance, Future<BlobKey>>() {
@Override
public Future<BlobKey> apply(JobManagerMessages.TaskManagerInstance value) {
Instance taskManager = value.instance().get();
if (serveLogFile) {
return taskManager.getTaskManagerGateway().requestTaskManagerLog(timeTimeout);
} else {
return taskManager.getTaskManagerGateway().requestTaskManagerStdout(timeTimeout);
}
}
});
Future<String> logPathFuture = blobKeyFuture.thenCombine(cache, new BiFunction<BlobKey, BlobCache, Tuple2<BlobKey, BlobCache>>() {
@Override
public Tuple2<BlobKey, BlobCache> apply(BlobKey blobKey, BlobCache blobCache) {
return Tuple2.of(blobKey, blobCache);
}
}).thenComposeAsync(new ApplyFunction<Tuple2<BlobKey, BlobCache>, Future<String>>() {
@Override
public Future<String> apply(Tuple2<BlobKey, BlobCache> value) {
final BlobKey blobKey = value.f0;
final BlobCache blobCache = value.f1;
//delete previous log file, if it is different than the current one
HashMap<String, BlobKey> lastSubmittedFile = serveLogFile ? lastSubmittedLog : lastSubmittedStdout;
if (lastSubmittedFile.containsKey(taskManagerID)) {
if (!blobKey.equals(lastSubmittedFile.get(taskManagerID))) {
try {
blobCache.deleteGlobal(lastSubmittedFile.get(taskManagerID));
} catch (IOException e) {
return FlinkCompletableFuture.completedExceptionally(new Exception("Could not delete file for " + taskManagerID + '.', e));
}
lastSubmittedFile.put(taskManagerID, blobKey);
}
} else {
lastSubmittedFile.put(taskManagerID, blobKey);
}
try {
return FlinkCompletableFuture.completed(blobCache.getURL(blobKey).getFile());
} catch (IOException e) {
return FlinkCompletableFuture.completedExceptionally(new Exception("Could not retrieve blob for " + blobKey + '.', e));
}
}
}, executor);
logPathFuture.exceptionally(new ApplyFunction<Throwable, Void>() {
@Override
public Void apply(Throwable failure) {
display(ctx, request, "Fetching TaskManager log failed.");
LOG.error("Fetching TaskManager log failed.", failure);
lastRequestPending.remove(taskManagerID);
return null;
}
});
logPathFuture.thenAccept(new AcceptFunction<String>() {
@Override
public void accept(String filePath) {
File file = new File(filePath);
final RandomAccessFile raf;
try {
raf = new RandomAccessFile(file, "r");
} catch (FileNotFoundException e) {
display(ctx, request, "Displaying TaskManager log failed.");
LOG.error("Displaying TaskManager log failed.", e);
return;
}
long fileLength;
try {
fileLength = raf.length();
} catch (IOException ioe) {
display(ctx, request, "Displaying TaskManager log failed.");
LOG.error("Displaying TaskManager log failed.", ioe);
try {
raf.close();
} catch (IOException e) {
LOG.error("Could not close random access file.", e);
}
return;
}
final FileChannel fc = raf.getChannel();
HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK);
response.headers().set(CONTENT_TYPE, "text/plain");
if (HttpHeaders.isKeepAlive(request)) {
response.headers().set(CONNECTION, HttpHeaders.Values.KEEP_ALIVE);
}
HttpHeaders.setContentLength(response, fileLength);
// write the initial line and the header.
ctx.write(response);
// write the content.
ChannelFuture lastContentFuture;
final GenericFutureListener<io.netty.util.concurrent.Future<? super Void>> completionListener = new GenericFutureListener<io.netty.util.concurrent.Future<? super Void>>() {
@Override
public void operationComplete(io.netty.util.concurrent.Future<? super Void> future) throws Exception {
lastRequestPending.remove(taskManagerID);
fc.close();
raf.close();
}
};
if (ctx.pipeline().get(SslHandler.class) == null) {
ctx.write(new DefaultFileRegion(fc, 0, fileLength), ctx.newProgressivePromise()).addListener(completionListener);
lastContentFuture = ctx.writeAndFlush(LastHttpContent.EMPTY_LAST_CONTENT);
} else {
try {
lastContentFuture = ctx.writeAndFlush(new HttpChunkedInput(new ChunkedFile(raf, 0, fileLength, 8192)), ctx.newProgressivePromise()).addListener(completionListener);
} catch (IOException e) {
display(ctx, request, "Displaying TaskManager log failed.");
LOG.warn("Could not write http data.", e);
return;
}
// HttpChunkedInput will write the end marker (LastHttpContent) for us.
}
// close the connection, if no keep-alive is needed
if (!HttpHeaders.isKeepAlive(request)) {
lastContentFuture.addListener(ChannelFutureListener.CLOSE);
}
}
});
} catch (Exception e) {
display(ctx, request, "Error: " + e.getMessage());
LOG.error("Fetching TaskManager log failed.", e);
lastRequestPending.remove(taskManagerID);
}
} else {
display(ctx, request, "loading...");
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class TaskManagersHandler method handleJsonRequest.
@Override
public String handleJsonRequest(Map<String, String> pathParams, Map<String, String> queryParams, ActorGateway jobManager) throws Exception {
try {
if (jobManager != null) {
// whether one task manager's metrics are requested, or all task manager, we
// return them in an array. This avoids unnecessary code complexity.
// If only one task manager is requested, we only fetch one task manager metrics.
final List<Instance> instances = new ArrayList<>();
if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) {
try {
InstanceID instanceID = new InstanceID(StringUtils.hexStringToByte(pathParams.get(TASK_MANAGER_ID_KEY)));
Future<Object> future = jobManager.ask(new JobManagerMessages.RequestTaskManagerInstance(instanceID), timeout);
TaskManagerInstance instance = (TaskManagerInstance) Await.result(future, timeout);
if (instance.instance().nonEmpty()) {
instances.add(instance.instance().get());
}
}// this means the id string was invalid. Keep the list empty.
catch (IllegalArgumentException e) {
// do nothing.
}
} else {
Future<Object> future = jobManager.ask(JobManagerMessages.getRequestRegisteredTaskManagers(), timeout);
RegisteredTaskManagers taskManagers = (RegisteredTaskManagers) Await.result(future, timeout);
instances.addAll(taskManagers.asJavaCollection());
}
StringWriter writer = new StringWriter();
JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer);
gen.writeStartObject();
gen.writeArrayFieldStart("taskmanagers");
for (Instance instance : instances) {
gen.writeStartObject();
gen.writeStringField("id", instance.getId().toString());
gen.writeStringField("path", instance.getTaskManagerGateway().getAddress());
gen.writeNumberField("dataPort", instance.getTaskManagerLocation().dataPort());
gen.writeNumberField("timeSinceLastHeartbeat", instance.getLastHeartBeat());
gen.writeNumberField("slotsNumber", instance.getTotalNumberOfSlots());
gen.writeNumberField("freeSlots", instance.getNumberOfAvailableSlots());
gen.writeNumberField("cpuCores", instance.getResources().getNumberOfCPUCores());
gen.writeNumberField("physicalMemory", instance.getResources().getSizeOfPhysicalMemory());
gen.writeNumberField("freeMemory", instance.getResources().getSizeOfJvmHeap());
gen.writeNumberField("managedMemory", instance.getResources().getSizeOfManagedMemory());
// only send metrics when only one task manager requests them.
if (pathParams.containsKey(TASK_MANAGER_ID_KEY)) {
fetcher.update();
MetricStore.TaskManagerMetricStore metrics = fetcher.getMetricStore().getTaskManagerMetricStore(instance.getId().toString());
if (metrics != null) {
gen.writeObjectFieldStart("metrics");
long heapUsed = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Heap.Used", "0"));
long heapCommitted = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Heap.Committed", "0"));
long heapTotal = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Heap.Max", "0"));
gen.writeNumberField("heapCommitted", heapCommitted);
gen.writeNumberField("heapUsed", heapUsed);
gen.writeNumberField("heapMax", heapTotal);
long nonHeapUsed = Long.valueOf(metrics.getMetric("Status.JVM.Memory.NonHeap.Used", "0"));
long nonHeapCommitted = Long.valueOf(metrics.getMetric("Status.JVM.Memory.NonHeap.Committed", "0"));
long nonHeapTotal = Long.valueOf(metrics.getMetric("Status.JVM.Memory.NonHeap.Max", "0"));
gen.writeNumberField("nonHeapCommitted", nonHeapCommitted);
gen.writeNumberField("nonHeapUsed", nonHeapUsed);
gen.writeNumberField("nonHeapMax", nonHeapTotal);
gen.writeNumberField("totalCommitted", heapCommitted + nonHeapCommitted);
gen.writeNumberField("totalUsed", heapUsed + nonHeapUsed);
gen.writeNumberField("totalMax", heapTotal + nonHeapTotal);
long directCount = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Direct.Count", "0"));
long directUsed = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Direct.MemoryUsed", "0"));
long directMax = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Direct.TotalCapacity", "0"));
gen.writeNumberField("directCount", directCount);
gen.writeNumberField("directUsed", directUsed);
gen.writeNumberField("directMax", directMax);
long mappedCount = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Mapped.Count", "0"));
long mappedUsed = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Mapped.MemoryUsed", "0"));
long mappedMax = Long.valueOf(metrics.getMetric("Status.JVM.Memory.Mapped.TotalCapacity", "0"));
gen.writeNumberField("mappedCount", mappedCount);
gen.writeNumberField("mappedUsed", mappedUsed);
gen.writeNumberField("mappedMax", mappedMax);
long memorySegmentsAvailable = Long.valueOf(metrics.getMetric("Status.Network.AvailableMemorySegments", "0"));
long memorySegmentsTotal = Long.valueOf(metrics.getMetric("Status.Network.TotalMemorySegments", "0"));
gen.writeNumberField("memorySegmentsAvailable", memorySegmentsAvailable);
gen.writeNumberField("memorySegmentsTotal", memorySegmentsTotal);
gen.writeArrayFieldStart("garbageCollectors");
for (String gcName : metrics.garbageCollectorNames) {
String count = metrics.getMetric("Status.JVM.GarbageCollector." + gcName + ".Count", null);
String time = metrics.getMetric("Status.JVM.GarbageCollector." + gcName + ".Time", null);
if (count != null && time != null) {
gen.writeStartObject();
gen.writeStringField("name", gcName);
gen.writeNumberField("count", Long.valueOf(count));
gen.writeNumberField("time", Long.valueOf(time));
gen.writeEndObject();
}
}
gen.writeEndArray();
gen.writeEndObject();
}
}
gen.writeEndObject();
}
gen.writeEndArray();
gen.writeEndObject();
gen.close();
return writer.toString();
} else {
throw new Exception("No connection to the leading JobManager.");
}
} catch (Exception e) {
throw new RuntimeException("Failed to fetch list of all task managers: " + e.getMessage(), e);
}
}
Aggregations