use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.
the class JobExceptionsHandlerTest method compareExceptions.
private static void compareExceptions(AccessExecutionGraph originalJob, String json) throws IOException {
JsonNode result = ArchivedJobGenerationUtils.mapper.readTree(json);
Assert.assertEquals(originalJob.getFailureCauseAsString(), result.get("root-exception").asText());
ArrayNode exceptions = (ArrayNode) result.get("all-exceptions");
int x = 0;
for (AccessExecutionVertex expectedSubtask : originalJob.getAllExecutionVertices()) {
if (!expectedSubtask.getFailureCauseAsString().equals(ExceptionUtils.STRINGIFIED_NULL_EXCEPTION)) {
JsonNode exception = exceptions.get(x);
Assert.assertEquals(expectedSubtask.getFailureCauseAsString(), exception.get("exception").asText());
Assert.assertEquals(expectedSubtask.getTaskNameWithSubtaskIndex(), exception.get("task").asText());
TaskManagerLocation location = expectedSubtask.getCurrentAssignedResourceLocation();
String expectedLocationString = location.getFQDNHostname() + ':' + location.dataPort();
Assert.assertEquals(expectedLocationString, exception.get("location").asText());
}
x++;
}
Assert.assertEquals(x > JobExceptionsHandler.MAX_NUMBER_EXCEPTION_TO_REPORT, result.get("truncated").asBoolean());
}
use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.
the class JobVertexTaskManagersHandlerTest method compareVertexTaskManagers.
private static void compareVertexTaskManagers(AccessExecutionJobVertex originalTask, AccessExecutionVertex originalSubtask, String json) throws IOException {
JsonNode result = ArchivedJobGenerationUtils.mapper.readTree(json);
Assert.assertEquals(originalTask.getJobVertexId().toString(), result.get("id").asText());
Assert.assertEquals(originalTask.getName(), result.get("name").asText());
Assert.assertTrue(result.get("now").asLong() > 0);
ArrayNode taskmanagers = (ArrayNode) result.get("taskmanagers");
JsonNode taskManager = taskmanagers.get(0);
TaskManagerLocation location = originalSubtask.getCurrentAssignedResourceLocation();
String expectedLocationString = location.getHostname() + ':' + location.dataPort();
Assert.assertEquals(expectedLocationString, taskManager.get("host").asText());
Assert.assertEquals(ExecutionState.FINISHED.name(), taskManager.get("status").asText());
Assert.assertEquals(3, taskManager.get("start-time").asLong());
Assert.assertEquals(5, taskManager.get("end-time").asLong());
Assert.assertEquals(2, taskManager.get("duration").asLong());
JsonNode statusCounts = taskManager.get("status-counts");
Assert.assertEquals(0, statusCounts.get(ExecutionState.CREATED.name()).asInt());
Assert.assertEquals(0, statusCounts.get(ExecutionState.SCHEDULED.name()).asInt());
Assert.assertEquals(0, statusCounts.get(ExecutionState.DEPLOYING.name()).asInt());
Assert.assertEquals(0, statusCounts.get(ExecutionState.RUNNING.name()).asInt());
Assert.assertEquals(1, statusCounts.get(ExecutionState.FINISHED.name()).asInt());
Assert.assertEquals(0, statusCounts.get(ExecutionState.CANCELING.name()).asInt());
Assert.assertEquals(0, statusCounts.get(ExecutionState.CANCELED.name()).asInt());
Assert.assertEquals(0, statusCounts.get(ExecutionState.FAILED.name()).asInt());
long expectedNumBytesIn = 0;
long expectedNumBytesOut = 0;
long expectedNumRecordsIn = 0;
long expectedNumRecordsOut = 0;
for (AccessExecutionVertex vertex : originalTask.getTaskVertices()) {
IOMetrics ioMetrics = vertex.getCurrentExecutionAttempt().getIOMetrics();
expectedNumBytesIn += ioMetrics.getNumBytesInLocal() + ioMetrics.getNumBytesInRemote();
expectedNumBytesOut += ioMetrics.getNumBytesOut();
expectedNumRecordsIn += ioMetrics.getNumRecordsIn();
expectedNumRecordsOut += ioMetrics.getNumRecordsOut();
}
JsonNode metrics = taskManager.get("metrics");
Assert.assertEquals(expectedNumBytesIn, metrics.get("read-bytes").asLong());
Assert.assertEquals(expectedNumBytesOut, metrics.get("write-bytes").asLong());
Assert.assertEquals(expectedNumRecordsIn, metrics.get("read-records").asLong());
Assert.assertEquals(expectedNumRecordsOut, metrics.get("write-records").asLong());
}
use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.
the class JobMaster method offerSlots.
@RpcMethod
public Future<Iterable<SlotOffer>> offerSlots(final ResourceID taskManagerId, final Iterable<SlotOffer> slots, final UUID leaderId) throws Exception {
validateLeaderSessionId(leaderId);
Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManager = registeredTaskManagers.get(taskManagerId);
if (taskManager == null) {
throw new Exception("Unknown TaskManager " + taskManagerId);
}
final JobID jid = jobGraph.getJobID();
final TaskManagerLocation taskManagerLocation = taskManager.f0;
final TaskExecutorGateway taskExecutorGateway = taskManager.f1;
final ArrayList<Tuple2<AllocatedSlot, SlotOffer>> slotsAndOffers = new ArrayList<>();
final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, leaderId);
for (SlotOffer slotOffer : slots) {
final AllocatedSlot slot = new AllocatedSlot(slotOffer.getAllocationId(), jid, taskManagerLocation, slotOffer.getSlotIndex(), slotOffer.getResourceProfile(), rpcTaskManagerGateway);
slotsAndOffers.add(new Tuple2<>(slot, slotOffer));
}
return slotPoolGateway.offerSlots(slotsAndOffers);
}
use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.
the class Scheduler method scheduleTask.
/**
* Returns either a {@link SimpleSlot}, or a {@link Future}.
*/
private Object scheduleTask(ScheduledUnit task, boolean queueIfNoResource) throws NoResourceAvailableException {
if (task == null) {
throw new NullPointerException();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Scheduling task " + task);
}
final ExecutionVertex vertex = task.getTaskToExecute().getVertex();
final Iterable<TaskManagerLocation> preferredLocations = vertex.getPreferredLocationsBasedOnInputs();
final boolean forceExternalLocation = false && preferredLocations != null && preferredLocations.iterator().hasNext();
synchronized (globalLock) {
SlotSharingGroup sharingUnit = task.getSlotSharingGroup();
if (sharingUnit != null) {
if (queueIfNoResource) {
throw new IllegalArgumentException("A task with a vertex sharing group was scheduled in a queued fashion.");
}
final SlotSharingGroupAssignment assignment = sharingUnit.getTaskAssignment();
final CoLocationConstraint constraint = task.getLocationConstraint();
// sanity check that we do not use an externally forced location and a co-location constraint together
if (constraint != null && forceExternalLocation) {
throw new IllegalArgumentException("The scheduling cannot be constrained simultaneously by a " + "co-location constraint and an external location constraint.");
}
// get a slot from the group, if the group has one for us (and can fulfill the constraint)
final SimpleSlot slotFromGroup;
if (constraint == null) {
slotFromGroup = assignment.getSlotForTask(vertex);
} else {
slotFromGroup = assignment.getSlotForTask(vertex, constraint);
}
SimpleSlot newSlot = null;
SimpleSlot toUse = null;
// the following needs to make sure any allocated slot is released in case of an error
try {
// any slot that is local, or where the assignment was unconstrained is good!
if (slotFromGroup != null && slotFromGroup.getLocality() != Locality.NON_LOCAL) {
// the location, because we are quite happy with the slot
if (constraint != null && !constraint.isAssigned()) {
constraint.lockLocation();
}
updateLocalityCounters(slotFromGroup, vertex);
return slotFromGroup;
}
// the group did not have a local slot for us. see if we can one (or a better one)
// our location preference is either determined by the location constraint, or by the
// vertex's preferred locations
final Iterable<TaskManagerLocation> locations;
final boolean localOnly;
if (constraint != null && constraint.isAssigned()) {
locations = Collections.singleton(constraint.getLocation());
localOnly = true;
} else {
locations = vertex.getPreferredLocationsBasedOnInputs();
localOnly = forceExternalLocation;
}
newSlot = getNewSlotForSharingGroup(vertex, locations, assignment, constraint, localOnly);
if (newSlot == null) {
if (slotFromGroup == null) {
if (constraint != null && constraint.isAssigned()) {
// nothing is available on the node where the co-location constraint forces us to
throw new NoResourceAvailableException("Could not allocate a slot on instance " + constraint.getLocation() + ", as required by the co-location constraint.");
} else if (forceExternalLocation) {
// could not satisfy the external location constraint
String hosts = getHostnamesFromInstances(preferredLocations);
throw new NoResourceAvailableException("Could not schedule task " + vertex + " to any of the required hosts: " + hosts);
} else {
// simply nothing is available
throw new NoResourceAvailableException(task, getNumberOfAvailableInstances(), getTotalNumberOfSlots(), getNumberOfAvailableSlots());
}
} else {
// got a non-local from the group, and no new one, so we use the non-local
// slot from the sharing group
toUse = slotFromGroup;
}
} else if (slotFromGroup == null || !slotFromGroup.isAlive() || newSlot.getLocality() == Locality.LOCAL) {
// then we use the new slot
if (slotFromGroup != null) {
slotFromGroup.releaseSlot();
}
toUse = newSlot;
} else {
// both are available and usable. neither is local. in that case, we may
// as well use the slot from the sharing group, to minimize the number of
// instances that the job occupies
newSlot.releaseSlot();
toUse = slotFromGroup;
}
// the location, because we are going to use that slot
if (constraint != null && !constraint.isAssigned()) {
constraint.lockLocation();
}
updateLocalityCounters(toUse, vertex);
} catch (NoResourceAvailableException e) {
throw e;
} catch (Throwable t) {
if (slotFromGroup != null) {
slotFromGroup.releaseSlot();
}
if (newSlot != null) {
newSlot.releaseSlot();
}
ExceptionUtils.rethrow(t, "An error occurred while allocating a slot in a sharing group");
}
return toUse;
} else {
// 2) === schedule without hints and sharing ===
SimpleSlot slot = getFreeSlotForTask(vertex, preferredLocations, forceExternalLocation);
if (slot != null) {
updateLocalityCounters(slot, vertex);
return slot;
} else {
// no resource available now, so queue the request
if (queueIfNoResource) {
CompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
this.taskQueue.add(new QueuedTask(task, future));
return future;
} else if (forceExternalLocation) {
String hosts = getHostnamesFromInstances(preferredLocations);
throw new NoResourceAvailableException("Could not schedule task " + vertex + " to any of the required hosts: " + hosts);
} else {
throw new NoResourceAvailableException(getNumberOfAvailableInstances(), getTotalNumberOfSlots(), getNumberOfAvailableSlots());
}
}
}
}
}
use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
Aggregations