use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class SchedulerIsolatedTasksTest method testScheduleWithDyingInstances.
@Test
public void testScheduleWithDyingInstances() {
try {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
Instance i1 = getRandomInstance(2);
Instance i2 = getRandomInstance(2);
Instance i3 = getRandomInstance(1);
scheduler.newInstanceAvailable(i1);
scheduler.newInstanceAvailable(i2);
scheduler.newInstanceAvailable(i3);
List<SimpleSlot> slots = new ArrayList<SimpleSlot>();
slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
i2.markDead();
for (SimpleSlot slot : slots) {
if (slot.getOwner() == i2) {
assertTrue(slot.isCanceled());
} else {
assertFalse(slot.isCanceled());
}
slot.releaseSlot();
}
assertEquals(3, scheduler.getNumberOfAvailableSlots());
i1.markDead();
i3.markDead();
// cannot get another slot, since all instances are dead
try {
scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
fail("Scheduler served a slot from a dead instance");
} catch (ExecutionException e) {
assertTrue(e.getCause() instanceof NoResourceAvailableException);
} catch (Exception e) {
fail("Wrong exception type.");
}
// now the latest, the scheduler should have noticed (through the lazy mechanisms)
// that all instances have vanished
assertEquals(0, scheduler.getNumberOfInstancesWithAvailableSlots());
assertEquals(0, scheduler.getNumberOfAvailableSlots());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class SchedulerIsolatedTasksTest method testAddAndRemoveInstance.
@Test
public void testAddAndRemoveInstance() {
try {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
Instance i1 = getRandomInstance(2);
Instance i2 = getRandomInstance(2);
Instance i3 = getRandomInstance(2);
assertEquals(0, scheduler.getNumberOfAvailableInstances());
assertEquals(0, scheduler.getNumberOfAvailableSlots());
scheduler.newInstanceAvailable(i1);
assertEquals(1, scheduler.getNumberOfAvailableInstances());
assertEquals(2, scheduler.getNumberOfAvailableSlots());
scheduler.newInstanceAvailable(i2);
assertEquals(2, scheduler.getNumberOfAvailableInstances());
assertEquals(4, scheduler.getNumberOfAvailableSlots());
scheduler.newInstanceAvailable(i3);
assertEquals(3, scheduler.getNumberOfAvailableInstances());
assertEquals(6, scheduler.getNumberOfAvailableSlots());
// cannot add available instance again
try {
scheduler.newInstanceAvailable(i2);
fail("Scheduler accepted instance twice");
} catch (IllegalArgumentException e) {
// bueno!
}
// some instances die
assertEquals(3, scheduler.getNumberOfAvailableInstances());
assertEquals(6, scheduler.getNumberOfAvailableSlots());
scheduler.instanceDied(i2);
assertEquals(2, scheduler.getNumberOfAvailableInstances());
assertEquals(4, scheduler.getNumberOfAvailableSlots());
// try to add a dead instance
try {
scheduler.newInstanceAvailable(i2);
fail("Scheduler accepted dead instance");
} catch (IllegalArgumentException e) {
// stimmt
}
scheduler.instanceDied(i1);
assertEquals(1, scheduler.getNumberOfAvailableInstances());
assertEquals(2, scheduler.getNumberOfAvailableSlots());
scheduler.instanceDied(i3);
assertEquals(0, scheduler.getNumberOfAvailableInstances());
assertEquals(0, scheduler.getNumberOfAvailableSlots());
assertFalse(i1.isAlive());
assertFalse(i2.isAlive());
assertFalse(i3.isAlive());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class Scheduler method handleNewSlot.
private void handleNewSlot() {
synchronized (globalLock) {
Instance instance = this.newlyAvailableInstances.poll();
if (instance == null || !instance.hasResourcesAvailable()) {
// someone else took it
return;
}
QueuedTask queued = taskQueue.peek();
if (queued != null) {
ScheduledUnit task = queued.getTask();
ExecutionVertex vertex = task.getTaskToExecute().getVertex();
try {
SimpleSlot newSlot = instance.allocateSimpleSlot(vertex.getJobId());
if (newSlot != null) {
// success, remove from the task queue and notify the future
taskQueue.poll();
if (queued.getFuture() != null) {
try {
queued.getFuture().complete(newSlot);
} catch (Throwable t) {
LOG.error("Error calling allocation future for task " + vertex.getSimpleName(), t);
task.getTaskToExecute().fail(t);
}
}
}
} catch (InstanceDiedException e) {
if (LOG.isDebugEnabled()) {
LOG.debug("Instance " + instance + " was marked dead asynchronously.");
}
removeInstance(instance);
}
} else {
this.instancesWithAvailableResources.put(instance.getTaskManagerID(), instance);
}
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
use of org.apache.flink.runtime.instance.Instance in project flink by apache.
the class ExecutionGraphRestartTest method testCancelWhileFailing.
@Test
public void testCancelWhileFailing() throws Exception {
// We want to manually control the restart and delay
RestartStrategy restartStrategy = new InfiniteDelayRestartStrategy();
Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createSpyExecutionGraph(restartStrategy);
ExecutionGraph executionGraph = executionGraphInstanceTuple.f0;
Instance instance = executionGraphInstanceTuple.f1;
doNothing().when(executionGraph).jobVertexInFinalState();
// Kill the instance...
instance.markDead();
Deadline deadline = TestingUtils.TESTING_DURATION().fromNow();
// ...and wait for all vertices to be in state FAILED. The
// jobVertexInFinalState does nothing, that's why we don't wait on the
// job status.
boolean success = false;
while (deadline.hasTimeLeft() && !success) {
success = true;
for (ExecutionVertex vertex : executionGraph.getAllExecutionVertices()) {
ExecutionState state = vertex.getExecutionState();
if (state != ExecutionState.FAILED && state != ExecutionState.CANCELED) {
success = false;
Thread.sleep(100);
break;
}
}
}
// Still in failing
assertEquals(JobStatus.FAILING, executionGraph.getState());
// The cancel call needs to change the state to CANCELLING
executionGraph.cancel();
assertEquals(JobStatus.CANCELLING, executionGraph.getState());
// Unspy and finalize the job state
doCallRealMethod().when(executionGraph).jobVertexInFinalState();
executionGraph.jobVertexInFinalState();
assertEquals(JobStatus.CANCELED, executionGraph.getState());
}
Aggregations