use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.
the class ExecutionFailureHandlerTest method testUnrecoverableErrorCheck.
/**
* Tests the check for unrecoverable error.
*/
@Test
public void testUnrecoverableErrorCheck() {
// normal error
assertFalse(ExecutionFailureHandler.isUnrecoverableError(new Exception()));
// direct unrecoverable error
assertTrue(ExecutionFailureHandler.isUnrecoverableError(new SuppressRestartsException(new Exception())));
// nested unrecoverable error
assertTrue(ExecutionFailureHandler.isUnrecoverableError(new Exception(new SuppressRestartsException(new Exception()))));
}
use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.
the class ExecutionGraph method tryRestartOrFail.
/**
* Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
* try to fail the job. This operation is only permitted if the current state is FAILING or
* RESTARTING.
*
* @return true if the operation could be executed; false if a concurrent job status change occurred
*/
private boolean tryRestartOrFail() {
JobStatus currentState = state;
if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
synchronized (progressLock) {
if (LOG.isDebugEnabled()) {
LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
} else {
LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
}
final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;
if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());
restartStrategy.restart(this);
return true;
} else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
final List<String> reasonsForNoRestart = new ArrayList<>(2);
if (!isFailureCauseAllowingRestart) {
reasonsForNoRestart.add("a type of SuppressRestartsException was thrown");
}
if (!isRestartStrategyAllowingRestart) {
reasonsForNoRestart.add("the restart strategy prevented it");
}
LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(), StringUtils.join(reasonsForNoRestart, " and "), failureCause);
postRunCleanup();
return true;
} else {
// we must have changed the state concurrently, thus we cannot complete this operation
return false;
}
}
} else {
// this operation is only allowed in the state FAILING or RESTARTING
return false;
}
}
use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.
the class KvStateLocationRegistry method notifyKvStateRegistered.
/**
* Notifies the registry about a registered KvState instance.
*
* @param jobVertexId JobVertexID the KvState instance belongs to
* @param keyGroupRange Key group range the KvState instance belongs to
* @param registrationName Name under which the KvState has been registered
* @param kvStateId ID of the registered KvState instance
* @param kvStateServerAddress Server address where to find the KvState instance
*
* @throws IllegalArgumentException If JobVertexID does not belong to job
* @throws IllegalArgumentException If state has been registered with same
* name by another operator.
* @throws IndexOutOfBoundsException If key group index is out of bounds.
*/
public void notifyKvStateRegistered(JobVertexID jobVertexId, KeyGroupRange keyGroupRange, String registrationName, KvStateID kvStateId, KvStateServerAddress kvStateServerAddress) {
KvStateLocation location = lookupTable.get(registrationName);
if (location == null) {
// First registration for this operator, create the location info
ExecutionJobVertex vertex = jobVertices.get(jobVertexId);
if (vertex != null) {
int parallelism = vertex.getMaxParallelism();
location = new KvStateLocation(jobId, jobVertexId, parallelism, registrationName);
lookupTable.put(registrationName, location);
} else {
throw new IllegalArgumentException("Unknown JobVertexID " + jobVertexId);
}
}
// Duplicated name if vertex IDs don't match
if (!location.getJobVertexId().equals(jobVertexId)) {
IllegalStateException duplicate = new IllegalStateException("Registration name clash. KvState with name '" + registrationName + "' has already been registered by another operator (" + location.getJobVertexId() + ").");
ExecutionJobVertex vertex = jobVertices.get(jobVertexId);
if (vertex != null) {
vertex.fail(new SuppressRestartsException(duplicate));
}
throw duplicate;
}
location.registerKvState(keyGroupRange, kvStateId, kvStateServerAddress);
}
use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.
the class ExecutionGraphRestartTest method testNoRestartOnSuppressException.
@Test
public void testNoRestartOnSuppressException() throws Exception {
Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createSpyExecutionGraph(new FixedDelayRestartStrategy(1, 1000));
ExecutionGraph eg = executionGraphInstanceTuple.f0;
// Fail with unrecoverable Exception
eg.getAllExecutionVertices().iterator().next().fail(new SuppressRestartsException(new Exception("Test Exception")));
assertEquals(JobStatus.FAILING, eg.getState());
for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
vertex.getCurrentExecutionAttempt().cancelingComplete();
}
FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
// Wait for async restart
Deadline deadline = timeout.fromNow();
while (deadline.hasTimeLeft() && eg.getState() != JobStatus.FAILED) {
Thread.sleep(100);
}
assertEquals(JobStatus.FAILED, eg.getState());
// No restart
verify(eg, never()).restart();
RestartStrategy restartStrategy = eg.getRestartStrategy();
assertTrue(restartStrategy instanceof FixedDelayRestartStrategy);
assertEquals(0, ((FixedDelayRestartStrategy) restartStrategy).getCurrentRestartAttempt());
}
Aggregations