Search in sources :

Example 1 with SuppressRestartsException

use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.

the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.

/**
	 * This test tests that the restarting time metric correctly displays restarting times.
	 */
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
    final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
    try {
        // setup execution graph with mocked scheduling logic
        int parallelism = 1;
        JobVertex jobVertex = new JobVertex("TestVertex");
        jobVertex.setParallelism(parallelism);
        jobVertex.setInvokableClass(NoOpInvokable.class);
        JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
        Configuration jobConfig = new Configuration();
        Time timeout = Time.seconds(10L);
        MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        assertTrue(metricRegistry.getReporters().size() == 1);
        MetricReporter reporter = metricRegistry.getReporters().get(0);
        assertTrue(reporter instanceof TestingReporter);
        TestingReporter testingReporter = (TestingReporter) reporter;
        MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
        Scheduler scheduler = mock(Scheduler.class);
        ResourceID taskManagerId = ResourceID.generate();
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
        when(taskManagerLocation.getHostname()).thenReturn("localhost");
        TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
        Instance instance = mock(Instance.class);
        when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(instance.getTaskManagerID()).thenReturn(taskManagerId);
        when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        Slot rootSlot = mock(Slot.class);
        AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
        when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
        SimpleSlot simpleSlot = mock(SimpleSlot.class);
        when(simpleSlot.isAlive()).thenReturn(true);
        when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
        when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
        when(simpleSlot.getRoot()).thenReturn(rootSlot);
        when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
        FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
        future.complete(simpleSlot);
        when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
        when(rootSlot.getSlotNumber()).thenReturn(0);
        when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
        TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
        ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
        // get restarting time metric
        Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
        assertNotNull(metric);
        assertTrue(metric instanceof Gauge);
        @SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
        // check that the restarting time is 0 since it's the initial start
        assertTrue(0L == restartingTime.getValue());
        executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
        // start execution
        executionGraph.scheduleForExecution();
        assertTrue(0L == restartingTime.getValue());
        List<ExecutionAttemptID> executionIDs = new ArrayList<>();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        // tell execution graph that the tasks are in state running --> job status switches to state running
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(0L == restartingTime.getValue());
        // fail the job so that it goes into state restarting
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        // wait some time so that the restarting time gauge shows a value different from 0
        Thread.sleep(50);
        long previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is monotonically increasing
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // check that we have measured some restarting time
        assertTrue(previousRestartingTime > 0);
        // restart job
        testingRestartStrategy.restartExecutionGraph();
        executionIDs.clear();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(firstRestartingTimestamp != 0);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time does not increase after we've reached the running state
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // fail job again
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
        Thread.sleep(50);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is increasing again
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        assertTrue(previousRestartingTime > 0);
        // now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
        // for this to work, we have to use a SuppressRestartException
        executionGraph.fail(new SuppressRestartsException(new Exception()));
        assertEquals(JobStatus.FAILED, executionGraph.getState());
        previousRestartingTime = restartingTime.getValue();
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
    } finally {
        executor.shutdownNow();
    }
}
Also used : JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Gauge(org.apache.flink.metrics.Gauge) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ScheduledUnit(org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit) MetricReporter(org.apache.flink.metrics.reporter.MetricReporter) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) JobException(org.apache.flink.runtime.JobException) IOException(java.io.IOException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) Slot(org.apache.flink.runtime.instance.Slot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) Metric(org.apache.flink.metrics.Metric) Test(org.junit.Test)

Example 2 with SuppressRestartsException

use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.

the class ExecutionFailureHandlerTest method testUnrecoverableErrorCheck.

/**
 * Tests the check for unrecoverable error.
 */
@Test
public void testUnrecoverableErrorCheck() {
    // normal error
    assertFalse(ExecutionFailureHandler.isUnrecoverableError(new Exception()));
    // direct unrecoverable error
    assertTrue(ExecutionFailureHandler.isUnrecoverableError(new SuppressRestartsException(new Exception())));
    // nested unrecoverable error
    assertTrue(ExecutionFailureHandler.isUnrecoverableError(new Exception(new SuppressRestartsException(new Exception()))));
}
Also used : SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) Test(org.junit.Test)

Example 3 with SuppressRestartsException

use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.

the class ExecutionGraph method tryRestartOrFail.

/**
	 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
	 * try to fail the job. This operation is only permitted if the current state is FAILING or
	 * RESTARTING.
	 *
	 * @return true if the operation could be executed; false if a concurrent job status change occurred
	 */
private boolean tryRestartOrFail() {
    JobStatus currentState = state;
    if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
        synchronized (progressLock) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
            } else {
                LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
            }
            final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
            final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
            boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;
            if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
                LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());
                restartStrategy.restart(this);
                return true;
            } else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
                final List<String> reasonsForNoRestart = new ArrayList<>(2);
                if (!isFailureCauseAllowingRestart) {
                    reasonsForNoRestart.add("a type of SuppressRestartsException was thrown");
                }
                if (!isRestartStrategyAllowingRestart) {
                    reasonsForNoRestart.add("the restart strategy prevented it");
                }
                LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(), StringUtils.join(reasonsForNoRestart, " and "), failureCause);
                postRunCleanup();
                return true;
            } else {
                // we must have changed the state concurrently, thus we cannot complete this operation
                return false;
            }
        }
    } else {
        // this operation is only allowed in the state FAILING or RESTARTING
        return false;
    }
}
Also used : JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) List(java.util.List) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList)

Example 4 with SuppressRestartsException

use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.

the class KvStateLocationRegistry method notifyKvStateRegistered.

/**
	 * Notifies the registry about a registered KvState instance.
	 *
	 * @param jobVertexId JobVertexID the KvState instance belongs to
	 * @param keyGroupRange Key group range the KvState instance belongs to
	 * @param registrationName Name under which the KvState has been registered
	 * @param kvStateId ID of the registered KvState instance
	 * @param kvStateServerAddress Server address where to find the KvState instance
	 *
	 * @throws IllegalArgumentException If JobVertexID does not belong to job
	 * @throws IllegalArgumentException If state has been registered with same
	 * name by another operator.
	 * @throws IndexOutOfBoundsException If key group index is out of bounds.
	 */
public void notifyKvStateRegistered(JobVertexID jobVertexId, KeyGroupRange keyGroupRange, String registrationName, KvStateID kvStateId, KvStateServerAddress kvStateServerAddress) {
    KvStateLocation location = lookupTable.get(registrationName);
    if (location == null) {
        // First registration for this operator, create the location info
        ExecutionJobVertex vertex = jobVertices.get(jobVertexId);
        if (vertex != null) {
            int parallelism = vertex.getMaxParallelism();
            location = new KvStateLocation(jobId, jobVertexId, parallelism, registrationName);
            lookupTable.put(registrationName, location);
        } else {
            throw new IllegalArgumentException("Unknown JobVertexID " + jobVertexId);
        }
    }
    // Duplicated name if vertex IDs don't match
    if (!location.getJobVertexId().equals(jobVertexId)) {
        IllegalStateException duplicate = new IllegalStateException("Registration name clash. KvState with name '" + registrationName + "' has already been registered by another operator (" + location.getJobVertexId() + ").");
        ExecutionJobVertex vertex = jobVertices.get(jobVertexId);
        if (vertex != null) {
            vertex.fail(new SuppressRestartsException(duplicate));
        }
        throw duplicate;
    }
    location.registerKvState(keyGroupRange, kvStateId, kvStateServerAddress);
}
Also used : SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex)

Example 5 with SuppressRestartsException

use of org.apache.flink.runtime.execution.SuppressRestartsException in project flink by apache.

the class ExecutionGraphRestartTest method testNoRestartOnSuppressException.

@Test
public void testNoRestartOnSuppressException() throws Exception {
    Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createSpyExecutionGraph(new FixedDelayRestartStrategy(1, 1000));
    ExecutionGraph eg = executionGraphInstanceTuple.f0;
    // Fail with unrecoverable Exception
    eg.getAllExecutionVertices().iterator().next().fail(new SuppressRestartsException(new Exception("Test Exception")));
    assertEquals(JobStatus.FAILING, eg.getState());
    for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
        vertex.getCurrentExecutionAttempt().cancelingComplete();
    }
    FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
    // Wait for async restart
    Deadline deadline = timeout.fromNow();
    while (deadline.hasTimeLeft() && eg.getState() != JobStatus.FAILED) {
        Thread.sleep(100);
    }
    assertEquals(JobStatus.FAILED, eg.getState());
    // No restart
    verify(eg, never()).restart();
    RestartStrategy restartStrategy = eg.getRestartStrategy();
    assertTrue(restartStrategy instanceof FixedDelayRestartStrategy);
    assertEquals(0, ((FixedDelayRestartStrategy) restartStrategy).getCurrentRestartAttempt());
}
Also used : SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) Instance(org.apache.flink.runtime.instance.Instance) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) FailureRateRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FailureRateRestartStrategy) InfiniteDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy) NoRestartStrategy(org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy) RestartStrategy(org.apache.flink.runtime.executiongraph.restart.RestartStrategy) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) IOException(java.io.IOException) Test(org.junit.Test)

Aggregations

SuppressRestartsException (org.apache.flink.runtime.execution.SuppressRestartsException)10 Test (org.junit.Test)7 IOException (java.io.IOException)4 InfiniteDelayRestartStrategy (org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy)3 Instance (org.apache.flink.runtime.instance.Instance)3 ArrayList (java.util.ArrayList)2 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)2 Configuration (org.apache.flink.configuration.Configuration)2 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)2 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)2 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)2 Deadline (scala.concurrent.duration.Deadline)2 Field (java.lang.reflect.Field)1 List (java.util.List)1 CopyOnWriteArrayList (java.util.concurrent.CopyOnWriteArrayList)1 ExecutionException (java.util.concurrent.ExecutionException)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 JobID (org.apache.flink.api.common.JobID)1 Time (org.apache.flink.api.common.time.Time)1