Search in sources :

Example 6 with JobStatus

use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.

the class ExecutionGraph method restart.

public void restart() {
    try {
        synchronized (progressLock) {
            JobStatus current = state;
            if (current == JobStatus.CANCELED) {
                LOG.info("Canceled job during restart. Aborting restart.");
                return;
            } else if (current == JobStatus.FAILED) {
                LOG.info("Failed job during restart. Aborting restart.");
                return;
            } else if (current == JobStatus.SUSPENDED) {
                LOG.info("Suspended job during restart. Aborting restart.");
                return;
            } else if (current != JobStatus.RESTARTING) {
                throw new IllegalStateException("Can only restart job from state restarting.");
            }
            if (slotProvider == null) {
                throw new IllegalStateException("The execution graph has not been scheduled before - slotProvider is null.");
            }
            this.currentExecutions.clear();
            Collection<CoLocationGroup> colGroups = new HashSet<>();
            for (ExecutionJobVertex jv : this.verticesInCreationOrder) {
                CoLocationGroup cgroup = jv.getCoLocationGroup();
                if (cgroup != null && !colGroups.contains(cgroup)) {
                    cgroup.resetConstraints();
                    colGroups.add(cgroup);
                }
                jv.resetForNewExecution();
            }
            for (int i = 0; i < stateTimestamps.length; i++) {
                if (i != JobStatus.RESTARTING.ordinal()) {
                    // Only clear the non restarting state in order to preserve when the job was
                    // restarted. This is needed for the restarting time gauge
                    stateTimestamps[i] = 0;
                }
            }
            numFinishedJobVertices = 0;
            transitionState(JobStatus.RESTARTING, JobStatus.CREATED);
            // if we have checkpointed state, reload it into the executions
            if (checkpointCoordinator != null) {
                checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false);
            }
        }
        scheduleForExecution();
    } catch (Throwable t) {
        LOG.warn("Failed to restart the job.", t);
        fail(t);
    }
}
Also used : JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) CoLocationGroup(org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable) HashSet(java.util.HashSet)

Example 7 with JobStatus

use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.

the class ExecutionGraph method tryRestartOrFail.

/**
	 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
	 * try to fail the job. This operation is only permitted if the current state is FAILING or
	 * RESTARTING.
	 *
	 * @return true if the operation could be executed; false if a concurrent job status change occurred
	 */
private boolean tryRestartOrFail() {
    JobStatus currentState = state;
    if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
        synchronized (progressLock) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
            } else {
                LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
            }
            final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
            final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
            boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;
            if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
                LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());
                restartStrategy.restart(this);
                return true;
            } else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
                final List<String> reasonsForNoRestart = new ArrayList<>(2);
                if (!isFailureCauseAllowingRestart) {
                    reasonsForNoRestart.add("a type of SuppressRestartsException was thrown");
                }
                if (!isRestartStrategyAllowingRestart) {
                    reasonsForNoRestart.add("the restart strategy prevented it");
                }
                LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(), StringUtils.join(reasonsForNoRestart, " and "), failureCause);
                postRunCleanup();
                return true;
            } else {
                // we must have changed the state concurrently, thus we cannot complete this operation
                return false;
            }
        }
    } else {
        // this operation is only allowed in the state FAILING or RESTARTING
        return false;
    }
}
Also used : JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) List(java.util.List) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList)

Example 8 with JobStatus

use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.

the class CompletedCheckpointTest method testCleanUpOnShutdown.

/**
	 * Tests that the garbage collection properties are respected when shutting down.
	 */
@Test
public void testCleanUpOnShutdown() throws Exception {
    File file = tmpFolder.newFile();
    String externalPath = file.getAbsolutePath();
    JobStatus[] terminalStates = new JobStatus[] { JobStatus.FINISHED, JobStatus.CANCELED, JobStatus.FAILED, JobStatus.SUSPENDED };
    TaskState state = mock(TaskState.class);
    Map<JobVertexID, TaskState> taskStates = new HashMap<>();
    taskStates.put(new JobVertexID(), state);
    for (JobStatus status : terminalStates) {
        Mockito.reset(state);
        // Keep
        CheckpointProperties props = new CheckpointProperties(false, true, false, false, false, false, false);
        CompletedCheckpoint checkpoint = new CompletedCheckpoint(new JobID(), 0, 0, 1, new HashMap<>(taskStates), props, new FileStateHandle(new Path(file.toURI()), file.length()), externalPath);
        checkpoint.discard(status);
        verify(state, times(0)).discardState();
        assertEquals(true, file.exists());
        // Discard
        props = new CheckpointProperties(false, false, true, true, true, true, true);
        checkpoint = new CompletedCheckpoint(new JobID(), 0, 0, 1, new HashMap<>(taskStates), props);
        checkpoint.discard(status);
        verify(state, times(1)).discardState();
    }
}
Also used : Path(org.apache.flink.core.fs.Path) HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) FileStateHandle(org.apache.flink.runtime.state.filesystem.FileStateHandle) JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) File(java.io.File) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 9 with JobStatus

use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.

the class WebMonitorMessagesTest method testJobDetailsMessage.

@Test
public void testJobDetailsMessage() {
    try {
        final Random rnd = new Random();
        int[] numVerticesPerState = new int[ExecutionState.values().length];
        int numTotal = 0;
        for (int i = 0; i < numVerticesPerState.length; i++) {
            int count = rnd.nextInt(55);
            numVerticesPerState[i] = count;
            numTotal += count;
        }
        long time = rnd.nextLong();
        long endTime = rnd.nextBoolean() ? -1L : time + rnd.nextInt();
        long lastModified = endTime == -1 ? time + rnd.nextInt() : endTime;
        String name = GenericMessageTester.randomString(rnd);
        JobID jid = GenericMessageTester.randomJobId(rnd);
        JobStatus status = GenericMessageTester.randomJobStatus(rnd);
        JobDetails msg1 = new JobDetails(jid, name, time, endTime, status, lastModified, numVerticesPerState, numTotal);
        JobDetails msg2 = new JobDetails(jid, name, time, endTime, status, lastModified, numVerticesPerState, numTotal);
        GenericMessageTester.testMessageInstances(msg1, msg2);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) Random(java.util.Random) JobID(org.apache.flink.api.common.JobID) JobDetails(org.apache.flink.runtime.messages.webmonitor.JobDetails) RequestJobDetails(org.apache.flink.runtime.messages.webmonitor.RequestJobDetails) Test(org.junit.Test)

Example 10 with JobStatus

use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.

the class JobManagerActorTestUtils method waitForJobStatus.

/**
	 * Waits for the expected {@link JobStatus}.
	 *
	 * <p>Repeatedly queries the JobManager via {@link RequestJobStatus} messages.
	 *
	 * @param jobId             Job ID of the job to wait for
	 * @param expectedJobStatus Expected job status
	 * @param jobManager        Job manager actor to ask
	 * @param timeout           Timeout after which the operation fails
	 * @throws Exception If the job is not found within the timeout or the job is in another state.
	 */
public static void waitForJobStatus(JobID jobId, JobStatus expectedJobStatus, ActorGateway jobManager, FiniteDuration timeout) throws Exception {
    checkNotNull(jobId, "Job ID");
    checkNotNull(expectedJobStatus, "Expected job status");
    checkNotNull(jobManager, "Job manager");
    checkNotNull(timeout, "Timeout");
    final Deadline deadline = timeout.fromNow();
    while (deadline.hasTimeLeft()) {
        // Request the job status
        JobStatusResponse response = requestJobStatus(jobId, jobManager, deadline.timeLeft());
        // Found the job
        if (response instanceof CurrentJobStatus) {
            JobStatus jobStatus = ((CurrentJobStatus) response).status();
            // OK, that's what we were waiting for
            if (jobStatus == expectedJobStatus) {
                return;
            } else if (jobStatus.isGloballyTerminalState()) {
                throw new IllegalStateException("Job is in terminal state " + jobStatus + ", " + "but was waiting for " + expectedJobStatus + ".");
            }
        } else // Did not find the job... retry
        if (response instanceof JobNotFound) {
            Thread.sleep(Math.min(100, deadline.timeLeft().toMillis()));
        } else {
            throw new IllegalStateException("Unexpected response.");
        }
    }
    throw new IllegalStateException("Job not found within deadline.");
}
Also used : JobStatusResponse(org.apache.flink.runtime.messages.JobManagerMessages.JobStatusResponse) RequestJobStatus(org.apache.flink.runtime.messages.JobManagerMessages.RequestJobStatus) JobManagerMessages.getRequestJobStatus(org.apache.flink.runtime.messages.JobManagerMessages.getRequestJobStatus) JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) CurrentJobStatus(org.apache.flink.runtime.messages.JobManagerMessages.CurrentJobStatus) CurrentJobStatus(org.apache.flink.runtime.messages.JobManagerMessages.CurrentJobStatus) Deadline(scala.concurrent.duration.Deadline) JobNotFound(org.apache.flink.runtime.messages.JobManagerMessages.JobNotFound)

Aggregations

JobStatus (org.apache.flink.runtime.jobgraph.JobStatus)12 JobID (org.apache.flink.api.common.JobID)3 AccessExecutionJobVertex (org.apache.flink.runtime.executiongraph.AccessExecutionJobVertex)3 AccessExecutionVertex (org.apache.flink.runtime.executiongraph.AccessExecutionVertex)3 JobDetails (org.apache.flink.runtime.messages.webmonitor.JobDetails)3 Test (org.junit.Test)3 ArrayList (java.util.ArrayList)2 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)2 SuppressRestartsException (org.apache.flink.runtime.execution.SuppressRestartsException)2 RequestJobDetails (org.apache.flink.runtime.messages.webmonitor.RequestJobDetails)2 Deadline (scala.concurrent.duration.Deadline)2 ActorSystem (akka.actor.ActorSystem)1 JsonGenerator (com.fasterxml.jackson.core.JsonGenerator)1 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 ArrayNode (com.fasterxml.jackson.databind.node.ArrayNode)1 File (java.io.File)1 IOException (java.io.IOException)1 StringWriter (java.io.StringWriter)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1