use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.
the class ExecutionGraph method restart.
public void restart() {
try {
synchronized (progressLock) {
JobStatus current = state;
if (current == JobStatus.CANCELED) {
LOG.info("Canceled job during restart. Aborting restart.");
return;
} else if (current == JobStatus.FAILED) {
LOG.info("Failed job during restart. Aborting restart.");
return;
} else if (current == JobStatus.SUSPENDED) {
LOG.info("Suspended job during restart. Aborting restart.");
return;
} else if (current != JobStatus.RESTARTING) {
throw new IllegalStateException("Can only restart job from state restarting.");
}
if (slotProvider == null) {
throw new IllegalStateException("The execution graph has not been scheduled before - slotProvider is null.");
}
this.currentExecutions.clear();
Collection<CoLocationGroup> colGroups = new HashSet<>();
for (ExecutionJobVertex jv : this.verticesInCreationOrder) {
CoLocationGroup cgroup = jv.getCoLocationGroup();
if (cgroup != null && !colGroups.contains(cgroup)) {
cgroup.resetConstraints();
colGroups.add(cgroup);
}
jv.resetForNewExecution();
}
for (int i = 0; i < stateTimestamps.length; i++) {
if (i != JobStatus.RESTARTING.ordinal()) {
// Only clear the non restarting state in order to preserve when the job was
// restarted. This is needed for the restarting time gauge
stateTimestamps[i] = 0;
}
}
numFinishedJobVertices = 0;
transitionState(JobStatus.RESTARTING, JobStatus.CREATED);
// if we have checkpointed state, reload it into the executions
if (checkpointCoordinator != null) {
checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false);
}
}
scheduleForExecution();
} catch (Throwable t) {
LOG.warn("Failed to restart the job.", t);
fail(t);
}
}
use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.
the class ExecutionGraph method tryRestartOrFail.
/**
* Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
* try to fail the job. This operation is only permitted if the current state is FAILING or
* RESTARTING.
*
* @return true if the operation could be executed; false if a concurrent job status change occurred
*/
private boolean tryRestartOrFail() {
JobStatus currentState = state;
if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
synchronized (progressLock) {
if (LOG.isDebugEnabled()) {
LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
} else {
LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
}
final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;
if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());
restartStrategy.restart(this);
return true;
} else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
final List<String> reasonsForNoRestart = new ArrayList<>(2);
if (!isFailureCauseAllowingRestart) {
reasonsForNoRestart.add("a type of SuppressRestartsException was thrown");
}
if (!isRestartStrategyAllowingRestart) {
reasonsForNoRestart.add("the restart strategy prevented it");
}
LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(), StringUtils.join(reasonsForNoRestart, " and "), failureCause);
postRunCleanup();
return true;
} else {
// we must have changed the state concurrently, thus we cannot complete this operation
return false;
}
}
} else {
// this operation is only allowed in the state FAILING or RESTARTING
return false;
}
}
use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.
the class CompletedCheckpointTest method testCleanUpOnShutdown.
/**
* Tests that the garbage collection properties are respected when shutting down.
*/
@Test
public void testCleanUpOnShutdown() throws Exception {
File file = tmpFolder.newFile();
String externalPath = file.getAbsolutePath();
JobStatus[] terminalStates = new JobStatus[] { JobStatus.FINISHED, JobStatus.CANCELED, JobStatus.FAILED, JobStatus.SUSPENDED };
TaskState state = mock(TaskState.class);
Map<JobVertexID, TaskState> taskStates = new HashMap<>();
taskStates.put(new JobVertexID(), state);
for (JobStatus status : terminalStates) {
Mockito.reset(state);
// Keep
CheckpointProperties props = new CheckpointProperties(false, true, false, false, false, false, false);
CompletedCheckpoint checkpoint = new CompletedCheckpoint(new JobID(), 0, 0, 1, new HashMap<>(taskStates), props, new FileStateHandle(new Path(file.toURI()), file.length()), externalPath);
checkpoint.discard(status);
verify(state, times(0)).discardState();
assertEquals(true, file.exists());
// Discard
props = new CheckpointProperties(false, false, true, true, true, true, true);
checkpoint = new CompletedCheckpoint(new JobID(), 0, 0, 1, new HashMap<>(taskStates), props);
checkpoint.discard(status);
verify(state, times(1)).discardState();
}
}
use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.
the class WebMonitorMessagesTest method testJobDetailsMessage.
@Test
public void testJobDetailsMessage() {
try {
final Random rnd = new Random();
int[] numVerticesPerState = new int[ExecutionState.values().length];
int numTotal = 0;
for (int i = 0; i < numVerticesPerState.length; i++) {
int count = rnd.nextInt(55);
numVerticesPerState[i] = count;
numTotal += count;
}
long time = rnd.nextLong();
long endTime = rnd.nextBoolean() ? -1L : time + rnd.nextInt();
long lastModified = endTime == -1 ? time + rnd.nextInt() : endTime;
String name = GenericMessageTester.randomString(rnd);
JobID jid = GenericMessageTester.randomJobId(rnd);
JobStatus status = GenericMessageTester.randomJobStatus(rnd);
JobDetails msg1 = new JobDetails(jid, name, time, endTime, status, lastModified, numVerticesPerState, numTotal);
JobDetails msg2 = new JobDetails(jid, name, time, endTime, status, lastModified, numVerticesPerState, numTotal);
GenericMessageTester.testMessageInstances(msg1, msg2);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.jobgraph.JobStatus in project flink by apache.
the class JobManagerActorTestUtils method waitForJobStatus.
/**
* Waits for the expected {@link JobStatus}.
*
* <p>Repeatedly queries the JobManager via {@link RequestJobStatus} messages.
*
* @param jobId Job ID of the job to wait for
* @param expectedJobStatus Expected job status
* @param jobManager Job manager actor to ask
* @param timeout Timeout after which the operation fails
* @throws Exception If the job is not found within the timeout or the job is in another state.
*/
public static void waitForJobStatus(JobID jobId, JobStatus expectedJobStatus, ActorGateway jobManager, FiniteDuration timeout) throws Exception {
checkNotNull(jobId, "Job ID");
checkNotNull(expectedJobStatus, "Expected job status");
checkNotNull(jobManager, "Job manager");
checkNotNull(timeout, "Timeout");
final Deadline deadline = timeout.fromNow();
while (deadline.hasTimeLeft()) {
// Request the job status
JobStatusResponse response = requestJobStatus(jobId, jobManager, deadline.timeLeft());
// Found the job
if (response instanceof CurrentJobStatus) {
JobStatus jobStatus = ((CurrentJobStatus) response).status();
// OK, that's what we were waiting for
if (jobStatus == expectedJobStatus) {
return;
} else if (jobStatus.isGloballyTerminalState()) {
throw new IllegalStateException("Job is in terminal state " + jobStatus + ", " + "but was waiting for " + expectedJobStatus + ".");
}
} else // Did not find the job... retry
if (response instanceof JobNotFound) {
Thread.sleep(Math.min(100, deadline.timeLeft().toMillis()));
} else {
throw new IllegalStateException("Unexpected response.");
}
}
throw new IllegalStateException("Job not found within deadline.");
}
Aggregations