use of java.util.concurrent.TimeoutException in project flink by apache.
the class TaskExecutor method offerSlotsToJobManager.
// ------------------------------------------------------------------------
// Internal job manager connection methods
// ------------------------------------------------------------------------
private void offerSlotsToJobManager(final JobID jobId) {
final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
if (jobManagerConnection == null) {
log.debug("There is no job manager connection to the leader of job {}.", jobId);
} else {
if (taskSlotTable.hasAllocatedSlots(jobId)) {
log.info("Offer reserved slots to the leader of job {}.", jobId);
final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
final Iterator<TaskSlot> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
final UUID leaderId = jobManagerConnection.getLeaderId();
final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
while (reservedSlotsIterator.hasNext()) {
SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
try {
if (!taskSlotTable.markSlotActive(offer.getAllocationId())) {
// the slot is either free or releasing at the moment
final String message = "Could not mark slot " + jobId + " active.";
log.debug(message);
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
}
} catch (SlotNotFoundException e) {
final String message = "Could not mark slot " + jobId + " active.";
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
continue;
}
reservedSlots.add(offer);
}
Future<Iterable<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, leaderId, taskManagerConfiguration.getTimeout());
acceptedSlotsFuture.thenAcceptAsync(new AcceptFunction<Iterable<SlotOffer>>() {
@Override
public void accept(Iterable<SlotOffer> acceptedSlots) {
// check if the response is still valid
if (isJobManagerConnectionValid(jobId, leaderId)) {
// mark accepted slots active
for (SlotOffer acceptedSlot : acceptedSlots) {
reservedSlots.remove(acceptedSlot);
}
final Exception e = new Exception("The slot was rejected by the JobManager.");
for (SlotOffer rejectedSlot : reservedSlots) {
freeSlot(rejectedSlot.getAllocationId(), e);
}
} else {
// discard the response since there is a new leader for the job
log.debug("Discard offer slot response since there is a new leader " + "for the job {}.", jobId);
}
}
}, getMainThreadExecutor());
acceptedSlotsFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {
@Override
public Void apply(Throwable throwable) {
if (throwable instanceof TimeoutException) {
// We ran into a timeout. Try again.
offerSlotsToJobManager(jobId);
} else {
// We encountered an exception. Free the slots and return them to the RM.
for (SlotOffer reservedSlot : reservedSlots) {
freeSlot(reservedSlot.getAllocationId(), throwable);
}
}
return null;
}
}, getMainThreadExecutor());
} else {
log.debug("There are no unassigned slots for the job {}.", jobId);
}
}
}
use of java.util.concurrent.TimeoutException in project flink by apache.
the class TestingListener method waitForNewLeader.
public String waitForNewLeader(long timeout) throws Exception {
long start = System.currentTimeMillis();
long curTimeout;
while (exception == null && (address == null || address.equals(oldAddress)) && (curTimeout = timeout - System.currentTimeMillis() + start) > 0) {
synchronized (lock) {
try {
lock.wait(curTimeout);
} catch (InterruptedException e) {
// we got interrupted so check again for the condition
}
}
}
if (exception != null) {
throw exception;
} else if (address == null || address.equals(oldAddress)) {
throw new TimeoutException("Listener was not notified about a leader within " + timeout + "ms");
}
oldAddress = address;
return address;
}
use of java.util.concurrent.TimeoutException in project flink by apache.
the class JobClient method submitJobDetached.
/**
* Submits a job in detached mode. The method sends the JobGraph to the
* JobManager and waits for the answer whether the job could be started or not.
*
* @param jobManagerGateway Gateway to the JobManager which will execute the jobs
* @param config The cluster wide configuration.
* @param jobGraph The job
* @param timeout Timeout in which the JobManager must have responded.
*/
public static void submitJobDetached(ActorGateway jobManagerGateway, Configuration config, JobGraph jobGraph, FiniteDuration timeout, ClassLoader classLoader) throws JobExecutionException {
checkNotNull(jobManagerGateway, "The jobManagerGateway must not be null.");
checkNotNull(jobGraph, "The jobGraph must not be null.");
checkNotNull(timeout, "The timeout must not be null.");
LOG.info("Checking and uploading JAR files");
try {
jobGraph.uploadUserJars(jobManagerGateway, timeout, config);
} catch (IOException e) {
throw new JobSubmissionException(jobGraph.getJobID(), "Could not upload the program's JAR files to the JobManager.", e);
}
Object result;
try {
Future<Object> future = jobManagerGateway.ask(new JobManagerMessages.SubmitJob(jobGraph, // only receive the Acknowledge for the job submission message
ListeningBehaviour.DETACHED), timeout);
result = Await.result(future, timeout);
} catch (TimeoutException e) {
throw new JobTimeoutException(jobGraph.getJobID(), "JobManager did not respond within " + timeout.toString(), e);
} catch (Throwable t) {
throw new JobSubmissionException(jobGraph.getJobID(), "Failed to send job to JobManager: " + t.getMessage(), t.getCause());
}
if (result instanceof JobManagerMessages.JobSubmitSuccess) {
JobID respondedID = ((JobManagerMessages.JobSubmitSuccess) result).jobId();
// validate response
if (!respondedID.equals(jobGraph.getJobID())) {
throw new JobExecutionException(jobGraph.getJobID(), "JobManager responded for wrong Job. This Job: " + jobGraph.getJobID() + ", response: " + respondedID);
}
} else if (result instanceof JobManagerMessages.JobResultFailure) {
try {
SerializedThrowable t = ((JobManagerMessages.JobResultFailure) result).cause();
throw t.deserializeError(classLoader);
} catch (JobExecutionException e) {
throw e;
} catch (Throwable t) {
throw new JobExecutionException(jobGraph.getJobID(), "JobSubmission failed: " + t.getMessage(), t);
}
} else {
throw new JobExecutionException(jobGraph.getJobID(), "Unexpected response from JobManager: " + result);
}
}
use of java.util.concurrent.TimeoutException in project flink by apache.
the class JobClient method awaitJobResult.
/**
* Given a JobListeningContext, awaits the result of the job execution that this context is bound to
* @param listeningContext The listening context of the job execution
* @return The result of the execution
* @throws JobExecutionException if anything goes wrong while monitoring the job
*/
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
final JobID jobID = listeningContext.getJobID();
final ActorRef jobClientActor = listeningContext.getJobClientActor();
final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
final FiniteDuration askTimeout = listeningContext.getTimeout();
// retrieves class loader if necessary
final ClassLoader classLoader = listeningContext.getClassLoader();
// ping the JobClientActor from time to time to check if it is still running
while (!jobSubmissionFuture.isCompleted()) {
try {
Await.ready(jobSubmissionFuture, askTimeout);
} catch (InterruptedException e) {
throw new JobExecutionException(jobID, "Interrupted while waiting for job completion.");
} catch (TimeoutException e) {
try {
Await.result(Patterns.ask(jobClientActor, // Ping the Actor to see if it is alive
new Identify(true), Timeout.durationToTimeout(askTimeout)), askTimeout);
// we got a reply, continue waiting for the job result
} catch (Exception eInner) {
// thus the health check failed
if (!jobSubmissionFuture.isCompleted()) {
throw new JobExecutionException(jobID, "JobClientActor seems to have died before the JobExecutionResult could be retrieved.", eInner);
}
}
}
}
final Object answer;
try {
// we have already awaited the result, zero time to wait here
answer = Await.result(jobSubmissionFuture, Duration.Zero());
} catch (Throwable throwable) {
throw new JobExecutionException(jobID, "Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
} finally {
// failsafe shutdown of the client actor
jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
// second block handles the actual response
if (answer instanceof JobManagerMessages.JobResultSuccess) {
LOG.info("Job execution complete");
SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
if (result != null) {
try {
return result.toJobExecutionResult(classLoader);
} catch (Throwable t) {
throw new JobExecutionException(jobID, "Job was successfully executed but JobExecutionResult could not be deserialized.");
}
} else {
throw new JobExecutionException(jobID, "Job was successfully executed but result contained a null JobExecutionResult.");
}
} else if (answer instanceof JobManagerMessages.JobResultFailure) {
LOG.info("Job execution failed");
SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
if (serThrowable != null) {
Throwable cause = serThrowable.deserializeError(classLoader);
if (cause instanceof JobExecutionException) {
throw (JobExecutionException) cause;
} else {
throw new JobExecutionException(jobID, "Job execution failed", cause);
}
} else {
throw new JobExecutionException(jobID, "Job execution failed with null as failure cause.");
}
} else if (answer instanceof JobManagerMessages.JobNotFound) {
throw new JobRetrievalException(((JobManagerMessages.JobNotFound) answer).jobID(), "Couldn't retrieve Job " + jobID + " because it was not running.");
} else {
throw new JobExecutionException(jobID, "Unknown answer from JobManager after submitting the job: " + answer);
}
}
use of java.util.concurrent.TimeoutException in project flink by apache.
the class Execution method deployToSlot.
public void deployToSlot(final SimpleSlot slot) throws JobException {
checkNotNull(slot);
// The more general check is the timeout of the deployment call
if (!slot.isAlive()) {
throw new JobException("Target slot (TaskManager) for deployment is no longer alive.");
}
// make sure exactly one deployment call happens from the correct state
// note: the transition from CREATED to DEPLOYING is for testing purposes only
ExecutionState previous = this.state;
if (previous == SCHEDULED || previous == CREATED) {
if (!transitionState(previous, DEPLOYING)) {
// this should actually not happen and indicates a race somewhere else
throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
}
} else {
// vertex may have been cancelled, or it was already scheduled
throw new IllegalStateException("The vertex must be in CREATED or SCHEDULED state to be deployed. Found state " + previous);
}
try {
// good, we are allowed to deploy
if (!slot.setExecutedVertex(this)) {
throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
}
this.assignedResource = slot;
// race double check, did we fail/cancel and do we need to release the slot?
if (this.state != DEPLOYING) {
slot.releaseSlot();
return;
}
if (LOG.isInfoEnabled()) {
LOG.info(String.format("Deploying %s (attempt #%d) to %s", vertex.getSimpleName(), attemptNumber, getAssignedResourceLocation().getHostname()));
}
final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor(attemptId, slot, taskState, attemptNumber);
// register this execution at the execution graph, to receive call backs
vertex.getExecutionGraph().registerExecution(this);
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
final Future<Acknowledge> submitResultFuture = taskManagerGateway.submitTask(deployment, timeout);
submitResultFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {
@Override
public Void apply(Throwable failure) {
if (failure instanceof TimeoutException) {
String taskname = vertex.getTaskNameWithSubtaskIndex() + " (" + attemptId + ')';
markFailed(new Exception("Cannot deploy task " + taskname + " - TaskManager (" + getAssignedResourceLocation() + ") not responding after a timeout of " + timeout, failure));
} else {
markFailed(failure);
}
return null;
}
}, executor);
} catch (Throwable t) {
markFailed(t);
ExceptionUtils.rethrow(t);
}
}
Aggregations