Search in sources :

Example 6 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class TaskExecutor method offerSlotsToJobManager.

// ------------------------------------------------------------------------
//  Internal job manager connection methods
// ------------------------------------------------------------------------
private void offerSlotsToJobManager(final JobID jobId) {
    final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
    if (jobManagerConnection == null) {
        log.debug("There is no job manager connection to the leader of job {}.", jobId);
    } else {
        if (taskSlotTable.hasAllocatedSlots(jobId)) {
            log.info("Offer reserved slots to the leader of job {}.", jobId);
            final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
            final Iterator<TaskSlot> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
            final UUID leaderId = jobManagerConnection.getLeaderId();
            final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
            while (reservedSlotsIterator.hasNext()) {
                SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
                try {
                    if (!taskSlotTable.markSlotActive(offer.getAllocationId())) {
                        // the slot is either free or releasing at the moment
                        final String message = "Could not mark slot " + jobId + " active.";
                        log.debug(message);
                        jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
                    }
                } catch (SlotNotFoundException e) {
                    final String message = "Could not mark slot " + jobId + " active.";
                    jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
                    continue;
                }
                reservedSlots.add(offer);
            }
            Future<Iterable<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, leaderId, taskManagerConfiguration.getTimeout());
            acceptedSlotsFuture.thenAcceptAsync(new AcceptFunction<Iterable<SlotOffer>>() {

                @Override
                public void accept(Iterable<SlotOffer> acceptedSlots) {
                    // check if the response is still valid
                    if (isJobManagerConnectionValid(jobId, leaderId)) {
                        // mark accepted slots active
                        for (SlotOffer acceptedSlot : acceptedSlots) {
                            reservedSlots.remove(acceptedSlot);
                        }
                        final Exception e = new Exception("The slot was rejected by the JobManager.");
                        for (SlotOffer rejectedSlot : reservedSlots) {
                            freeSlot(rejectedSlot.getAllocationId(), e);
                        }
                    } else {
                        // discard the response since there is a new leader for the job
                        log.debug("Discard offer slot response since there is a new leader " + "for the job {}.", jobId);
                    }
                }
            }, getMainThreadExecutor());
            acceptedSlotsFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {

                @Override
                public Void apply(Throwable throwable) {
                    if (throwable instanceof TimeoutException) {
                        // We ran into a timeout. Try again.
                        offerSlotsToJobManager(jobId);
                    } else {
                        // We encountered an exception. Free the slots and return them to the RM.
                        for (SlotOffer reservedSlot : reservedSlots) {
                            freeSlot(reservedSlot.getAllocationId(), throwable);
                        }
                    }
                    return null;
                }
            }, getMainThreadExecutor());
        } else {
            log.debug("There are no unassigned slots for the job {}.", jobId);
        }
    }
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TaskSlot(org.apache.flink.runtime.taskexecutor.slot.TaskSlot) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) TimeoutException(java.util.concurrent.TimeoutException) PartitionException(org.apache.flink.runtime.taskexecutor.exceptions.PartitionException) CheckpointException(org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException) UUID(java.util.UUID) HashSet(java.util.HashSet) TimeoutException(java.util.concurrent.TimeoutException)

Example 7 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class TestingListener method waitForNewLeader.

public String waitForNewLeader(long timeout) throws Exception {
    long start = System.currentTimeMillis();
    long curTimeout;
    while (exception == null && (address == null || address.equals(oldAddress)) && (curTimeout = timeout - System.currentTimeMillis() + start) > 0) {
        synchronized (lock) {
            try {
                lock.wait(curTimeout);
            } catch (InterruptedException e) {
            // we got interrupted so check again for the condition
            }
        }
    }
    if (exception != null) {
        throw exception;
    } else if (address == null || address.equals(oldAddress)) {
        throw new TimeoutException("Listener was not notified about a leader within " + timeout + "ms");
    }
    oldAddress = address;
    return address;
}
Also used : TimeoutException(java.util.concurrent.TimeoutException)

Example 8 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class JobClient method submitJobDetached.

/**
	 * Submits a job in detached mode. The method sends the JobGraph to the
	 * JobManager and waits for the answer whether the job could be started or not.
	 *
	 * @param jobManagerGateway Gateway to the JobManager which will execute the jobs
	 * @param config The cluster wide configuration.
	 * @param jobGraph The job
	 * @param timeout  Timeout in which the JobManager must have responded.
	 */
public static void submitJobDetached(ActorGateway jobManagerGateway, Configuration config, JobGraph jobGraph, FiniteDuration timeout, ClassLoader classLoader) throws JobExecutionException {
    checkNotNull(jobManagerGateway, "The jobManagerGateway must not be null.");
    checkNotNull(jobGraph, "The jobGraph must not be null.");
    checkNotNull(timeout, "The timeout must not be null.");
    LOG.info("Checking and uploading JAR files");
    try {
        jobGraph.uploadUserJars(jobManagerGateway, timeout, config);
    } catch (IOException e) {
        throw new JobSubmissionException(jobGraph.getJobID(), "Could not upload the program's JAR files to the JobManager.", e);
    }
    Object result;
    try {
        Future<Object> future = jobManagerGateway.ask(new JobManagerMessages.SubmitJob(jobGraph, // only receive the Acknowledge for the job submission message
        ListeningBehaviour.DETACHED), timeout);
        result = Await.result(future, timeout);
    } catch (TimeoutException e) {
        throw new JobTimeoutException(jobGraph.getJobID(), "JobManager did not respond within " + timeout.toString(), e);
    } catch (Throwable t) {
        throw new JobSubmissionException(jobGraph.getJobID(), "Failed to send job to JobManager: " + t.getMessage(), t.getCause());
    }
    if (result instanceof JobManagerMessages.JobSubmitSuccess) {
        JobID respondedID = ((JobManagerMessages.JobSubmitSuccess) result).jobId();
        // validate response
        if (!respondedID.equals(jobGraph.getJobID())) {
            throw new JobExecutionException(jobGraph.getJobID(), "JobManager responded for wrong Job. This Job: " + jobGraph.getJobID() + ", response: " + respondedID);
        }
    } else if (result instanceof JobManagerMessages.JobResultFailure) {
        try {
            SerializedThrowable t = ((JobManagerMessages.JobResultFailure) result).cause();
            throw t.deserializeError(classLoader);
        } catch (JobExecutionException e) {
            throw e;
        } catch (Throwable t) {
            throw new JobExecutionException(jobGraph.getJobID(), "JobSubmission failed: " + t.getMessage(), t);
        }
    } else {
        throw new JobExecutionException(jobGraph.getJobID(), "Unexpected response from JobManager: " + result);
    }
}
Also used : JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) IOException(java.io.IOException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable)

Example 9 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class JobClient method awaitJobResult.

/**
	 * Given a JobListeningContext, awaits the result of the job execution that this context is bound to
	 * @param listeningContext The listening context of the job execution
	 * @return The result of the execution
	 * @throws JobExecutionException if anything goes wrong while monitoring the job
	 */
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
    final JobID jobID = listeningContext.getJobID();
    final ActorRef jobClientActor = listeningContext.getJobClientActor();
    final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
    final FiniteDuration askTimeout = listeningContext.getTimeout();
    // retrieves class loader if necessary
    final ClassLoader classLoader = listeningContext.getClassLoader();
    // ping the JobClientActor from time to time to check if it is still running
    while (!jobSubmissionFuture.isCompleted()) {
        try {
            Await.ready(jobSubmissionFuture, askTimeout);
        } catch (InterruptedException e) {
            throw new JobExecutionException(jobID, "Interrupted while waiting for job completion.");
        } catch (TimeoutException e) {
            try {
                Await.result(Patterns.ask(jobClientActor, // Ping the Actor to see if it is alive
                new Identify(true), Timeout.durationToTimeout(askTimeout)), askTimeout);
            // we got a reply, continue waiting for the job result
            } catch (Exception eInner) {
                // thus the health check failed
                if (!jobSubmissionFuture.isCompleted()) {
                    throw new JobExecutionException(jobID, "JobClientActor seems to have died before the JobExecutionResult could be retrieved.", eInner);
                }
            }
        }
    }
    final Object answer;
    try {
        // we have already awaited the result, zero time to wait here
        answer = Await.result(jobSubmissionFuture, Duration.Zero());
    } catch (Throwable throwable) {
        throw new JobExecutionException(jobID, "Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
    } finally {
        // failsafe shutdown of the client actor
        jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
    }
    // second block handles the actual response
    if (answer instanceof JobManagerMessages.JobResultSuccess) {
        LOG.info("Job execution complete");
        SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
        if (result != null) {
            try {
                return result.toJobExecutionResult(classLoader);
            } catch (Throwable t) {
                throw new JobExecutionException(jobID, "Job was successfully executed but JobExecutionResult could not be deserialized.");
            }
        } else {
            throw new JobExecutionException(jobID, "Job was successfully executed but result contained a null JobExecutionResult.");
        }
    } else if (answer instanceof JobManagerMessages.JobResultFailure) {
        LOG.info("Job execution failed");
        SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
        if (serThrowable != null) {
            Throwable cause = serThrowable.deserializeError(classLoader);
            if (cause instanceof JobExecutionException) {
                throw (JobExecutionException) cause;
            } else {
                throw new JobExecutionException(jobID, "Job execution failed", cause);
            }
        } else {
            throw new JobExecutionException(jobID, "Job execution failed with null as failure cause.");
        }
    } else if (answer instanceof JobManagerMessages.JobNotFound) {
        throw new JobRetrievalException(((JobManagerMessages.JobNotFound) answer).jobID(), "Couldn't retrieve Job " + jobID + " because it was not running.");
    } else {
        throw new JobExecutionException(jobID, "Unknown answer from JobManager after submitting the job: " + answer);
    }
}
Also used : ActorRef(akka.actor.ActorRef) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) Identify(akka.actor.Identify) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) FlinkUserCodeClassLoader(org.apache.flink.runtime.execution.librarycache.FlinkUserCodeClassLoader) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable)

Example 10 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class Execution method deployToSlot.

public void deployToSlot(final SimpleSlot slot) throws JobException {
    checkNotNull(slot);
    // The more general check is the timeout of the deployment call
    if (!slot.isAlive()) {
        throw new JobException("Target slot (TaskManager) for deployment is no longer alive.");
    }
    // make sure exactly one deployment call happens from the correct state
    // note: the transition from CREATED to DEPLOYING is for testing purposes only
    ExecutionState previous = this.state;
    if (previous == SCHEDULED || previous == CREATED) {
        if (!transitionState(previous, DEPLOYING)) {
            // this should actually not happen and indicates a race somewhere else
            throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
        }
    } else {
        // vertex may have been cancelled, or it was already scheduled
        throw new IllegalStateException("The vertex must be in CREATED or SCHEDULED state to be deployed. Found state " + previous);
    }
    try {
        // good, we are allowed to deploy
        if (!slot.setExecutedVertex(this)) {
            throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
        }
        this.assignedResource = slot;
        // race double check, did we fail/cancel and do we need to release the slot?
        if (this.state != DEPLOYING) {
            slot.releaseSlot();
            return;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info(String.format("Deploying %s (attempt #%d) to %s", vertex.getSimpleName(), attemptNumber, getAssignedResourceLocation().getHostname()));
        }
        final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor(attemptId, slot, taskState, attemptNumber);
        // register this execution at the execution graph, to receive call backs
        vertex.getExecutionGraph().registerExecution(this);
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        final Future<Acknowledge> submitResultFuture = taskManagerGateway.submitTask(deployment, timeout);
        submitResultFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {

            @Override
            public Void apply(Throwable failure) {
                if (failure instanceof TimeoutException) {
                    String taskname = vertex.getTaskNameWithSubtaskIndex() + " (" + attemptId + ')';
                    markFailed(new Exception("Cannot deploy task " + taskname + " - TaskManager (" + getAssignedResourceLocation() + ") not responding after a timeout of " + timeout, failure));
                } else {
                    markFailed(failure);
                }
                return null;
            }
        }, executor);
    } catch (Throwable t) {
        markFailed(t);
        ExceptionUtils.rethrow(t);
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) TimeoutException(java.util.concurrent.TimeoutException) JobException(org.apache.flink.runtime.JobException) JobException(org.apache.flink.runtime.JobException) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

TimeoutException (java.util.concurrent.TimeoutException)717 ExecutionException (java.util.concurrent.ExecutionException)229 IOException (java.io.IOException)167 Test (org.junit.Test)131 CountDownLatch (java.util.concurrent.CountDownLatch)73 ArrayList (java.util.ArrayList)67 ExecutorService (java.util.concurrent.ExecutorService)62 Future (java.util.concurrent.Future)45 CancellationException (java.util.concurrent.CancellationException)44 Test (org.testng.annotations.Test)44 File (java.io.File)34 List (java.util.List)34 Map (java.util.Map)32 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)32 HashMap (java.util.HashMap)26 TimeUnit (java.util.concurrent.TimeUnit)26 AtomicReference (java.util.concurrent.atomic.AtomicReference)23 RejectedExecutionException (java.util.concurrent.RejectedExecutionException)21 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)21 URI (java.net.URI)20