Search in sources :

Example 81 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class HttpTestClient method sendRequest.

/**
	 * Sends a request to to the server.
	 *
	 * <pre>
	 * HttpRequest request = new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/overview");
	 * request.headers().set(HttpHeaders.Names.HOST, host);
	 * request.headers().set(HttpHeaders.Names.CONNECTION, HttpHeaders.Values.CLOSE);
	 *
	 * sendRequest(request);
	 * </pre>
	 *
	 * @param request The {@link HttpRequest} to send to the server
	 */
public void sendRequest(HttpRequest request, FiniteDuration timeout) throws InterruptedException, TimeoutException {
    LOG.debug("Writing {}.", request);
    // Make the connection attempt.
    ChannelFuture connect = bootstrap.connect(host, port);
    Channel channel;
    if (connect.await(timeout.toMillis(), TimeUnit.MILLISECONDS)) {
        channel = connect.channel();
    } else {
        throw new TimeoutException("Connection failed");
    }
    channel.writeAndFlush(request);
}
Also used : ChannelFuture(io.netty.channel.ChannelFuture) NioSocketChannel(io.netty.channel.socket.nio.NioSocketChannel) SocketChannel(io.netty.channel.socket.SocketChannel) Channel(io.netty.channel.Channel) TimeoutException(java.util.concurrent.TimeoutException)

Example 82 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class RescalingITCase method testSavepointRescalingPartitionedOperatorState.

/**
	 * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link ListCheckpointed}
	 * as it subsumes {@link org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
	 */
public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut, OperatorCheckpointMethod checkpointMethod) throws Exception {
    final int parallelism = scaleOut ? numSlots : numSlots / 2;
    final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
    final int maxParallelism = 13;
    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();
    JobID jobID = null;
    ActorGateway jobManager = null;
    int counterSize = Math.max(parallelism, parallelism2);
    if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION || checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
        PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
        PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize];
    } else {
        PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
        PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE = new int[counterSize];
    }
    try {
        jobManager = cluster.getLeaderGateway(deadline.timeLeft());
        JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, checkpointMethod);
        jobID = jobGraph.getJobID();
        cluster.submitJobDetached(jobGraph);
        Object savepointResponse = null;
        // wait until the operator is started
        StateSourceBase.workStartedLatch.await();
        while (deadline.hasTimeLeft()) {
            Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
            FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
            savepointResponse = Await.result(savepointPathFuture, waitingTime);
            if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
                break;
            }
            System.out.println(savepointResponse);
        }
        assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);
        final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();
        Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());
        Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());
        Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());
        assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);
        Await.ready(jobRemovedFuture, deadline.timeLeft());
        // job successfully removed
        jobID = null;
        JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, checkpointMethod);
        scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        jobID = scaledJobGraph.getJobID();
        cluster.submitJobAndWait(scaledJobGraph, false);
        int sumExp = 0;
        int sumAct = 0;
        if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION) {
            for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
        } else if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
            for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
            sumExp *= parallelism2;
        } else {
            for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
        }
        assertEquals(sumExp, sumAct);
        jobID = null;
    } finally {
        // clear any left overs from a possibly failed job
        if (jobID != null && jobManager != null) {
            Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);
            try {
                Await.ready(jobRemovedFuture, timeout);
            } catch (TimeoutException | InterruptedException ie) {
                fail("Failed while cleaning up the cluster.");
            }
        }
    }
}
Also used : Deadline(scala.concurrent.duration.Deadline) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException)

Example 83 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class JobClient method submitJobDetached.

/**
	 * Submits a job in detached mode. The method sends the JobGraph to the
	 * JobManager and waits for the answer whether the job could be started or not.
	 *
	 * @param jobManagerGateway Gateway to the JobManager which will execute the jobs
	 * @param config The cluster wide configuration.
	 * @param jobGraph The job
	 * @param timeout  Timeout in which the JobManager must have responded.
	 */
public static void submitJobDetached(ActorGateway jobManagerGateway, Configuration config, JobGraph jobGraph, FiniteDuration timeout, ClassLoader classLoader) throws JobExecutionException {
    checkNotNull(jobManagerGateway, "The jobManagerGateway must not be null.");
    checkNotNull(jobGraph, "The jobGraph must not be null.");
    checkNotNull(timeout, "The timeout must not be null.");
    LOG.info("Checking and uploading JAR files");
    try {
        jobGraph.uploadUserJars(jobManagerGateway, timeout, config);
    } catch (IOException e) {
        throw new JobSubmissionException(jobGraph.getJobID(), "Could not upload the program's JAR files to the JobManager.", e);
    }
    Object result;
    try {
        Future<Object> future = jobManagerGateway.ask(new JobManagerMessages.SubmitJob(jobGraph, // only receive the Acknowledge for the job submission message
        ListeningBehaviour.DETACHED), timeout);
        result = Await.result(future, timeout);
    } catch (TimeoutException e) {
        throw new JobTimeoutException(jobGraph.getJobID(), "JobManager did not respond within " + timeout.toString(), e);
    } catch (Throwable t) {
        throw new JobSubmissionException(jobGraph.getJobID(), "Failed to send job to JobManager: " + t.getMessage(), t.getCause());
    }
    if (result instanceof JobManagerMessages.JobSubmitSuccess) {
        JobID respondedID = ((JobManagerMessages.JobSubmitSuccess) result).jobId();
        // validate response
        if (!respondedID.equals(jobGraph.getJobID())) {
            throw new JobExecutionException(jobGraph.getJobID(), "JobManager responded for wrong Job. This Job: " + jobGraph.getJobID() + ", response: " + respondedID);
        }
    } else if (result instanceof JobManagerMessages.JobResultFailure) {
        try {
            SerializedThrowable t = ((JobManagerMessages.JobResultFailure) result).cause();
            throw t.deserializeError(classLoader);
        } catch (JobExecutionException e) {
            throw e;
        } catch (Throwable t) {
            throw new JobExecutionException(jobGraph.getJobID(), "JobSubmission failed: " + t.getMessage(), t);
        }
    } else {
        throw new JobExecutionException(jobGraph.getJobID(), "Unexpected response from JobManager: " + result);
    }
}
Also used : JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) IOException(java.io.IOException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable)

Example 84 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class JobClient method awaitJobResult.

/**
	 * Given a JobListeningContext, awaits the result of the job execution that this context is bound to
	 * @param listeningContext The listening context of the job execution
	 * @return The result of the execution
	 * @throws JobExecutionException if anything goes wrong while monitoring the job
	 */
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
    final JobID jobID = listeningContext.getJobID();
    final ActorRef jobClientActor = listeningContext.getJobClientActor();
    final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
    final FiniteDuration askTimeout = listeningContext.getTimeout();
    // retrieves class loader if necessary
    final ClassLoader classLoader = listeningContext.getClassLoader();
    // ping the JobClientActor from time to time to check if it is still running
    while (!jobSubmissionFuture.isCompleted()) {
        try {
            Await.ready(jobSubmissionFuture, askTimeout);
        } catch (InterruptedException e) {
            throw new JobExecutionException(jobID, "Interrupted while waiting for job completion.");
        } catch (TimeoutException e) {
            try {
                Await.result(Patterns.ask(jobClientActor, // Ping the Actor to see if it is alive
                new Identify(true), Timeout.durationToTimeout(askTimeout)), askTimeout);
            // we got a reply, continue waiting for the job result
            } catch (Exception eInner) {
                // thus the health check failed
                if (!jobSubmissionFuture.isCompleted()) {
                    throw new JobExecutionException(jobID, "JobClientActor seems to have died before the JobExecutionResult could be retrieved.", eInner);
                }
            }
        }
    }
    final Object answer;
    try {
        // we have already awaited the result, zero time to wait here
        answer = Await.result(jobSubmissionFuture, Duration.Zero());
    } catch (Throwable throwable) {
        throw new JobExecutionException(jobID, "Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
    } finally {
        // failsafe shutdown of the client actor
        jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
    }
    // second block handles the actual response
    if (answer instanceof JobManagerMessages.JobResultSuccess) {
        LOG.info("Job execution complete");
        SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
        if (result != null) {
            try {
                return result.toJobExecutionResult(classLoader);
            } catch (Throwable t) {
                throw new JobExecutionException(jobID, "Job was successfully executed but JobExecutionResult could not be deserialized.");
            }
        } else {
            throw new JobExecutionException(jobID, "Job was successfully executed but result contained a null JobExecutionResult.");
        }
    } else if (answer instanceof JobManagerMessages.JobResultFailure) {
        LOG.info("Job execution failed");
        SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
        if (serThrowable != null) {
            Throwable cause = serThrowable.deserializeError(classLoader);
            if (cause instanceof JobExecutionException) {
                throw (JobExecutionException) cause;
            } else {
                throw new JobExecutionException(jobID, "Job execution failed", cause);
            }
        } else {
            throw new JobExecutionException(jobID, "Job execution failed with null as failure cause.");
        }
    } else if (answer instanceof JobManagerMessages.JobNotFound) {
        throw new JobRetrievalException(((JobManagerMessages.JobNotFound) answer).jobID(), "Couldn't retrieve Job " + jobID + " because it was not running.");
    } else {
        throw new JobExecutionException(jobID, "Unknown answer from JobManager after submitting the job: " + answer);
    }
}
Also used : ActorRef(akka.actor.ActorRef) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) Identify(akka.actor.Identify) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) FlinkUserCodeClassLoader(org.apache.flink.runtime.execution.librarycache.FlinkUserCodeClassLoader) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable)

Example 85 with TimeoutException

use of java.util.concurrent.TimeoutException in project flink by apache.

the class Execution method deployToSlot.

public void deployToSlot(final SimpleSlot slot) throws JobException {
    checkNotNull(slot);
    // The more general check is the timeout of the deployment call
    if (!slot.isAlive()) {
        throw new JobException("Target slot (TaskManager) for deployment is no longer alive.");
    }
    // make sure exactly one deployment call happens from the correct state
    // note: the transition from CREATED to DEPLOYING is for testing purposes only
    ExecutionState previous = this.state;
    if (previous == SCHEDULED || previous == CREATED) {
        if (!transitionState(previous, DEPLOYING)) {
            // this should actually not happen and indicates a race somewhere else
            throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
        }
    } else {
        // vertex may have been cancelled, or it was already scheduled
        throw new IllegalStateException("The vertex must be in CREATED or SCHEDULED state to be deployed. Found state " + previous);
    }
    try {
        // good, we are allowed to deploy
        if (!slot.setExecutedVertex(this)) {
            throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
        }
        this.assignedResource = slot;
        // race double check, did we fail/cancel and do we need to release the slot?
        if (this.state != DEPLOYING) {
            slot.releaseSlot();
            return;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info(String.format("Deploying %s (attempt #%d) to %s", vertex.getSimpleName(), attemptNumber, getAssignedResourceLocation().getHostname()));
        }
        final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor(attemptId, slot, taskState, attemptNumber);
        // register this execution at the execution graph, to receive call backs
        vertex.getExecutionGraph().registerExecution(this);
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        final Future<Acknowledge> submitResultFuture = taskManagerGateway.submitTask(deployment, timeout);
        submitResultFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {

            @Override
            public Void apply(Throwable failure) {
                if (failure instanceof TimeoutException) {
                    String taskname = vertex.getTaskNameWithSubtaskIndex() + " (" + attemptId + ')';
                    markFailed(new Exception("Cannot deploy task " + taskname + " - TaskManager (" + getAssignedResourceLocation() + ") not responding after a timeout of " + timeout, failure));
                } else {
                    markFailed(failure);
                }
                return null;
            }
        }, executor);
    } catch (Throwable t) {
        markFailed(t);
        ExceptionUtils.rethrow(t);
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) TimeoutException(java.util.concurrent.TimeoutException) JobException(org.apache.flink.runtime.JobException) JobException(org.apache.flink.runtime.JobException) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

TimeoutException (java.util.concurrent.TimeoutException)788 ExecutionException (java.util.concurrent.ExecutionException)249 IOException (java.io.IOException)184 Test (org.junit.Test)149 ArrayList (java.util.ArrayList)75 CountDownLatch (java.util.concurrent.CountDownLatch)73 ExecutorService (java.util.concurrent.ExecutorService)71 Future (java.util.concurrent.Future)54 CancellationException (java.util.concurrent.CancellationException)44 Test (org.testng.annotations.Test)44 List (java.util.List)39 HashMap (java.util.HashMap)38 Map (java.util.Map)38 File (java.io.File)36 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)36 TimeUnit (java.util.concurrent.TimeUnit)34 AtomicReference (java.util.concurrent.atomic.AtomicReference)26 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)22 URI (java.net.URI)21 RejectedExecutionException (java.util.concurrent.RejectedExecutionException)21