Search in sources :

Example 51 with JobID

use of org.apache.flink.api.common.JobID in project flink by apache.

the class TaskManagerJobGroupTest method testCreateQueryServiceMetricInfo.

@Test
public void testCreateQueryServiceMetricInfo() {
    JobID jid = new JobID();
    MetricRegistry registry = new MetricRegistry(MetricRegistryConfiguration.defaultMetricRegistryConfiguration());
    TaskManagerMetricGroup tm = new TaskManagerMetricGroup(registry, "host", "id");
    TaskManagerJobMetricGroup job = new TaskManagerJobMetricGroup(registry, tm, jid, "jobname");
    QueryScopeInfo.JobQueryScopeInfo info = job.createQueryServiceMetricInfo(new DummyCharacterFilter());
    assertEquals("", info.scope);
    assertEquals(jid.toString(), info.jobID);
}
Also used : QueryScopeInfo(org.apache.flink.runtime.metrics.dump.QueryScopeInfo) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) DummyCharacterFilter(org.apache.flink.runtime.metrics.util.DummyCharacterFilter) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 52 with JobID

use of org.apache.flink.api.common.JobID in project flink by apache.

the class RescalingITCase method testSavepointRescalingPartitionedOperatorState.

/**
	 * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link ListCheckpointed}
	 * as it subsumes {@link org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
	 */
public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut, OperatorCheckpointMethod checkpointMethod) throws Exception {
    final int parallelism = scaleOut ? numSlots : numSlots / 2;
    final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
    final int maxParallelism = 13;
    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();
    JobID jobID = null;
    ActorGateway jobManager = null;
    int counterSize = Math.max(parallelism, parallelism2);
    if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION || checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
        PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
        PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize];
    } else {
        PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
        PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE = new int[counterSize];
    }
    try {
        jobManager = cluster.getLeaderGateway(deadline.timeLeft());
        JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, checkpointMethod);
        jobID = jobGraph.getJobID();
        cluster.submitJobDetached(jobGraph);
        Object savepointResponse = null;
        // wait until the operator is started
        StateSourceBase.workStartedLatch.await();
        while (deadline.hasTimeLeft()) {
            Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
            FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
            savepointResponse = Await.result(savepointPathFuture, waitingTime);
            if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
                break;
            }
            System.out.println(savepointResponse);
        }
        assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);
        final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();
        Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());
        Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());
        Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());
        assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);
        Await.ready(jobRemovedFuture, deadline.timeLeft());
        // job successfully removed
        jobID = null;
        JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, checkpointMethod);
        scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        jobID = scaledJobGraph.getJobID();
        cluster.submitJobAndWait(scaledJobGraph, false);
        int sumExp = 0;
        int sumAct = 0;
        if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION) {
            for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
        } else if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
            for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
            sumExp *= parallelism2;
        } else {
            for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
        }
        assertEquals(sumExp, sumAct);
        jobID = null;
    } finally {
        // clear any left overs from a possibly failed job
        if (jobID != null && jobManager != null) {
            Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);
            try {
                Await.ready(jobRemovedFuture, timeout);
            } catch (TimeoutException | InterruptedException ie) {
                fail("Failed while cleaning up the cluster.");
            }
        }
    }
}
Also used : Deadline(scala.concurrent.duration.Deadline) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException)

Example 53 with JobID

use of org.apache.flink.api.common.JobID in project flink by apache.

the class SavepointITCase method testCanRestoreWithModifiedStatelessOperators.

/**
	 * FLINK-5985
	 *
	 * This test ensures we can restore from a savepoint under modifications to the job graph that only concern
	 * stateless operators.
	 */
@Test
public void testCanRestoreWithModifiedStatelessOperators() throws Exception {
    // Config
    int numTaskManagers = 2;
    int numSlotsPerTaskManager = 2;
    int parallelism = 2;
    // Test deadline
    final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
    final File tmpDir = CommonTestUtils.createTempDirectory();
    final File savepointDir = new File(tmpDir, "savepoints");
    TestingCluster flink = null;
    String savepointPath;
    try {
        // Flink configuration
        final Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
        config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointDir.toURI().toString());
        LOG.info("Flink configuration: " + config + ".");
        // Start Flink
        flink = new TestingCluster(config);
        LOG.info("Starting Flink cluster.");
        flink.start(true);
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        final StatefulCounter statefulCounter = new StatefulCounter();
        StatefulCounter.resetForTest(parallelism);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(parallelism);
        env.addSource(new InfiniteTestSource()).shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return 4 * value;
            }
        }).shuffle().map(statefulCounter).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return 2 * value;
            }
        }).addSink(new DiscardingSink<Integer>());
        JobGraph originalJobGraph = env.getStreamGraph().getJobGraph();
        JobSubmissionResult submissionResult = flink.submitJobDetached(originalJobGraph);
        JobID jobID = submissionResult.getJobID();
        // wait for the Tasks to be ready
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
        savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
        Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
        ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
        LOG.info("Retrieved savepoint: " + savepointPath + ".");
        // Shut down the Flink cluster (thereby canceling the job)
        LOG.info("Shutting down Flink cluster.");
        flink.shutdown();
        flink.awaitTermination();
    } finally {
        flink.shutdown();
        flink.awaitTermination();
    }
    try {
        LOG.info("Restarting Flink cluster.");
        flink.start(true);
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        // Reset static test helpers
        StatefulCounter.resetForTest(parallelism);
        // Gather all task deployment descriptors
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(parallelism);
        // generate a modified job graph that adds a stateless op
        env.addSource(new InfiniteTestSource()).shuffle().map(new StatefulCounter()).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return value;
            }
        }).addSink(new DiscardingSink<Integer>());
        JobGraph modifiedJobGraph = env.getStreamGraph().getJobGraph();
        // Set the savepoint path
        modifiedJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        LOG.info("Resubmitting job " + modifiedJobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
        // Submit the job
        flink.submitJobDetached(modifiedJobGraph);
        // Await state is restored
        StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // Await some progress after restore
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
    } finally {
        flink.shutdown();
        flink.awaitTermination();
    }
}
Also used : RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) Configuration(org.apache.flink.configuration.Configuration) MapFunction(org.apache.flink.api.common.functions.MapFunction) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) FileNotFoundException(java.io.FileNotFoundException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) JobSubmissionResult(org.apache.flink.api.common.JobSubmissionResult) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) File(java.io.File) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 54 with JobID

use of org.apache.flink.api.common.JobID in project flink by apache.

the class KeyedOneInputStreamOperatorTestHarness method snapshotLegacy.

/**
	 *
	 */
@Override
public StreamStateHandle snapshotLegacy(long checkpointId, long timestamp) throws Exception {
    // simply use an in-memory handle
    MemoryStateBackend backend = new MemoryStateBackend();
    CheckpointStreamFactory streamFactory = backend.createStreamFactory(new JobID(), "test_op");
    CheckpointStreamFactory.CheckpointStateOutputStream outStream = streamFactory.createCheckpointStateOutputStream(checkpointId, timestamp);
    if (operator instanceof StreamCheckpointedOperator) {
        ((StreamCheckpointedOperator) operator).snapshotState(outStream, checkpointId, timestamp);
    }
    if (keyedStateBackend != null) {
        RunnableFuture<KeyGroupsStateHandle> keyedSnapshotRunnable = keyedStateBackend.snapshot(checkpointId, timestamp, streamFactory, CheckpointOptions.forFullCheckpoint());
        if (!keyedSnapshotRunnable.isDone()) {
            Thread runner = new Thread(keyedSnapshotRunnable);
            runner.start();
        }
        outStream.write(1);
        ObjectOutputStream oos = new ObjectOutputStream(outStream);
        oos.writeObject(keyedSnapshotRunnable.get());
        oos.flush();
    } else {
        outStream.write(0);
    }
    return outStream.closeAndGetHandle();
}
Also used : CheckpointStreamFactory(org.apache.flink.runtime.state.CheckpointStreamFactory) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) StreamCheckpointedOperator(org.apache.flink.streaming.api.operators.StreamCheckpointedOperator) ObjectOutputStream(java.io.ObjectOutputStream) JobID(org.apache.flink.api.common.JobID) KeyGroupsStateHandle(org.apache.flink.runtime.state.KeyGroupsStateHandle)

Example 55 with JobID

use of org.apache.flink.api.common.JobID in project flink by apache.

the class JobClient method submitJobDetached.

/**
	 * Submits a job in detached mode. The method sends the JobGraph to the
	 * JobManager and waits for the answer whether the job could be started or not.
	 *
	 * @param jobManagerGateway Gateway to the JobManager which will execute the jobs
	 * @param config The cluster wide configuration.
	 * @param jobGraph The job
	 * @param timeout  Timeout in which the JobManager must have responded.
	 */
public static void submitJobDetached(ActorGateway jobManagerGateway, Configuration config, JobGraph jobGraph, FiniteDuration timeout, ClassLoader classLoader) throws JobExecutionException {
    checkNotNull(jobManagerGateway, "The jobManagerGateway must not be null.");
    checkNotNull(jobGraph, "The jobGraph must not be null.");
    checkNotNull(timeout, "The timeout must not be null.");
    LOG.info("Checking and uploading JAR files");
    try {
        jobGraph.uploadUserJars(jobManagerGateway, timeout, config);
    } catch (IOException e) {
        throw new JobSubmissionException(jobGraph.getJobID(), "Could not upload the program's JAR files to the JobManager.", e);
    }
    Object result;
    try {
        Future<Object> future = jobManagerGateway.ask(new JobManagerMessages.SubmitJob(jobGraph, // only receive the Acknowledge for the job submission message
        ListeningBehaviour.DETACHED), timeout);
        result = Await.result(future, timeout);
    } catch (TimeoutException e) {
        throw new JobTimeoutException(jobGraph.getJobID(), "JobManager did not respond within " + timeout.toString(), e);
    } catch (Throwable t) {
        throw new JobSubmissionException(jobGraph.getJobID(), "Failed to send job to JobManager: " + t.getMessage(), t.getCause());
    }
    if (result instanceof JobManagerMessages.JobSubmitSuccess) {
        JobID respondedID = ((JobManagerMessages.JobSubmitSuccess) result).jobId();
        // validate response
        if (!respondedID.equals(jobGraph.getJobID())) {
            throw new JobExecutionException(jobGraph.getJobID(), "JobManager responded for wrong Job. This Job: " + jobGraph.getJobID() + ", response: " + respondedID);
        }
    } else if (result instanceof JobManagerMessages.JobResultFailure) {
        try {
            SerializedThrowable t = ((JobManagerMessages.JobResultFailure) result).cause();
            throw t.deserializeError(classLoader);
        } catch (JobExecutionException e) {
            throw e;
        } catch (Throwable t) {
            throw new JobExecutionException(jobGraph.getJobID(), "JobSubmission failed: " + t.getMessage(), t);
        }
    } else {
        throw new JobExecutionException(jobGraph.getJobID(), "Unexpected response from JobManager: " + result);
    }
}
Also used : JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) IOException(java.io.IOException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable)

Aggregations

JobID (org.apache.flink.api.common.JobID)335 Test (org.junit.Test)274 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)88 IOException (java.io.IOException)74 Configuration (org.apache.flink.configuration.Configuration)72 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)61 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)48 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)47 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)44 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)42 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)38 ArrayList (java.util.ArrayList)37 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)32 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)31 HashMap (java.util.HashMap)29 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)29 FiniteDuration (scala.concurrent.duration.FiniteDuration)28 IntermediateDataSetID (org.apache.flink.runtime.jobgraph.IntermediateDataSetID)24 File (java.io.File)23 UUID (java.util.UUID)23