Search in sources :

Example 71 with Deadline

use of org.apache.flink.api.common.time.Deadline in project flink by apache.

the class AbstractOperatorRestoreTestBase method testMigrationAndRestore.

@Test
public void testMigrationAndRestore() throws Throwable {
    ClusterClient<?> clusterClient = cluster.getClusterClient();
    final Deadline deadline = Deadline.now().plus(TEST_TIMEOUT);
    // submit job with old version savepoint and create a migrated savepoint in the new version
    String savepointPath = migrateJob(clusterClient, deadline);
    // restore from migrated new version savepoint
    restoreJob(clusterClient, deadline, savepointPath);
}
Also used : Deadline(org.apache.flink.api.common.time.Deadline) Test(org.junit.Test)

Example 72 with Deadline

use of org.apache.flink.api.common.time.Deadline in project flink by apache.

the class UnalignedCheckpointStressITCase method runStressTest.

@Test
public void runStressTest() throws Exception {
    Deadline deadline = Deadline.fromNow(Duration.ofMillis(TEST_DURATION));
    Optional<File> externalizedCheckpoint = Optional.empty();
    while (deadline.hasTimeLeft()) {
        externalizedCheckpoint = Optional.of(runAndTakeExternalCheckpoint(externalizedCheckpoint));
        cleanDirectoryExcept(externalizedCheckpoint.get());
    }
}
Also used : Deadline(org.apache.flink.api.common.time.Deadline) File(java.io.File) Test(org.junit.Test)

Example 73 with Deadline

use of org.apache.flink.api.common.time.Deadline in project flink by apache.

the class RescalingITCase method testSavepointRescalingKeyedState.

/**
 * Tests that a job with purely keyed state can be restarted from a savepoint with a different
 * parallelism.
 */
public void testSavepointRescalingKeyedState(boolean scaleOut, boolean deriveMaxParallelism) throws Exception {
    final int numberKeys = 42;
    final int numberElements = 1000;
    final int numberElements2 = 500;
    final int parallelism = scaleOut ? numSlots / 2 : numSlots;
    final int parallelism2 = scaleOut ? numSlots : numSlots / 2;
    final int maxParallelism = 13;
    Duration timeout = Duration.ofMinutes(3);
    Deadline deadline = Deadline.now().plus(timeout);
    ClusterClient<?> client = cluster.getClusterClient();
    try {
        JobGraph jobGraph = createJobGraphWithKeyedState(parallelism, maxParallelism, numberKeys, numberElements, false, 100);
        final JobID jobID = jobGraph.getJobID();
        client.submitJob(jobGraph).get();
        // wait til the sources have emitted numberElements for each key and completed a
        // checkpoint
        assertTrue(SubtaskIndexFlatMapper.workCompletedLatch.await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
        // verify the current state
        Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();
        Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();
        for (int key = 0; key < numberKeys; key++) {
            int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
            expectedResult.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism, keyGroupIndex), numberElements * key));
        }
        assertEquals(expectedResult, actualResult);
        // clear the CollectionSink set for the restarted job
        CollectionSink.clearElementsSet();
        waitForAllTaskRunning(cluster.getMiniCluster(), jobGraph.getJobID(), false);
        CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL);
        final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        client.cancel(jobID).get();
        while (!getRunningJobs(client).isEmpty()) {
            Thread.sleep(50);
        }
        int restoreMaxParallelism = deriveMaxParallelism ? JobVertex.MAX_PARALLELISM_DEFAULT : maxParallelism;
        JobGraph scaledJobGraph = createJobGraphWithKeyedState(parallelism2, restoreMaxParallelism, numberKeys, numberElements2, true, 100);
        scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        submitJobAndWaitForResult(client, scaledJobGraph, getClass().getClassLoader());
        Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();
        Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();
        for (int key = 0; key < numberKeys; key++) {
            int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
            expectedResult2.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2)));
        }
        assertEquals(expectedResult2, actualResult2);
    } finally {
        // clear the CollectionSink set for the restarted job
        CollectionSink.clearElementsSet();
    }
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Deadline(org.apache.flink.api.common.time.Deadline) Duration(java.time.Duration) JobID(org.apache.flink.api.common.JobID) HashSet(java.util.HashSet)

Example 74 with Deadline

use of org.apache.flink.api.common.time.Deadline in project flink by apache.

the class RescalingITCase method testSavepointRescalingWithKeyedAndNonPartitionedState.

/**
 * Tests that a job with non partitioned state can be restarted from a savepoint with a
 * different parallelism if the operator with non-partitioned state are not rescaled.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingWithKeyedAndNonPartitionedState() throws Exception {
    int numberKeys = 42;
    int numberElements = 1000;
    int numberElements2 = 500;
    int parallelism = numSlots / 2;
    int parallelism2 = numSlots;
    int maxParallelism = 13;
    Duration timeout = Duration.ofMinutes(3);
    Deadline deadline = Deadline.now().plus(timeout);
    ClusterClient<?> client = cluster.getClusterClient();
    try {
        JobGraph jobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState(parallelism, maxParallelism, parallelism, numberKeys, numberElements, false, 100);
        final JobID jobID = jobGraph.getJobID();
        // make sure the job does not finish before we take the savepoint
        StateSourceBase.canFinishLatch = new CountDownLatch(1);
        client.submitJob(jobGraph).get();
        // wait til the sources have emitted numberElements for each key and completed a
        // checkpoint
        assertTrue(SubtaskIndexFlatMapper.workCompletedLatch.await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
        // verify the current state
        Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();
        Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();
        for (int key = 0; key < numberKeys; key++) {
            int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
            expectedResult.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism, keyGroupIndex), numberElements * key));
        }
        assertEquals(expectedResult, actualResult);
        // clear the CollectionSink set for the restarted job
        CollectionSink.clearElementsSet();
        waitForAllTaskRunning(cluster.getMiniCluster(), jobGraph.getJobID(), false);
        CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL);
        final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // we took a savepoint, the job can finish now
        StateSourceBase.canFinishLatch.countDown();
        client.cancel(jobID).get();
        while (!getRunningJobs(client).isEmpty()) {
            Thread.sleep(50);
        }
        JobGraph scaledJobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState(parallelism2, maxParallelism, parallelism, numberKeys, numberElements + numberElements2, true, 100);
        scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        submitJobAndWaitForResult(client, scaledJobGraph, getClass().getClassLoader());
        Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();
        Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();
        for (int key = 0; key < numberKeys; key++) {
            int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
            expectedResult2.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2)));
        }
        assertEquals(expectedResult2, actualResult2);
    } finally {
        // clear the CollectionSink set for the restarted job
        CollectionSink.clearElementsSet();
    }
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Deadline(org.apache.flink.api.common.time.Deadline) Duration(java.time.Duration) CountDownLatch(java.util.concurrent.CountDownLatch) JobID(org.apache.flink.api.common.JobID) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 75 with Deadline

use of org.apache.flink.api.common.time.Deadline in project flink by apache.

the class RescalingITCase method testSavepointRescalingNonPartitionedStateCausesException.

/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
    final int parallelism = numSlots / 2;
    final int parallelism2 = numSlots;
    final int maxParallelism = 13;
    Duration timeout = Duration.ofMinutes(3);
    Deadline deadline = Deadline.now().plus(timeout);
    ClusterClient<?> client = cluster.getClusterClient();
    try {
        JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);
        // make sure the job does not finish before we take the savepoint
        StateSourceBase.canFinishLatch = new CountDownLatch(1);
        final JobID jobID = jobGraph.getJobID();
        client.submitJob(jobGraph).get();
        // wait until the operator is started
        waitForAllTaskRunning(cluster.getMiniCluster(), jobGraph.getJobID(), false);
        // wait until the operator handles some data
        StateSourceBase.workStartedLatch.await();
        CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL);
        final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // we took a savepoint, the job can finish now
        StateSourceBase.canFinishLatch.countDown();
        client.cancel(jobID).get();
        while (!getRunningJobs(client).isEmpty()) {
            Thread.sleep(50);
        }
        // job successfully removed
        JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);
        scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        submitJobAndWaitForResult(client, scaledJobGraph, getClass().getClassLoader());
    } catch (JobExecutionException exception) {
        if (exception.getCause() instanceof IllegalStateException) {
        // we expect a IllegalStateException wrapped
        // in a JobExecutionException, because the job containing non-partitioned state
        // is being rescaled
        } else {
            throw exception;
        }
    }
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) Deadline(org.apache.flink.api.common.time.Deadline) Duration(java.time.Duration) CountDownLatch(java.util.concurrent.CountDownLatch) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

Deadline (org.apache.flink.api.common.time.Deadline)75 Test (org.junit.Test)34 JobID (org.apache.flink.api.common.JobID)29 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)26 Duration (java.time.Duration)19 Configuration (org.apache.flink.configuration.Configuration)15 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)14 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)13 IOException (java.io.IOException)12 ExecutionException (java.util.concurrent.ExecutionException)12 KeySelector (org.apache.flink.api.java.functions.KeySelector)12 AtomicLong (java.util.concurrent.atomic.AtomicLong)11 MiniCluster (org.apache.flink.runtime.minicluster.MiniCluster)10 File (java.io.File)9 TimeUnit (java.util.concurrent.TimeUnit)9 JobStatus (org.apache.flink.api.common.JobStatus)9 List (java.util.List)8 Test (org.junit.jupiter.api.Test)8 CompletableFuture (java.util.concurrent.CompletableFuture)7 CountDownLatch (java.util.concurrent.CountDownLatch)7