Search in sources :

Example 1 with StandaloneSessionClusterEntrypoint

use of org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint in project flink by apache.

the class AbstractTaskManagerProcessFailureRecoveryTest method testTaskManagerProcessFailure.

@Test
public void testTaskManagerProcessFailure() throws Exception {
    TestProcess taskManagerProcess1 = null;
    TestProcess taskManagerProcess2 = null;
    TestProcess taskManagerProcess3 = null;
    File coordinateTempDir = null;
    Configuration config = new Configuration();
    config.setString(JobManagerOptions.ADDRESS, "localhost");
    config.setString(RestOptions.BIND_PORT, "0");
    config.setLong(HeartbeatManagerOptions.HEARTBEAT_INTERVAL, 200L);
    config.setLong(HeartbeatManagerOptions.HEARTBEAT_TIMEOUT, 10000L);
    config.set(HeartbeatManagerOptions.HEARTBEAT_RPC_FAILURE_THRESHOLD, 1);
    config.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperResource.getConnectString());
    config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().getAbsolutePath());
    config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
    config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
    config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
    config.set(TaskManagerOptions.CPU_CORES, 1.0);
    config.setString(JobManagerOptions.EXECUTION_FAILOVER_STRATEGY, "full");
    config.set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofSeconds(30L));
    try (final StandaloneSessionClusterEntrypoint clusterEntrypoint = new StandaloneSessionClusterEntrypoint(config)) {
        // check that we run this test only if the java command
        // is available on this machine
        String javaCommand = getJavaCommandPath();
        if (javaCommand == null) {
            System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
            return;
        }
        clusterEntrypoint.startCluster();
        // coordination between the processes goes through a directory
        coordinateTempDir = temporaryFolder.newFolder();
        TestProcessBuilder taskManagerProcessBuilder = new TestProcessBuilder(TaskExecutorProcessEntryPoint.class.getName());
        taskManagerProcessBuilder.addConfigAsMainClassArgs(config);
        // start the first two TaskManager processes
        taskManagerProcess1 = taskManagerProcessBuilder.start();
        taskManagerProcess2 = taskManagerProcessBuilder.start();
        // the program will set a marker file in each of its parallel tasks once they are ready,
        // so that
        // this coordinating code is aware of this.
        // the program will very slowly consume elements until the marker file (later created by
        // the
        // test driver code) is present
        final File coordinateDirClosure = coordinateTempDir;
        final AtomicReference<Throwable> errorRef = new AtomicReference<>();
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testTaskManagerFailure(config, coordinateDirClosure);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef.set(t);
                }
            }
        };
        // start the test program
        programTrigger.start();
        // max 20 seconds
        if (!waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, 120000)) {
            // check if the program failed for some reason
            if (errorRef.get() != null) {
                Throwable error = errorRef.get();
                error.printStackTrace();
                fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
            } else {
                // no error occurred, simply a timeout
                fail("The tasks were not started within time (" + 120000 + "msecs)");
            }
        }
        // start the third TaskManager
        taskManagerProcess3 = taskManagerProcessBuilder.start();
        // kill one of the previous TaskManagers, triggering a failure and recovery
        taskManagerProcess1.destroy();
        waitForShutdown("TaskManager 1", taskManagerProcess1);
        // we create the marker file which signals the program functions tasks that they can
        // complete
        touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        // wait for at most 5 minutes for the program to complete
        programTrigger.join(300000);
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef.get() != null) {
            Throwable error = errorRef.get();
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    // all seems well :-)
    } catch (Exception e) {
        e.printStackTrace();
        printProcessLog("TaskManager 1", taskManagerProcess1);
        printProcessLog("TaskManager 2", taskManagerProcess2);
        printProcessLog("TaskManager 3", taskManagerProcess3);
        fail(e.getMessage());
    } catch (Error e) {
        e.printStackTrace();
        printProcessLog("TaskManager 1", taskManagerProcess1);
        printProcessLog("TaskManager 2", taskManagerProcess2);
        printProcessLog("TaskManager 3", taskManagerProcess3);
        throw e;
    } finally {
        if (taskManagerProcess1 != null) {
            taskManagerProcess1.destroy();
        }
        if (taskManagerProcess2 != null) {
            taskManagerProcess2.destroy();
        }
        if (taskManagerProcess3 != null) {
            taskManagerProcess3.destroy();
        }
        waitForShutdown("TaskManager 1", taskManagerProcess1);
        waitForShutdown("TaskManager 2", taskManagerProcess2);
        waitForShutdown("TaskManager 3", taskManagerProcess3);
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) AtomicReference(java.util.concurrent.atomic.AtomicReference) TestProcess(org.apache.flink.test.util.TestProcessBuilder.TestProcess) TestProcessBuilder(org.apache.flink.test.util.TestProcessBuilder) IOException(java.io.IOException) StandaloneSessionClusterEntrypoint(org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint) TaskExecutorProcessEntryPoint(org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint) File(java.io.File) Test(org.junit.Test)

Example 2 with StandaloneSessionClusterEntrypoint

use of org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint in project flink by apache.

the class LocalRecoveryITCase method submitJob.

private JobClient submitJob(int parallelism, StandaloneSessionClusterEntrypoint clusterEntrypoint) throws Exception {
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", clusterEntrypoint.getRestPort(), new Configuration());
    env.setParallelism(parallelism);
    env.enableCheckpointing(100, CheckpointingMode.EXACTLY_ONCE);
    env.addSource(new LocalRecoverySource()).keyBy(x -> x).addSink(new DiscardingSink<>());
    final JobClient jobClient = env.executeAsync();
    return jobClient;
}
Also used : Deadline(org.apache.flink.api.common.time.Deadline) ClusterOptions(org.apache.flink.configuration.ClusterOptions) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) CheckpointingMode(org.apache.flink.streaming.api.CheckpointingMode) CheckpointingStatistics(org.apache.flink.runtime.rest.messages.checkpoints.CheckpointingStatistics) FunctionSnapshotContext(org.apache.flink.runtime.state.FunctionSnapshotContext) EmptyRequestBody(org.apache.flink.runtime.rest.messages.EmptyRequestBody) ArrayList(java.util.ArrayList) ListState(org.apache.flink.api.common.state.ListState) TestLoggerExtension(org.apache.flink.util.TestLoggerExtension) TaskManagerOptions(org.apache.flink.configuration.TaskManagerOptions) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) Duration(java.time.Duration) RichParallelSourceFunction(org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) RestOptions(org.apache.flink.configuration.RestOptions) Nonnull(javax.annotation.Nonnull) HeartbeatManagerOptions(org.apache.flink.configuration.HeartbeatManagerOptions) StandaloneSessionClusterEntrypoint(org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint) Iterator(java.util.Iterator) CheckpointedFunction(org.apache.flink.streaming.api.checkpoint.CheckpointedFunction) DiscardingSink(org.apache.flink.streaming.api.functions.sink.DiscardingSink) FunctionInitializationContext(org.apache.flink.runtime.state.FunctionInitializationContext) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) CheckpointingStatisticsHeaders(org.apache.flink.runtime.rest.messages.checkpoints.CheckpointingStatisticsHeaders) IOException(java.io.IOException) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) TestProcessBuilder(org.apache.flink.test.util.TestProcessBuilder) Preconditions(org.apache.flink.util.Preconditions) JobClient(org.apache.flink.core.execution.JobClient) File(java.io.File) CheckpointingOptions(org.apache.flink.configuration.CheckpointingOptions) Test(org.junit.jupiter.api.Test) TimeUnit(java.util.concurrent.TimeUnit) Executors(org.apache.flink.util.concurrent.Executors) List(java.util.List) JobID(org.apache.flink.api.common.JobID) JobMessageParameters(org.apache.flink.runtime.rest.messages.JobMessageParameters) TaskExecutorProcessEntryPoint(org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint) TempDir(org.junit.jupiter.api.io.TempDir) Optional(java.util.Optional) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) StreamingRuntimeContext(org.apache.flink.streaming.api.operators.StreamingRuntimeContext) Collections(java.util.Collections) RestClient(org.apache.flink.runtime.rest.RestClient) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) JobClient(org.apache.flink.core.execution.JobClient)

Example 3 with StandaloneSessionClusterEntrypoint

use of org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint in project flink by apache.

the class LocalRecoveryITCase method testRecoverLocallyFromProcessCrashWithWorkingDirectory.

@Test
public void testRecoverLocallyFromProcessCrashWithWorkingDirectory() throws Exception {
    final Configuration configuration = new Configuration();
    configuration.set(JobManagerOptions.ADDRESS, "localhost");
    configuration.set(JobManagerOptions.PORT, 0);
    configuration.set(RestOptions.BIND_PORT, "0");
    configuration.set(HeartbeatManagerOptions.HEARTBEAT_TIMEOUT, 10000L);
    configuration.set(HeartbeatManagerOptions.HEARTBEAT_INTERVAL, 1000L);
    configuration.set(HeartbeatManagerOptions.HEARTBEAT_RPC_FAILURE_THRESHOLD, 1);
    configuration.set(ClusterOptions.PROCESS_WORKING_DIR_BASE, tmpDirectory.getAbsolutePath());
    configuration.set(CheckpointingOptions.LOCAL_RECOVERY, true);
    configuration.set(TaskManagerOptions.SLOT_TIMEOUT, Duration.ofSeconds(30L));
    final int parallelism = 3;
    boolean success = false;
    Collection<TaskManagerProcess> taskManagerProcesses = Collections.emptyList();
    try (final StandaloneSessionClusterEntrypoint clusterEntrypoint = new StandaloneSessionClusterEntrypoint(configuration)) {
        clusterEntrypoint.startCluster();
        final Configuration configurationTemplate = new Configuration(configuration);
        configurationTemplate.set(JobManagerOptions.PORT, clusterEntrypoint.getRpcPort());
        taskManagerProcesses = startTaskManagerProcesses(parallelism, configurationTemplate);
        final JobClient jobClient = submitJob(parallelism, clusterEntrypoint);
        final long waitingTimeInSeconds = 45L;
        waitUntilCheckpointCompleted(configuration, clusterEntrypoint.getRestPort(), jobClient.getJobID(), Deadline.fromNow(Duration.ofSeconds(waitingTimeInSeconds)));
        restartTaskManagerProcesses(taskManagerProcesses, parallelism - 1);
        jobClient.getJobExecutionResult().get(waitingTimeInSeconds, TimeUnit.SECONDS);
        success = true;
    } finally {
        if (!success) {
            for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
                printLogOutput(taskManagerProcess);
            }
        }
        for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
            taskManagerProcess.terminate();
        }
    }
}
Also used : StandaloneSessionClusterEntrypoint(org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint) Configuration(org.apache.flink.configuration.Configuration) JobClient(org.apache.flink.core.execution.JobClient) StandaloneSessionClusterEntrypoint(org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint) TaskExecutorProcessEntryPoint(org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint) Test(org.junit.jupiter.api.Test)

Aggregations

Configuration (org.apache.flink.configuration.Configuration)3 StandaloneSessionClusterEntrypoint (org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint)3 TaskExecutorProcessEntryPoint (org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint)3 File (java.io.File)2 IOException (java.io.IOException)2 JobClient (org.apache.flink.core.execution.JobClient)2 Test (org.junit.jupiter.api.Test)2 Duration (java.time.Duration)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Optional (java.util.Optional)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Nonnull (javax.annotation.Nonnull)1 JobID (org.apache.flink.api.common.JobID)1 ListState (org.apache.flink.api.common.state.ListState)1 ListStateDescriptor (org.apache.flink.api.common.state.ListStateDescriptor)1