Search in sources :

Example 51 with Configuration

use of org.apache.flink.configuration.Configuration in project flink by apache.

the class FastFailuresITCase method testThis.

@Test
public void testThis() {
    Configuration config = new Configuration();
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 2);
    config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
    LocalFlinkMiniCluster cluster = new LocalFlinkMiniCluster(config, false);
    cluster.start();
    StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("localhost", cluster.getLeaderRPCPort());
    env.getConfig().disableSysoutLogging();
    env.setParallelism(4);
    env.enableCheckpointing(1000);
    env.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(210, 0));
    DataStream<Tuple2<Integer, Integer>> input = env.addSource(new RichSourceFunction<Tuple2<Integer, Integer>>() {

        @Override
        public void open(Configuration parameters) {
            if (FAILURES_SO_FAR.incrementAndGet() <= NUM_FAILURES) {
                throw new RuntimeException("fail");
            }
        }

        @Override
        public void run(SourceContext<Tuple2<Integer, Integer>> ctx) {
        }

        @Override
        public void cancel() {
        }
    });
    input.keyBy(0).map(new MapFunction<Tuple2<Integer, Integer>, Integer>() {

        @Override
        public Integer map(Tuple2<Integer, Integer> value) {
            return value.f0;
        }
    }).addSink(new SinkFunction<Integer>() {

        @Override
        public void invoke(Integer value) {
        }
    });
    try {
        env.execute();
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) MapFunction(org.apache.flink.api.common.functions.MapFunction) LocalFlinkMiniCluster(org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Tuple2(org.apache.flink.api.java.tuple.Tuple2) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test)

Example 52 with Configuration

use of org.apache.flink.configuration.Configuration in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.

/**
	 * Simple checkpointed streaming sum.
	 *
	 * <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
	 * returns it to the main thread via a static variable. We wait until some checkpoints are
	 * completed and sanity check that the sources recover with an updated state to make sure that
	 * this test actually tests something.
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
    // Config
    final int checkpointingInterval = 200;
    final int sequenceEnd = 5000;
    final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(Parallelism);
    env.enableCheckpointing(checkpointingInterval);
    env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
    JobGraph jobGraph = env.getStreamGraph().getJobGraph();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
    config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
    ActorSystem testSystem = null;
    final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            // Submit the job in detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        CompletedCheckpointsLatch.await();
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Wait to finish
        FinalCountLatch.await();
        assertEquals(expectedSum, (long) FinalCount.get());
        for (int i = 0; i < Parallelism; i++) {
            assertNotEquals(0, RecoveredStates.get(i));
        }
    } catch (Throwable t) {
        // Reset all static state for test retries
        CompletedCheckpointsLatch = new CountDownLatch(2);
        RecoveredStates = new AtomicLongArray(Parallelism);
        FinalCountLatch = new CountDownLatch(1);
        FinalCount = new AtomicReference<>();
        LastElement = -1;
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) AtomicLongArray(java.util.concurrent.atomic.AtomicLongArray) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 53 with Configuration

use of org.apache.flink.configuration.Configuration in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testJobPersistencyWhenJobManagerShutdown.

// ---------------------------------------------------------------------------------------------
/**
	 * Tests that the HA job is not cleaned up when the jobmanager is stopped.
	 */
@Test
public void testJobPersistencyWhenJobManagerShutdown() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Configure the cluster
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 1);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    TestingCluster flink = new TestingCluster(config, false, false);
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Start the JobManager and TaskManager
        flink.start(true);
        JobGraph jobGraph = createBlockingJobGraph();
        // Set restart strategy to guard against shut down races.
        // If the TM fails before the JM, it might happen that the
        // Job is failed, leading to state removal.
        ExecutionConfig ec = new ExecutionConfig();
        ec.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 100));
        jobGraph.setExecutionConfig(ec);
        ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
        // Submit the job
        jobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        // Wait for the job to start
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, jobManager, deadline.timeLeft());
    } finally {
        flink.shutdown();
    }
    // verify that the persisted job data has not been removed from ZooKeeper when the JM has
    // been shutdown
    verifyRecoveryState(config);
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) Configuration(org.apache.flink.configuration.Configuration) Deadline(scala.concurrent.duration.Deadline) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test)

Example 54 with Configuration

use of org.apache.flink.configuration.Configuration in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testSubmitJobToNonLeader.

/**
	 * Tests that submissions to non-leaders are handled.
	 */
@Test
public void testSubmitJobToNonLeader() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Configure the cluster
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    TestingCluster flink = new TestingCluster(config, false, false);
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Start the JobManager and TaskManager
        flink.start(true);
        JobGraph jobGraph = createBlockingJobGraph();
        List<ActorRef> bothJobManagers = flink.getJobManagersAsJava();
        ActorGateway leadingJobManager = flink.getLeaderGateway(deadline.timeLeft());
        ActorGateway nonLeadingJobManager;
        if (bothJobManagers.get(0).equals(leadingJobManager.actor())) {
            nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(1), null);
        } else {
            nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(0), null);
        }
        log.info("Leading job manager: " + leadingJobManager);
        log.info("Non-leading job manager: " + nonLeadingJobManager);
        // Submit the job
        nonLeadingJobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        log.info("Submitted job graph to " + nonLeadingJobManager);
        // Wait for the job to start. We are asking the *leading** JM here although we've
        // submitted the job to the non-leading JM. This is the behaviour under test.
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leadingJobManager, deadline.timeLeft());
        log.info("Wait that the non-leader removes the submitted job.");
        // Make sure that the **non-leading** JM has actually removed the job graph from its
        // local state.
        boolean success = false;
        while (!success && deadline.hasTimeLeft()) {
            JobStatusResponse jobStatusResponse = JobManagerActorTestUtils.requestJobStatus(jobGraph.getJobID(), nonLeadingJobManager, deadline.timeLeft());
            if (jobStatusResponse instanceof JobManagerMessages.JobNotFound) {
                success = true;
            } else {
                log.info(((JobManagerMessages.CurrentJobStatus) jobStatusResponse).status().toString());
                Thread.sleep(100);
            }
        }
        if (!success) {
            fail("Non-leading JM was still holding reference to the job graph.");
        }
        Future<Object> jobRemoved = leadingJobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        leadingJobManager.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
        Await.ready(jobRemoved, deadline.timeLeft());
    } finally {
        flink.shutdown();
    }
    // Verify that everything is clean
    verifyCleanRecoveryState(config);
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobStatusResponse(org.apache.flink.runtime.messages.JobManagerMessages.JobStatusResponse) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestActorRef(akka.testkit.TestActorRef) Deadline(scala.concurrent.duration.Deadline) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test)

Example 55 with Configuration

use of org.apache.flink.configuration.Configuration in project flink by apache.

the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerFailure.

/**
	 * Test program with JobManager failure.
	 *
	 * @param zkQuorum ZooKeeper quorum to connect to
	 * @param coordinateDir Coordination directory
	 * @throws Exception
	 */
public void testJobManagerFailure(String zkQuorum, final File coordinateDir) throws Exception {
    Configuration config = new Configuration();
    config.setString(HighAvailabilityOptions.HA_MODE, "ZOOKEEPER");
    config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zkQuorum);
    ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("leader", 1, config);
    env.setParallelism(PARALLELISM);
    env.setNumberOfExecutionRetries(1);
    env.getConfig().setExecutionMode(executionMode);
    env.getConfig().disableSysoutLogging();
    final long NUM_ELEMENTS = 100000L;
    final DataSet<Long> result = env.generateSequence(1, NUM_ELEMENTS).rebalance().map(new RichMapFunction<Long, Long>() {

        private final File proceedFile = new File(coordinateDir, PROCEED_MARKER_FILE);

        private boolean markerCreated = false;

        private boolean checkForProceedFile = true;

        @Override
        public Long map(Long value) throws Exception {
            if (!markerCreated) {
                int taskIndex = getRuntimeContext().getIndexOfThisSubtask();
                AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateDir, READY_MARKER_FILE_PREFIX + taskIndex));
                markerCreated = true;
            }
            // check if the proceed file exists
            if (checkForProceedFile) {
                if (proceedFile.exists()) {
                    checkForProceedFile = false;
                } else {
                    // otherwise wait so that we make slow progress
                    Thread.sleep(100);
                }
            }
            return value;
        }
    }).reduce(new ReduceFunction<Long>() {

        @Override
        public Long reduce(Long value1, Long value2) {
            return value1 + value2;
        }
    }).flatMap(new RichFlatMapFunction<Long, Long>() {

        @Override
        public void flatMap(Long value, Collector<Long> out) throws Exception {
            assertEquals(NUM_ELEMENTS * (NUM_ELEMENTS + 1L) / 2L, (long) value);
            int taskIndex = getRuntimeContext().getIndexOfThisSubtask();
            AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateDir, FINISH_MARKER_FILE_PREFIX + taskIndex));
        }
    });
    result.output(new DiscardingOutputFormat<Long>());
    env.execute();
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) ReduceFunction(org.apache.flink.api.common.functions.ReduceFunction) IOException(java.io.IOException) File(java.io.File)

Aggregations

Configuration (org.apache.flink.configuration.Configuration)630 Test (org.junit.Test)452 IOException (java.io.IOException)137 FileInputSplit (org.apache.flink.core.fs.FileInputSplit)93 File (java.io.File)92 JobID (org.apache.flink.api.common.JobID)74 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)68 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)49 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)46 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)45 Path (org.apache.flink.core.fs.Path)44 ActorRef (akka.actor.ActorRef)43 ArrayList (java.util.ArrayList)43 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)39 FiniteDuration (scala.concurrent.duration.FiniteDuration)38 LocalFlinkMiniCluster (org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster)36 BeforeClass (org.junit.BeforeClass)35 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)33 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)33 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)32