Search in sources :

Example 11 with Time

use of org.apache.flink.api.common.time.Time in project flink by apache.

the class JobLeaderIdServiceTest method jobTimeoutAfterLostLeadership.

/**
	 * Tests that a timeout get cancelled once a job leader has been found. Furthermore, it tests
	 * that a new timeout is registered after the jobmanager has lost leadership.
	 */
@Test(timeout = 10000)
public void jobTimeoutAfterLostLeadership() throws Exception {
    final JobID jobId = new JobID();
    final String address = "foobar";
    final UUID leaderId = UUID.randomUUID();
    TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
    ScheduledFuture<?> timeout1 = mock(ScheduledFuture.class);
    ScheduledFuture<?> timeout2 = mock(ScheduledFuture.class);
    final Queue<ScheduledFuture<?>> timeoutQueue = new ArrayDeque<>(Arrays.asList(timeout1, timeout2));
    ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
    final AtomicReference<Runnable> lastRunnable = new AtomicReference<>();
    doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            lastRunnable.set((Runnable) invocation.getArguments()[0]);
            return timeoutQueue.poll();
        }
    }).when(scheduledExecutor).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
    Time timeout = Time.milliseconds(5000L);
    JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
    final AtomicReference<UUID> lastTimeoutId = new AtomicReference<>();
    doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            lastTimeoutId.set((UUID) invocation.getArguments()[1]);
            return null;
        }
    }).when(jobLeaderIdActions).notifyJobTimeout(eq(jobId), any(UUID.class));
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
    jobLeaderIdService.start(jobLeaderIdActions);
    jobLeaderIdService.addJob(jobId);
    Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
    // notify the leader id service about the new leader
    leaderRetrievalService.notifyListener(address, leaderId);
    assertEquals(leaderId, leaderIdFuture.get());
    assertTrue(jobLeaderIdService.containsJob(jobId));
    // check that the first timeout got cancelled
    verify(timeout1, times(1)).cancel(anyBoolean());
    verify(scheduledExecutor, times(1)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
    // initial timeout runnable which should no longer have an effect
    Runnable runnable = lastRunnable.get();
    assertNotNull(runnable);
    runnable.run();
    verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), any(UUID.class));
    // the timeout should no longer be valid
    assertFalse(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
    // lose leadership
    leaderRetrievalService.notifyListener("", null);
    verify(scheduledExecutor, times(2)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
    // the second runnable should be the new timeout
    runnable = lastRunnable.get();
    assertNotNull(runnable);
    runnable.run();
    verify(jobLeaderIdActions, times(2)).notifyJobTimeout(eq(jobId), any(UUID.class));
    // the new timeout should be valid
    assertTrue(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
}
Also used : TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) AtomicReference(java.util.concurrent.atomic.AtomicReference) Time(org.apache.flink.api.common.time.Time) ArrayDeque(java.util.ArrayDeque) ScheduledFuture(java.util.concurrent.ScheduledFuture) ScheduledExecutor(org.apache.flink.runtime.concurrent.ScheduledExecutor) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Answer(org.mockito.stubbing.Answer) Mockito.doAnswer(org.mockito.Mockito.doAnswer) InvocationOnMock(org.mockito.invocation.InvocationOnMock) TimeUnit(java.util.concurrent.TimeUnit) UUID(java.util.UUID) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 12 with Time

use of org.apache.flink.api.common.time.Time in project flink by apache.

the class JobLeaderIdServiceTest method testInitialJobTimeout.

/**
	 * Tests that the initial job registration registers a timeout which will call
	 * {@link JobLeaderIdActions#notifyJobTimeout(JobID, UUID)} when executed.
	 */
@Test
public void testInitialJobTimeout() throws Exception {
    final JobID jobId = new JobID();
    TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
    ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
    Time timeout = Time.milliseconds(5000L);
    JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
    jobLeaderIdService.start(jobLeaderIdActions);
    jobLeaderIdService.addJob(jobId);
    assertTrue(jobLeaderIdService.containsJob(jobId));
    ArgumentCaptor<Runnable> runnableArgumentCaptor = ArgumentCaptor.forClass(Runnable.class);
    verify(scheduledExecutor).schedule(runnableArgumentCaptor.capture(), anyLong(), any(TimeUnit.class));
    Runnable timeoutRunnable = runnableArgumentCaptor.getValue();
    timeoutRunnable.run();
    ArgumentCaptor<UUID> timeoutIdArgumentCaptor = ArgumentCaptor.forClass(UUID.class);
    verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), timeoutIdArgumentCaptor.capture());
    assertTrue(jobLeaderIdService.isValidTimeout(jobId, timeoutIdArgumentCaptor.getValue()));
}
Also used : TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) TimeUnit(java.util.concurrent.TimeUnit) Time(org.apache.flink.api.common.time.Time) UUID(java.util.UUID) JobID(org.apache.flink.api.common.JobID) ScheduledExecutor(org.apache.flink.runtime.concurrent.ScheduledExecutor) Test(org.junit.Test)

Example 13 with Time

use of org.apache.flink.api.common.time.Time in project flink by apache.

the class JobLeaderIdServiceTest method testRemovingJob.

/**
	 * Tests that removing a job completes the job leader id future exceptionally
	 */
@Test(timeout = 10000)
public void testRemovingJob() throws Exception {
    final JobID jobId = new JobID();
    TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
    ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
    Time timeout = Time.milliseconds(5000L);
    JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
    jobLeaderIdService.start(jobLeaderIdActions);
    jobLeaderIdService.addJob(jobId);
    Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
    // remove the job before we could find a leader
    jobLeaderIdService.removeJob(jobId);
    assertFalse(jobLeaderIdService.containsJob(jobId));
    try {
        leaderIdFuture.get();
        fail("The leader id future should be completed exceptionally.");
    } catch (ExecutionException ignored) {
    // expected exception
    }
}
Also used : TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) Time(org.apache.flink.api.common.time.Time) UUID(java.util.UUID) ExecutionException(java.util.concurrent.ExecutionException) JobID(org.apache.flink.api.common.JobID) ScheduledExecutor(org.apache.flink.runtime.concurrent.ScheduledExecutor) Test(org.junit.Test)

Example 14 with Time

use of org.apache.flink.api.common.time.Time in project flink by apache.

the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.

/**
	 * Makes sure that all components are shut down when the TaskManager
	 * actor is shut down.
	 */
@Test
public void testComponentsStartupShutdown() {
    final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
    final Time timeout = Time.seconds(100);
    final int BUFFER_SIZE = 32 * 1024;
    Configuration config = new Configuration();
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
    config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(config);
        final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
        final int numberOfSlots = 1;
        // create the components for the TaskManager manually
        final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
        1000000, config, // exit-jvm-on-fatal-error
        false);
        final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
        ResourceID taskManagerId = ResourceID.generate();
        final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
        final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
        final IOManager ioManager = new IOManagerAsync(TMP_DIR);
        final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
        network.start();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
        // create the task manager
        final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                // wait for the TaskManager to be registered
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // shut down all actors and the actor system
        // Kill the Task down the JobManager
        taskManager.tell(Kill.getInstance(), ActorRef.noSender());
        jobManager.tell(Kill.getInstance(), ActorRef.noSender());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
        actorSystem = null;
        // now that the TaskManager is shut down, the components should be shut down as well
        assertTrue(network.isShutdown());
        assertTrue(ioManager.isProperlyShutDown());
        assertTrue(memManager.isShutdown());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) KvStateRegistry(org.apache.flink.runtime.query.KvStateRegistry) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) ActorRef(akka.actor.ActorRef) Time(org.apache.flink.api.common.time.Time) JobManager(org.apache.flink.runtime.jobmanager.JobManager) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Props(akka.actor.Props) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FiniteDuration(scala.concurrent.duration.FiniteDuration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) ResultPartitionManager(org.apache.flink.runtime.io.network.partition.ResultPartitionManager) NetworkBufferPool(org.apache.flink.runtime.io.network.buffer.NetworkBufferPool) LocalConnectionManager(org.apache.flink.runtime.io.network.LocalConnectionManager) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) TaskEventDispatcher(org.apache.flink.runtime.io.network.TaskEventDispatcher) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 15 with Time

use of org.apache.flink.api.common.time.Time in project flink by apache.

the class MiniCluster method start.

/**
	 * Starts the mini cluster, based on the configured properties.
	 * 
	 * @throws Exception This method passes on any exception that occurs during the startup of
	 *                   the mini cluster.
	 */
public void start() throws Exception {
    synchronized (lock) {
        checkState(!running, "FlinkMiniCluster is already running");
        LOG.info("Starting Flink Mini Cluster");
        LOG.debug("Using configuration {}", config);
        final Configuration configuration = new UnmodifiableConfiguration(config.generateConfiguration());
        final Time rpcTimeout = config.getRpcTimeout();
        final int numJobManagers = config.getNumJobManagers();
        final int numTaskManagers = config.getNumTaskManagers();
        final int numResourceManagers = config.getNumResourceManagers();
        final boolean singleRpc = config.getUseSingleRpcSystem();
        try {
            LOG.info("Starting Metrics Registry");
            metricRegistry = createMetricRegistry(configuration);
            RpcService[] jobManagerRpcServices = new RpcService[numJobManagers];
            RpcService[] taskManagerRpcServices = new RpcService[numTaskManagers];
            RpcService[] resourceManagerRpcServices = new RpcService[numResourceManagers];
            // bring up all the RPC services
            LOG.info("Starting RPC Service(s)");
            // we always need the 'commonRpcService' for auxiliary calls
            commonRpcService = createRpcService(configuration, rpcTimeout, false, null);
            if (singleRpc) {
                // set that same RPC service for all JobManagers and TaskManagers
                for (int i = 0; i < numJobManagers; i++) {
                    jobManagerRpcServices[i] = commonRpcService;
                }
                for (int i = 0; i < numTaskManagers; i++) {
                    taskManagerRpcServices[i] = commonRpcService;
                }
                for (int i = 0; i < numResourceManagers; i++) {
                    resourceManagerRpcServices[i] = commonRpcService;
                }
                this.resourceManagerRpcServices = null;
                this.jobManagerRpcServices = null;
                this.taskManagerRpcServices = null;
            } else {
                // start a new service per component, possibly with custom bind addresses
                final String jobManagerBindAddress = config.getJobManagerBindAddress();
                final String taskManagerBindAddress = config.getTaskManagerBindAddress();
                final String resourceManagerBindAddress = config.getResourceManagerBindAddress();
                for (int i = 0; i < numJobManagers; i++) {
                    jobManagerRpcServices[i] = createRpcService(configuration, rpcTimeout, true, jobManagerBindAddress);
                }
                for (int i = 0; i < numTaskManagers; i++) {
                    taskManagerRpcServices[i] = createRpcService(configuration, rpcTimeout, true, taskManagerBindAddress);
                }
                for (int i = 0; i < numResourceManagers; i++) {
                    resourceManagerRpcServices[i] = createRpcService(configuration, rpcTimeout, true, resourceManagerBindAddress);
                }
                this.jobManagerRpcServices = jobManagerRpcServices;
                this.taskManagerRpcServices = taskManagerRpcServices;
                this.resourceManagerRpcServices = resourceManagerRpcServices;
            }
            // create the high-availability services
            LOG.info("Starting high-availability services");
            haServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(configuration);
            heartbeatServices = HeartbeatServices.fromConfiguration(configuration);
            // bring up the ResourceManager(s)
            LOG.info("Starting {} ResourceManger(s)", numResourceManagers);
            resourceManagerRunners = startResourceManagers(configuration, haServices, metricRegistry, numResourceManagers, resourceManagerRpcServices);
            // bring up the TaskManager(s) for the mini cluster
            LOG.info("Starting {} TaskManger(s)", numTaskManagers);
            taskManagerRunners = startTaskManagers(configuration, haServices, metricRegistry, numTaskManagers, taskManagerRpcServices);
            // bring up the dispatcher that launches JobManagers when jobs submitted
            LOG.info("Starting job dispatcher(s) for {} JobManger(s)", numJobManagers);
            jobDispatcher = new MiniClusterJobDispatcher(configuration, haServices, heartbeatServices, metricRegistry, numJobManagers, jobManagerRpcServices);
        } catch (Exception e) {
            // cleanup everything
            try {
                shutdownInternally();
            } catch (Exception ee) {
                e.addSuppressed(ee);
            }
            throw e;
        }
        // now officially mark this as running
        running = true;
        LOG.info("Flink Mini Cluster started successfully");
    }
}
Also used : UnmodifiableConfiguration(org.apache.flink.configuration.UnmodifiableConfiguration) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) RpcService(org.apache.flink.runtime.rpc.RpcService) AkkaRpcService(org.apache.flink.runtime.rpc.akka.AkkaRpcService) UnmodifiableConfiguration(org.apache.flink.configuration.UnmodifiableConfiguration) Time(org.apache.flink.api.common.time.Time) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException)

Aggregations

Time (org.apache.flink.api.common.time.Time)16 Test (org.junit.Test)9 JobID (org.apache.flink.api.common.JobID)6 UUID (java.util.UUID)4 ScheduledExecutor (org.apache.flink.runtime.concurrent.ScheduledExecutor)4 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)4 TestingLeaderRetrievalService (org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 Configuration (org.apache.flink.configuration.Configuration)3 FlinkCompletableFuture (org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture)3 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)3 ActorSystem (akka.actor.ActorSystem)2 List (java.util.List)2 TimeUnit (java.util.concurrent.TimeUnit)2 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)2 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)2 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)2 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)2 Slot (org.apache.flink.runtime.instance.Slot)2