Search in sources :

Example 1 with MemoryExecutionGraphInfoStore

use of org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore in project flink by apache.

the class SessionClusterEntrypoint method createSerializableExecutionGraphStore.

@Override
protected ExecutionGraphInfoStore createSerializableExecutionGraphStore(Configuration configuration, ScheduledExecutor scheduledExecutor) throws IOException {
    final JobManagerOptions.JobStoreType jobStoreType = configuration.get(JobManagerOptions.JOB_STORE_TYPE);
    final Time expirationTime = Time.seconds(configuration.getLong(JobManagerOptions.JOB_STORE_EXPIRATION_TIME));
    final int maximumCapacity = configuration.getInteger(JobManagerOptions.JOB_STORE_MAX_CAPACITY);
    switch(jobStoreType) {
        case File:
            {
                final File tmpDir = new File(ConfigurationUtils.parseTempDirectories(configuration)[0]);
                final long maximumCacheSizeBytes = configuration.getLong(JobManagerOptions.JOB_STORE_CACHE_SIZE);
                return new FileExecutionGraphInfoStore(tmpDir, expirationTime, maximumCapacity, maximumCacheSizeBytes, scheduledExecutor, Ticker.systemTicker());
            }
        case Memory:
            {
                return new MemoryExecutionGraphInfoStore(expirationTime, maximumCapacity, scheduledExecutor, Ticker.systemTicker());
            }
        default:
            {
                throw new IllegalArgumentException("Unsupported job store type " + jobStoreType);
            }
    }
}
Also used : MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) FileExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.FileExecutionGraphInfoStore) Time(org.apache.flink.api.common.time.Time) File(java.io.File) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions)

Example 2 with MemoryExecutionGraphInfoStore

use of org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore in project flink by apache.

the class TestingMiniCluster method createDispatcherResourceManagerComponents.

@Override
protected Collection<? extends DispatcherResourceManagerComponent> createDispatcherResourceManagerComponents(Configuration configuration, RpcServiceFactory rpcServiceFactory, HighAvailabilityServices haServices, BlobServer blobServer, HeartbeatServices heartbeatServices, MetricRegistry metricRegistry, MetricQueryServiceRetriever metricQueryServiceRetriever, FatalErrorHandler fatalErrorHandler) throws Exception {
    DispatcherResourceManagerComponentFactory dispatcherResourceManagerComponentFactory = createDispatcherResourceManagerComponentFactory();
    final List<DispatcherResourceManagerComponent> result = new ArrayList<>(numberDispatcherResourceManagerComponents);
    for (int i = 0; i < numberDispatcherResourceManagerComponents; i++) {
        result.add(dispatcherResourceManagerComponentFactory.create(configuration, ResourceID.generate(), getIOExecutor(), rpcServiceFactory.createRpcService(), haServices, blobServer, heartbeatServices, metricRegistry, new MemoryExecutionGraphInfoStore(), metricQueryServiceRetriever, fatalErrorHandler));
    }
    return result;
}
Also used : MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) DispatcherResourceManagerComponent(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent) ArrayList(java.util.ArrayList) DispatcherResourceManagerComponentFactory(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory)

Example 3 with MemoryExecutionGraphInfoStore

use of org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore in project flink by apache.

the class ZooKeeperDefaultDispatcherRunnerTest method testResourceCleanupUnderLeadershipChange.

/**
 * See FLINK-11665.
 */
@Test
public void testResourceCleanupUnderLeadershipChange() throws Exception {
    final TestingRpcService rpcService = testingRpcServiceResource.getTestingRpcService();
    final TestingLeaderElectionService dispatcherLeaderElectionService = new TestingLeaderElectionService();
    final CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration, fatalErrorHandler).asCuratorFramework();
    try (final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServicesBuilder().setDispatcherLeaderElectionService(dispatcherLeaderElectionService).setJobMasterLeaderRetrieverFunction(jobId -> ZooKeeperUtils.createLeaderRetrievalService(client)).build()) {
        final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, CompletableFuture::new, blobServer, new TestingHeartbeatServices(), UnregisteredMetricGroups::createUnregisteredJobManagerMetricGroup, new MemoryExecutionGraphInfoStore(), fatalErrorHandler, VoidHistoryServerArchivist.INSTANCE, null, ForkJoinPool.commonPool(), new DispatcherOperationCaches());
        final DefaultDispatcherRunnerFactory defaultDispatcherRunnerFactory = DefaultDispatcherRunnerFactory.createSessionRunner(SessionDispatcherFactory.INSTANCE);
        try (final DispatcherRunner dispatcherRunner = createDispatcherRunner(rpcService, dispatcherLeaderElectionService, new JobPersistenceComponentFactory() {

            @Override
            public JobGraphStore createJobGraphStore() {
                return createZooKeeperJobGraphStore(client);
            }

            @Override
            public JobResultStore createJobResultStore() {
                return new EmbeddedJobResultStore();
            }
        }, partialDispatcherServices, defaultDispatcherRunnerFactory)) {
            // initial run
            DispatcherGateway dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
            final JobGraph jobGraph = createJobGraphWithBlobs();
            LOG.info("Initial job submission {}.", jobGraph.getJobID());
            dispatcherGateway.submitJob(jobGraph, TESTING_TIMEOUT).get();
            dispatcherLeaderElectionService.notLeader();
            // recovering submitted jobs
            LOG.info("Re-grant leadership first time.");
            dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
            LOG.info("Cancel recovered job {}.", jobGraph.getJobID());
            // cancellation of the job should remove everything
            final CompletableFuture<JobResult> jobResultFuture = dispatcherGateway.requestJobResult(jobGraph.getJobID(), TESTING_TIMEOUT);
            dispatcherGateway.cancelJob(jobGraph.getJobID(), TESTING_TIMEOUT).get();
            // a successful cancellation should eventually remove all job information
            final JobResult jobResult = jobResultFuture.get();
            assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.CANCELED));
            dispatcherLeaderElectionService.notLeader();
            // check that the job has been removed from ZooKeeper
            final JobGraphStore submittedJobGraphStore = createZooKeeperJobGraphStore(client);
            CommonTestUtils.waitUntilCondition(() -> submittedJobGraphStore.getJobIds().isEmpty(), Deadline.fromNow(VERIFICATION_TIMEOUT), 20L);
        }
    }
    // check resource clean up
    assertThat(clusterHaStorageDir.listFiles(), is(emptyArray()));
}
Also used : ZooKeeperUtils(org.apache.flink.runtime.util.ZooKeeperUtils) Deadline(org.apache.flink.api.common.time.Deadline) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) Matchers.emptyArray(org.hamcrest.Matchers.emptyArray) JobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.JobPersistenceComponentFactory) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LoggerFactory(org.slf4j.LoggerFactory) ExceptionUtils(org.apache.flink.util.ExceptionUtils) Assert.assertThat(org.junit.Assert.assertThat) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) After(org.junit.After) Duration(java.time.Duration) TestLogger(org.apache.flink.util.TestLogger) ClassRule(org.junit.ClassRule) HighAvailabilityServicesUtils(org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils) UUID(java.util.UUID) TestingUtils(org.apache.flink.testutils.TestingUtils) Matchers.is(org.hamcrest.Matchers.is) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) Time(org.apache.flink.api.common.time.Time) DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) BlobServer(org.apache.flink.runtime.blob.BlobServer) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) LeaderConnectionInfo(org.apache.flink.runtime.util.LeaderConnectionInfo) CuratorFramework(org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework) VoidHistoryServerArchivist(org.apache.flink.runtime.dispatcher.VoidHistoryServerArchivist) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) CompletableFuture(java.util.concurrent.CompletableFuture) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) JobResult(org.apache.flink.runtime.jobmaster.JobResult) PermanentBlobKey(org.apache.flink.runtime.blob.PermanentBlobKey) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) Before(org.junit.Before) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Logger(org.slf4j.Logger) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) Test(org.junit.Test) IOException(java.io.IOException) File(java.io.File) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) ForkJoinPool(java.util.concurrent.ForkJoinPool) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) TestingRpcServiceResource(org.apache.flink.runtime.rpc.TestingRpcServiceResource) BlobUtils(org.apache.flink.runtime.blob.BlobUtils) SessionDispatcherFactory(org.apache.flink.runtime.dispatcher.SessionDispatcherFactory) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ZooKeeperResource(org.apache.flink.runtime.zookeeper.ZooKeeperResource) HighAvailabilityOptions(org.apache.flink.configuration.HighAvailabilityOptions) TemporaryFolder(org.junit.rules.TemporaryFolder) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) JobResult(org.apache.flink.runtime.jobmaster.JobResult) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) JobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.JobPersistenceComponentFactory) CuratorFramework(org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework) CompletableFuture(java.util.concurrent.CompletableFuture) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) Test(org.junit.Test)

Example 4 with MemoryExecutionGraphInfoStore

use of org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore in project flink by apache.

the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.

@Test
public void testCancelingOnProcessFailure() throws Throwable {
    Assume.assumeTrue("---- Skipping Process Failure test : Could not find java executable ----", getJavaCommandPath() != null);
    TestProcess taskManagerProcess = null;
    final TestingFatalErrorHandler fatalErrorHandler = new TestingFatalErrorHandler();
    Configuration config = new Configuration();
    config.setString(JobManagerOptions.ADDRESS, "localhost");
    config.set(AkkaOptions.ASK_TIMEOUT_DURATION, Duration.ofSeconds(100));
    config.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperResource.getConnectString());
    config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().getAbsolutePath());
    config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
    config.set(TaskManagerOptions.CPU_CORES, 1.0);
    config.setInteger(RestOptions.PORT, 0);
    final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
    final int jobManagerPort = rpcService.getPort();
    config.setInteger(JobManagerOptions.PORT, jobManagerPort);
    final DispatcherResourceManagerComponentFactory resourceManagerComponentFactory = DefaultDispatcherResourceManagerComponentFactory.createSessionComponentFactory(StandaloneResourceManagerFactory.getInstance());
    DispatcherResourceManagerComponent dispatcherResourceManagerComponent = null;
    final ScheduledExecutorService ioExecutor = TestingUtils.defaultExecutor();
    final HighAvailabilityServices haServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(config, ioExecutor, AddressResolution.NO_ADDRESS_RESOLUTION, RpcSystem.load(), NoOpFatalErrorHandler.INSTANCE);
    final AtomicReference<Throwable> programException = new AtomicReference<>();
    try {
        dispatcherResourceManagerComponent = resourceManagerComponentFactory.create(config, ResourceID.generate(), ioExecutor, rpcService, haServices, blobServerResource.getBlobServer(), new HeartbeatServices(100L, 10000L, 2), NoOpMetricRegistry.INSTANCE, new MemoryExecutionGraphInfoStore(), VoidMetricQueryServiceRetriever.INSTANCE, fatalErrorHandler);
        TestProcessBuilder taskManagerProcessBuilder = new TestProcessBuilder(TaskExecutorProcessEntryPoint.class.getName());
        taskManagerProcessBuilder.addConfigAsMainClassArgs(config);
        taskManagerProcess = taskManagerProcessBuilder.start();
        // start the test program, which infinitely blocks
        Runnable programRunner = new Runnable() {

            @Override
            public void run() {
                try {
                    ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", 1337, config);
                    env.setParallelism(2);
                    env.setRestartStrategy(RestartStrategies.noRestart());
                    env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {

                        @Override
                        public Long map(Long value) throws Exception {
                            synchronized (this) {
                                System.out.println(TASK_DEPLOYED_MARKER);
                                wait();
                            }
                            return 0L;
                        }
                    }).output(new DiscardingOutputFormat<>());
                    env.execute();
                } catch (Throwable t) {
                    programException.set(t);
                }
            }
        };
        Thread programThread = new Thread(programRunner);
        programThread.start();
        waitUntilAtLeastOneTaskHasBeenDeployed(taskManagerProcess);
        // kill the TaskManager after the job started to run
        taskManagerProcess.destroy();
        taskManagerProcess = null;
        // the job should fail within a few seconds due to heartbeat timeouts
        // since the CI environment is often slow, we conservatively give it up to 2 minutes
        programThread.join(TIMEOUT.toMillis());
        assertFalse("The program did not cancel in time", programThread.isAlive());
        Throwable error = programException.get();
        assertNotNull("The program did not fail properly", error);
        assertTrue(error instanceof ProgramInvocationException);
    // all seems well :-)
    } catch (Exception | Error e) {
        if (taskManagerProcess != null) {
            printOutput("TaskManager OUT", taskManagerProcess.getProcessOutput().toString());
            printOutput("TaskManager ERR", taskManagerProcess.getErrorOutput().toString());
        }
        throw ExceptionUtils.firstOrSuppressed(e, programException.get());
    } finally {
        if (taskManagerProcess != null) {
            taskManagerProcess.destroy();
        }
        if (dispatcherResourceManagerComponent != null) {
            dispatcherResourceManagerComponent.stopApplication(ApplicationStatus.SUCCEEDED, null);
        }
        fatalErrorHandler.rethrowError();
        RpcUtils.terminateRpcService(rpcService, Time.seconds(100L));
        haServices.closeAndCleanupAllData();
    }
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) MapFunction(org.apache.flink.api.common.functions.MapFunction) TestProcess(org.apache.flink.test.util.TestProcessBuilder.TestProcess) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) DispatcherResourceManagerComponent(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent) TaskExecutorProcessEntryPoint(org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AtomicReference(java.util.concurrent.atomic.AtomicReference) TaskExecutorProcessEntryPoint(org.apache.flink.test.recovery.utils.TaskExecutorProcessEntryPoint) TestProcessBuilder(org.apache.flink.test.util.TestProcessBuilder) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) TimeoutException(java.util.concurrent.TimeoutException) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) RpcService(org.apache.flink.runtime.rpc.RpcService) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) DefaultDispatcherResourceManagerComponentFactory(org.apache.flink.runtime.entrypoint.component.DefaultDispatcherResourceManagerComponentFactory) DispatcherResourceManagerComponentFactory(org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory) Test(org.junit.Test)

Aggregations

MemoryExecutionGraphInfoStore (org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore)4 File (java.io.File)2 Time (org.apache.flink.api.common.time.Time)2 Configuration (org.apache.flink.configuration.Configuration)2 DispatcherResourceManagerComponent (org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponent)2 DispatcherResourceManagerComponentFactory (org.apache.flink.runtime.entrypoint.component.DispatcherResourceManagerComponentFactory)2 TestingFatalErrorHandler (org.apache.flink.runtime.util.TestingFatalErrorHandler)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 Duration (java.time.Duration)1 ArrayList (java.util.ArrayList)1 UUID (java.util.UUID)1 CompletableFuture (java.util.concurrent.CompletableFuture)1 ForkJoinPool (java.util.concurrent.ForkJoinPool)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 TimeoutException (java.util.concurrent.TimeoutException)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 MapFunction (org.apache.flink.api.common.functions.MapFunction)1 Deadline (org.apache.flink.api.common.time.Deadline)1 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)1