Search in sources :

Example 1 with StartpointManager

use of org.apache.samza.startpoint.StartpointManager in project samza by apache.

the class ClusterBasedJobCoordinator method run.

/**
 * Starts the JobCoordinator.
 */
public void run() {
    if (!isStarted.compareAndSet(false, true)) {
        LOG.warn("Attempting to start an already started job coordinator. ");
        return;
    }
    // set up JmxServer (if jmx is enabled)
    if (isJmxEnabled) {
        jmxServer = new JmxServer();
        state.jmxUrl = jmxServer.getJmxUrl();
        state.jmxTunnelingUrl = jmxServer.getTunnelingJmxUrl();
    } else {
        jmxServer = null;
    }
    try {
        // initialize JobCoordinator state
        LOG.info("Starting cluster based job coordinator");
        // write the diagnostics metadata file
        String jobName = new JobConfig(config).getName().get();
        String jobId = new JobConfig(config).getJobId();
        Optional<String> execEnvContainerId = Optional.ofNullable(System.getenv("CONTAINER_ID"));
        DiagnosticsUtil.writeMetadataFile(jobName, jobId, METRICS_SOURCE_NAME, execEnvContainerId, config);
        // create necessary checkpoint and changelog streams, if not created
        JobModel jobModel = jobModelManager.jobModel();
        MetadataResourceUtil metadataResourceUtil = new MetadataResourceUtil(jobModel, this.metrics, config);
        metadataResourceUtil.createResources();
        // create all the resources required for state backend factories
        StorageConfig storageConfig = new StorageConfig(config);
        storageConfig.getBackupFactories().forEach(stateStorageBackendBackupFactory -> {
            StateBackendFactory stateBackendFactory = ReflectionUtil.getObj(stateStorageBackendBackupFactory, StateBackendFactory.class);
            StateBackendAdmin stateBackendAdmin = stateBackendFactory.getAdmin(jobModel, config);
            // Create resources required for state backend admin
            stateBackendAdmin.createResources();
            // Validate resources required for state backend admin
            stateBackendAdmin.validateResources();
        });
        /*
       * We fanout startpoint if and only if
       *  1. Startpoint is enabled in configuration
       *  2. If AM HA is enabled, fanout only if startpoint enabled and job coordinator metadata changed
       */
        if (shouldFanoutStartpoint()) {
            StartpointManager startpointManager = createStartpointManager();
            startpointManager.start();
            try {
                startpointManager.fanOut(JobModelUtil.getTaskToSystemStreamPartitions(jobModel));
            } finally {
                startpointManager.stop();
            }
        }
        // Remap changelog partitions to tasks
        Map<TaskName, Integer> prevPartitionMappings = changelogStreamManager.readPartitionMapping();
        Map<TaskName, Integer> taskPartitionMappings = new HashMap<>();
        Map<String, ContainerModel> containers = jobModel.getContainers();
        for (ContainerModel containerModel : containers.values()) {
            for (TaskModel taskModel : containerModel.getTasks().values()) {
                taskPartitionMappings.put(taskModel.getTaskName(), taskModel.getChangelogPartition().getPartitionId());
            }
        }
        changelogStreamManager.updatePartitionMapping(prevPartitionMappings, taskPartitionMappings);
        containerProcessManager.start();
        systemAdmins.start();
        partitionMonitor.start();
        inputStreamRegexMonitor.ifPresent(StreamRegexMonitor::start);
        // containerPlacementRequestAllocator thread has to start after the cpm is started
        LOG.info("Starting the container placement handler thread");
        containerPlacementMetadataStore.start();
        containerPlacementRequestAllocatorThread.start();
        boolean isInterrupted = false;
        while (!containerProcessManager.shouldShutdown() && !checkAndThrowException() && !isInterrupted && checkcontainerPlacementRequestAllocatorThreadIsAlive()) {
            try {
                Thread.sleep(jobCoordinatorSleepInterval);
            } catch (InterruptedException e) {
                isInterrupted = true;
                LOG.error("Interrupted in job coordinator loop", e);
                Thread.currentThread().interrupt();
            }
        }
    } catch (Throwable e) {
        LOG.error("Exception thrown in the JobCoordinator loop", e);
        throw new SamzaException(e);
    } finally {
        onShutDown();
    }
}
Also used : JmxServer(org.apache.samza.metrics.JmxServer) StorageConfig(org.apache.samza.config.StorageConfig) HashMap(java.util.HashMap) StateBackendAdmin(org.apache.samza.storage.StateBackendAdmin) SamzaException(org.apache.samza.SamzaException) JobConfig(org.apache.samza.config.JobConfig) ContainerModel(org.apache.samza.job.model.ContainerModel) StateBackendFactory(org.apache.samza.storage.StateBackendFactory) StreamRegexMonitor(org.apache.samza.coordinator.StreamRegexMonitor) TaskName(org.apache.samza.container.TaskName) JobModel(org.apache.samza.job.model.JobModel) StartpointManager(org.apache.samza.startpoint.StartpointManager) MetadataResourceUtil(org.apache.samza.coordinator.MetadataResourceUtil) TaskModel(org.apache.samza.job.model.TaskModel)

Example 2 with StartpointManager

use of org.apache.samza.startpoint.StartpointManager in project samza by apache.

the class TestClusterBasedJobCoordinator method testVerifyStartpointManagerFanOut.

@Test
public void testVerifyStartpointManagerFanOut() throws IOException {
    configMap.put(JobConfig.JOB_CONTAINER_COUNT, "1");
    configMap.put("job.jmx.enabled", "false");
    when(CoordinatorStreamUtil.readConfigFromCoordinatorStream(anyObject())).thenReturn(new MapConfig(configMap));
    Config config = new MapConfig(configMap);
    MockitoException stopException = new MockitoException("Stop");
    ClusterBasedJobCoordinator clusterCoordinator = spy(ClusterBasedJobCoordinatorRunner.createFromMetadataStore(config));
    ContainerProcessManager mockContainerProcessManager = mock(ContainerProcessManager.class);
    doReturn(true).when(mockContainerProcessManager).shouldShutdown();
    StartpointManager mockStartpointManager = mock(StartpointManager.class);
    // Stop ClusterBasedJobCoordinator#run after stop() method by throwing an exception to stop the run loop.
    // ClusterBasedJobCoordinator will need to be refactored for better mock support.
    doThrow(stopException).when(mockStartpointManager).stop();
    doReturn(mockContainerProcessManager).when(clusterCoordinator).createContainerProcessManager();
    doReturn(mockStartpointManager).when(clusterCoordinator).createStartpointManager();
    try {
        clusterCoordinator.run();
    } catch (SamzaException ex) {
        assertEquals(stopException, ex.getCause());
        verify(mockStartpointManager).start();
        verify(mockStartpointManager).fanOut(any());
        verify(mockStartpointManager).stop();
        return;
    }
    fail("Expected run() method to stop after StartpointManager#stop()");
}
Also used : JobConfig(org.apache.samza.config.JobConfig) ApplicationConfig(org.apache.samza.config.ApplicationConfig) MapConfig(org.apache.samza.config.MapConfig) Config(org.apache.samza.config.Config) MockitoException(org.mockito.exceptions.base.MockitoException) MapConfig(org.apache.samza.config.MapConfig) StartpointManager(org.apache.samza.startpoint.StartpointManager) SamzaException(org.apache.samza.SamzaException) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 3 with StartpointManager

use of org.apache.samza.startpoint.StartpointManager in project samza by apache.

the class ZkJobCoordinator method loadMetadataResources.

/**
 * Stores the configuration of the job in the coordinator stream.
 */
@VisibleForTesting
void loadMetadataResources(JobModel jobModel) {
    try {
        MetadataResourceUtil metadataResourceUtil = createMetadataResourceUtil(jobModel, config);
        metadataResourceUtil.createResources();
        if (coordinatorStreamStore != null) {
            // TODO: SAMZA-2273 - publish configs async
            CoordinatorStreamValueSerde jsonSerde = new CoordinatorStreamValueSerde(SetConfig.TYPE);
            NamespaceAwareCoordinatorStreamStore configStore = new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetConfig.TYPE);
            for (Map.Entry<String, String> entry : config.entrySet()) {
                byte[] serializedValue = jsonSerde.toBytes(entry.getValue());
                configStore.put(entry.getKey(), serializedValue);
            }
            configStore.flush();
            if (new JobConfig(config).getStartpointEnabled()) {
                // fan out the startpoints
                StartpointManager startpointManager = createStartpointManager();
                startpointManager.start();
                try {
                    startpointManager.fanOut(JobModelUtil.getTaskToSystemStreamPartitions(jobModel));
                } finally {
                    startpointManager.stop();
                }
            }
        } else {
            LOG.warn("No metadata store registered to this job coordinator. Config not written to the metadata store and no Startpoints fan out.");
        }
    } catch (IOException ex) {
        throw new SamzaException(String.format("IO exception while loading metadata resources."), ex);
    }
}
Also used : NamespaceAwareCoordinatorStreamStore(org.apache.samza.coordinator.metadatastore.NamespaceAwareCoordinatorStreamStore) StartpointManager(org.apache.samza.startpoint.StartpointManager) IOException(java.io.IOException) MetadataResourceUtil(org.apache.samza.coordinator.MetadataResourceUtil) CoordinatorStreamValueSerde(org.apache.samza.coordinator.stream.CoordinatorStreamValueSerde) Map(java.util.Map) HashMap(java.util.HashMap) SamzaException(org.apache.samza.SamzaException) JobConfig(org.apache.samza.config.JobConfig) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 4 with StartpointManager

use of org.apache.samza.startpoint.StartpointManager in project samza by apache.

the class ContainerLaunchUtil method run.

@VisibleForTesting
static void run(ApplicationDescriptorImpl<? extends ApplicationDescriptor> appDesc, String jobName, String jobId, String containerId, Optional<String> executionEnvContainerId, Optional<String> samzaEpochId, JobModel jobModel, Config config, Optional<ExternalContext> externalContextOptional) {
    CoordinatorStreamStore coordinatorStreamStore = buildCoordinatorStreamStore(config, new MetricsRegistryMap());
    coordinatorStreamStore.init();
    /*
     * We track the exit code and only trigger exit in the finally block to make sure we are able to execute all the
     * clean up steps. Prior implementation had short circuited exit causing some of the clean up steps to be missed.
     */
    int exitCode = 0;
    try {
        TaskFactory taskFactory = TaskFactoryUtil.getTaskFactory(appDesc);
        LocalityManager localityManager = new LocalityManager(new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetContainerHostMapping.TYPE));
        // StartpointManager wraps the coordinatorStreamStore in the namespaces internally
        StartpointManager startpointManager = null;
        if (new JobConfig(config).getStartpointEnabled()) {
            startpointManager = new StartpointManager(coordinatorStreamStore);
        }
        Map<String, MetricsReporter> metricsReporters = loadMetricsReporters(appDesc, containerId, config);
        // Creating diagnostics manager and reporter, and wiring it respectively
        Optional<DiagnosticsManager> diagnosticsManager = DiagnosticsUtil.buildDiagnosticsManager(jobName, jobId, jobModel, containerId, executionEnvContainerId, samzaEpochId, config);
        MetricsRegistryMap metricsRegistryMap = new MetricsRegistryMap();
        SamzaContainer container = SamzaContainer$.MODULE$.apply(containerId, jobModel, ScalaJavaUtil.toScalaMap(metricsReporters), metricsRegistryMap, taskFactory, JobContextImpl.fromConfigWithDefaults(config, jobModel), Option.apply(appDesc.getApplicationContainerContextFactory().orElse(null)), Option.apply(appDesc.getApplicationTaskContextFactory().orElse(null)), Option.apply(externalContextOptional.orElse(null)), localityManager, startpointManager, Option.apply(diagnosticsManager.orElse(null)));
        ProcessorLifecycleListener processorLifecycleListener = appDesc.getProcessorLifecycleListenerFactory().createInstance(new ProcessorContext() {
        }, config);
        ClusterBasedProcessorLifecycleListener listener = new ClusterBasedProcessorLifecycleListener(config, processorLifecycleListener, container::shutdown);
        container.setContainerListener(listener);
        ContainerHeartbeatMonitor heartbeatMonitor = createContainerHeartbeatMonitor(container, new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetConfig.TYPE), config);
        if (heartbeatMonitor != null) {
            heartbeatMonitor.start();
        }
        if (new JobConfig(config).getApplicationMasterHighAvailabilityEnabled()) {
            executionEnvContainerId.ifPresent(execEnvContainerId -> {
                ExecutionContainerIdManager executionContainerIdManager = new ExecutionContainerIdManager(new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetExecutionEnvContainerIdMapping.TYPE));
                executionContainerIdManager.writeExecutionEnvironmentContainerIdMapping(containerId, execEnvContainerId);
            });
        }
        container.run();
        if (heartbeatMonitor != null) {
            heartbeatMonitor.stop();
        }
        // overriding the value with what the listener returns
        if (containerRunnerException == null) {
            containerRunnerException = listener.getContainerException();
        }
        if (containerRunnerException != null) {
            log.error("Container stopped with Exception. Exiting process now.", containerRunnerException);
            exitCode = 1;
        }
    } catch (Throwable e) {
        /*
       * Two separate log statements are intended to print the entire stack trace as part of the logs. Using
       * single log statement with custom format requires explicitly fetching stack trace and null checks which makes
       * the code slightly hard to read in comparison with the current choice.
       */
        log.error("Exiting the process due to", e);
        log.error("Container runner exception: ", containerRunnerException);
        exitCode = 1;
    } finally {
        coordinatorStreamStore.close();
        /*
       * Only exit in the scenario of non-zero exit code in order to maintain parity with current implementation where
       * the method completes when no errors are encountered.
       */
        if (exitCode != 0) {
            exitProcess(exitCode);
        }
    }
}
Also used : DiagnosticsManager(org.apache.samza.diagnostics.DiagnosticsManager) ContainerHeartbeatMonitor(org.apache.samza.container.ContainerHeartbeatMonitor) JobConfig(org.apache.samza.config.JobConfig) SamzaContainer(org.apache.samza.container.SamzaContainer) NamespaceAwareCoordinatorStreamStore(org.apache.samza.coordinator.metadatastore.NamespaceAwareCoordinatorStreamStore) ExecutionContainerIdManager(org.apache.samza.container.ExecutionContainerIdManager) CoordinatorStreamStore(org.apache.samza.coordinator.metadatastore.CoordinatorStreamStore) NamespaceAwareCoordinatorStreamStore(org.apache.samza.coordinator.metadatastore.NamespaceAwareCoordinatorStreamStore) MetricsReporter(org.apache.samza.metrics.MetricsReporter) TaskFactory(org.apache.samza.task.TaskFactory) StartpointManager(org.apache.samza.startpoint.StartpointManager) MetricsRegistryMap(org.apache.samza.metrics.MetricsRegistryMap) LocalityManager(org.apache.samza.container.LocalityManager) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 5 with StartpointManager

use of org.apache.samza.startpoint.StartpointManager in project samza by apache.

the class StaticResourceJobCoordinator method start.

@Override
public void start() {
    LOG.info("Starting job coordinator");
    this.systemAdmins.start();
    this.startpointManager.ifPresent(StartpointManager::start);
    try {
        JobModel jobModel = newJobModel();
        doSetLoggingContextConfig(jobModel.getConfig());
        // monitors should be created right after job model is calculated (see jobModelMonitors() for more details)
        JobModelMonitors jobModelMonitors = jobModelMonitors(jobModel);
        Optional<DiagnosticsManager> diagnosticsManager = diagnosticsManager(jobModel);
        JobCoordinatorMetadata newMetadata = this.jobCoordinatorMetadataManager.generateJobCoordinatorMetadata(jobModel, jobModel.getConfig());
        Set<JobMetadataChange> jobMetadataChanges = checkForMetadataChanges(newMetadata);
        if (!jobMetadataChanges.isEmpty() && !jobMetadataChanges.contains(JobMetadataChange.NEW_DEPLOYMENT)) {
            /*
         * If the job coordinator comes up, but not due to a new deployment, and the metadata changed, then trigger a
         * restart. This case applies if the job coordinator died and the job model needed to change while it was down.
         * If there were no metadata changes, then just let the current workers continue to run.
         * If there was a new deployment (which includes the case where the coordinator requested a restart), then we
         * rely on the external resource manager to make sure the previous workers restarted, so we don't need to
         * restart again.
         */
            LOG.info("Triggering job restart");
            this.jobRestartSignal.restartJob();
        } else {
            prepareWorkerExecution(jobModel, newMetadata, jobMetadataChanges);
            // save components that depend on job model in order to manage lifecycle or access later
            this.currentDiagnosticsManager = diagnosticsManager;
            this.currentJobModelMonitors = Optional.of(jobModelMonitors);
            this.currentJobModel = Optional.of(jobModel);
            // lifecycle: start components
            this.coordinatorCommunication.start();
            this.jobCoordinatorListener.ifPresent(listener -> listener.onNewJobModel(this.processorId, jobModel));
            this.currentDiagnosticsManager.ifPresent(DiagnosticsManager::start);
            jobModelMonitors.start();
            this.jobPreparationComplete.set(true);
        }
    } catch (Exception e) {
        LOG.error("Error while running job coordinator; exiting", e);
        throw new SamzaException("Error while running job coordinator", e);
    }
}
Also used : DiagnosticsManager(org.apache.samza.diagnostics.DiagnosticsManager) JobCoordinatorMetadata(org.apache.samza.job.JobCoordinatorMetadata) JobModelMonitors(org.apache.samza.coordinator.JobModelMonitors) JobMetadataChange(org.apache.samza.job.JobMetadataChange) JobModel(org.apache.samza.job.model.JobModel) StartpointManager(org.apache.samza.startpoint.StartpointManager) SamzaException(org.apache.samza.SamzaException) ConfigException(org.apache.samza.config.ConfigException) IOException(java.io.IOException) SamzaException(org.apache.samza.SamzaException)

Aggregations

StartpointManager (org.apache.samza.startpoint.StartpointManager)13 SamzaException (org.apache.samza.SamzaException)8 Test (org.junit.Test)7 JobConfig (org.apache.samza.config.JobConfig)6 CoordinatorStreamStore (org.apache.samza.coordinator.metadatastore.CoordinatorStreamStore)5 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)4 CountDownLatch (java.util.concurrent.CountDownLatch)4 ApplicationRunner (org.apache.samza.runtime.ApplicationRunner)4 IncomingMessageEnvelope (org.apache.samza.system.IncomingMessageEnvelope)4 TaskCallback (org.apache.samza.task.TaskCallback)4 TestTaskApplication (org.apache.samza.test.processor.TestTaskApplication)4 TestKafkaEvent (org.apache.samza.test.util.TestKafkaEvent)4 ExpectedException (org.junit.rules.ExpectedException)4 VisibleForTesting (com.google.common.annotations.VisibleForTesting)3 MetadataResourceUtil (org.apache.samza.coordinator.MetadataResourceUtil)3 NamespaceAwareCoordinatorStreamStore (org.apache.samza.coordinator.metadatastore.NamespaceAwareCoordinatorStreamStore)3 DiagnosticsManager (org.apache.samza.diagnostics.DiagnosticsManager)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 RecordMetadata (org.apache.kafka.clients.producer.RecordMetadata)2