use of org.apache.samza.startpoint.StartpointManager in project samza by apache.
the class ClusterBasedJobCoordinator method run.
/**
* Starts the JobCoordinator.
*/
public void run() {
if (!isStarted.compareAndSet(false, true)) {
LOG.warn("Attempting to start an already started job coordinator. ");
return;
}
// set up JmxServer (if jmx is enabled)
if (isJmxEnabled) {
jmxServer = new JmxServer();
state.jmxUrl = jmxServer.getJmxUrl();
state.jmxTunnelingUrl = jmxServer.getTunnelingJmxUrl();
} else {
jmxServer = null;
}
try {
// initialize JobCoordinator state
LOG.info("Starting cluster based job coordinator");
// write the diagnostics metadata file
String jobName = new JobConfig(config).getName().get();
String jobId = new JobConfig(config).getJobId();
Optional<String> execEnvContainerId = Optional.ofNullable(System.getenv("CONTAINER_ID"));
DiagnosticsUtil.writeMetadataFile(jobName, jobId, METRICS_SOURCE_NAME, execEnvContainerId, config);
// create necessary checkpoint and changelog streams, if not created
JobModel jobModel = jobModelManager.jobModel();
MetadataResourceUtil metadataResourceUtil = new MetadataResourceUtil(jobModel, this.metrics, config);
metadataResourceUtil.createResources();
// create all the resources required for state backend factories
StorageConfig storageConfig = new StorageConfig(config);
storageConfig.getBackupFactories().forEach(stateStorageBackendBackupFactory -> {
StateBackendFactory stateBackendFactory = ReflectionUtil.getObj(stateStorageBackendBackupFactory, StateBackendFactory.class);
StateBackendAdmin stateBackendAdmin = stateBackendFactory.getAdmin(jobModel, config);
// Create resources required for state backend admin
stateBackendAdmin.createResources();
// Validate resources required for state backend admin
stateBackendAdmin.validateResources();
});
/*
* We fanout startpoint if and only if
* 1. Startpoint is enabled in configuration
* 2. If AM HA is enabled, fanout only if startpoint enabled and job coordinator metadata changed
*/
if (shouldFanoutStartpoint()) {
StartpointManager startpointManager = createStartpointManager();
startpointManager.start();
try {
startpointManager.fanOut(JobModelUtil.getTaskToSystemStreamPartitions(jobModel));
} finally {
startpointManager.stop();
}
}
// Remap changelog partitions to tasks
Map<TaskName, Integer> prevPartitionMappings = changelogStreamManager.readPartitionMapping();
Map<TaskName, Integer> taskPartitionMappings = new HashMap<>();
Map<String, ContainerModel> containers = jobModel.getContainers();
for (ContainerModel containerModel : containers.values()) {
for (TaskModel taskModel : containerModel.getTasks().values()) {
taskPartitionMappings.put(taskModel.getTaskName(), taskModel.getChangelogPartition().getPartitionId());
}
}
changelogStreamManager.updatePartitionMapping(prevPartitionMappings, taskPartitionMappings);
containerProcessManager.start();
systemAdmins.start();
partitionMonitor.start();
inputStreamRegexMonitor.ifPresent(StreamRegexMonitor::start);
// containerPlacementRequestAllocator thread has to start after the cpm is started
LOG.info("Starting the container placement handler thread");
containerPlacementMetadataStore.start();
containerPlacementRequestAllocatorThread.start();
boolean isInterrupted = false;
while (!containerProcessManager.shouldShutdown() && !checkAndThrowException() && !isInterrupted && checkcontainerPlacementRequestAllocatorThreadIsAlive()) {
try {
Thread.sleep(jobCoordinatorSleepInterval);
} catch (InterruptedException e) {
isInterrupted = true;
LOG.error("Interrupted in job coordinator loop", e);
Thread.currentThread().interrupt();
}
}
} catch (Throwable e) {
LOG.error("Exception thrown in the JobCoordinator loop", e);
throw new SamzaException(e);
} finally {
onShutDown();
}
}
use of org.apache.samza.startpoint.StartpointManager in project samza by apache.
the class TestClusterBasedJobCoordinator method testVerifyStartpointManagerFanOut.
@Test
public void testVerifyStartpointManagerFanOut() throws IOException {
configMap.put(JobConfig.JOB_CONTAINER_COUNT, "1");
configMap.put("job.jmx.enabled", "false");
when(CoordinatorStreamUtil.readConfigFromCoordinatorStream(anyObject())).thenReturn(new MapConfig(configMap));
Config config = new MapConfig(configMap);
MockitoException stopException = new MockitoException("Stop");
ClusterBasedJobCoordinator clusterCoordinator = spy(ClusterBasedJobCoordinatorRunner.createFromMetadataStore(config));
ContainerProcessManager mockContainerProcessManager = mock(ContainerProcessManager.class);
doReturn(true).when(mockContainerProcessManager).shouldShutdown();
StartpointManager mockStartpointManager = mock(StartpointManager.class);
// Stop ClusterBasedJobCoordinator#run after stop() method by throwing an exception to stop the run loop.
// ClusterBasedJobCoordinator will need to be refactored for better mock support.
doThrow(stopException).when(mockStartpointManager).stop();
doReturn(mockContainerProcessManager).when(clusterCoordinator).createContainerProcessManager();
doReturn(mockStartpointManager).when(clusterCoordinator).createStartpointManager();
try {
clusterCoordinator.run();
} catch (SamzaException ex) {
assertEquals(stopException, ex.getCause());
verify(mockStartpointManager).start();
verify(mockStartpointManager).fanOut(any());
verify(mockStartpointManager).stop();
return;
}
fail("Expected run() method to stop after StartpointManager#stop()");
}
use of org.apache.samza.startpoint.StartpointManager in project samza by apache.
the class ZkJobCoordinator method loadMetadataResources.
/**
* Stores the configuration of the job in the coordinator stream.
*/
@VisibleForTesting
void loadMetadataResources(JobModel jobModel) {
try {
MetadataResourceUtil metadataResourceUtil = createMetadataResourceUtil(jobModel, config);
metadataResourceUtil.createResources();
if (coordinatorStreamStore != null) {
// TODO: SAMZA-2273 - publish configs async
CoordinatorStreamValueSerde jsonSerde = new CoordinatorStreamValueSerde(SetConfig.TYPE);
NamespaceAwareCoordinatorStreamStore configStore = new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetConfig.TYPE);
for (Map.Entry<String, String> entry : config.entrySet()) {
byte[] serializedValue = jsonSerde.toBytes(entry.getValue());
configStore.put(entry.getKey(), serializedValue);
}
configStore.flush();
if (new JobConfig(config).getStartpointEnabled()) {
// fan out the startpoints
StartpointManager startpointManager = createStartpointManager();
startpointManager.start();
try {
startpointManager.fanOut(JobModelUtil.getTaskToSystemStreamPartitions(jobModel));
} finally {
startpointManager.stop();
}
}
} else {
LOG.warn("No metadata store registered to this job coordinator. Config not written to the metadata store and no Startpoints fan out.");
}
} catch (IOException ex) {
throw new SamzaException(String.format("IO exception while loading metadata resources."), ex);
}
}
use of org.apache.samza.startpoint.StartpointManager in project samza by apache.
the class ContainerLaunchUtil method run.
@VisibleForTesting
static void run(ApplicationDescriptorImpl<? extends ApplicationDescriptor> appDesc, String jobName, String jobId, String containerId, Optional<String> executionEnvContainerId, Optional<String> samzaEpochId, JobModel jobModel, Config config, Optional<ExternalContext> externalContextOptional) {
CoordinatorStreamStore coordinatorStreamStore = buildCoordinatorStreamStore(config, new MetricsRegistryMap());
coordinatorStreamStore.init();
/*
* We track the exit code and only trigger exit in the finally block to make sure we are able to execute all the
* clean up steps. Prior implementation had short circuited exit causing some of the clean up steps to be missed.
*/
int exitCode = 0;
try {
TaskFactory taskFactory = TaskFactoryUtil.getTaskFactory(appDesc);
LocalityManager localityManager = new LocalityManager(new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetContainerHostMapping.TYPE));
// StartpointManager wraps the coordinatorStreamStore in the namespaces internally
StartpointManager startpointManager = null;
if (new JobConfig(config).getStartpointEnabled()) {
startpointManager = new StartpointManager(coordinatorStreamStore);
}
Map<String, MetricsReporter> metricsReporters = loadMetricsReporters(appDesc, containerId, config);
// Creating diagnostics manager and reporter, and wiring it respectively
Optional<DiagnosticsManager> diagnosticsManager = DiagnosticsUtil.buildDiagnosticsManager(jobName, jobId, jobModel, containerId, executionEnvContainerId, samzaEpochId, config);
MetricsRegistryMap metricsRegistryMap = new MetricsRegistryMap();
SamzaContainer container = SamzaContainer$.MODULE$.apply(containerId, jobModel, ScalaJavaUtil.toScalaMap(metricsReporters), metricsRegistryMap, taskFactory, JobContextImpl.fromConfigWithDefaults(config, jobModel), Option.apply(appDesc.getApplicationContainerContextFactory().orElse(null)), Option.apply(appDesc.getApplicationTaskContextFactory().orElse(null)), Option.apply(externalContextOptional.orElse(null)), localityManager, startpointManager, Option.apply(diagnosticsManager.orElse(null)));
ProcessorLifecycleListener processorLifecycleListener = appDesc.getProcessorLifecycleListenerFactory().createInstance(new ProcessorContext() {
}, config);
ClusterBasedProcessorLifecycleListener listener = new ClusterBasedProcessorLifecycleListener(config, processorLifecycleListener, container::shutdown);
container.setContainerListener(listener);
ContainerHeartbeatMonitor heartbeatMonitor = createContainerHeartbeatMonitor(container, new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetConfig.TYPE), config);
if (heartbeatMonitor != null) {
heartbeatMonitor.start();
}
if (new JobConfig(config).getApplicationMasterHighAvailabilityEnabled()) {
executionEnvContainerId.ifPresent(execEnvContainerId -> {
ExecutionContainerIdManager executionContainerIdManager = new ExecutionContainerIdManager(new NamespaceAwareCoordinatorStreamStore(coordinatorStreamStore, SetExecutionEnvContainerIdMapping.TYPE));
executionContainerIdManager.writeExecutionEnvironmentContainerIdMapping(containerId, execEnvContainerId);
});
}
container.run();
if (heartbeatMonitor != null) {
heartbeatMonitor.stop();
}
// overriding the value with what the listener returns
if (containerRunnerException == null) {
containerRunnerException = listener.getContainerException();
}
if (containerRunnerException != null) {
log.error("Container stopped with Exception. Exiting process now.", containerRunnerException);
exitCode = 1;
}
} catch (Throwable e) {
/*
* Two separate log statements are intended to print the entire stack trace as part of the logs. Using
* single log statement with custom format requires explicitly fetching stack trace and null checks which makes
* the code slightly hard to read in comparison with the current choice.
*/
log.error("Exiting the process due to", e);
log.error("Container runner exception: ", containerRunnerException);
exitCode = 1;
} finally {
coordinatorStreamStore.close();
/*
* Only exit in the scenario of non-zero exit code in order to maintain parity with current implementation where
* the method completes when no errors are encountered.
*/
if (exitCode != 0) {
exitProcess(exitCode);
}
}
}
use of org.apache.samza.startpoint.StartpointManager in project samza by apache.
the class StaticResourceJobCoordinator method start.
@Override
public void start() {
LOG.info("Starting job coordinator");
this.systemAdmins.start();
this.startpointManager.ifPresent(StartpointManager::start);
try {
JobModel jobModel = newJobModel();
doSetLoggingContextConfig(jobModel.getConfig());
// monitors should be created right after job model is calculated (see jobModelMonitors() for more details)
JobModelMonitors jobModelMonitors = jobModelMonitors(jobModel);
Optional<DiagnosticsManager> diagnosticsManager = diagnosticsManager(jobModel);
JobCoordinatorMetadata newMetadata = this.jobCoordinatorMetadataManager.generateJobCoordinatorMetadata(jobModel, jobModel.getConfig());
Set<JobMetadataChange> jobMetadataChanges = checkForMetadataChanges(newMetadata);
if (!jobMetadataChanges.isEmpty() && !jobMetadataChanges.contains(JobMetadataChange.NEW_DEPLOYMENT)) {
/*
* If the job coordinator comes up, but not due to a new deployment, and the metadata changed, then trigger a
* restart. This case applies if the job coordinator died and the job model needed to change while it was down.
* If there were no metadata changes, then just let the current workers continue to run.
* If there was a new deployment (which includes the case where the coordinator requested a restart), then we
* rely on the external resource manager to make sure the previous workers restarted, so we don't need to
* restart again.
*/
LOG.info("Triggering job restart");
this.jobRestartSignal.restartJob();
} else {
prepareWorkerExecution(jobModel, newMetadata, jobMetadataChanges);
// save components that depend on job model in order to manage lifecycle or access later
this.currentDiagnosticsManager = diagnosticsManager;
this.currentJobModelMonitors = Optional.of(jobModelMonitors);
this.currentJobModel = Optional.of(jobModel);
// lifecycle: start components
this.coordinatorCommunication.start();
this.jobCoordinatorListener.ifPresent(listener -> listener.onNewJobModel(this.processorId, jobModel));
this.currentDiagnosticsManager.ifPresent(DiagnosticsManager::start);
jobModelMonitors.start();
this.jobPreparationComplete.set(true);
}
} catch (Exception e) {
LOG.error("Error while running job coordinator; exiting", e);
throw new SamzaException("Error while running job coordinator", e);
}
}
Aggregations