use of org.apache.samza.config.ClusterManagerConfig in project samza by apache.
the class JobModelCalculator method calculateJobModel.
/**
* Does the following:
* 1. Fetches metadata of the input streams defined in configuration through {@code streamMetadataCache}.
* 2. Applies the SSP grouper and task name grouper defined in the configuration to build the {@link JobModel}.
* @param originalConfig the configuration of the job.
* @param changeLogPartitionMapping the task to changelog partition mapping of the job.
* @param streamMetadataCache the cache that holds the partition metadata of the input streams.
* @param grouperMetadata provides the historical metadata of the application.
* @return the built {@link JobModel}.
*/
public JobModel calculateJobModel(Config originalConfig, Map<TaskName, Integer> changeLogPartitionMapping, StreamMetadataCache streamMetadataCache, GrouperMetadata grouperMetadata) {
// refresh config if enabled regex topic rewriter
Config refreshedConfig = refreshConfigByRegexTopicRewriter(originalConfig);
TaskConfig taskConfig = new TaskConfig(refreshedConfig);
// Do grouping to fetch TaskName to SSP mapping
Set<SystemStreamPartition> allSystemStreamPartitions = getMatchedInputStreamPartitions(refreshedConfig, streamMetadataCache);
// processor list is required by some of the groupers. So, let's pass them as part of the config.
// Copy the config and add the processor list to the config copy.
// TODO: It is non-ideal to have config as a medium to transmit the locality information; especially, if the locality information evolves. Evaluate options on using context objects to pass dependent components.
Map<String, String> configMap = new HashMap<>(refreshedConfig);
configMap.put(JobConfig.PROCESSOR_LIST, String.join(",", grouperMetadata.getProcessorLocality().keySet()));
SystemStreamPartitionGrouper grouper = getSystemStreamPartitionGrouper(new MapConfig(configMap));
JobConfig jobConfig = new JobConfig(refreshedConfig);
Map<TaskName, Set<SystemStreamPartition>> groups;
if (jobConfig.isSSPGrouperProxyEnabled()) {
SSPGrouperProxy sspGrouperProxy = new SSPGrouperProxy(refreshedConfig, grouper);
groups = sspGrouperProxy.group(allSystemStreamPartitions, grouperMetadata);
} else {
LOG.warn(String.format("SSPGrouperProxy is disabled (%s = false). Stateful jobs may produce erroneous results if this is not enabled.", JobConfig.SSP_INPUT_EXPANSION_ENABLED));
groups = grouper.group(allSystemStreamPartitions);
}
LOG.info(String.format("SystemStreamPartitionGrouper %s has grouped the SystemStreamPartitions into %d tasks with the following taskNames: %s", grouper, groups.size(), groups));
// If no mappings are present (first time the job is running) we return -1, this will allow 0 to be the first change
// mapping.
int maxChangelogPartitionId = changeLogPartitionMapping.values().stream().max(Comparator.naturalOrder()).orElse(-1);
// Sort the groups prior to assigning the changelog mapping so that the mapping is reproducible and intuitive
TreeMap<TaskName, Set<SystemStreamPartition>> sortedGroups = new TreeMap<>(groups);
Set<TaskModel> taskModels = new HashSet<>();
for (Map.Entry<TaskName, Set<SystemStreamPartition>> group : sortedGroups.entrySet()) {
TaskName taskName = group.getKey();
Set<SystemStreamPartition> systemStreamPartitions = group.getValue();
Optional<Integer> changelogPartitionId = Optional.ofNullable(changeLogPartitionMapping.get(taskName));
Partition changelogPartition;
if (changelogPartitionId.isPresent()) {
changelogPartition = new Partition(changelogPartitionId.get());
} else {
// If we've never seen this TaskName before, then assign it a new changelog partition.
maxChangelogPartitionId++;
LOG.info(String.format("New task %s is being assigned changelog partition %s.", taskName, maxChangelogPartitionId));
changelogPartition = new Partition(maxChangelogPartitionId);
}
taskModels.add(new TaskModel(taskName, systemStreamPartitions, changelogPartition));
}
// Here is where we should put in a pluggable option for the SSPTaskNameGrouper for locality, load-balancing, etc.
TaskNameGrouperFactory containerGrouperFactory = ReflectionUtil.getObj(taskConfig.getTaskNameGrouperFactory(), TaskNameGrouperFactory.class);
boolean standbyTasksEnabled = jobConfig.getStandbyTasksEnabled();
int standbyTaskReplicationFactor = jobConfig.getStandbyTaskReplicationFactor();
TaskNameGrouperProxy taskNameGrouperProxy = new TaskNameGrouperProxy(containerGrouperFactory.build(refreshedConfig), standbyTasksEnabled, standbyTaskReplicationFactor);
Set<ContainerModel> containerModels;
boolean isHostAffinityEnabled = new ClusterManagerConfig(refreshedConfig).getHostAffinityEnabled();
if (isHostAffinityEnabled) {
containerModels = taskNameGrouperProxy.group(taskModels, grouperMetadata);
} else {
containerModels = taskNameGrouperProxy.group(taskModels, new ArrayList<>(grouperMetadata.getProcessorLocality().keySet()));
}
Map<String, ContainerModel> containerMap = containerModels.stream().collect(Collectors.toMap(ContainerModel::getId, Function.identity()));
return new JobModel(refreshedConfig, containerMap);
}
use of org.apache.samza.config.ClusterManagerConfig in project samza by apache.
the class ExecutionPlanner method validateConfig.
private void validateConfig() {
ApplicationConfig appConfig = new ApplicationConfig(config);
ClusterManagerConfig clusterConfig = new ClusterManagerConfig(config);
// currently we don't support host-affinity in batch mode
if (appConfig.getAppMode() == ApplicationConfig.ApplicationMode.BATCH && clusterConfig.getHostAffinityEnabled()) {
throw new SamzaException(String.format("Host affinity is not supported in batch mode. Please configure %s=false.", ClusterManagerConfig.JOB_HOST_AFFINITY_ENABLED));
}
}
use of org.apache.samza.config.ClusterManagerConfig in project samza by apache.
the class TestContainerProcessManager method testNewContainerRequestedOnFailureWithKnownCode.
/**
* Test AM requests a new container when a task fails
* Error codes with same behavior - Disk failure, preemption and aborted
*/
@Test
public void testNewContainerRequestedOnFailureWithKnownCode() throws Exception {
Config conf = getConfig();
Map<String, String> config = new HashMap<>();
config.putAll(getConfig());
SamzaApplicationState state = new SamzaApplicationState(getJobModelManager(1));
MockClusterResourceManagerCallback callback = new MockClusterResourceManagerCallback();
MockClusterResourceManager clusterResourceManager = new MockClusterResourceManager(callback, state);
ClusterManagerConfig clusterManagerConfig = spy(new ClusterManagerConfig(new MapConfig(config)));
ContainerManager containerManager = buildContainerManager(containerPlacementMetadataStore, state, clusterResourceManager, clusterManagerConfig.getHostAffinityEnabled(), false);
MockContainerAllocatorWithoutHostAffinity allocator = new MockContainerAllocatorWithoutHostAffinity(clusterResourceManager, conf, state, containerManager);
ContainerProcessManager cpm = spy(buildContainerProcessManager(clusterManagerConfig, state, clusterResourceManager, Optional.of(allocator)));
// Start the task clusterResourceManager
cpm.start();
assertFalse(cpm.shouldShutdown());
assertEquals(1, allocator.getContainerRequestState().numPendingRequests());
SamzaResource container1 = new SamzaResource(1, 1000, "host1", "id1");
cpm.onResourceAllocated(container1);
// Allow container to run and update state
if (!allocator.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
assertEquals(0, allocator.getContainerRequestState().numPendingRequests());
cpm.onStreamProcessorLaunchSuccess(container1);
// Create container failure - with ContainerExitStatus.DISKS_FAILED
SamzaResourceStatus resourceStatusOnAppError = new SamzaResourceStatus(container1.getContainerId(), "App error", 1);
cpm.onResourceCompleted(resourceStatusOnAppError);
verify(cpm).onResourceCompletedWithUnknownStatus(eq(resourceStatusOnAppError), anyString(), anyString(), anyInt());
// The above failure should trigger a container request
assertEquals(1, allocator.getContainerRequestState().numPendingRequests());
assertFalse(cpm.shouldShutdown());
assertFalse(state.jobHealthy.get());
assertEquals(2, clusterResourceManager.resourceRequests.size());
assertEquals(0, clusterResourceManager.releasedResources.size());
assertEquals(ResourceRequestState.ANY_HOST, allocator.getContainerRequestState().peekPendingRequest().getPreferredHost());
SamzaResource container2 = new SamzaResource(1, 1000, "host1", "id2");
cpm.onResourceAllocated(container2);
// Allow container to run and update state
if (!allocator.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
cpm.onStreamProcessorLaunchSuccess(container2);
// Create container failure - with ContainerExitStatus.PREEMPTED
SamzaResourceStatus resourceStatusOnPreemption = new SamzaResourceStatus(container2.getContainerId(), "Preemption", SamzaResourceStatus.PREEMPTED);
cpm.onResourceCompleted(resourceStatusOnPreemption);
verify(cpm, never()).onResourceCompletedWithUnknownStatus(eq(resourceStatusOnPreemption), anyString(), anyString(), anyInt());
assertEquals(3, clusterResourceManager.resourceRequests.size());
// The above failure should trigger a container request
assertEquals(1, allocator.getContainerRequestState().numPendingRequests());
assertFalse(cpm.shouldShutdown());
assertFalse(state.jobHealthy.get());
assertEquals(ResourceRequestState.ANY_HOST, allocator.getContainerRequestState().peekPendingRequest().getPreferredHost());
SamzaResource container3 = new SamzaResource(1, 1000, "host1", "id3");
cpm.onResourceAllocated(container3);
// Allow container to run and update state
if (!allocator.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
cpm.onStreamProcessorLaunchSuccess(container3);
// Create container failure - with ContainerExitStatus.ABORTED
SamzaResourceStatus resourceStatusOnAborted = new SamzaResourceStatus(container3.getContainerId(), "Aborted", SamzaResourceStatus.ABORTED);
cpm.onResourceCompleted(resourceStatusOnAborted);
verify(cpm, never()).onResourceCompletedWithUnknownStatus(eq(resourceStatusOnAborted), anyString(), anyString(), anyInt());
// The above failure should trigger a container request
assertEquals(1, allocator.getContainerRequestState().numPendingRequests());
assertEquals(4, clusterResourceManager.resourceRequests.size());
assertEquals(0, clusterResourceManager.releasedResources.size());
assertFalse(cpm.shouldShutdown());
assertFalse(state.jobHealthy.get());
assertEquals(ResourceRequestState.ANY_HOST, allocator.getContainerRequestState().peekPendingRequest().getPreferredHost());
cpm.stop();
}
use of org.apache.samza.config.ClusterManagerConfig in project samza by apache.
the class TestContainerProcessManager method testAllBufferedResourcesAreUtilized.
@Test
public void testAllBufferedResourcesAreUtilized() throws Exception {
Map<String, String> config = new HashMap<>();
config.putAll(getConfigWithHostAffinity());
config.put("job.container.count", "2");
config.put("cluster-manager.container.retry.count", "2");
config.put("cluster-manager.container.request.timeout.ms", "10000");
Config cfg = new MapConfig(config);
// 1. Request two containers on hosts - host1 and host2
SamzaApplicationState state = new SamzaApplicationState(getJobModelManager(2));
MockClusterResourceManagerCallback callback = new MockClusterResourceManagerCallback();
MockClusterResourceManager clusterResourceManager = new MockClusterResourceManager(callback, state);
FaultDomainManager faultDomainManager = mock(FaultDomainManager.class);
LocalityManager mockLocalityManager = mock(LocalityManager.class);
when(mockLocalityManager.readLocality()).thenReturn(new LocalityModel(ImmutableMap.of("0", new ProcessorLocality("0", "host1"), "1", new ProcessorLocality("1", "host2"))));
ContainerManager containerManager = buildContainerManager(containerPlacementMetadataStore, state, clusterResourceManager, Boolean.parseBoolean(config.get(ClusterManagerConfig.HOST_AFFINITY_ENABLED)), false, mockLocalityManager, faultDomainManager);
MockContainerAllocatorWithHostAffinity allocator = new MockContainerAllocatorWithHostAffinity(clusterResourceManager, cfg, state, containerManager);
ContainerProcessManager cpm = spy(buildContainerProcessManager(new ClusterManagerConfig(cfg), state, clusterResourceManager, Optional.of(allocator), mockLocalityManager, false, faultDomainManager));
cpm.start();
assertFalse(cpm.shouldShutdown());
// 2. When the task manager starts, there should have been a pending request on host1 and host2
assertEquals(2, allocator.getContainerRequestState().numPendingRequests());
// 3. Allocate an extra resource on host1 and no resource on host2 yet.
SamzaResource resource1 = new SamzaResource(1, 1000, "host1", "id1");
SamzaResource resource2 = new SamzaResource(1, 1000, "host1", "id2");
cpm.onResourceAllocated(resource1);
cpm.onResourceAllocated(resource2);
// 4. Wait for the container to start on host1 and immediately fail
if (!allocator.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
cpm.onStreamProcessorLaunchSuccess(resource1);
assertEquals("host2", allocator.getContainerRequestState().peekPendingRequest().getPreferredHost());
assertEquals(1, allocator.getContainerRequestState().numPendingRequests());
cpm.onResourceCompleted(new SamzaResourceStatus(resource1.getContainerId(), "App Error", 1));
verify(cpm).onResourceCompletedWithUnknownStatus(any(SamzaResourceStatus.class), anyString(), anyString(), anyInt());
assertEquals(2, allocator.getContainerRequestState().numPendingRequests());
assertFalse(cpm.shouldShutdown());
assertFalse(state.jobHealthy.get());
assertEquals(3, clusterResourceManager.resourceRequests.size());
assertEquals(0, clusterResourceManager.releasedResources.size());
// 5. Do not allocate any further resource on host1, and verify that the re-run of the container on host1 uses the
// previously allocated extra resource
SamzaResource resource3 = new SamzaResource(1, 1000, "host2", "id3");
cpm.onResourceAllocated(resource3);
if (!allocator.awaitContainersStart(2, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
cpm.onStreamProcessorLaunchSuccess(resource2);
cpm.onStreamProcessorLaunchSuccess(resource3);
assertTrue(state.jobHealthy.get());
cpm.stop();
}
use of org.apache.samza.config.ClusterManagerConfig in project samza by apache.
the class TestContainerProcessManager method testDuplicateNotificationsDoNotAffectJobHealth.
@Test
public void testDuplicateNotificationsDoNotAffectJobHealth() throws Exception {
Config conf = getConfig();
Map<String, String> config = new HashMap<>();
config.putAll(getConfig());
SamzaApplicationState state = new SamzaApplicationState(getJobModelManager(1));
MockClusterResourceManagerCallback callback = new MockClusterResourceManagerCallback();
MockClusterResourceManager clusterResourceManager = new MockClusterResourceManager(callback, state);
ClusterManagerConfig clusterManagerConfig = spy(new ClusterManagerConfig(new MapConfig(conf)));
ContainerManager containerManager = buildContainerManager(containerPlacementMetadataStore, state, clusterResourceManager, clusterManagerConfig.getHostAffinityEnabled(), false);
MockContainerAllocatorWithoutHostAffinity allocator = new MockContainerAllocatorWithoutHostAffinity(clusterResourceManager, conf, state, containerManager);
ContainerProcessManager cpm = spy(buildContainerProcessManager(clusterManagerConfig, state, clusterResourceManager, Optional.of(allocator)));
// Start the task manager
cpm.start();
assertFalse(cpm.shouldShutdown());
assertEquals(1, allocator.getContainerRequestState().numPendingRequests());
SamzaResource container1 = new SamzaResource(1, 1000, "host1", "id1");
cpm.onResourceAllocated(container1);
// Allow container to run and update state
if (!allocator.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
cpm.onStreamProcessorLaunchSuccess(container1);
assertEquals(0, allocator.getContainerRequestState().numPendingRequests());
// Create container failure - with ContainerExitStatus.DISKS_FAILED
cpm.onResourceCompleted(new SamzaResourceStatus(container1.getContainerId(), "Disk failure", SamzaResourceStatus.DISK_FAIL));
verify(cpm, never()).onResourceCompletedWithUnknownStatus(any(SamzaResourceStatus.class), anyString(), anyString(), anyInt());
// The above failure should trigger a container request
assertEquals(1, allocator.getContainerRequestState().numPendingRequests());
assertFalse(cpm.shouldShutdown());
assertFalse(state.jobHealthy.get());
assertEquals(2, clusterResourceManager.resourceRequests.size());
assertEquals(0, clusterResourceManager.releasedResources.size());
assertEquals(ResourceRequestState.ANY_HOST, allocator.getContainerRequestState().peekPendingRequest().getPreferredHost());
SamzaResource container2 = new SamzaResource(1, 1000, "host1", "id2");
cpm.onResourceAllocated(container2);
// Allow container to run and update state
if (!allocator.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
cpm.onStreamProcessorLaunchSuccess(container2);
assertTrue(state.jobHealthy.get());
// Simulate a duplicate notification for container 1 with a different exit code
cpm.onResourceCompleted(new SamzaResourceStatus(container1.getContainerId(), "Disk failure", SamzaResourceStatus.PREEMPTED));
verify(cpm, never()).onResourceCompletedWithUnknownStatus(any(SamzaResourceStatus.class), anyString(), anyString(), anyInt());
// assert that a duplicate notification does not change metrics (including job health)
assertEquals(state.redundantNotifications.get(), 1);
assertEquals(2, clusterResourceManager.resourceRequests.size());
assertEquals(0, clusterResourceManager.releasedResources.size());
assertTrue(state.jobHealthy.get());
cpm.stop();
}
Aggregations