use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class JobModelCalculator method calculateJobModel.
/**
* Does the following:
* 1. Fetches metadata of the input streams defined in configuration through {@code streamMetadataCache}.
* 2. Applies the SSP grouper and task name grouper defined in the configuration to build the {@link JobModel}.
* @param originalConfig the configuration of the job.
* @param changeLogPartitionMapping the task to changelog partition mapping of the job.
* @param streamMetadataCache the cache that holds the partition metadata of the input streams.
* @param grouperMetadata provides the historical metadata of the application.
* @return the built {@link JobModel}.
*/
public JobModel calculateJobModel(Config originalConfig, Map<TaskName, Integer> changeLogPartitionMapping, StreamMetadataCache streamMetadataCache, GrouperMetadata grouperMetadata) {
// refresh config if enabled regex topic rewriter
Config refreshedConfig = refreshConfigByRegexTopicRewriter(originalConfig);
TaskConfig taskConfig = new TaskConfig(refreshedConfig);
// Do grouping to fetch TaskName to SSP mapping
Set<SystemStreamPartition> allSystemStreamPartitions = getMatchedInputStreamPartitions(refreshedConfig, streamMetadataCache);
// processor list is required by some of the groupers. So, let's pass them as part of the config.
// Copy the config and add the processor list to the config copy.
// TODO: It is non-ideal to have config as a medium to transmit the locality information; especially, if the locality information evolves. Evaluate options on using context objects to pass dependent components.
Map<String, String> configMap = new HashMap<>(refreshedConfig);
configMap.put(JobConfig.PROCESSOR_LIST, String.join(",", grouperMetadata.getProcessorLocality().keySet()));
SystemStreamPartitionGrouper grouper = getSystemStreamPartitionGrouper(new MapConfig(configMap));
JobConfig jobConfig = new JobConfig(refreshedConfig);
Map<TaskName, Set<SystemStreamPartition>> groups;
if (jobConfig.isSSPGrouperProxyEnabled()) {
SSPGrouperProxy sspGrouperProxy = new SSPGrouperProxy(refreshedConfig, grouper);
groups = sspGrouperProxy.group(allSystemStreamPartitions, grouperMetadata);
} else {
LOG.warn(String.format("SSPGrouperProxy is disabled (%s = false). Stateful jobs may produce erroneous results if this is not enabled.", JobConfig.SSP_INPUT_EXPANSION_ENABLED));
groups = grouper.group(allSystemStreamPartitions);
}
LOG.info(String.format("SystemStreamPartitionGrouper %s has grouped the SystemStreamPartitions into %d tasks with the following taskNames: %s", grouper, groups.size(), groups));
// If no mappings are present (first time the job is running) we return -1, this will allow 0 to be the first change
// mapping.
int maxChangelogPartitionId = changeLogPartitionMapping.values().stream().max(Comparator.naturalOrder()).orElse(-1);
// Sort the groups prior to assigning the changelog mapping so that the mapping is reproducible and intuitive
TreeMap<TaskName, Set<SystemStreamPartition>> sortedGroups = new TreeMap<>(groups);
Set<TaskModel> taskModels = new HashSet<>();
for (Map.Entry<TaskName, Set<SystemStreamPartition>> group : sortedGroups.entrySet()) {
TaskName taskName = group.getKey();
Set<SystemStreamPartition> systemStreamPartitions = group.getValue();
Optional<Integer> changelogPartitionId = Optional.ofNullable(changeLogPartitionMapping.get(taskName));
Partition changelogPartition;
if (changelogPartitionId.isPresent()) {
changelogPartition = new Partition(changelogPartitionId.get());
} else {
// If we've never seen this TaskName before, then assign it a new changelog partition.
maxChangelogPartitionId++;
LOG.info(String.format("New task %s is being assigned changelog partition %s.", taskName, maxChangelogPartitionId));
changelogPartition = new Partition(maxChangelogPartitionId);
}
taskModels.add(new TaskModel(taskName, systemStreamPartitions, changelogPartition));
}
// Here is where we should put in a pluggable option for the SSPTaskNameGrouper for locality, load-balancing, etc.
TaskNameGrouperFactory containerGrouperFactory = ReflectionUtil.getObj(taskConfig.getTaskNameGrouperFactory(), TaskNameGrouperFactory.class);
boolean standbyTasksEnabled = jobConfig.getStandbyTasksEnabled();
int standbyTaskReplicationFactor = jobConfig.getStandbyTaskReplicationFactor();
TaskNameGrouperProxy taskNameGrouperProxy = new TaskNameGrouperProxy(containerGrouperFactory.build(refreshedConfig), standbyTasksEnabled, standbyTaskReplicationFactor);
Set<ContainerModel> containerModels;
boolean isHostAffinityEnabled = new ClusterManagerConfig(refreshedConfig).getHostAffinityEnabled();
if (isHostAffinityEnabled) {
containerModels = taskNameGrouperProxy.group(taskModels, grouperMetadata);
} else {
containerModels = taskNameGrouperProxy.group(taskModels, new ArrayList<>(grouperMetadata.getProcessorLocality().keySet()));
}
Map<String, ContainerModel> containerMap = containerModels.stream().collect(Collectors.toMap(ContainerModel::getId, Function.identity()));
return new JobModel(refreshedConfig, containerMap);
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestGroupByContainerIds method testMoreTasksThanProcessors.
@Test
public void testMoreTasksThanProcessors() {
String testProcessorId1 = "testProcessorId1";
String testProcessorId2 = "testProcessorId2";
LocationId testLocationId1 = new LocationId("testLocationId1");
LocationId testLocationId2 = new LocationId("testLocationId2");
LocationId testLocationId3 = new LocationId("testLocationId3");
TaskName testTaskName1 = new TaskName("testTasKId1");
TaskName testTaskName2 = new TaskName("testTaskId2");
TaskName testTaskName3 = new TaskName("testTaskId3");
Map<String, LocationId> processorLocality = ImmutableMap.of(testProcessorId1, testLocationId1, testProcessorId2, testLocationId2);
Map<TaskName, LocationId> taskLocality = ImmutableMap.of(testTaskName1, testLocationId1, testTaskName2, testLocationId2, testTaskName3, testLocationId3);
GrouperMetadataImpl grouperMetadata = new GrouperMetadataImpl(processorLocality, taskLocality, new HashMap<>(), new HashMap<>());
Set<TaskModel> taskModels = generateTaskModels(1);
List<String> containerIds = ImmutableList.of(testProcessorId1, testProcessorId2);
Map<TaskName, TaskModel> expectedTasks = taskModels.stream().collect(Collectors.toMap(TaskModel::getTaskName, x -> x));
ContainerModel expectedContainerModel = new ContainerModel(testProcessorId1, expectedTasks);
Set<ContainerModel> actualContainerModels = buildSimpleGrouper().group(taskModels, grouperMetadata);
assertEquals(1, actualContainerModels.size());
assertEquals(ImmutableSet.of(expectedContainerModel), actualContainerModels);
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestGroupByContainerIds method testGroupHappyPathWithListOfContainers.
@Test
public void testGroupHappyPathWithListOfContainers() {
Set<TaskModel> taskModels = generateTaskModels(5);
List<String> containerIds = new ArrayList<String>() {
{
add("4");
add("2");
}
};
Set<ContainerModel> containers = buildSimpleGrouper().group(taskModels, containerIds);
Map<String, ContainerModel> containersMap = new HashMap<>();
for (ContainerModel container : containers) {
containersMap.put(container.getId(), container);
}
assertEquals(2, containers.size());
ContainerModel container0 = containersMap.get("4");
ContainerModel container1 = containersMap.get("2");
assertNotNull(container0);
assertNotNull(container1);
assertEquals("4", container0.getId());
assertEquals("2", container1.getId());
assertEquals(3, container0.getTasks().size());
assertEquals(2, container1.getTasks().size());
assertTrue(container0.getTasks().containsKey(getTaskName(0)));
assertTrue(container0.getTasks().containsKey(getTaskName(2)));
assertTrue(container0.getTasks().containsKey(getTaskName(4)));
assertTrue(container1.getTasks().containsKey(getTaskName(1)));
assertTrue(container1.getTasks().containsKey(getTaskName(3)));
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestGroupByContainerIds method testShouldUseTaskLocalityWhenGeneratingContainerModels.
@Test
public void testShouldUseTaskLocalityWhenGeneratingContainerModels() {
TaskNameGrouper taskNameGrouper = buildSimpleGrouper(3);
String testProcessorId1 = "testProcessorId1";
String testProcessorId2 = "testProcessorId2";
String testProcessorId3 = "testProcessorId3";
LocationId testLocationId1 = new LocationId("testLocationId1");
LocationId testLocationId2 = new LocationId("testLocationId2");
LocationId testLocationId3 = new LocationId("testLocationId3");
TaskName testTaskName1 = new TaskName("testTasKId1");
TaskName testTaskName2 = new TaskName("testTaskId2");
TaskName testTaskName3 = new TaskName("testTaskId3");
TaskModel testTaskModel1 = new TaskModel(testTaskName1, new HashSet<>(), new Partition(0));
TaskModel testTaskModel2 = new TaskModel(testTaskName2, new HashSet<>(), new Partition(1));
TaskModel testTaskModel3 = new TaskModel(testTaskName3, new HashSet<>(), new Partition(2));
Map<String, LocationId> processorLocality = ImmutableMap.of(testProcessorId1, testLocationId1, testProcessorId2, testLocationId2, testProcessorId3, testLocationId3);
Map<TaskName, LocationId> taskLocality = ImmutableMap.of(testTaskName1, testLocationId1, testTaskName2, testLocationId2, testTaskName3, testLocationId3);
GrouperMetadataImpl grouperMetadata = new GrouperMetadataImpl(processorLocality, taskLocality, new HashMap<>(), new HashMap<>());
Set<TaskModel> taskModels = ImmutableSet.of(testTaskModel1, testTaskModel2, testTaskModel3);
Set<ContainerModel> expectedContainerModels = ImmutableSet.of(new ContainerModel(testProcessorId1, ImmutableMap.of(testTaskName1, testTaskModel1)), new ContainerModel(testProcessorId2, ImmutableMap.of(testTaskName2, testTaskModel2)), new ContainerModel(testProcessorId3, ImmutableMap.of(testTaskName3, testTaskModel3)));
Set<ContainerModel> actualContainerModels = taskNameGrouper.group(taskModels, grouperMetadata);
assertEquals(expectedContainerModels, actualContainerModels);
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestGroupByContainerIds method testFewerTasksThanContainers.
@Test
public void testFewerTasksThanContainers() {
final String testContainerId1 = "1";
final String testContainerId2 = "2";
Set<TaskModel> taskModels = generateTaskModels(1);
List<String> containerIds = ImmutableList.of(testContainerId1, testContainerId2);
Map<TaskName, TaskModel> expectedTasks = taskModels.stream().collect(Collectors.toMap(TaskModel::getTaskName, x -> x));
ContainerModel expectedContainerModel = new ContainerModel(testContainerId1, expectedTasks);
Set<ContainerModel> actualContainerModels = buildSimpleGrouper().group(taskModels, containerIds);
assertEquals(1, actualContainerModels.size());
assertEquals(ImmutableSet.of(expectedContainerModel), actualContainerModels);
}
Aggregations