use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class GroupByContainerIds method group.
/**
* {@inheritDoc}
*
* When the are `t` tasks and `p` processors, where t <= p, a fair task distribution should ideally assign
* (t / p) tasks to each processor. In addition to guaranteeing a fair distribution, this {@link TaskNameGrouper}
* implementation generates a locationId aware task assignment to processors where it makes best efforts in assigning
* the tasks to processors with the same locality.
*
* Task assignment to processors is accomplished through the following two phases:
*
* 1. In the first phase, each task(T) is assigned to a processor(P) that satisfies the following constraints:
* A. The processor(P) should have the same locality of the task(T).
* B. Number of tasks already assigned to the processor should be less than the (number of tasks / number of processors).
*
* 2. Each unassigned task from phase 1 are then mapped to any processor with task count less than the
* (number of tasks / number of processors). When no such processor exists, then the unassigned
* task is mapped to any processor from available processors in a round robin fashion.
*/
@Override
public Set<ContainerModel> group(Set<TaskModel> taskModels, GrouperMetadata grouperMetadata) {
// Validate that the task models are not empty.
Map<TaskName, LocationId> taskLocality = grouperMetadata.getTaskLocality();
Preconditions.checkArgument(!taskModels.isEmpty(), "No tasks found. Likely due to no input partitions. Can't run a job with no tasks.");
// Invoke the default grouper when the processor locality does not exist.
if (MapUtils.isEmpty(grouperMetadata.getProcessorLocality())) {
LOG.info("ProcessorLocality is empty. Generating with the default group method.");
return group(taskModels, new ArrayList<>());
}
Map<String, LocationId> processorLocality = new TreeMap<>(grouperMetadata.getProcessorLocality());
/**
* When there're more task models than processors then choose the lexicographically least `x` processors(where x = tasks.size()).
*/
if (processorLocality.size() > taskModels.size()) {
processorLocality = processorLocality.entrySet().stream().limit(taskModels.size()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}
Map<LocationId, List<String>> locationIdToProcessors = new HashMap<>();
Map<String, TaskGroup> processorIdToTaskGroup = new HashMap<>();
// Generate the {@see LocationId} to processors mapping and processorId to {@see TaskGroup} mapping.
processorLocality.forEach((processorId, locationId) -> {
List<String> processorIds = locationIdToProcessors.getOrDefault(locationId, new ArrayList<>());
processorIds.add(processorId);
locationIdToProcessors.put(locationId, processorIds);
processorIdToTaskGroup.put(processorId, new TaskGroup(processorId, new ArrayList<>()));
});
int numTasksPerProcessor = taskModels.size() / processorLocality.size();
Set<TaskName> assignedTasks = new HashSet<>();
/**
* A processor is considered under-assigned when number of tasks assigned to it is less than
* (number of tasks / number of processors).
* Map the tasks to the under-assigned processors with same locality.
*/
for (TaskModel taskModel : taskModels) {
LocationId taskLocationId = taskLocality.get(taskModel.getTaskName());
if (taskLocationId != null) {
List<String> processorIds = locationIdToProcessors.getOrDefault(taskLocationId, new ArrayList<>());
for (String processorId : processorIds) {
TaskGroup taskGroup = processorIdToTaskGroup.get(processorId);
if (taskGroup.size() < numTasksPerProcessor) {
taskGroup.addTaskName(taskModel.getTaskName().getTaskName());
assignedTasks.add(taskModel.getTaskName());
break;
}
}
}
}
/**
* In some scenarios, the task either might not have any previous locality or might not have any
* processor that maps to its previous locality. This cyclic processorId's iterator helps us in
* those scenarios to assign the processorIds to those kind of tasks in a round robin fashion.
*/
Iterator<String> processorIdsCyclicIterator = Iterators.cycle(processorLocality.keySet());
// Order the taskGroups to choose a task group in a deterministic fashion for unassigned tasks.
List<TaskGroup> taskGroups = new ArrayList<>(processorIdToTaskGroup.values());
taskGroups.sort(Comparator.comparing(TaskGroup::getContainerId));
/**
* For the tasks left over from the previous stage, map them to any under-assigned processor.
* When a under-assigned processor doesn't exist, then map them to any processor from the
* available processors in a round robin manner.
*/
for (TaskModel taskModel : taskModels) {
if (!assignedTasks.contains(taskModel.getTaskName())) {
Optional<TaskGroup> underAssignedTaskGroup = taskGroups.stream().filter(taskGroup -> taskGroup.size() < numTasksPerProcessor).findFirst();
if (underAssignedTaskGroup.isPresent()) {
underAssignedTaskGroup.get().addTaskName(taskModel.getTaskName().getTaskName());
} else {
TaskGroup taskGroup = processorIdToTaskGroup.get(processorIdsCyclicIterator.next());
taskGroup.addTaskName(taskModel.getTaskName().getTaskName());
}
assignedTasks.add(taskModel.getTaskName());
}
}
return TaskGroup.buildContainerModels(taskModels, taskGroups);
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestJobModelCalculator method testCustomSSPGrouper.
@Test
public void testCustomSSPGrouper() {
// custom grouper only groups into two tasks, so only need 2 changelog partitions
Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(2);
Config config = config(ImmutableList.of(SYSTEM_STREAM0, SYSTEM_STREAM1), ImmutableMap.of(JobConfig.SSP_GROUPER_FACTORY, Partition0SeparateFactory.class.getName()));
when(this.grouperMetadata.getProcessorLocality()).thenReturn(ImmutableMap.of("0", mock(LocationId.class), "1", mock(LocationId.class)));
Set<SystemStreamPartition> sspsForTask1 = new ImmutableSet.Builder<SystemStreamPartition>().add(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(1))).add(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(2))).add(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(3))).add(new SystemStreamPartition(SYSTEM_STREAM1, new Partition(1))).add(new SystemStreamPartition(SYSTEM_STREAM1, new Partition(2))).build();
Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), taskModel(0, 0, 0))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), new TaskModel(taskName(1), sspsForTask1, new Partition(1)))));
JobModel expected = new JobModel(config, containerModels);
JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
assertEquals(expected, actual);
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestJobModelCalculator method testHostAffinityEnabled.
@Test
public void testHostAffinityEnabled() {
Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(4);
Config config = config(ImmutableList.of(SYSTEM_STREAM0, SYSTEM_STREAM1), ImmutableMap.of(ClusterManagerConfig.HOST_AFFINITY_ENABLED, "true", // make sure the group method which accepts GrouperMetadata is used
TaskConfig.GROUPER_FACTORY, GroupByContainerCountOverrideFactory.class.getName()));
Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), taskModel(0, 0, 0), taskName(2), taskModel(2, 2, 2))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), taskModel(1, 1, 1), taskName(3), taskModel(3, 3))));
JobModel expected = new JobModel(config, containerModels);
JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
assertEquals(expected, actual);
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestJobModelCalculator method testWithSSPFilter.
@Test
public void testWithSSPFilter() {
Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(4);
Config config = config(ImmutableList.of(SYSTEM_STREAM0, SYSTEM_STREAM1), ImmutableMap.of(JobConfig.SSP_MATCHER_CLASS, Partition0Or1Filter.class.getName(), JobConfig.SSP_MATCHER_CONFIG_JOB_FACTORY_REGEX, ".*MyJobFactory", // this needs to match the regex in the line above
JobConfig.STREAM_JOB_FACTORY_CLASS, "org.apache.samza.custom.MyJobFactory"));
Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), taskModel(0, 0, 0))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), taskModel(1, 1, 1))));
JobModel expected = new JobModel(config, containerModels);
JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
assertEquals(expected, actual);
}
use of org.apache.samza.job.model.ContainerModel in project samza by apache.
the class TestJobModelCalculator method testSSPGrouperProxyUsed.
@Test
public void testSSPGrouperProxyUsed() {
addStreamMetadataCacheMetadata(this.streamMetadataCache, ImmutableMap.of(SYSTEM_STREAM0, buildSystemStreamMetadata(4)));
Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(2);
Config config = config(ImmutableList.of(SYSTEM_STREAM0), ImmutableMap.of(JobConfig.SSP_GROUPER_FACTORY, Partition0SeparateFactory.class.getName(), // need this to trigger SSPGrouperProxy logic
String.format(StorageConfig.FACTORY, "myStore"), "MyCustomStore"));
// custom SSP grouper expects a certain processor locality for another test, so add the locality here too
when(this.grouperMetadata.getProcessorLocality()).thenReturn(ImmutableMap.of("0", mock(LocationId.class), "1", mock(LocationId.class)));
/*
* Even though the custom grouper factory would normally send the additional SSPs to task 1, the SSP grouper proxy
* should give task 0 some of the SSPs.
*/
when(this.grouperMetadata.getPreviousTaskToSSPAssignment()).thenReturn(ImmutableMap.of(taskName(0), ImmutableList.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(0))), taskName(1), ImmutableList.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(1)))));
Set<SystemStreamPartition> sspsForTask0 = ImmutableSet.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(0)), new SystemStreamPartition(SYSTEM_STREAM0, new Partition(2)));
Set<SystemStreamPartition> sspsForTask1 = ImmutableSet.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(1)), new SystemStreamPartition(SYSTEM_STREAM0, new Partition(3)));
Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), new TaskModel(taskName(0), sspsForTask0, new Partition(0)))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), new TaskModel(taskName(1), sspsForTask1, new Partition(1)))));
JobModel expected = new JobModel(config, containerModels);
JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
assertEquals(expected, actual);
}
Aggregations