Search in sources :

Example 81 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class GroupByContainerIds method group.

/**
 * {@inheritDoc}
 *
 * When the are `t` tasks and `p` processors, where t <= p, a fair task distribution should ideally assign
 * (t / p) tasks to each processor. In addition to guaranteeing a fair distribution, this {@link TaskNameGrouper}
 * implementation generates a locationId aware task assignment to processors where it makes best efforts in assigning
 * the tasks to processors with the same locality.
 *
 * Task assignment to processors is accomplished through the following two phases:
 *
 * 1. In the first phase, each task(T) is assigned to a processor(P) that satisfies the following constraints:
 *    A. The processor(P) should have the same locality of the task(T).
 *    B. Number of tasks already assigned to the processor should be less than the (number of tasks / number of processors).
 *
 * 2. Each unassigned task from phase 1 are then mapped to any processor with task count less than the
 * (number of tasks / number of processors). When no such processor exists, then the unassigned
 * task is mapped to any processor from available processors in a round robin fashion.
 */
@Override
public Set<ContainerModel> group(Set<TaskModel> taskModels, GrouperMetadata grouperMetadata) {
    // Validate that the task models are not empty.
    Map<TaskName, LocationId> taskLocality = grouperMetadata.getTaskLocality();
    Preconditions.checkArgument(!taskModels.isEmpty(), "No tasks found. Likely due to no input partitions. Can't run a job with no tasks.");
    // Invoke the default grouper when the processor locality does not exist.
    if (MapUtils.isEmpty(grouperMetadata.getProcessorLocality())) {
        LOG.info("ProcessorLocality is empty. Generating with the default group method.");
        return group(taskModels, new ArrayList<>());
    }
    Map<String, LocationId> processorLocality = new TreeMap<>(grouperMetadata.getProcessorLocality());
    /**
     * When there're more task models than processors then choose the lexicographically least `x` processors(where x = tasks.size()).
     */
    if (processorLocality.size() > taskModels.size()) {
        processorLocality = processorLocality.entrySet().stream().limit(taskModels.size()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    }
    Map<LocationId, List<String>> locationIdToProcessors = new HashMap<>();
    Map<String, TaskGroup> processorIdToTaskGroup = new HashMap<>();
    // Generate the {@see LocationId} to processors mapping and processorId to {@see TaskGroup} mapping.
    processorLocality.forEach((processorId, locationId) -> {
        List<String> processorIds = locationIdToProcessors.getOrDefault(locationId, new ArrayList<>());
        processorIds.add(processorId);
        locationIdToProcessors.put(locationId, processorIds);
        processorIdToTaskGroup.put(processorId, new TaskGroup(processorId, new ArrayList<>()));
    });
    int numTasksPerProcessor = taskModels.size() / processorLocality.size();
    Set<TaskName> assignedTasks = new HashSet<>();
    /**
     * A processor is considered under-assigned when number of tasks assigned to it is less than
     * (number of tasks / number of processors).
     * Map the tasks to the under-assigned processors with same locality.
     */
    for (TaskModel taskModel : taskModels) {
        LocationId taskLocationId = taskLocality.get(taskModel.getTaskName());
        if (taskLocationId != null) {
            List<String> processorIds = locationIdToProcessors.getOrDefault(taskLocationId, new ArrayList<>());
            for (String processorId : processorIds) {
                TaskGroup taskGroup = processorIdToTaskGroup.get(processorId);
                if (taskGroup.size() < numTasksPerProcessor) {
                    taskGroup.addTaskName(taskModel.getTaskName().getTaskName());
                    assignedTasks.add(taskModel.getTaskName());
                    break;
                }
            }
        }
    }
    /**
     * In some scenarios, the task either might not have any previous locality or might not have any
     * processor that maps to its previous locality. This cyclic processorId's iterator helps us in
     * those scenarios to assign the processorIds to those kind of tasks in a round robin fashion.
     */
    Iterator<String> processorIdsCyclicIterator = Iterators.cycle(processorLocality.keySet());
    // Order the taskGroups to choose a task group in a deterministic fashion for unassigned tasks.
    List<TaskGroup> taskGroups = new ArrayList<>(processorIdToTaskGroup.values());
    taskGroups.sort(Comparator.comparing(TaskGroup::getContainerId));
    /**
     * For the tasks left over from the previous stage, map them to any under-assigned processor.
     * When a under-assigned processor doesn't exist, then map them to any processor from the
     * available processors in a round robin manner.
     */
    for (TaskModel taskModel : taskModels) {
        if (!assignedTasks.contains(taskModel.getTaskName())) {
            Optional<TaskGroup> underAssignedTaskGroup = taskGroups.stream().filter(taskGroup -> taskGroup.size() < numTasksPerProcessor).findFirst();
            if (underAssignedTaskGroup.isPresent()) {
                underAssignedTaskGroup.get().addTaskName(taskModel.getTaskName().getTaskName());
            } else {
                TaskGroup taskGroup = processorIdToTaskGroup.get(processorIdsCyclicIterator.next());
                taskGroup.addTaskName(taskModel.getTaskName().getTaskName());
            }
            assignedTasks.add(taskModel.getTaskName());
        }
    }
    return TaskGroup.buildContainerModels(taskModels, taskGroups);
}
Also used : MapUtils(org.apache.commons.collections4.MapUtils) Arrays(java.util.Arrays) TaskName(org.apache.samza.container.TaskName) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) LoggerFactory(org.slf4j.LoggerFactory) Set(java.util.Set) HashMap(java.util.HashMap) TaskModel(org.apache.samza.job.model.TaskModel) Collectors(java.util.stream.Collectors) Iterators(com.google.common.collect.Iterators) ArrayList(java.util.ArrayList) LocationId(org.apache.samza.runtime.LocationId) HashSet(java.util.HashSet) List(java.util.List) TreeMap(java.util.TreeMap) ContainerModel(org.apache.samza.job.model.ContainerModel) Map(java.util.Map) Optional(java.util.Optional) Preconditions(com.google.common.base.Preconditions) Comparator(java.util.Comparator) Collections(java.util.Collections) HashMap(java.util.HashMap) LocationId(org.apache.samza.runtime.LocationId) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) TaskName(org.apache.samza.container.TaskName) ArrayList(java.util.ArrayList) List(java.util.List) TaskModel(org.apache.samza.job.model.TaskModel) HashSet(java.util.HashSet)

Example 82 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestJobModelCalculator method testCustomSSPGrouper.

@Test
public void testCustomSSPGrouper() {
    // custom grouper only groups into two tasks, so only need 2 changelog partitions
    Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(2);
    Config config = config(ImmutableList.of(SYSTEM_STREAM0, SYSTEM_STREAM1), ImmutableMap.of(JobConfig.SSP_GROUPER_FACTORY, Partition0SeparateFactory.class.getName()));
    when(this.grouperMetadata.getProcessorLocality()).thenReturn(ImmutableMap.of("0", mock(LocationId.class), "1", mock(LocationId.class)));
    Set<SystemStreamPartition> sspsForTask1 = new ImmutableSet.Builder<SystemStreamPartition>().add(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(1))).add(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(2))).add(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(3))).add(new SystemStreamPartition(SYSTEM_STREAM1, new Partition(1))).add(new SystemStreamPartition(SYSTEM_STREAM1, new Partition(2))).build();
    Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), taskModel(0, 0, 0))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), new TaskModel(taskName(1), sspsForTask1, new Partition(1)))));
    JobModel expected = new JobModel(config, containerModels);
    JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
    assertEquals(expected, actual);
}
Also used : SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) MapConfig(org.apache.samza.config.MapConfig) StorageConfig(org.apache.samza.config.StorageConfig) Config(org.apache.samza.config.Config) JobConfig(org.apache.samza.config.JobConfig) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) TaskConfig(org.apache.samza.config.TaskConfig) ContainerModel(org.apache.samza.job.model.ContainerModel) TaskName(org.apache.samza.container.TaskName) JobModel(org.apache.samza.job.model.JobModel) TaskModel(org.apache.samza.job.model.TaskModel) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 83 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestJobModelCalculator method testHostAffinityEnabled.

@Test
public void testHostAffinityEnabled() {
    Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(4);
    Config config = config(ImmutableList.of(SYSTEM_STREAM0, SYSTEM_STREAM1), ImmutableMap.of(ClusterManagerConfig.HOST_AFFINITY_ENABLED, "true", // make sure the group method which accepts GrouperMetadata is used
    TaskConfig.GROUPER_FACTORY, GroupByContainerCountOverrideFactory.class.getName()));
    Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), taskModel(0, 0, 0), taskName(2), taskModel(2, 2, 2))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), taskModel(1, 1, 1), taskName(3), taskModel(3, 3))));
    JobModel expected = new JobModel(config, containerModels);
    JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
    assertEquals(expected, actual);
}
Also used : TaskName(org.apache.samza.container.TaskName) MapConfig(org.apache.samza.config.MapConfig) StorageConfig(org.apache.samza.config.StorageConfig) Config(org.apache.samza.config.Config) JobConfig(org.apache.samza.config.JobConfig) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) TaskConfig(org.apache.samza.config.TaskConfig) JobModel(org.apache.samza.job.model.JobModel) ContainerModel(org.apache.samza.job.model.ContainerModel) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 84 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestJobModelCalculator method testWithSSPFilter.

@Test
public void testWithSSPFilter() {
    Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(4);
    Config config = config(ImmutableList.of(SYSTEM_STREAM0, SYSTEM_STREAM1), ImmutableMap.of(JobConfig.SSP_MATCHER_CLASS, Partition0Or1Filter.class.getName(), JobConfig.SSP_MATCHER_CONFIG_JOB_FACTORY_REGEX, ".*MyJobFactory", // this needs to match the regex in the line above
    JobConfig.STREAM_JOB_FACTORY_CLASS, "org.apache.samza.custom.MyJobFactory"));
    Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), taskModel(0, 0, 0))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), taskModel(1, 1, 1))));
    JobModel expected = new JobModel(config, containerModels);
    JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
    assertEquals(expected, actual);
}
Also used : TaskName(org.apache.samza.container.TaskName) MapConfig(org.apache.samza.config.MapConfig) StorageConfig(org.apache.samza.config.StorageConfig) Config(org.apache.samza.config.Config) JobConfig(org.apache.samza.config.JobConfig) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) TaskConfig(org.apache.samza.config.TaskConfig) JobModel(org.apache.samza.job.model.JobModel) ContainerModel(org.apache.samza.job.model.ContainerModel) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 85 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestJobModelCalculator method testSSPGrouperProxyUsed.

@Test
public void testSSPGrouperProxyUsed() {
    addStreamMetadataCacheMetadata(this.streamMetadataCache, ImmutableMap.of(SYSTEM_STREAM0, buildSystemStreamMetadata(4)));
    Map<TaskName, Integer> changelogPartitionMapping = changelogPartitionMapping(2);
    Config config = config(ImmutableList.of(SYSTEM_STREAM0), ImmutableMap.of(JobConfig.SSP_GROUPER_FACTORY, Partition0SeparateFactory.class.getName(), // need this to trigger SSPGrouperProxy logic
    String.format(StorageConfig.FACTORY, "myStore"), "MyCustomStore"));
    // custom SSP grouper expects a certain processor locality for another test, so add the locality here too
    when(this.grouperMetadata.getProcessorLocality()).thenReturn(ImmutableMap.of("0", mock(LocationId.class), "1", mock(LocationId.class)));
    /*
     * Even though the custom grouper factory would normally send the additional SSPs to task 1, the SSP grouper proxy
     * should give task 0 some of the SSPs.
     */
    when(this.grouperMetadata.getPreviousTaskToSSPAssignment()).thenReturn(ImmutableMap.of(taskName(0), ImmutableList.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(0))), taskName(1), ImmutableList.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(1)))));
    Set<SystemStreamPartition> sspsForTask0 = ImmutableSet.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(0)), new SystemStreamPartition(SYSTEM_STREAM0, new Partition(2)));
    Set<SystemStreamPartition> sspsForTask1 = ImmutableSet.of(new SystemStreamPartition(SYSTEM_STREAM0, new Partition(1)), new SystemStreamPartition(SYSTEM_STREAM0, new Partition(3)));
    Map<String, ContainerModel> containerModels = ImmutableMap.of("0", new ContainerModel("0", ImmutableMap.of(taskName(0), new TaskModel(taskName(0), sspsForTask0, new Partition(0)))), "1", new ContainerModel("1", ImmutableMap.of(taskName(1), new TaskModel(taskName(1), sspsForTask1, new Partition(1)))));
    JobModel expected = new JobModel(config, containerModels);
    JobModel actual = JobModelCalculator.INSTANCE.calculateJobModel(config, changelogPartitionMapping, this.streamMetadataCache, this.grouperMetadata);
    assertEquals(expected, actual);
}
Also used : SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) TaskName(org.apache.samza.container.TaskName) MapConfig(org.apache.samza.config.MapConfig) StorageConfig(org.apache.samza.config.StorageConfig) Config(org.apache.samza.config.Config) JobConfig(org.apache.samza.config.JobConfig) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) TaskConfig(org.apache.samza.config.TaskConfig) JobModel(org.apache.samza.job.model.JobModel) TaskModel(org.apache.samza.job.model.TaskModel) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) ContainerModel(org.apache.samza.job.model.ContainerModel) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Aggregations

ContainerModel (org.apache.samza.job.model.ContainerModel)96 TaskModel (org.apache.samza.job.model.TaskModel)68 TaskName (org.apache.samza.container.TaskName)60 Test (org.junit.Test)57 HashMap (java.util.HashMap)53 JobModel (org.apache.samza.job.model.JobModel)37 MapConfig (org.apache.samza.config.MapConfig)30 Config (org.apache.samza.config.Config)28 Partition (org.apache.samza.Partition)24 SystemStreamPartition (org.apache.samza.system.SystemStreamPartition)22 StorageConfig (org.apache.samza.config.StorageConfig)19 Map (java.util.Map)18 JobConfig (org.apache.samza.config.JobConfig)18 TaskConfig (org.apache.samza.config.TaskConfig)18 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)14 ClusterManagerConfig (org.apache.samza.config.ClusterManagerConfig)12 LocationId (org.apache.samza.runtime.LocationId)12 Collectors (java.util.stream.Collectors)10 SystemStream (org.apache.samza.system.SystemStream)10