Search in sources :

Example 16 with LocationId

use of org.apache.samza.runtime.LocationId in project samza by apache.

the class GroupByContainerIds method group.

/**
 * {@inheritDoc}
 *
 * When the are `t` tasks and `p` processors, where t <= p, a fair task distribution should ideally assign
 * (t / p) tasks to each processor. In addition to guaranteeing a fair distribution, this {@link TaskNameGrouper}
 * implementation generates a locationId aware task assignment to processors where it makes best efforts in assigning
 * the tasks to processors with the same locality.
 *
 * Task assignment to processors is accomplished through the following two phases:
 *
 * 1. In the first phase, each task(T) is assigned to a processor(P) that satisfies the following constraints:
 *    A. The processor(P) should have the same locality of the task(T).
 *    B. Number of tasks already assigned to the processor should be less than the (number of tasks / number of processors).
 *
 * 2. Each unassigned task from phase 1 are then mapped to any processor with task count less than the
 * (number of tasks / number of processors). When no such processor exists, then the unassigned
 * task is mapped to any processor from available processors in a round robin fashion.
 */
@Override
public Set<ContainerModel> group(Set<TaskModel> taskModels, GrouperMetadata grouperMetadata) {
    // Validate that the task models are not empty.
    Map<TaskName, LocationId> taskLocality = grouperMetadata.getTaskLocality();
    Preconditions.checkArgument(!taskModels.isEmpty(), "No tasks found. Likely due to no input partitions. Can't run a job with no tasks.");
    // Invoke the default grouper when the processor locality does not exist.
    if (MapUtils.isEmpty(grouperMetadata.getProcessorLocality())) {
        LOG.info("ProcessorLocality is empty. Generating with the default group method.");
        return group(taskModels, new ArrayList<>());
    }
    Map<String, LocationId> processorLocality = new TreeMap<>(grouperMetadata.getProcessorLocality());
    /**
     * When there're more task models than processors then choose the lexicographically least `x` processors(where x = tasks.size()).
     */
    if (processorLocality.size() > taskModels.size()) {
        processorLocality = processorLocality.entrySet().stream().limit(taskModels.size()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    }
    Map<LocationId, List<String>> locationIdToProcessors = new HashMap<>();
    Map<String, TaskGroup> processorIdToTaskGroup = new HashMap<>();
    // Generate the {@see LocationId} to processors mapping and processorId to {@see TaskGroup} mapping.
    processorLocality.forEach((processorId, locationId) -> {
        List<String> processorIds = locationIdToProcessors.getOrDefault(locationId, new ArrayList<>());
        processorIds.add(processorId);
        locationIdToProcessors.put(locationId, processorIds);
        processorIdToTaskGroup.put(processorId, new TaskGroup(processorId, new ArrayList<>()));
    });
    int numTasksPerProcessor = taskModels.size() / processorLocality.size();
    Set<TaskName> assignedTasks = new HashSet<>();
    /**
     * A processor is considered under-assigned when number of tasks assigned to it is less than
     * (number of tasks / number of processors).
     * Map the tasks to the under-assigned processors with same locality.
     */
    for (TaskModel taskModel : taskModels) {
        LocationId taskLocationId = taskLocality.get(taskModel.getTaskName());
        if (taskLocationId != null) {
            List<String> processorIds = locationIdToProcessors.getOrDefault(taskLocationId, new ArrayList<>());
            for (String processorId : processorIds) {
                TaskGroup taskGroup = processorIdToTaskGroup.get(processorId);
                if (taskGroup.size() < numTasksPerProcessor) {
                    taskGroup.addTaskName(taskModel.getTaskName().getTaskName());
                    assignedTasks.add(taskModel.getTaskName());
                    break;
                }
            }
        }
    }
    /**
     * In some scenarios, the task either might not have any previous locality or might not have any
     * processor that maps to its previous locality. This cyclic processorId's iterator helps us in
     * those scenarios to assign the processorIds to those kind of tasks in a round robin fashion.
     */
    Iterator<String> processorIdsCyclicIterator = Iterators.cycle(processorLocality.keySet());
    // Order the taskGroups to choose a task group in a deterministic fashion for unassigned tasks.
    List<TaskGroup> taskGroups = new ArrayList<>(processorIdToTaskGroup.values());
    taskGroups.sort(Comparator.comparing(TaskGroup::getContainerId));
    /**
     * For the tasks left over from the previous stage, map them to any under-assigned processor.
     * When a under-assigned processor doesn't exist, then map them to any processor from the
     * available processors in a round robin manner.
     */
    for (TaskModel taskModel : taskModels) {
        if (!assignedTasks.contains(taskModel.getTaskName())) {
            Optional<TaskGroup> underAssignedTaskGroup = taskGroups.stream().filter(taskGroup -> taskGroup.size() < numTasksPerProcessor).findFirst();
            if (underAssignedTaskGroup.isPresent()) {
                underAssignedTaskGroup.get().addTaskName(taskModel.getTaskName().getTaskName());
            } else {
                TaskGroup taskGroup = processorIdToTaskGroup.get(processorIdsCyclicIterator.next());
                taskGroup.addTaskName(taskModel.getTaskName().getTaskName());
            }
            assignedTasks.add(taskModel.getTaskName());
        }
    }
    return TaskGroup.buildContainerModels(taskModels, taskGroups);
}
Also used : MapUtils(org.apache.commons.collections4.MapUtils) Arrays(java.util.Arrays) TaskName(org.apache.samza.container.TaskName) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) LoggerFactory(org.slf4j.LoggerFactory) Set(java.util.Set) HashMap(java.util.HashMap) TaskModel(org.apache.samza.job.model.TaskModel) Collectors(java.util.stream.Collectors) Iterators(com.google.common.collect.Iterators) ArrayList(java.util.ArrayList) LocationId(org.apache.samza.runtime.LocationId) HashSet(java.util.HashSet) List(java.util.List) TreeMap(java.util.TreeMap) ContainerModel(org.apache.samza.job.model.ContainerModel) Map(java.util.Map) Optional(java.util.Optional) Preconditions(com.google.common.base.Preconditions) Comparator(java.util.Comparator) Collections(java.util.Collections) HashMap(java.util.HashMap) LocationId(org.apache.samza.runtime.LocationId) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) TaskName(org.apache.samza.container.TaskName) ArrayList(java.util.ArrayList) List(java.util.List) TaskModel(org.apache.samza.job.model.TaskModel) HashSet(java.util.HashSet)

Aggregations

LocationId (org.apache.samza.runtime.LocationId)16 TaskName (org.apache.samza.container.TaskName)13 ContainerModel (org.apache.samza.job.model.ContainerModel)11 Test (org.junit.Test)11 TaskModel (org.apache.samza.job.model.TaskModel)9 HashMap (java.util.HashMap)7 Partition (org.apache.samza.Partition)6 ContainerMocks.getTaskName (org.apache.samza.container.mock.ContainerMocks.getTaskName)6 ArrayList (java.util.ArrayList)5 List (java.util.List)4 ProcessorLocality (org.apache.samza.job.model.ProcessorLocality)4 HashSet (java.util.HashSet)3 Map (java.util.Map)3 Set (java.util.Set)3 Collectors (java.util.stream.Collectors)3 Collections (java.util.Collections)2 Optional (java.util.Optional)2 StringUtils (org.apache.commons.lang3.StringUtils)2 Config (org.apache.samza.config.Config)2 JobConfig (org.apache.samza.config.JobConfig)2