Examples with SSPGrouperProxy - org.apache.samza.container.grouper.stream.SSPGrouperProxy

Example 1 with SSPGrouperProxy

use of org.apache.samza.container.grouper.stream.SSPGrouperProxy in project samza by apache.

the class JobModelCalculator method calculateJobModel.

/**
 * Does the following:
 * 1. Fetches metadata of the input streams defined in configuration through {@code streamMetadataCache}.
 * 2. Applies the SSP grouper and task name grouper defined in the configuration to build the {@link JobModel}.
 * @param originalConfig the configuration of the job.
 * @param changeLogPartitionMapping the task to changelog partition mapping of the job.
 * @param streamMetadataCache the cache that holds the partition metadata of the input streams.
 * @param grouperMetadata provides the historical metadata of the application.
 * @return the built {@link JobModel}.
 */
public JobModel calculateJobModel(Config originalConfig, Map<TaskName, Integer> changeLogPartitionMapping, StreamMetadataCache streamMetadataCache, GrouperMetadata grouperMetadata) {
    // refresh config if enabled regex topic rewriter
    Config refreshedConfig = refreshConfigByRegexTopicRewriter(originalConfig);
    TaskConfig taskConfig = new TaskConfig(refreshedConfig);
    // Do grouping to fetch TaskName to SSP mapping
    Set<SystemStreamPartition> allSystemStreamPartitions = getMatchedInputStreamPartitions(refreshedConfig, streamMetadataCache);
    // processor list is required by some of the groupers. So, let's pass them as part of the config.
    // Copy the config and add the processor list to the config copy.
    // TODO: It is non-ideal to have config as a medium to transmit the locality information; especially, if the locality information evolves. Evaluate options on using context objects to pass dependent components.
    Map<String, String> configMap = new HashMap<>(refreshedConfig);
    configMap.put(JobConfig.PROCESSOR_LIST, String.join(",", grouperMetadata.getProcessorLocality().keySet()));
    SystemStreamPartitionGrouper grouper = getSystemStreamPartitionGrouper(new MapConfig(configMap));
    JobConfig jobConfig = new JobConfig(refreshedConfig);
    Map<TaskName, Set<SystemStreamPartition>> groups;
    if (jobConfig.isSSPGrouperProxyEnabled()) {
        SSPGrouperProxy sspGrouperProxy = new SSPGrouperProxy(refreshedConfig, grouper);
        groups = sspGrouperProxy.group(allSystemStreamPartitions, grouperMetadata);
    } else {
        LOG.warn(String.format("SSPGrouperProxy is disabled (%s = false). Stateful jobs may produce erroneous results if this is not enabled.", JobConfig.SSP_INPUT_EXPANSION_ENABLED));
        groups = grouper.group(allSystemStreamPartitions);
    }
    LOG.info(String.format("SystemStreamPartitionGrouper %s has grouped the SystemStreamPartitions into %d tasks with the following taskNames: %s", grouper, groups.size(), groups));
    // If no mappings are present (first time the job is running) we return -1, this will allow 0 to be the first change
    // mapping.
    int maxChangelogPartitionId = changeLogPartitionMapping.values().stream().max(Comparator.naturalOrder()).orElse(-1);
    // Sort the groups prior to assigning the changelog mapping so that the mapping is reproducible and intuitive
    TreeMap<TaskName, Set<SystemStreamPartition>> sortedGroups = new TreeMap<>(groups);
    Set<TaskModel> taskModels = new HashSet<>();
    for (Map.Entry<TaskName, Set<SystemStreamPartition>> group : sortedGroups.entrySet()) {
        TaskName taskName = group.getKey();
        Set<SystemStreamPartition> systemStreamPartitions = group.getValue();
        Optional<Integer> changelogPartitionId = Optional.ofNullable(changeLogPartitionMapping.get(taskName));
        Partition changelogPartition;
        if (changelogPartitionId.isPresent()) {
            changelogPartition = new Partition(changelogPartitionId.get());
        } else {
            // If we've never seen this TaskName before, then assign it a new changelog partition.
            maxChangelogPartitionId++;
            LOG.info(String.format("New task %s is being assigned changelog partition %s.", taskName, maxChangelogPartitionId));
            changelogPartition = new Partition(maxChangelogPartitionId);
        }
        taskModels.add(new TaskModel(taskName, systemStreamPartitions, changelogPartition));
    }
    // Here is where we should put in a pluggable option for the SSPTaskNameGrouper for locality, load-balancing, etc.
    TaskNameGrouperFactory containerGrouperFactory = ReflectionUtil.getObj(taskConfig.getTaskNameGrouperFactory(), TaskNameGrouperFactory.class);
    boolean standbyTasksEnabled = jobConfig.getStandbyTasksEnabled();
    int standbyTaskReplicationFactor = jobConfig.getStandbyTaskReplicationFactor();
    TaskNameGrouperProxy taskNameGrouperProxy = new TaskNameGrouperProxy(containerGrouperFactory.build(refreshedConfig), standbyTasksEnabled, standbyTaskReplicationFactor);
    Set<ContainerModel> containerModels;
    boolean isHostAffinityEnabled = new ClusterManagerConfig(refreshedConfig).getHostAffinityEnabled();
    if (isHostAffinityEnabled) {
        containerModels = taskNameGrouperProxy.group(taskModels, grouperMetadata);
    } else {
        containerModels = taskNameGrouperProxy.group(taskModels, new ArrayList<>(grouperMetadata.getProcessorLocality().keySet()));
    }
    Map<String, ContainerModel> containerMap = containerModels.stream().collect(Collectors.toMap(ContainerModel::getId, Function.identity()));
    return new JobModel(refreshedConfig, containerMap);
}

Also used : HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) SSPGrouperProxy(org.apache.samza.container.grouper.stream.SSPGrouperProxy) JobConfig(org.apache.samza.config.JobConfig) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) MapConfig(org.apache.samza.config.MapConfig) TaskConfig(org.apache.samza.config.TaskConfig) Config(org.apache.samza.config.Config) ArrayList(java.util.ArrayList) TaskConfig(org.apache.samza.config.TaskConfig) JobConfig(org.apache.samza.config.JobConfig) ContainerModel(org.apache.samza.job.model.ContainerModel) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) JobModel(org.apache.samza.job.model.JobModel) MapConfig(org.apache.samza.config.MapConfig) TaskNameGrouperFactory(org.apache.samza.container.grouper.task.TaskNameGrouperFactory) HashSet(java.util.HashSet) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) TreeMap(java.util.TreeMap) SystemStreamPartitionGrouper(org.apache.samza.container.grouper.stream.SystemStreamPartitionGrouper) TaskNameGrouperProxy(org.apache.samza.container.grouper.task.TaskNameGrouperProxy) TaskName(org.apache.samza.container.TaskName) HashMap(java.util.HashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) TaskModel(org.apache.samza.job.model.TaskModel) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition)

Aggregations

ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Set (java.util.Set)1 TreeMap (java.util.TreeMap)1 Partition (org.apache.samza.Partition)1 ClusterManagerConfig (org.apache.samza.config.ClusterManagerConfig)1 Config (org.apache.samza.config.Config)1 JobConfig (org.apache.samza.config.JobConfig)1 MapConfig (org.apache.samza.config.MapConfig)1 TaskConfig (org.apache.samza.config.TaskConfig)1 TaskName (org.apache.samza.container.TaskName)1 SSPGrouperProxy (org.apache.samza.container.grouper.stream.SSPGrouperProxy)1 SystemStreamPartitionGrouper (org.apache.samza.container.grouper.stream.SystemStreamPartitionGrouper)1 TaskNameGrouperFactory (org.apache.samza.container.grouper.task.TaskNameGrouperFactory)1 TaskNameGrouperProxy (org.apache.samza.container.grouper.task.TaskNameGrouperProxy)1 ContainerModel (org.apache.samza.job.model.ContainerModel)1 JobModel (org.apache.samza.job.model.JobModel)1 TaskModel (org.apache.samza.job.model.TaskModel)1