Search in sources :

Example 71 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class JobModelCalculator method calculateJobModel.

/**
 * Does the following:
 * 1. Fetches metadata of the input streams defined in configuration through {@code streamMetadataCache}.
 * 2. Applies the SSP grouper and task name grouper defined in the configuration to build the {@link JobModel}.
 * @param originalConfig the configuration of the job.
 * @param changeLogPartitionMapping the task to changelog partition mapping of the job.
 * @param streamMetadataCache the cache that holds the partition metadata of the input streams.
 * @param grouperMetadata provides the historical metadata of the application.
 * @return the built {@link JobModel}.
 */
public JobModel calculateJobModel(Config originalConfig, Map<TaskName, Integer> changeLogPartitionMapping, StreamMetadataCache streamMetadataCache, GrouperMetadata grouperMetadata) {
    // refresh config if enabled regex topic rewriter
    Config refreshedConfig = refreshConfigByRegexTopicRewriter(originalConfig);
    TaskConfig taskConfig = new TaskConfig(refreshedConfig);
    // Do grouping to fetch TaskName to SSP mapping
    Set<SystemStreamPartition> allSystemStreamPartitions = getMatchedInputStreamPartitions(refreshedConfig, streamMetadataCache);
    // processor list is required by some of the groupers. So, let's pass them as part of the config.
    // Copy the config and add the processor list to the config copy.
    // TODO: It is non-ideal to have config as a medium to transmit the locality information; especially, if the locality information evolves. Evaluate options on using context objects to pass dependent components.
    Map<String, String> configMap = new HashMap<>(refreshedConfig);
    configMap.put(JobConfig.PROCESSOR_LIST, String.join(",", grouperMetadata.getProcessorLocality().keySet()));
    SystemStreamPartitionGrouper grouper = getSystemStreamPartitionGrouper(new MapConfig(configMap));
    JobConfig jobConfig = new JobConfig(refreshedConfig);
    Map<TaskName, Set<SystemStreamPartition>> groups;
    if (jobConfig.isSSPGrouperProxyEnabled()) {
        SSPGrouperProxy sspGrouperProxy = new SSPGrouperProxy(refreshedConfig, grouper);
        groups = sspGrouperProxy.group(allSystemStreamPartitions, grouperMetadata);
    } else {
        LOG.warn(String.format("SSPGrouperProxy is disabled (%s = false). Stateful jobs may produce erroneous results if this is not enabled.", JobConfig.SSP_INPUT_EXPANSION_ENABLED));
        groups = grouper.group(allSystemStreamPartitions);
    }
    LOG.info(String.format("SystemStreamPartitionGrouper %s has grouped the SystemStreamPartitions into %d tasks with the following taskNames: %s", grouper, groups.size(), groups));
    // If no mappings are present (first time the job is running) we return -1, this will allow 0 to be the first change
    // mapping.
    int maxChangelogPartitionId = changeLogPartitionMapping.values().stream().max(Comparator.naturalOrder()).orElse(-1);
    // Sort the groups prior to assigning the changelog mapping so that the mapping is reproducible and intuitive
    TreeMap<TaskName, Set<SystemStreamPartition>> sortedGroups = new TreeMap<>(groups);
    Set<TaskModel> taskModels = new HashSet<>();
    for (Map.Entry<TaskName, Set<SystemStreamPartition>> group : sortedGroups.entrySet()) {
        TaskName taskName = group.getKey();
        Set<SystemStreamPartition> systemStreamPartitions = group.getValue();
        Optional<Integer> changelogPartitionId = Optional.ofNullable(changeLogPartitionMapping.get(taskName));
        Partition changelogPartition;
        if (changelogPartitionId.isPresent()) {
            changelogPartition = new Partition(changelogPartitionId.get());
        } else {
            // If we've never seen this TaskName before, then assign it a new changelog partition.
            maxChangelogPartitionId++;
            LOG.info(String.format("New task %s is being assigned changelog partition %s.", taskName, maxChangelogPartitionId));
            changelogPartition = new Partition(maxChangelogPartitionId);
        }
        taskModels.add(new TaskModel(taskName, systemStreamPartitions, changelogPartition));
    }
    // Here is where we should put in a pluggable option for the SSPTaskNameGrouper for locality, load-balancing, etc.
    TaskNameGrouperFactory containerGrouperFactory = ReflectionUtil.getObj(taskConfig.getTaskNameGrouperFactory(), TaskNameGrouperFactory.class);
    boolean standbyTasksEnabled = jobConfig.getStandbyTasksEnabled();
    int standbyTaskReplicationFactor = jobConfig.getStandbyTaskReplicationFactor();
    TaskNameGrouperProxy taskNameGrouperProxy = new TaskNameGrouperProxy(containerGrouperFactory.build(refreshedConfig), standbyTasksEnabled, standbyTaskReplicationFactor);
    Set<ContainerModel> containerModels;
    boolean isHostAffinityEnabled = new ClusterManagerConfig(refreshedConfig).getHostAffinityEnabled();
    if (isHostAffinityEnabled) {
        containerModels = taskNameGrouperProxy.group(taskModels, grouperMetadata);
    } else {
        containerModels = taskNameGrouperProxy.group(taskModels, new ArrayList<>(grouperMetadata.getProcessorLocality().keySet()));
    }
    Map<String, ContainerModel> containerMap = containerModels.stream().collect(Collectors.toMap(ContainerModel::getId, Function.identity()));
    return new JobModel(refreshedConfig, containerMap);
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) SSPGrouperProxy(org.apache.samza.container.grouper.stream.SSPGrouperProxy) JobConfig(org.apache.samza.config.JobConfig) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) MapConfig(org.apache.samza.config.MapConfig) TaskConfig(org.apache.samza.config.TaskConfig) Config(org.apache.samza.config.Config) ArrayList(java.util.ArrayList) TaskConfig(org.apache.samza.config.TaskConfig) JobConfig(org.apache.samza.config.JobConfig) ContainerModel(org.apache.samza.job.model.ContainerModel) ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) JobModel(org.apache.samza.job.model.JobModel) MapConfig(org.apache.samza.config.MapConfig) TaskNameGrouperFactory(org.apache.samza.container.grouper.task.TaskNameGrouperFactory) HashSet(java.util.HashSet) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) TreeMap(java.util.TreeMap) SystemStreamPartitionGrouper(org.apache.samza.container.grouper.stream.SystemStreamPartitionGrouper) TaskNameGrouperProxy(org.apache.samza.container.grouper.task.TaskNameGrouperProxy) TaskName(org.apache.samza.container.TaskName) HashMap(java.util.HashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) TaskModel(org.apache.samza.job.model.TaskModel) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition)

Example 72 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestGroupByContainerIds method testMoreTasksThanProcessors.

@Test
public void testMoreTasksThanProcessors() {
    String testProcessorId1 = "testProcessorId1";
    String testProcessorId2 = "testProcessorId2";
    LocationId testLocationId1 = new LocationId("testLocationId1");
    LocationId testLocationId2 = new LocationId("testLocationId2");
    LocationId testLocationId3 = new LocationId("testLocationId3");
    TaskName testTaskName1 = new TaskName("testTasKId1");
    TaskName testTaskName2 = new TaskName("testTaskId2");
    TaskName testTaskName3 = new TaskName("testTaskId3");
    Map<String, LocationId> processorLocality = ImmutableMap.of(testProcessorId1, testLocationId1, testProcessorId2, testLocationId2);
    Map<TaskName, LocationId> taskLocality = ImmutableMap.of(testTaskName1, testLocationId1, testTaskName2, testLocationId2, testTaskName3, testLocationId3);
    GrouperMetadataImpl grouperMetadata = new GrouperMetadataImpl(processorLocality, taskLocality, new HashMap<>(), new HashMap<>());
    Set<TaskModel> taskModels = generateTaskModels(1);
    List<String> containerIds = ImmutableList.of(testProcessorId1, testProcessorId2);
    Map<TaskName, TaskModel> expectedTasks = taskModels.stream().collect(Collectors.toMap(TaskModel::getTaskName, x -> x));
    ContainerModel expectedContainerModel = new ContainerModel(testProcessorId1, expectedTasks);
    Set<ContainerModel> actualContainerModels = buildSimpleGrouper().group(taskModels, grouperMetadata);
    assertEquals(1, actualContainerModels.size());
    assertEquals(ImmutableSet.of(expectedContainerModel), actualContainerModels);
}
Also used : ImmutableSet(com.google.common.collect.ImmutableSet) TaskName(org.apache.samza.container.TaskName) ImmutableMap(com.google.common.collect.ImmutableMap) Assert.assertNotNull(org.junit.Assert.assertNotNull) Partition(org.apache.samza.Partition) Set(java.util.Set) Assert.assertTrue(org.junit.Assert.assertTrue) HashMap(java.util.HashMap) TaskModel(org.apache.samza.job.model.TaskModel) Test(org.junit.Test) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) LocationId(org.apache.samza.runtime.LocationId) HashSet(java.util.HashSet) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ContainerModel(org.apache.samza.job.model.ContainerModel) ContainerMocks.generateTaskModels(org.apache.samza.container.mock.ContainerMocks.generateTaskModels) Map(java.util.Map) ContainerMocks.getTaskName(org.apache.samza.container.mock.ContainerMocks.getTaskName) Config(org.apache.samza.config.Config) Collections(java.util.Collections) MapConfig(org.apache.samza.config.MapConfig) Assert.assertEquals(org.junit.Assert.assertEquals) LocationId(org.apache.samza.runtime.LocationId) ContainerModel(org.apache.samza.job.model.ContainerModel) TaskName(org.apache.samza.container.TaskName) ContainerMocks.getTaskName(org.apache.samza.container.mock.ContainerMocks.getTaskName) TaskModel(org.apache.samza.job.model.TaskModel) Test(org.junit.Test)

Example 73 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestGroupByContainerIds method testGroupHappyPathWithListOfContainers.

@Test
public void testGroupHappyPathWithListOfContainers() {
    Set<TaskModel> taskModels = generateTaskModels(5);
    List<String> containerIds = new ArrayList<String>() {

        {
            add("4");
            add("2");
        }
    };
    Set<ContainerModel> containers = buildSimpleGrouper().group(taskModels, containerIds);
    Map<String, ContainerModel> containersMap = new HashMap<>();
    for (ContainerModel container : containers) {
        containersMap.put(container.getId(), container);
    }
    assertEquals(2, containers.size());
    ContainerModel container0 = containersMap.get("4");
    ContainerModel container1 = containersMap.get("2");
    assertNotNull(container0);
    assertNotNull(container1);
    assertEquals("4", container0.getId());
    assertEquals("2", container1.getId());
    assertEquals(3, container0.getTasks().size());
    assertEquals(2, container1.getTasks().size());
    assertTrue(container0.getTasks().containsKey(getTaskName(0)));
    assertTrue(container0.getTasks().containsKey(getTaskName(2)));
    assertTrue(container0.getTasks().containsKey(getTaskName(4)));
    assertTrue(container1.getTasks().containsKey(getTaskName(1)));
    assertTrue(container1.getTasks().containsKey(getTaskName(3)));
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaskModel(org.apache.samza.job.model.TaskModel) ContainerModel(org.apache.samza.job.model.ContainerModel) Test(org.junit.Test)

Example 74 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestGroupByContainerIds method testShouldUseTaskLocalityWhenGeneratingContainerModels.

@Test
public void testShouldUseTaskLocalityWhenGeneratingContainerModels() {
    TaskNameGrouper taskNameGrouper = buildSimpleGrouper(3);
    String testProcessorId1 = "testProcessorId1";
    String testProcessorId2 = "testProcessorId2";
    String testProcessorId3 = "testProcessorId3";
    LocationId testLocationId1 = new LocationId("testLocationId1");
    LocationId testLocationId2 = new LocationId("testLocationId2");
    LocationId testLocationId3 = new LocationId("testLocationId3");
    TaskName testTaskName1 = new TaskName("testTasKId1");
    TaskName testTaskName2 = new TaskName("testTaskId2");
    TaskName testTaskName3 = new TaskName("testTaskId3");
    TaskModel testTaskModel1 = new TaskModel(testTaskName1, new HashSet<>(), new Partition(0));
    TaskModel testTaskModel2 = new TaskModel(testTaskName2, new HashSet<>(), new Partition(1));
    TaskModel testTaskModel3 = new TaskModel(testTaskName3, new HashSet<>(), new Partition(2));
    Map<String, LocationId> processorLocality = ImmutableMap.of(testProcessorId1, testLocationId1, testProcessorId2, testLocationId2, testProcessorId3, testLocationId3);
    Map<TaskName, LocationId> taskLocality = ImmutableMap.of(testTaskName1, testLocationId1, testTaskName2, testLocationId2, testTaskName3, testLocationId3);
    GrouperMetadataImpl grouperMetadata = new GrouperMetadataImpl(processorLocality, taskLocality, new HashMap<>(), new HashMap<>());
    Set<TaskModel> taskModels = ImmutableSet.of(testTaskModel1, testTaskModel2, testTaskModel3);
    Set<ContainerModel> expectedContainerModels = ImmutableSet.of(new ContainerModel(testProcessorId1, ImmutableMap.of(testTaskName1, testTaskModel1)), new ContainerModel(testProcessorId2, ImmutableMap.of(testTaskName2, testTaskModel2)), new ContainerModel(testProcessorId3, ImmutableMap.of(testTaskName3, testTaskModel3)));
    Set<ContainerModel> actualContainerModels = taskNameGrouper.group(taskModels, grouperMetadata);
    assertEquals(expectedContainerModels, actualContainerModels);
}
Also used : Partition(org.apache.samza.Partition) LocationId(org.apache.samza.runtime.LocationId) ContainerModel(org.apache.samza.job.model.ContainerModel) TaskName(org.apache.samza.container.TaskName) ContainerMocks.getTaskName(org.apache.samza.container.mock.ContainerMocks.getTaskName) TaskModel(org.apache.samza.job.model.TaskModel) Test(org.junit.Test)

Example 75 with ContainerModel

use of org.apache.samza.job.model.ContainerModel in project samza by apache.

the class TestGroupByContainerIds method testFewerTasksThanContainers.

@Test
public void testFewerTasksThanContainers() {
    final String testContainerId1 = "1";
    final String testContainerId2 = "2";
    Set<TaskModel> taskModels = generateTaskModels(1);
    List<String> containerIds = ImmutableList.of(testContainerId1, testContainerId2);
    Map<TaskName, TaskModel> expectedTasks = taskModels.stream().collect(Collectors.toMap(TaskModel::getTaskName, x -> x));
    ContainerModel expectedContainerModel = new ContainerModel(testContainerId1, expectedTasks);
    Set<ContainerModel> actualContainerModels = buildSimpleGrouper().group(taskModels, containerIds);
    assertEquals(1, actualContainerModels.size());
    assertEquals(ImmutableSet.of(expectedContainerModel), actualContainerModels);
}
Also used : ImmutableSet(com.google.common.collect.ImmutableSet) TaskName(org.apache.samza.container.TaskName) ImmutableMap(com.google.common.collect.ImmutableMap) Assert.assertNotNull(org.junit.Assert.assertNotNull) Partition(org.apache.samza.Partition) Set(java.util.Set) Assert.assertTrue(org.junit.Assert.assertTrue) HashMap(java.util.HashMap) TaskModel(org.apache.samza.job.model.TaskModel) Test(org.junit.Test) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) LocationId(org.apache.samza.runtime.LocationId) HashSet(java.util.HashSet) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ContainerModel(org.apache.samza.job.model.ContainerModel) ContainerMocks.generateTaskModels(org.apache.samza.container.mock.ContainerMocks.generateTaskModels) Map(java.util.Map) ContainerMocks.getTaskName(org.apache.samza.container.mock.ContainerMocks.getTaskName) Config(org.apache.samza.config.Config) Collections(java.util.Collections) MapConfig(org.apache.samza.config.MapConfig) Assert.assertEquals(org.junit.Assert.assertEquals) TaskName(org.apache.samza.container.TaskName) ContainerMocks.getTaskName(org.apache.samza.container.mock.ContainerMocks.getTaskName) TaskModel(org.apache.samza.job.model.TaskModel) ContainerModel(org.apache.samza.job.model.ContainerModel) Test(org.junit.Test)

Aggregations

ContainerModel (org.apache.samza.job.model.ContainerModel)96 TaskModel (org.apache.samza.job.model.TaskModel)68 TaskName (org.apache.samza.container.TaskName)60 Test (org.junit.Test)57 HashMap (java.util.HashMap)53 JobModel (org.apache.samza.job.model.JobModel)37 MapConfig (org.apache.samza.config.MapConfig)30 Config (org.apache.samza.config.Config)28 Partition (org.apache.samza.Partition)24 SystemStreamPartition (org.apache.samza.system.SystemStreamPartition)22 StorageConfig (org.apache.samza.config.StorageConfig)19 Map (java.util.Map)18 JobConfig (org.apache.samza.config.JobConfig)18 TaskConfig (org.apache.samza.config.TaskConfig)18 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)14 ClusterManagerConfig (org.apache.samza.config.ClusterManagerConfig)12 LocationId (org.apache.samza.runtime.LocationId)12 Collectors (java.util.stream.Collectors)10 SystemStream (org.apache.samza.system.SystemStream)10