Search in sources :

Example 6 with GrouperMetadataImpl

use of org.apache.samza.container.grouper.task.GrouperMetadataImpl in project samza by apache.

the class TestGroupByPartitionWithGrouperProxy method testSingleStreamRepartitioning.

@Test
public void testSingleStreamRepartitioning() {
    Map<TaskName, List<SystemStreamPartition>> prevGroupingWithSingleStream = ImmutableMap.<TaskName, List<SystemStreamPartition>>builder().put(new TaskName("Partition 0"), ImmutableList.of(new SystemStreamPartition("kafka", "PVE", new Partition(0)))).put(new TaskName("Partition 1"), ImmutableList.of(new SystemStreamPartition("kafka", "PVE", new Partition(1)))).put(new TaskName("Partition 2"), ImmutableList.of(new SystemStreamPartition("kafka", "PVE", new Partition(2)))).put(new TaskName("Partition 3"), ImmutableList.of(new SystemStreamPartition("kafka", "PVE", new Partition(3)))).build();
    Set<SystemStreamPartition> currSsps = IntStream.range(0, 8).mapToObj(partitionId -> new SystemStreamPartition("kafka", "PVE", new Partition(partitionId))).collect(Collectors.toSet());
    Map<TaskName, Set<SystemStreamPartition>> expectedGroupingForStateful = ImmutableMap.<TaskName, Set<SystemStreamPartition>>builder().put(new TaskName("Partition 1"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(1)), new SystemStreamPartition("kafka", "PVE", new Partition(5)))).put(new TaskName("Partition 0"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(0)), new SystemStreamPartition("kafka", "PVE", new Partition(4)))).put(new TaskName("Partition 3"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(7)), new SystemStreamPartition("kafka", "PVE", new Partition(3)))).put(new TaskName("Partition 2"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(2)), new SystemStreamPartition("kafka", "PVE", new Partition(6)))).build();
    Map<TaskName, Set<SystemStreamPartition>> expectedGroupingForStateless = ImmutableMap.<TaskName, Set<SystemStreamPartition>>builder().put(new TaskName("Partition 0"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(0)))).put(new TaskName("Partition 1"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(1)))).put(new TaskName("Partition 2"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(2)))).put(new TaskName("Partition 3"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(3)))).put(new TaskName("Partition 4"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(4)))).put(new TaskName("Partition 5"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(5)))).put(new TaskName("Partition 6"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(6)))).put(new TaskName("Partition 7"), ImmutableSet.of(new SystemStreamPartition("kafka", "PVE", new Partition(7)))).build();
    // SSPGrouperProxy for stateful job
    SSPGrouperProxy groupByPartition = buildSspGrouperProxy(true);
    GrouperMetadata grouperMetadata = new GrouperMetadataImpl(new HashMap<>(), new HashMap<>(), prevGroupingWithSingleStream, new HashMap<>());
    Map<TaskName, Set<SystemStreamPartition>> finalGrouping = groupByPartition.group(currSsps, grouperMetadata);
    Assert.assertEquals(expectedGroupingForStateful, finalGrouping);
    // SSPGrouperProxy for stateless job
    groupByPartition = buildSspGrouperProxy(false);
    finalGrouping = groupByPartition.group(currSsps, grouperMetadata);
    Assert.assertEquals(expectedGroupingForStateless, finalGrouping);
}
Also used : IntStream(java.util.stream.IntStream) StorageConfig(org.apache.samza.config.StorageConfig) ImmutableSet(com.google.common.collect.ImmutableSet) TaskName(org.apache.samza.container.TaskName) ImmutableMap(com.google.common.collect.ImmutableMap) Partition(org.apache.samza.Partition) Set(java.util.Set) GrouperMetadataImpl(org.apache.samza.container.grouper.task.GrouperMetadataImpl) HashMap(java.util.HashMap) Test(org.junit.Test) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Collectors(java.util.stream.Collectors) HashSet(java.util.HashSet) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) GrouperMetadata(org.apache.samza.container.grouper.task.GrouperMetadata) Assert(org.junit.Assert) MapConfig(org.apache.samza.config.MapConfig) Partition(org.apache.samza.Partition) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) GrouperMetadataImpl(org.apache.samza.container.grouper.task.GrouperMetadataImpl) TaskName(org.apache.samza.container.TaskName) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) GrouperMetadata(org.apache.samza.container.grouper.task.GrouperMetadata) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Test(org.junit.Test)

Example 7 with GrouperMetadataImpl

use of org.apache.samza.container.grouper.task.GrouperMetadataImpl in project samza by apache.

the class ZkJobCoordinator method getGrouperMetadata.

/**
 * Builds the {@link GrouperMetadataImpl} based upon provided {@param jobModelVersion}
 * and {@param processorNodes}.
 * @param jobModelVersion the most recent jobModelVersion available in the zookeeper.
 * @param processorNodes the list of live processors in the zookeeper.
 * @return the built grouper metadata.
 */
private GrouperMetadataImpl getGrouperMetadata(String jobModelVersion, List<ProcessorNode> processorNodes) {
    Map<TaskName, String> taskToProcessorId = new HashMap<>();
    Map<TaskName, List<SystemStreamPartition>> taskToSSPs = new HashMap<>();
    if (jobModelVersion != null) {
        JobModel jobModel = readJobModelFromMetadataStore(jobModelVersion);
        for (ContainerModel containerModel : jobModel.getContainers().values()) {
            for (TaskModel taskModel : containerModel.getTasks().values()) {
                taskToProcessorId.put(taskModel.getTaskName(), containerModel.getId());
                for (SystemStreamPartition partition : taskModel.getSystemStreamPartitions()) {
                    taskToSSPs.computeIfAbsent(taskModel.getTaskName(), k -> new ArrayList<>());
                    taskToSSPs.get(taskModel.getTaskName()).add(partition);
                }
            }
        }
    }
    Map<String, LocationId> processorLocality = new HashMap<>();
    for (ProcessorNode processorNode : processorNodes) {
        ProcessorData processorData = processorNode.getProcessorData();
        processorLocality.put(processorData.getProcessorId(), processorData.getLocationId());
    }
    Map<TaskName, LocationId> taskLocality = zkUtils.readTaskLocality();
    return new GrouperMetadataImpl(processorLocality, taskLocality, taskToSSPs, taskToProcessorId);
}
Also used : HashMap(java.util.HashMap) GrouperMetadataImpl(org.apache.samza.container.grouper.task.GrouperMetadataImpl) LocationId(org.apache.samza.runtime.LocationId) ContainerModel(org.apache.samza.job.model.ContainerModel) ProcessorNode(org.apache.samza.zk.ZkUtils.ProcessorNode) TaskName(org.apache.samza.container.TaskName) List(java.util.List) ArrayList(java.util.ArrayList) JobModel(org.apache.samza.job.model.JobModel) TaskModel(org.apache.samza.job.model.TaskModel) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition)

Example 8 with GrouperMetadataImpl

use of org.apache.samza.container.grouper.task.GrouperMetadataImpl in project samza by apache.

the class PassthroughJobCoordinator method getJobModel.

@Override
public JobModel getJobModel() {
    SystemAdmins systemAdmins = new SystemAdmins(config, this.getClass().getSimpleName());
    StreamMetadataCache streamMetadataCache = new StreamMetadataCache(systemAdmins, 5000, SystemClock.instance());
    systemAdmins.start();
    try {
        String containerId = Integer.toString(config.getInt(JobConfig.PROCESSOR_ID));
        GrouperMetadata grouperMetadata = new GrouperMetadataImpl(ImmutableMap.of(String.valueOf(containerId), locationId), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap());
        return JobModelCalculator.INSTANCE.calculateJobModel(this.config, Collections.emptyMap(), streamMetadataCache, grouperMetadata);
    } finally {
        systemAdmins.stop();
    }
}
Also used : StreamMetadataCache(org.apache.samza.system.StreamMetadataCache) GrouperMetadataImpl(org.apache.samza.container.grouper.task.GrouperMetadataImpl) GrouperMetadata(org.apache.samza.container.grouper.task.GrouperMetadata) SystemAdmins(org.apache.samza.system.SystemAdmins)

Example 9 with GrouperMetadataImpl

use of org.apache.samza.container.grouper.task.GrouperMetadataImpl in project samza by apache.

the class AzureJobCoordinator method doOnProcessorChange.

/**
 * Called only by the leader, either when the processor becomes the leader, or when the list of live processors changes.
 * @param currentProcessorIds New updated list of processor IDs which caused the rebalancing.
 */
private void doOnProcessorChange(List<String> currentProcessorIds) {
    // if list of processors is empty - it means we are called from 'onBecomeLeader'
    // Check if number of processors is greater than number of tasks
    List<String> initialProcessorIds = new ArrayList<>(currentProcessorIds);
    int numTasks = getMaxNumTasks();
    if (currentProcessorIds.size() > numTasks) {
        int iterator = 0;
        while (currentProcessorIds.size() != numTasks) {
            if (!currentProcessorIds.get(iterator).equals(processorId)) {
                currentProcessorIds.remove(iterator);
                iterator++;
            }
        }
    }
    LOG.info("currentProcessorIds = {}", currentProcessorIds);
    LOG.info("initialProcessorIds = {}", initialProcessorIds);
    String nextJMVersion;
    String prevJMVersion = currentJMVersion.get();
    JobModel prevJobModel = jobModel;
    AtomicBoolean barrierTimeout = new AtomicBoolean(false);
    if (currentProcessorIds.isEmpty()) {
        if (currentJMVersion.get().equals(INITIAL_STATE)) {
            nextJMVersion = "1";
        } else {
            nextJMVersion = Integer.toString(Integer.valueOf(prevJMVersion) + 1);
        }
        currentProcessorIds = new ArrayList<>(table.getActiveProcessorsList(currentJMVersion));
        initialProcessorIds = currentProcessorIds;
    } else {
        // Check if previous barrier not reached, then previous barrier times out.
        String blobJMV = leaderBlob.getJobModelVersion();
        nextJMVersion = Integer.toString(Integer.valueOf(prevJMVersion) + 1);
        if (blobJMV != null && Integer.valueOf(blobJMV) > Integer.valueOf(prevJMVersion)) {
            prevJMVersion = blobJMV;
            prevJobModel = leaderBlob.getJobModel();
            nextJMVersion = Integer.toString(Integer.valueOf(blobJMV) + 1);
            versionUpgradeDetected.getAndSet(false);
            leaderBarrierScheduler.shutdown();
            leaderBlob.publishBarrierState(BarrierState.TIMEOUT.name() + " " + blobJMV, azureLeaderElector.getLeaseId().get());
        }
    }
    // Generate the new JobModel
    GrouperMetadata grouperMetadata = new GrouperMetadataImpl(Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap());
    JobModel newJobModel = JobModelCalculator.INSTANCE.calculateJobModel(this.config, Collections.emptyMap(), streamMetadataCache, grouperMetadata);
    LOG.info("pid=" + processorId + "Generated new Job Model. Version = " + nextJMVersion);
    // Publish the new job model
    boolean jmWrite = leaderBlob.publishJobModel(prevJobModel, newJobModel, prevJMVersion, nextJMVersion, azureLeaderElector.getLeaseId().get());
    // Publish barrier state
    boolean barrierWrite = leaderBlob.publishBarrierState(BarrierState.START.name() + " " + nextJMVersion, azureLeaderElector.getLeaseId().get());
    barrierTimeout.set(false);
    // Publish list of processors this function was called with
    boolean processorWrite = leaderBlob.publishLiveProcessorList(initialProcessorIds, azureLeaderElector.getLeaseId().get());
    // Shut down processor if write fails even after retries. These writes have an inherent retry policy.
    if (!jmWrite || !barrierWrite || !processorWrite) {
        LOG.info("Leader failed to publish the job model {}. Stopping the processor with PID: .", jobModel, processorId);
        stop();
    }
    LOG.info("pid=" + processorId + "Published new Job Model. Version = " + nextJMVersion);
    // Start scheduler to check if barrier reached
    long startTime = System.currentTimeMillis();
    leaderBarrierScheduler = new LeaderBarrierCompleteScheduler(errorHandler, table, nextJMVersion, initialProcessorIds, startTime, barrierTimeout, currentJMVersion, processorId);
    leaderBarrierScheduler.setStateChangeListener(createLeaderBarrierCompleteListener(nextJMVersion, barrierTimeout));
    leaderBarrierScheduler.scheduleTask();
}
Also used : AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) LeaderBarrierCompleteScheduler(org.apache.samza.coordinator.scheduler.LeaderBarrierCompleteScheduler) GrouperMetadataImpl(org.apache.samza.container.grouper.task.GrouperMetadataImpl) ArrayList(java.util.ArrayList) JobModel(org.apache.samza.job.model.JobModel) GrouperMetadata(org.apache.samza.container.grouper.task.GrouperMetadata)

Example 10 with GrouperMetadataImpl

use of org.apache.samza.container.grouper.task.GrouperMetadataImpl in project samza by apache.

the class JobModelHelper method getGrouperMetadata.

private GrouperMetadata getGrouperMetadata(Config config, LocalityManager localityManager, TaskAssignmentManager taskAssignmentManager, TaskPartitionAssignmentManager taskPartitionAssignmentManager) {
    Map<String, LocationId> processorLocality = getProcessorLocality(config, localityManager);
    Map<TaskName, TaskMode> taskModes = taskAssignmentManager.readTaskModes();
    Map<TaskName, String> taskNameToProcessorId = new HashMap<>();
    Map<TaskName, LocationId> taskLocality = new HashMap<>();
    // We read the taskAssignment only for ActiveTasks, i.e., tasks that have no task-mode or have an active task mode
    taskAssignmentManager.readTaskAssignment().forEach((taskNameString, containerId) -> {
        TaskName taskName = new TaskName(taskNameString);
        if (isActiveTask(taskName, taskModes)) {
            taskNameToProcessorId.put(taskName, containerId);
            if (processorLocality.containsKey(containerId)) {
                taskLocality.put(taskName, processorLocality.get(containerId));
            }
        }
    });
    Map<SystemStreamPartition, List<String>> sspToTaskMapping = taskPartitionAssignmentManager.readTaskPartitionAssignments();
    Map<TaskName, List<SystemStreamPartition>> taskPartitionAssignments = new HashMap<>();
    // Task to partition assignments is stored as {@see SystemStreamPartition} to list of {@see TaskName} in
    // coordinator stream. This is done due to the 1 MB value size limit in a kafka topic. Conversion to
    // taskName to SystemStreamPartitions is done here to wire-in the data to {@see JobModel}.
    sspToTaskMapping.forEach((systemStreamPartition, taskNames) -> taskNames.forEach(taskNameString -> {
        TaskName taskName = new TaskName(taskNameString);
        if (isActiveTask(taskName, taskModes)) {
            taskPartitionAssignments.putIfAbsent(taskName, new ArrayList<>());
            taskPartitionAssignments.get(taskName).add(systemStreamPartition);
        }
    }));
    return new GrouperMetadataImpl(processorLocality, taskLocality, taskPartitionAssignments, taskNameToProcessorId);
}
Also used : StreamMetadataCache(org.apache.samza.system.StreamMetadataCache) TaskPartitionAssignmentManager(org.apache.samza.container.grouper.task.TaskPartitionAssignmentManager) LoggerFactory(org.slf4j.LoggerFactory) JobConfig(org.apache.samza.config.JobConfig) GrouperMetadataImpl(org.apache.samza.container.grouper.task.GrouperMetadataImpl) HashMap(java.util.HashMap) TaskModel(org.apache.samza.job.model.TaskModel) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) StringUtils(org.apache.commons.lang3.StringUtils) ArrayList(java.util.ArrayList) LocationId(org.apache.samza.runtime.LocationId) HashSet(java.util.HashSet) Map(java.util.Map) GrouperMetadata(org.apache.samza.container.grouper.task.GrouperMetadata) TaskAssignmentManager(org.apache.samza.container.grouper.task.TaskAssignmentManager) JobModel(org.apache.samza.job.model.JobModel) TaskName(org.apache.samza.container.TaskName) Logger(org.slf4j.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) LocalityManager(org.apache.samza.container.LocalityManager) List(java.util.List) TaskMode(org.apache.samza.job.model.TaskMode) ContainerModel(org.apache.samza.job.model.ContainerModel) ProcessorLocality(org.apache.samza.job.model.ProcessorLocality) Optional(java.util.Optional) Config(org.apache.samza.config.Config) HashMap(java.util.HashMap) GrouperMetadataImpl(org.apache.samza.container.grouper.task.GrouperMetadataImpl) LocationId(org.apache.samza.runtime.LocationId) ArrayList(java.util.ArrayList) TaskMode(org.apache.samza.job.model.TaskMode) TaskName(org.apache.samza.container.TaskName) ArrayList(java.util.ArrayList) List(java.util.List) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition)

Aggregations

GrouperMetadataImpl (org.apache.samza.container.grouper.task.GrouperMetadataImpl)14 GrouperMetadata (org.apache.samza.container.grouper.task.GrouperMetadata)13 List (java.util.List)12 TaskName (org.apache.samza.container.TaskName)12 SystemStreamPartition (org.apache.samza.system.SystemStreamPartition)12 HashMap (java.util.HashMap)11 Set (java.util.Set)11 ImmutableList (com.google.common.collect.ImmutableList)10 ImmutableSet (com.google.common.collect.ImmutableSet)10 Map (java.util.Map)10 Collectors (java.util.stream.Collectors)10 Partition (org.apache.samza.Partition)10 Test (org.junit.Test)10 ImmutableMap (com.google.common.collect.ImmutableMap)9 IntStream (java.util.stream.IntStream)9 MapConfig (org.apache.samza.config.MapConfig)9 StorageConfig (org.apache.samza.config.StorageConfig)9 Assert (org.junit.Assert)9 HashSet (java.util.HashSet)6 ArrayList (java.util.ArrayList)3