Search in sources :

Example 11 with Partition

use of org.apache.helix.model.Partition in project helix by apache.

the class DelayedAutoRebalancer method computeBestPossiblePartitionState.

/**
 * Compute the best state for all partitions.
 * This is the default implementation, subclasses should re-implement
 * this method if its logic to generate bestpossible map for each partition is different from the default one here.
 *
 * @param cache
 * @param idealState
 * @param resource
 * @param currentStateOutput Provides the current state and pending state transitions for all partitions
 * @return
 */
@Override
public ResourceAssignment computeBestPossiblePartitionState(ClusterDataCache cache, IdealState idealState, Resource resource, CurrentStateOutput currentStateOutput) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("Processing resource:" + resource.getResourceName());
    }
    Set<String> allNodes = cache.getEnabledInstances();
    Set<String> liveNodes = cache.getLiveInstances().keySet();
    ClusterConfig clusterConfig = cache.getClusterConfig();
    long delayTime = getRebalanceDelay(idealState, clusterConfig);
    Set<String> activeNodes = getActiveInstances(allNodes, idealState, liveNodes, cache.getInstanceOfflineTimeMap(), cache.getLiveInstances().keySet(), cache.getInstanceConfigMap(), delayTime, clusterConfig);
    String stateModelDefName = idealState.getStateModelDefRef();
    StateModelDefinition stateModelDef = cache.getStateModelDef(stateModelDefName);
    ResourceAssignment partitionMapping = new ResourceAssignment(resource.getResourceName());
    for (Partition partition : resource.getPartitions()) {
        Map<String, String> currentStateMap = currentStateOutput.getCurrentStateMap(resource.getResourceName(), partition);
        Set<String> disabledInstancesForPartition = cache.getDisabledInstancesForPartition(resource.getResourceName(), partition.toString());
        List<String> preferenceList = getPreferenceList(partition, idealState, activeNodes);
        Map<String, String> bestStateForPartition = computeBestPossibleStateForPartition(liveNodes, stateModelDef, preferenceList, currentStateMap, disabledInstancesForPartition, idealState);
        partitionMapping.addReplicaMap(partition, bestStateForPartition);
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Best possible mapping for resource  " + resource.getResourceName() + ": " + partitionMapping);
    }
    return partitionMapping;
}
Also used : Partition(org.apache.helix.model.Partition) ResourceAssignment(org.apache.helix.model.ResourceAssignment) StateModelDefinition(org.apache.helix.model.StateModelDefinition) ClusterConfig(org.apache.helix.model.ClusterConfig)

Example 12 with Partition

use of org.apache.helix.model.Partition in project helix by apache.

the class DeprecatedTaskRebalancer method computeResourceMapping.

private ResourceAssignment computeResourceMapping(String jobResource, WorkflowConfig workflowConfig, JobConfig jobCfg, ResourceAssignment prevAssignment, Collection<String> liveInstances, CurrentStateOutput currStateOutput, WorkflowContext workflowCtx, JobContext jobCtx, Set<Integer> partitionsToDropFromIs, ClusterDataCache cache) {
    TargetState jobTgtState = workflowConfig.getTargetState();
    // Update running status in workflow context
    if (jobTgtState == TargetState.STOP) {
        workflowCtx.setJobState(jobResource, TaskState.STOPPED);
        // Workflow has been stopped if all jobs are stopped
        if (isWorkflowStopped(workflowCtx, workflowConfig)) {
            workflowCtx.setWorkflowState(TaskState.STOPPED);
        }
    } else {
        workflowCtx.setJobState(jobResource, TaskState.IN_PROGRESS);
        // Workflow is in progress if any task is in progress
        workflowCtx.setWorkflowState(TaskState.IN_PROGRESS);
    }
    // Used to keep track of tasks that have already been assigned to instances.
    Set<Integer> assignedPartitions = new HashSet<Integer>();
    // Used to keep track of tasks that have failed, but whose failure is acceptable
    Set<Integer> skippedPartitions = new HashSet<Integer>();
    // Keeps a mapping of (partition) -> (instance, state)
    Map<Integer, PartitionAssignment> paMap = new TreeMap<Integer, PartitionAssignment>();
    Set<String> excludedInstances = getInstancesAssignedToOtherJobs(jobResource, workflowConfig, cache);
    // Process all the current assignments of tasks.
    Set<Integer> allPartitions = getAllTaskPartitions(jobCfg, jobCtx, workflowConfig, workflowCtx, cache);
    Map<String, SortedSet<Integer>> taskAssignments = getTaskPartitionAssignments(liveInstances, prevAssignment, allPartitions);
    long currentTime = System.currentTimeMillis();
    for (String instance : taskAssignments.keySet()) {
        if (excludedInstances.contains(instance)) {
            continue;
        }
        Set<Integer> pSet = taskAssignments.get(instance);
        // Used to keep track of partitions that are in one of the final states: COMPLETED, TIMED_OUT,
        // TASK_ERROR, ERROR.
        Set<Integer> donePartitions = new TreeSet<Integer>();
        for (int pId : pSet) {
            final String pName = pName(jobResource, pId);
            // Check for pending state transitions on this (partition, instance).
            Message pendingMessage = currStateOutput.getPendingState(jobResource, new Partition(pName), instance);
            if (pendingMessage != null) {
                // There is a pending state transition for this (partition, instance). Just copy forward
                // the state assignment from the previous ideal state.
                Map<String, String> stateMap = prevAssignment.getReplicaMap(new Partition(pName));
                if (stateMap != null) {
                    String prevState = stateMap.get(instance);
                    paMap.put(pId, new PartitionAssignment(instance, prevState));
                    assignedPartitions.add(pId);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(String.format("Task partition %s has a pending state transition on instance %s. Using the previous ideal state which was %s.", pName, instance, prevState));
                    }
                }
                continue;
            }
            TaskPartitionState currState = TaskPartitionState.valueOf(currStateOutput.getCurrentState(jobResource, new Partition(pName), instance));
            jobCtx.setPartitionState(pId, currState);
            // Process any requested state transitions.
            String requestedStateStr = currStateOutput.getRequestedState(jobResource, new Partition(pName), instance);
            if (requestedStateStr != null && !requestedStateStr.isEmpty()) {
                TaskPartitionState requestedState = TaskPartitionState.valueOf(requestedStateStr);
                if (requestedState.equals(currState)) {
                    LOG.warn(String.format("Requested state %s is the same as the current state for instance %s.", requestedState, instance));
                }
                paMap.put(pId, new PartitionAssignment(instance, requestedState.name()));
                assignedPartitions.add(pId);
                LOG.debug(String.format("Instance %s requested a state transition to %s for partition %s.", instance, requestedState, pName));
                continue;
            }
            switch(currState) {
                case RUNNING:
                case STOPPED:
                    {
                        TaskPartitionState nextState;
                        if (jobTgtState == TargetState.START) {
                            nextState = TaskPartitionState.RUNNING;
                        } else {
                            nextState = TaskPartitionState.STOPPED;
                        }
                        paMap.put(pId, new PartitionAssignment(instance, nextState.name()));
                        assignedPartitions.add(pId);
                        LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, nextState, instance));
                    }
                    break;
                case COMPLETED:
                    {
                        // The task has completed on this partition. Mark as such in the context object.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has completed with state %s. Marking as such in rebalancer context.", pName, currState));
                        partitionsToDropFromIs.add(pId);
                        markPartitionCompleted(jobCtx, pId);
                    }
                    break;
                case TIMED_OUT:
                case TASK_ERROR:
                case ERROR:
                    {
                        // The task may be rescheduled on a different instance.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has error state %s. Marking as such in rebalancer context.", pName, currState));
                        markPartitionError(jobCtx, pId, currState, true);
                        // maximum number of attempts.
                        if (jobCtx.getPartitionNumAttempts(pId) >= jobCfg.getMaxAttemptsPerTask()) {
                            // If the user does not require this task to succeed in order for the job to succeed,
                            // then we don't have to fail the job right now
                            boolean successOptional = false;
                            String taskId = jobCtx.getTaskIdForPartition(pId);
                            if (taskId != null) {
                                TaskConfig taskConfig = jobCfg.getTaskConfig(taskId);
                                if (taskConfig != null) {
                                    successOptional = taskConfig.isSuccessOptional();
                                }
                            }
                            // to fail the job immediately
                            if (skippedPartitions.size() < jobCfg.getFailureThreshold()) {
                                successOptional = true;
                            }
                            if (!successOptional) {
                                long finishTime = currentTime;
                                workflowCtx.setJobState(jobResource, TaskState.FAILED);
                                if (workflowConfig.isTerminable()) {
                                    workflowCtx.setWorkflowState(TaskState.FAILED);
                                    workflowCtx.setFinishTime(finishTime);
                                }
                                jobCtx.setFinishTime(finishTime);
                                markAllPartitionsError(jobCtx, currState, false);
                                addAllPartitions(allPartitions, partitionsToDropFromIs);
                                return emptyAssignment(jobResource, currStateOutput);
                            } else {
                                skippedPartitions.add(pId);
                                partitionsToDropFromIs.add(pId);
                            }
                        } else {
                            // Mark the task to be started at some later time (if enabled)
                            markPartitionDelayed(jobCfg, jobCtx, pId);
                        }
                    }
                    break;
                case INIT:
                case DROPPED:
                    {
                        // currState in [INIT, DROPPED]. Do nothing, the partition is eligible to be reassigned.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has state %s. It will be dropped from the current ideal state.", pName, currState));
                    }
                    break;
                default:
                    throw new AssertionError("Unknown enum symbol: " + currState);
            }
        }
        // Remove the set of task partitions that are completed or in one of the error states.
        pSet.removeAll(donePartitions);
    }
    // For delayed tasks, trigger a rebalance event for the closest upcoming ready time
    scheduleForNextTask(jobResource, jobCtx, currentTime);
    if (isJobComplete(jobCtx, allPartitions, skippedPartitions, jobCfg)) {
        workflowCtx.setJobState(jobResource, TaskState.COMPLETED);
        jobCtx.setFinishTime(currentTime);
        if (isWorkflowComplete(workflowCtx, workflowConfig)) {
            workflowCtx.setWorkflowState(TaskState.COMPLETED);
            workflowCtx.setFinishTime(currentTime);
        }
    }
    // Make additional task assignments if needed.
    if (jobTgtState == TargetState.START) {
        // Contains the set of task partitions that must be excluded from consideration when making
        // any new assignments.
        // This includes all completed, failed, delayed, and already assigned partitions.
        Set<Integer> excludeSet = Sets.newTreeSet(assignedPartitions);
        addCompletedPartitions(excludeSet, jobCtx, allPartitions);
        addGiveupPartitions(excludeSet, jobCtx, allPartitions, jobCfg);
        excludeSet.addAll(skippedPartitions);
        excludeSet.addAll(getNonReadyPartitions(jobCtx, currentTime));
        // Get instance->[partition, ...] mappings for the target resource.
        Map<String, SortedSet<Integer>> tgtPartitionAssignments = getTaskAssignment(currStateOutput, prevAssignment, liveInstances, jobCfg, jobCtx, workflowConfig, workflowCtx, allPartitions, cache);
        for (Map.Entry<String, SortedSet<Integer>> entry : taskAssignments.entrySet()) {
            String instance = entry.getKey();
            if (!tgtPartitionAssignments.containsKey(instance) || excludedInstances.contains(instance)) {
                continue;
            }
            // Contains the set of task partitions currently assigned to the instance.
            Set<Integer> pSet = entry.getValue();
            int numToAssign = jobCfg.getNumConcurrentTasksPerInstance() - pSet.size();
            if (numToAssign > 0) {
                List<Integer> nextPartitions = getNextPartitions(tgtPartitionAssignments.get(instance), excludeSet, numToAssign);
                for (Integer pId : nextPartitions) {
                    String pName = pName(jobResource, pId);
                    paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.RUNNING.name()));
                    excludeSet.add(pId);
                    jobCtx.setAssignedParticipant(pId, instance);
                    jobCtx.setPartitionState(pId, TaskPartitionState.INIT);
                    LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, TaskPartitionState.RUNNING, instance));
                }
            }
        }
    }
    // Construct a ResourceAssignment object from the map of partition assignments.
    ResourceAssignment ra = new ResourceAssignment(jobResource);
    for (Map.Entry<Integer, PartitionAssignment> e : paMap.entrySet()) {
        PartitionAssignment pa = e.getValue();
        ra.addReplicaMap(new Partition(pName(jobResource, e.getKey())), ImmutableMap.of(pa._instance, pa._state));
    }
    return ra;
}
Also used : Message(org.apache.helix.model.Message) SortedSet(java.util.SortedSet) ResourceAssignment(org.apache.helix.model.ResourceAssignment) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Partition(org.apache.helix.model.Partition) TreeMap(java.util.TreeMap) HashMap(java.util.HashMap) Map(java.util.Map) BiMap(com.google.common.collect.BiMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashBiMap(com.google.common.collect.HashBiMap) TreeMap(java.util.TreeMap)

Example 13 with Partition

use of org.apache.helix.model.Partition in project helix by apache.

the class FixedTargetTaskAssignmentCalculator method getTgtPartitionAssignment.

/**
 * Get partition assignments for the target resource, but only for the partitions of interest.
 * @param currStateOutput The current state of the instances in the cluster.
 * @param instances The instances.
 * @param tgtIs The ideal state of the target resource.
 * @param tgtStates Only partitions in this set of states will be considered. If null, partitions
 *          do not need to
 *          be in any specific state to be considered.
 * @param includeSet The set of partitions to consider.
 * @return A map of instance vs set of partition ids assigned to that instance.
 */
private static Map<String, SortedSet<Integer>> getTgtPartitionAssignment(CurrentStateOutput currStateOutput, Iterable<String> instances, IdealState tgtIs, Set<String> tgtStates, Set<Integer> includeSet, JobContext jobCtx) {
    Map<String, SortedSet<Integer>> result = new HashMap<String, SortedSet<Integer>>();
    for (String instance : instances) {
        result.put(instance, new TreeSet<Integer>());
    }
    Map<String, List<Integer>> partitionsByTarget = jobCtx.getPartitionsByTarget();
    for (String pName : tgtIs.getPartitionSet()) {
        List<Integer> partitions = partitionsByTarget.get(pName);
        if (partitions == null || partitions.size() < 1) {
            continue;
        }
        int pId = partitions.get(0);
        if (includeSet.contains(pId)) {
            for (String instance : instances) {
                Message pendingMessage = currStateOutput.getPendingState(tgtIs.getResourceName(), new Partition(pName), instance);
                if (pendingMessage != null) {
                    continue;
                }
                String s = currStateOutput.getCurrentState(tgtIs.getResourceName(), new Partition(pName), instance);
                if (s != null && (tgtStates == null || tgtStates.contains(s))) {
                    result.get(instance).add(pId);
                }
            }
        }
    }
    return result;
}
Also used : Partition(org.apache.helix.model.Partition) Message(org.apache.helix.model.Message) HashMap(java.util.HashMap) List(java.util.List) SortedSet(java.util.SortedSet)

Example 14 with Partition

use of org.apache.helix.model.Partition in project helix by apache.

the class TestRebalancerMetrics method copyCurrentStateFromBestPossible.

private CurrentStateOutput copyCurrentStateFromBestPossible(BestPossibleStateOutput bestPossibleStateOutput, String resource) {
    CurrentStateOutput currentStateOutput = new CurrentStateOutput();
    PartitionStateMap partitionStateMap = bestPossibleStateOutput.getPartitionStateMap(resource);
    for (Partition partition : partitionStateMap.partitionSet()) {
        Map<String, String> stateMap = partitionStateMap.getPartitionMap(partition);
        for (String instance : stateMap.keySet()) {
            currentStateOutput.setCurrentState(resource, partition, instance, stateMap.get(instance));
        }
    }
    return currentStateOutput;
}
Also used : PartitionStateMap(org.apache.helix.controller.common.PartitionStateMap) Partition(org.apache.helix.model.Partition) CurrentStateOutput(org.apache.helix.controller.stages.CurrentStateOutput)

Example 15 with Partition

use of org.apache.helix.model.Partition in project helix by apache.

the class JobRebalancer method computeResourceMapping.

private ResourceAssignment computeResourceMapping(String jobResource, WorkflowConfig workflowConfig, JobConfig jobCfg, ResourceAssignment prevTaskToInstanceStateAssignment, Collection<String> liveInstances, CurrentStateOutput currStateOutput, WorkflowContext workflowCtx, JobContext jobCtx, Set<Integer> partitionsToDropFromIs, ClusterDataCache cache) {
    TargetState jobTgtState = workflowConfig.getTargetState();
    TaskState jobState = workflowCtx.getJobState(jobResource);
    TaskState workflowState = workflowCtx.getWorkflowState();
    if (jobState == TaskState.IN_PROGRESS && (isTimeout(jobCtx.getStartTime(), jobCfg.getTimeout()) || TaskState.TIMED_OUT.equals(workflowState))) {
        jobState = TaskState.TIMING_OUT;
        workflowCtx.setJobState(jobResource, TaskState.TIMING_OUT);
    } else if (jobState != TaskState.TIMING_OUT && jobState != TaskState.FAILING) {
        // Update running status in workflow context
        if (jobTgtState == TargetState.STOP) {
            if (checkJobStopped(jobCtx)) {
                workflowCtx.setJobState(jobResource, TaskState.STOPPED);
            } else {
                workflowCtx.setJobState(jobResource, TaskState.STOPPING);
            }
            // Workflow has been stopped if all in progress jobs are stopped
            if (isWorkflowStopped(workflowCtx, workflowConfig)) {
                workflowCtx.setWorkflowState(TaskState.STOPPED);
            } else {
                workflowCtx.setWorkflowState(TaskState.STOPPING);
            }
        } else {
            workflowCtx.setJobState(jobResource, TaskState.IN_PROGRESS);
            // Workflow is in progress if any task is in progress
            workflowCtx.setWorkflowState(TaskState.IN_PROGRESS);
        }
    }
    // Used to keep track of tasks that have already been assigned to instances.
    Set<Integer> assignedPartitions = new HashSet<Integer>();
    // Used to keep track of tasks that have failed, but whose failure is acceptable
    Set<Integer> skippedPartitions = new HashSet<Integer>();
    // Keeps a mapping of (partition) -> (instance, state)
    Map<Integer, PartitionAssignment> paMap = new TreeMap<Integer, PartitionAssignment>();
    Set<String> excludedInstances = getExcludedInstances(jobResource, workflowConfig, cache);
    // Process all the current assignments of tasks.
    TaskAssignmentCalculator taskAssignmentCal = getAssignmentCalulator(jobCfg);
    Set<Integer> allPartitions = taskAssignmentCal.getAllTaskPartitions(jobCfg, jobCtx, workflowConfig, workflowCtx, cache.getIdealStates());
    if (allPartitions == null || allPartitions.isEmpty()) {
        // Empty target partitions, mark the job as FAILED.
        String failureMsg = "Empty task partition mapping for job " + jobResource + ", marked the job as FAILED!";
        LOG.info(failureMsg);
        jobCtx.setInfo(failureMsg);
        failJob(jobResource, workflowCtx, jobCtx, workflowConfig, cache.getJobConfigMap());
        markAllPartitionsError(jobCtx, TaskPartitionState.ERROR, false);
        return new ResourceAssignment(jobResource);
    }
    Map<String, SortedSet<Integer>> prevInstanceToTaskAssignments = getPrevInstanceToTaskAssignments(liveInstances, prevTaskToInstanceStateAssignment, allPartitions);
    long currentTime = System.currentTimeMillis();
    LOG.debug("All partitions: " + allPartitions + " taskAssignment: " + prevInstanceToTaskAssignments + " excludedInstances: " + excludedInstances);
    // Iterate through all instances
    for (String instance : prevInstanceToTaskAssignments.keySet()) {
        if (excludedInstances.contains(instance)) {
            continue;
        }
        Set<Integer> pSet = prevInstanceToTaskAssignments.get(instance);
        // Used to keep track of partitions that are in one of the final states: COMPLETED, TIMED_OUT,
        // TASK_ERROR, ERROR.
        Set<Integer> donePartitions = new TreeSet<Integer>();
        for (int pId : pSet) {
            final String pName = pName(jobResource, pId);
            TaskPartitionState currState = updateJobContextAndGetTaskCurrentState(currStateOutput, jobResource, pId, pName, instance, jobCtx);
            // Check for pending state transitions on this (partition, instance).
            Message pendingMessage = currStateOutput.getPendingState(jobResource, new Partition(pName), instance);
            if (pendingMessage != null && !pendingMessage.getToState().equals(currState.name())) {
                processTaskWithPendingMessage(prevTaskToInstanceStateAssignment, pId, pName, instance, pendingMessage, jobState, currState, paMap, assignedPartitions);
                continue;
            }
            // Process any requested state transitions.
            String requestedStateStr = currStateOutput.getRequestedState(jobResource, new Partition(pName), instance);
            if (requestedStateStr != null && !requestedStateStr.isEmpty()) {
                TaskPartitionState requestedState = TaskPartitionState.valueOf(requestedStateStr);
                if (requestedState.equals(currState)) {
                    LOG.warn(String.format("Requested state %s is the same as the current state for instance %s.", requestedState, instance));
                }
                paMap.put(pId, new PartitionAssignment(instance, requestedState.name()));
                assignedPartitions.add(pId);
                LOG.debug(String.format("Instance %s requested a state transition to %s for partition %s.", instance, requestedState, pName));
                continue;
            }
            switch(currState) {
                case RUNNING:
                    {
                        TaskPartitionState nextState = TaskPartitionState.RUNNING;
                        if (jobState == TaskState.TIMING_OUT) {
                            nextState = TaskPartitionState.TASK_ABORTED;
                        } else if (jobTgtState == TargetState.STOP) {
                            nextState = TaskPartitionState.STOPPED;
                        }
                        paMap.put(pId, new PartitionAssignment(instance, nextState.name()));
                        assignedPartitions.add(pId);
                        LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, nextState, instance));
                    }
                    break;
                case STOPPED:
                    {
                        TaskPartitionState nextState;
                        if (jobTgtState == TargetState.START) {
                            nextState = TaskPartitionState.RUNNING;
                        } else {
                            nextState = TaskPartitionState.STOPPED;
                        }
                        paMap.put(pId, new PartitionAssignment(instance, nextState.name()));
                        assignedPartitions.add(pId);
                        LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, nextState, instance));
                    }
                    break;
                case COMPLETED:
                    {
                        // The task has completed on this partition. Mark as such in the context object.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has completed with state %s. Marking as such in rebalancer context.", pName, currState));
                        partitionsToDropFromIs.add(pId);
                        markPartitionCompleted(jobCtx, pId);
                    }
                    break;
                case TIMED_OUT:
                case TASK_ERROR:
                case TASK_ABORTED:
                case ERROR:
                    {
                        // The task may be rescheduled on a different instance.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has error state %s with msg %s. Marking as such in rebalancer context.", pName, currState, jobCtx.getPartitionInfo(pId)));
                        markPartitionError(jobCtx, pId, currState, true);
                        // After all tasks are aborted, they will be dropped, because of job timeout.
                        if (jobState != TaskState.TIMED_OUT && jobState != TaskState.TIMING_OUT) {
                            if (jobCtx.getPartitionNumAttempts(pId) >= jobCfg.getMaxAttemptsPerTask() || currState.equals(TaskPartitionState.TASK_ABORTED) || currState.equals(TaskPartitionState.ERROR)) {
                                skippedPartitions.add(pId);
                                partitionsToDropFromIs.add(pId);
                                LOG.debug("skippedPartitions:" + skippedPartitions);
                            } else {
                                // Mark the task to be started at some later time (if enabled)
                                markPartitionDelayed(jobCfg, jobCtx, pId);
                            }
                        }
                    }
                    break;
                case INIT:
                case DROPPED:
                    {
                        // currState in [INIT, DROPPED]. Do nothing, the partition is eligible to be reassigned.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has state %s. It will be dropped from the current ideal state.", pName, currState));
                    }
                    break;
                default:
                    throw new AssertionError("Unknown enum symbol: " + currState);
            }
        }
        // Remove the set of task partitions that are completed or in one of the error states.
        pSet.removeAll(donePartitions);
    }
    addGiveupPartitions(skippedPartitions, jobCtx, allPartitions, jobCfg);
    if (jobState == TaskState.IN_PROGRESS && skippedPartitions.size() > jobCfg.getFailureThreshold()) {
        if (isJobFinished(jobCtx, jobResource, currStateOutput)) {
            failJob(jobResource, workflowCtx, jobCtx, workflowConfig, cache.getJobConfigMap());
            return buildEmptyAssignment(jobResource, currStateOutput);
        }
        workflowCtx.setJobState(jobResource, TaskState.FAILING);
        // Drop all assigned but not given-up tasks
        for (int pId : jobCtx.getPartitionSet()) {
            String instance = jobCtx.getAssignedParticipant(pId);
            if (jobCtx.getPartitionState(pId) != null && !isTaskGivenup(jobCtx, jobCfg, pId)) {
                paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.TASK_ABORTED.name()));
            }
            Partition partition = new Partition(pName(jobResource, pId));
            Message pendingMessage = currStateOutput.getPendingState(jobResource, partition, instance);
            // so that Helix will cancel the transition.
            if (jobCtx.getPartitionState(pId) == TaskPartitionState.INIT && pendingMessage != null) {
                paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.INIT.name()));
            }
        }
        return toResourceAssignment(jobResource, paMap);
    }
    if (jobState == TaskState.FAILING && isJobFinished(jobCtx, jobResource, currStateOutput)) {
        failJob(jobResource, workflowCtx, jobCtx, workflowConfig, cache.getJobConfigMap());
        return buildEmptyAssignment(jobResource, currStateOutput);
    }
    if (isJobComplete(jobCtx, allPartitions, jobCfg)) {
        markJobComplete(jobResource, jobCtx, workflowConfig, workflowCtx, cache.getJobConfigMap());
        _clusterStatusMonitor.updateJobCounters(jobCfg, TaskState.COMPLETED, jobCtx.getFinishTime() - jobCtx.getStartTime());
        _rebalanceScheduler.removeScheduledRebalance(jobResource);
        TaskUtil.cleanupJobIdealStateExtView(_manager.getHelixDataAccessor(), jobResource);
        return buildEmptyAssignment(jobResource, currStateOutput);
    }
    // can be dropped(note that Helix doesn't track whether the drop is success or not).
    if (jobState == TaskState.TIMING_OUT && isJobFinished(jobCtx, jobResource, currStateOutput)) {
        jobCtx.setFinishTime(System.currentTimeMillis());
        workflowCtx.setJobState(jobResource, TaskState.TIMED_OUT);
        // Mark all INIT task to TASK_ABORTED
        for (int pId : jobCtx.getPartitionSet()) {
            if (jobCtx.getPartitionState(pId) == TaskPartitionState.INIT) {
                jobCtx.setPartitionState(pId, TaskPartitionState.TASK_ABORTED);
            }
        }
        _clusterStatusMonitor.updateJobCounters(jobCfg, TaskState.TIMED_OUT);
        _rebalanceScheduler.removeScheduledRebalance(jobResource);
        TaskUtil.cleanupJobIdealStateExtView(_manager.getHelixDataAccessor(), jobResource);
        return buildEmptyAssignment(jobResource, currStateOutput);
    }
    // For delayed tasks, trigger a rebalance event for the closest upcoming ready time
    scheduleForNextTask(jobResource, jobCtx, currentTime);
    // Make additional task assignments if needed.
    if (jobState != TaskState.TIMING_OUT && jobState != TaskState.TIMED_OUT && jobTgtState == TargetState.START) {
        // Contains the set of task partitions that must be excluded from consideration when making
        // any new assignments.
        // This includes all completed, failed, delayed, and already assigned partitions.
        Set<Integer> excludeSet = Sets.newTreeSet(assignedPartitions);
        addCompletedTasks(excludeSet, jobCtx, allPartitions);
        addGiveupPartitions(excludeSet, jobCtx, allPartitions, jobCfg);
        excludeSet.addAll(skippedPartitions);
        excludeSet.addAll(getNonReadyPartitions(jobCtx, currentTime));
        // Get instance->[partition, ...] mappings for the target resource.
        Map<String, SortedSet<Integer>> tgtPartitionAssignments = taskAssignmentCal.getTaskAssignment(currStateOutput, prevTaskToInstanceStateAssignment, liveInstances, jobCfg, jobCtx, workflowConfig, workflowCtx, allPartitions, cache.getIdealStates());
        if (!isGenericTaskJob(jobCfg) || jobCfg.isRebalanceRunningTask()) {
            dropRebalancedRunningTasks(tgtPartitionAssignments, prevInstanceToTaskAssignments, paMap, jobCtx);
        }
        for (Map.Entry<String, SortedSet<Integer>> entry : prevInstanceToTaskAssignments.entrySet()) {
            String instance = entry.getKey();
            if (!tgtPartitionAssignments.containsKey(instance) || excludedInstances.contains(instance)) {
                continue;
            }
            // 1. throttled by job configuration
            // Contains the set of task partitions currently assigned to the instance.
            Set<Integer> pSet = entry.getValue();
            int jobCfgLimitation = jobCfg.getNumConcurrentTasksPerInstance() - pSet.size();
            // 2. throttled by participant capacity
            int participantCapacity = cache.getInstanceConfigMap().get(instance).getMaxConcurrentTask();
            if (participantCapacity == InstanceConfig.MAX_CONCURRENT_TASK_NOT_SET) {
                participantCapacity = cache.getClusterConfig().getMaxConcurrentTaskPerInstance();
            }
            int participantLimitation = participantCapacity - cache.getParticipantActiveTaskCount(instance);
            // New tasks to be assigned
            int numToAssign = Math.min(jobCfgLimitation, participantLimitation);
            LOG.debug(String.format("Throttle tasks to be assigned to instance %s using limitation: Job Concurrent Task(%d), " + "Participant Max Task(%d). Remaining capacity %d.", instance, jobCfgLimitation, participantCapacity, numToAssign));
            if (numToAssign > 0) {
                Set<Integer> throttledSet = new HashSet<Integer>();
                List<Integer> nextPartitions = getNextPartitions(tgtPartitionAssignments.get(instance), excludeSet, throttledSet, numToAssign);
                for (Integer pId : nextPartitions) {
                    String pName = pName(jobResource, pId);
                    paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.RUNNING.name()));
                    excludeSet.add(pId);
                    jobCtx.setAssignedParticipant(pId, instance);
                    jobCtx.setPartitionState(pId, TaskPartitionState.INIT);
                    jobCtx.setPartitionStartTime(pId, System.currentTimeMillis());
                    LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, TaskPartitionState.RUNNING, instance));
                }
                cache.setParticipantActiveTaskCount(instance, cache.getParticipantActiveTaskCount(instance) + nextPartitions.size());
                if (!throttledSet.isEmpty()) {
                    LOG.debug(throttledSet.size() + "tasks are ready but throttled when assigned to participant.");
                }
            }
        }
    }
    return toResourceAssignment(jobResource, paMap);
}
Also used : Partition(org.apache.helix.model.Partition) Message(org.apache.helix.model.Message) TreeMap(java.util.TreeMap) SortedSet(java.util.SortedSet) ResourceAssignment(org.apache.helix.model.ResourceAssignment) TreeSet(java.util.TreeSet) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TreeMap(java.util.TreeMap) HashSet(java.util.HashSet)

Aggregations

Partition (org.apache.helix.model.Partition)50 Message (org.apache.helix.model.Message)18 Test (org.testng.annotations.Test)17 HashMap (java.util.HashMap)16 Resource (org.apache.helix.model.Resource)16 Map (java.util.Map)12 ArrayList (java.util.ArrayList)10 Date (java.util.Date)10 HelixDataAccessor (org.apache.helix.HelixDataAccessor)9 HelixManager (org.apache.helix.HelixManager)9 ZNRecord (org.apache.helix.ZNRecord)9 IdealState (org.apache.helix.model.IdealState)9 StateModelDefinition (org.apache.helix.model.StateModelDefinition)9 ResourceAssignment (org.apache.helix.model.ResourceAssignment)8 PartitionStateMap (org.apache.helix.controller.common.PartitionStateMap)7 Pipeline (org.apache.helix.controller.pipeline.Pipeline)7 ZKHelixDataAccessor (org.apache.helix.manager.zk.ZKHelixDataAccessor)6 HashSet (java.util.HashSet)5 List (java.util.List)5 BestPossibleStateOutput (org.apache.helix.controller.stages.BestPossibleStateOutput)5