Search in sources :

Example 6 with ResourceAssignment

use of org.apache.helix.model.ResourceAssignment in project helix by apache.

the class JobRebalancer method computeResourceMapping.

private ResourceAssignment computeResourceMapping(String jobResource, WorkflowConfig workflowConfig, JobConfig jobCfg, ResourceAssignment prevTaskToInstanceStateAssignment, Collection<String> liveInstances, CurrentStateOutput currStateOutput, WorkflowContext workflowCtx, JobContext jobCtx, Set<Integer> partitionsToDropFromIs, ClusterDataCache cache) {
    TargetState jobTgtState = workflowConfig.getTargetState();
    TaskState jobState = workflowCtx.getJobState(jobResource);
    TaskState workflowState = workflowCtx.getWorkflowState();
    if (jobState == TaskState.IN_PROGRESS && (isTimeout(jobCtx.getStartTime(), jobCfg.getTimeout()) || TaskState.TIMED_OUT.equals(workflowState))) {
        jobState = TaskState.TIMING_OUT;
        workflowCtx.setJobState(jobResource, TaskState.TIMING_OUT);
    } else if (jobState != TaskState.TIMING_OUT && jobState != TaskState.FAILING) {
        // Update running status in workflow context
        if (jobTgtState == TargetState.STOP) {
            if (checkJobStopped(jobCtx)) {
                workflowCtx.setJobState(jobResource, TaskState.STOPPED);
            } else {
                workflowCtx.setJobState(jobResource, TaskState.STOPPING);
            }
            // Workflow has been stopped if all in progress jobs are stopped
            if (isWorkflowStopped(workflowCtx, workflowConfig)) {
                workflowCtx.setWorkflowState(TaskState.STOPPED);
            } else {
                workflowCtx.setWorkflowState(TaskState.STOPPING);
            }
        } else {
            workflowCtx.setJobState(jobResource, TaskState.IN_PROGRESS);
            // Workflow is in progress if any task is in progress
            workflowCtx.setWorkflowState(TaskState.IN_PROGRESS);
        }
    }
    // Used to keep track of tasks that have already been assigned to instances.
    Set<Integer> assignedPartitions = new HashSet<Integer>();
    // Used to keep track of tasks that have failed, but whose failure is acceptable
    Set<Integer> skippedPartitions = new HashSet<Integer>();
    // Keeps a mapping of (partition) -> (instance, state)
    Map<Integer, PartitionAssignment> paMap = new TreeMap<Integer, PartitionAssignment>();
    Set<String> excludedInstances = getExcludedInstances(jobResource, workflowConfig, cache);
    // Process all the current assignments of tasks.
    TaskAssignmentCalculator taskAssignmentCal = getAssignmentCalulator(jobCfg);
    Set<Integer> allPartitions = taskAssignmentCal.getAllTaskPartitions(jobCfg, jobCtx, workflowConfig, workflowCtx, cache.getIdealStates());
    if (allPartitions == null || allPartitions.isEmpty()) {
        // Empty target partitions, mark the job as FAILED.
        String failureMsg = "Empty task partition mapping for job " + jobResource + ", marked the job as FAILED!";
        LOG.info(failureMsg);
        jobCtx.setInfo(failureMsg);
        failJob(jobResource, workflowCtx, jobCtx, workflowConfig, cache.getJobConfigMap());
        markAllPartitionsError(jobCtx, TaskPartitionState.ERROR, false);
        return new ResourceAssignment(jobResource);
    }
    Map<String, SortedSet<Integer>> prevInstanceToTaskAssignments = getPrevInstanceToTaskAssignments(liveInstances, prevTaskToInstanceStateAssignment, allPartitions);
    long currentTime = System.currentTimeMillis();
    LOG.debug("All partitions: " + allPartitions + " taskAssignment: " + prevInstanceToTaskAssignments + " excludedInstances: " + excludedInstances);
    // Iterate through all instances
    for (String instance : prevInstanceToTaskAssignments.keySet()) {
        if (excludedInstances.contains(instance)) {
            continue;
        }
        Set<Integer> pSet = prevInstanceToTaskAssignments.get(instance);
        // Used to keep track of partitions that are in one of the final states: COMPLETED, TIMED_OUT,
        // TASK_ERROR, ERROR.
        Set<Integer> donePartitions = new TreeSet<Integer>();
        for (int pId : pSet) {
            final String pName = pName(jobResource, pId);
            TaskPartitionState currState = updateJobContextAndGetTaskCurrentState(currStateOutput, jobResource, pId, pName, instance, jobCtx);
            // Check for pending state transitions on this (partition, instance).
            Message pendingMessage = currStateOutput.getPendingState(jobResource, new Partition(pName), instance);
            if (pendingMessage != null && !pendingMessage.getToState().equals(currState.name())) {
                processTaskWithPendingMessage(prevTaskToInstanceStateAssignment, pId, pName, instance, pendingMessage, jobState, currState, paMap, assignedPartitions);
                continue;
            }
            // Process any requested state transitions.
            String requestedStateStr = currStateOutput.getRequestedState(jobResource, new Partition(pName), instance);
            if (requestedStateStr != null && !requestedStateStr.isEmpty()) {
                TaskPartitionState requestedState = TaskPartitionState.valueOf(requestedStateStr);
                if (requestedState.equals(currState)) {
                    LOG.warn(String.format("Requested state %s is the same as the current state for instance %s.", requestedState, instance));
                }
                paMap.put(pId, new PartitionAssignment(instance, requestedState.name()));
                assignedPartitions.add(pId);
                LOG.debug(String.format("Instance %s requested a state transition to %s for partition %s.", instance, requestedState, pName));
                continue;
            }
            switch(currState) {
                case RUNNING:
                    {
                        TaskPartitionState nextState = TaskPartitionState.RUNNING;
                        if (jobState == TaskState.TIMING_OUT) {
                            nextState = TaskPartitionState.TASK_ABORTED;
                        } else if (jobTgtState == TargetState.STOP) {
                            nextState = TaskPartitionState.STOPPED;
                        }
                        paMap.put(pId, new PartitionAssignment(instance, nextState.name()));
                        assignedPartitions.add(pId);
                        LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, nextState, instance));
                    }
                    break;
                case STOPPED:
                    {
                        TaskPartitionState nextState;
                        if (jobTgtState == TargetState.START) {
                            nextState = TaskPartitionState.RUNNING;
                        } else {
                            nextState = TaskPartitionState.STOPPED;
                        }
                        paMap.put(pId, new PartitionAssignment(instance, nextState.name()));
                        assignedPartitions.add(pId);
                        LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, nextState, instance));
                    }
                    break;
                case COMPLETED:
                    {
                        // The task has completed on this partition. Mark as such in the context object.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has completed with state %s. Marking as such in rebalancer context.", pName, currState));
                        partitionsToDropFromIs.add(pId);
                        markPartitionCompleted(jobCtx, pId);
                    }
                    break;
                case TIMED_OUT:
                case TASK_ERROR:
                case TASK_ABORTED:
                case ERROR:
                    {
                        // The task may be rescheduled on a different instance.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has error state %s with msg %s. Marking as such in rebalancer context.", pName, currState, jobCtx.getPartitionInfo(pId)));
                        markPartitionError(jobCtx, pId, currState, true);
                        // After all tasks are aborted, they will be dropped, because of job timeout.
                        if (jobState != TaskState.TIMED_OUT && jobState != TaskState.TIMING_OUT) {
                            if (jobCtx.getPartitionNumAttempts(pId) >= jobCfg.getMaxAttemptsPerTask() || currState.equals(TaskPartitionState.TASK_ABORTED) || currState.equals(TaskPartitionState.ERROR)) {
                                skippedPartitions.add(pId);
                                partitionsToDropFromIs.add(pId);
                                LOG.debug("skippedPartitions:" + skippedPartitions);
                            } else {
                                // Mark the task to be started at some later time (if enabled)
                                markPartitionDelayed(jobCfg, jobCtx, pId);
                            }
                        }
                    }
                    break;
                case INIT:
                case DROPPED:
                    {
                        // currState in [INIT, DROPPED]. Do nothing, the partition is eligible to be reassigned.
                        donePartitions.add(pId);
                        LOG.debug(String.format("Task partition %s has state %s. It will be dropped from the current ideal state.", pName, currState));
                    }
                    break;
                default:
                    throw new AssertionError("Unknown enum symbol: " + currState);
            }
        }
        // Remove the set of task partitions that are completed or in one of the error states.
        pSet.removeAll(donePartitions);
    }
    addGiveupPartitions(skippedPartitions, jobCtx, allPartitions, jobCfg);
    if (jobState == TaskState.IN_PROGRESS && skippedPartitions.size() > jobCfg.getFailureThreshold()) {
        if (isJobFinished(jobCtx, jobResource, currStateOutput)) {
            failJob(jobResource, workflowCtx, jobCtx, workflowConfig, cache.getJobConfigMap());
            return buildEmptyAssignment(jobResource, currStateOutput);
        }
        workflowCtx.setJobState(jobResource, TaskState.FAILING);
        // Drop all assigned but not given-up tasks
        for (int pId : jobCtx.getPartitionSet()) {
            String instance = jobCtx.getAssignedParticipant(pId);
            if (jobCtx.getPartitionState(pId) != null && !isTaskGivenup(jobCtx, jobCfg, pId)) {
                paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.TASK_ABORTED.name()));
            }
            Partition partition = new Partition(pName(jobResource, pId));
            Message pendingMessage = currStateOutput.getPendingState(jobResource, partition, instance);
            // so that Helix will cancel the transition.
            if (jobCtx.getPartitionState(pId) == TaskPartitionState.INIT && pendingMessage != null) {
                paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.INIT.name()));
            }
        }
        return toResourceAssignment(jobResource, paMap);
    }
    if (jobState == TaskState.FAILING && isJobFinished(jobCtx, jobResource, currStateOutput)) {
        failJob(jobResource, workflowCtx, jobCtx, workflowConfig, cache.getJobConfigMap());
        return buildEmptyAssignment(jobResource, currStateOutput);
    }
    if (isJobComplete(jobCtx, allPartitions, jobCfg)) {
        markJobComplete(jobResource, jobCtx, workflowConfig, workflowCtx, cache.getJobConfigMap());
        _clusterStatusMonitor.updateJobCounters(jobCfg, TaskState.COMPLETED, jobCtx.getFinishTime() - jobCtx.getStartTime());
        _rebalanceScheduler.removeScheduledRebalance(jobResource);
        TaskUtil.cleanupJobIdealStateExtView(_manager.getHelixDataAccessor(), jobResource);
        return buildEmptyAssignment(jobResource, currStateOutput);
    }
    // can be dropped(note that Helix doesn't track whether the drop is success or not).
    if (jobState == TaskState.TIMING_OUT && isJobFinished(jobCtx, jobResource, currStateOutput)) {
        jobCtx.setFinishTime(System.currentTimeMillis());
        workflowCtx.setJobState(jobResource, TaskState.TIMED_OUT);
        // Mark all INIT task to TASK_ABORTED
        for (int pId : jobCtx.getPartitionSet()) {
            if (jobCtx.getPartitionState(pId) == TaskPartitionState.INIT) {
                jobCtx.setPartitionState(pId, TaskPartitionState.TASK_ABORTED);
            }
        }
        _clusterStatusMonitor.updateJobCounters(jobCfg, TaskState.TIMED_OUT);
        _rebalanceScheduler.removeScheduledRebalance(jobResource);
        TaskUtil.cleanupJobIdealStateExtView(_manager.getHelixDataAccessor(), jobResource);
        return buildEmptyAssignment(jobResource, currStateOutput);
    }
    // For delayed tasks, trigger a rebalance event for the closest upcoming ready time
    scheduleForNextTask(jobResource, jobCtx, currentTime);
    // Make additional task assignments if needed.
    if (jobState != TaskState.TIMING_OUT && jobState != TaskState.TIMED_OUT && jobTgtState == TargetState.START) {
        // Contains the set of task partitions that must be excluded from consideration when making
        // any new assignments.
        // This includes all completed, failed, delayed, and already assigned partitions.
        Set<Integer> excludeSet = Sets.newTreeSet(assignedPartitions);
        addCompletedTasks(excludeSet, jobCtx, allPartitions);
        addGiveupPartitions(excludeSet, jobCtx, allPartitions, jobCfg);
        excludeSet.addAll(skippedPartitions);
        excludeSet.addAll(getNonReadyPartitions(jobCtx, currentTime));
        // Get instance->[partition, ...] mappings for the target resource.
        Map<String, SortedSet<Integer>> tgtPartitionAssignments = taskAssignmentCal.getTaskAssignment(currStateOutput, prevTaskToInstanceStateAssignment, liveInstances, jobCfg, jobCtx, workflowConfig, workflowCtx, allPartitions, cache.getIdealStates());
        if (!isGenericTaskJob(jobCfg) || jobCfg.isRebalanceRunningTask()) {
            dropRebalancedRunningTasks(tgtPartitionAssignments, prevInstanceToTaskAssignments, paMap, jobCtx);
        }
        for (Map.Entry<String, SortedSet<Integer>> entry : prevInstanceToTaskAssignments.entrySet()) {
            String instance = entry.getKey();
            if (!tgtPartitionAssignments.containsKey(instance) || excludedInstances.contains(instance)) {
                continue;
            }
            // 1. throttled by job configuration
            // Contains the set of task partitions currently assigned to the instance.
            Set<Integer> pSet = entry.getValue();
            int jobCfgLimitation = jobCfg.getNumConcurrentTasksPerInstance() - pSet.size();
            // 2. throttled by participant capacity
            int participantCapacity = cache.getInstanceConfigMap().get(instance).getMaxConcurrentTask();
            if (participantCapacity == InstanceConfig.MAX_CONCURRENT_TASK_NOT_SET) {
                participantCapacity = cache.getClusterConfig().getMaxConcurrentTaskPerInstance();
            }
            int participantLimitation = participantCapacity - cache.getParticipantActiveTaskCount(instance);
            // New tasks to be assigned
            int numToAssign = Math.min(jobCfgLimitation, participantLimitation);
            LOG.debug(String.format("Throttle tasks to be assigned to instance %s using limitation: Job Concurrent Task(%d), " + "Participant Max Task(%d). Remaining capacity %d.", instance, jobCfgLimitation, participantCapacity, numToAssign));
            if (numToAssign > 0) {
                Set<Integer> throttledSet = new HashSet<Integer>();
                List<Integer> nextPartitions = getNextPartitions(tgtPartitionAssignments.get(instance), excludeSet, throttledSet, numToAssign);
                for (Integer pId : nextPartitions) {
                    String pName = pName(jobResource, pId);
                    paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.RUNNING.name()));
                    excludeSet.add(pId);
                    jobCtx.setAssignedParticipant(pId, instance);
                    jobCtx.setPartitionState(pId, TaskPartitionState.INIT);
                    jobCtx.setPartitionStartTime(pId, System.currentTimeMillis());
                    LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName, TaskPartitionState.RUNNING, instance));
                }
                cache.setParticipantActiveTaskCount(instance, cache.getParticipantActiveTaskCount(instance) + nextPartitions.size());
                if (!throttledSet.isEmpty()) {
                    LOG.debug(throttledSet.size() + "tasks are ready but throttled when assigned to participant.");
                }
            }
        }
    }
    return toResourceAssignment(jobResource, paMap);
}
Also used : Partition(org.apache.helix.model.Partition) Message(org.apache.helix.model.Message) TreeMap(java.util.TreeMap) SortedSet(java.util.SortedSet) ResourceAssignment(org.apache.helix.model.ResourceAssignment) TreeSet(java.util.TreeSet) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TreeMap(java.util.TreeMap) HashSet(java.util.HashSet)

Example 7 with ResourceAssignment

use of org.apache.helix.model.ResourceAssignment in project helix by apache.

the class DeprecatedTaskRebalancer method emptyAssignment.

private static ResourceAssignment emptyAssignment(String name, CurrentStateOutput currStateOutput) {
    ResourceAssignment assignment = new ResourceAssignment(name);
    Set<Partition> partitions = currStateOutput.getCurrentStateMappedPartitions(name);
    for (Partition partition : partitions) {
        Map<String, String> currentStateMap = currStateOutput.getCurrentStateMap(name, partition);
        Map<String, String> replicaMap = Maps.newHashMap();
        for (String instanceName : currentStateMap.keySet()) {
            replicaMap.put(instanceName, HelixDefinedState.DROPPED.toString());
        }
        assignment.addReplicaMap(partition, replicaMap);
    }
    return assignment;
}
Also used : Partition(org.apache.helix.model.Partition) ResourceAssignment(org.apache.helix.model.ResourceAssignment)

Example 8 with ResourceAssignment

use of org.apache.helix.model.ResourceAssignment in project helix by apache.

the class DeprecatedTaskRebalancer method computeBestPossiblePartitionState.

@Override
public ResourceAssignment computeBestPossiblePartitionState(ClusterDataCache clusterData, IdealState taskIs, Resource resource, CurrentStateOutput currStateOutput) {
    final String resourceName = resource.getResourceName();
    LOG.debug("Computer Best Partition for resource: " + resourceName);
    // Fetch job configuration
    JobConfig jobCfg = (JobConfig) clusterData.getResourceConfig(resourceName);
    if (jobCfg == null) {
        LOG.debug("Job configuration is NULL for " + resourceName);
        return emptyAssignment(resourceName, currStateOutput);
    }
    String workflowResource = jobCfg.getWorkflow();
    // Fetch workflow configuration and context
    WorkflowConfig workflowCfg = clusterData.getWorkflowConfig(workflowResource);
    if (workflowCfg == null) {
        LOG.debug("Workflow configuration is NULL for " + resourceName);
        return emptyAssignment(resourceName, currStateOutput);
    }
    WorkflowContext workflowCtx = clusterData.getWorkflowContext(workflowResource);
    // Initialize workflow context if needed
    if (workflowCtx == null) {
        workflowCtx = new WorkflowContext(new ZNRecord(TaskUtil.WORKFLOW_CONTEXT_KW));
        workflowCtx.setStartTime(System.currentTimeMillis());
        workflowCtx.setName(workflowResource);
        LOG.info("Workflow context for " + resourceName + " created!");
    }
    // check ancestor job status
    int notStartedCount = 0;
    int inCompleteCount = 0;
    for (String ancestor : workflowCfg.getJobDag().getAncestors(resourceName)) {
        TaskState jobState = workflowCtx.getJobState(ancestor);
        if (jobState == null || jobState == TaskState.NOT_STARTED) {
            ++notStartedCount;
        } else if (jobState == TaskState.IN_PROGRESS || jobState == TaskState.STOPPED) {
            ++inCompleteCount;
        }
    }
    if (notStartedCount > 0 || (workflowCfg.isJobQueue() && inCompleteCount >= workflowCfg.getParallelJobs())) {
        LOG.debug("Job is not ready to be scheduled due to pending dependent jobs " + resourceName);
        return emptyAssignment(resourceName, currStateOutput);
    }
    // Clean up if workflow marked for deletion
    TargetState targetState = workflowCfg.getTargetState();
    if (targetState == TargetState.DELETE) {
        LOG.info("Workflow is marked as deleted " + workflowResource + " cleaning up the workflow context.");
        cleanup(_manager, resourceName, workflowCfg, workflowResource);
        return emptyAssignment(resourceName, currStateOutput);
    }
    // Check if this workflow has been finished past its expiry.
    if (workflowCtx.getFinishTime() != WorkflowContext.UNFINISHED && workflowCtx.getFinishTime() + workflowCfg.getExpiry() <= System.currentTimeMillis()) {
        LOG.info("Workflow " + workflowResource + " is completed and passed expiry time, cleaning up the workflow context.");
        markForDeletion(_manager, workflowResource);
        cleanup(_manager, resourceName, workflowCfg, workflowResource);
        return emptyAssignment(resourceName, currStateOutput);
    }
    // Fetch any existing context information from the property store.
    JobContext jobCtx = clusterData.getJobContext(resourceName);
    if (jobCtx == null) {
        jobCtx = new JobContext(new ZNRecord(TaskUtil.TASK_CONTEXT_KW));
        jobCtx.setStartTime(System.currentTimeMillis());
        jobCtx.setName(resourceName);
    }
    // Check for expired jobs for non-terminable workflows
    long jobFinishTime = jobCtx.getFinishTime();
    if (!workflowCfg.isTerminable() && jobFinishTime != WorkflowContext.UNFINISHED && jobFinishTime + workflowCfg.getExpiry() <= System.currentTimeMillis()) {
        LOG.info("Job " + resourceName + " is completed and passed expiry time, cleaning up the job context.");
        cleanup(_manager, resourceName, workflowCfg, workflowResource);
        return emptyAssignment(resourceName, currStateOutput);
    }
    // The job is already in a final state (completed/failed).
    if (workflowCtx.getJobState(resourceName) == TaskState.FAILED || workflowCtx.getJobState(resourceName) == TaskState.COMPLETED) {
        LOG.debug("Job " + resourceName + " is failed or already completed.");
        return emptyAssignment(resourceName, currStateOutput);
    }
    // Check for readiness, and stop processing if it's not ready
    boolean isReady = scheduleIfNotReady(workflowCfg, workflowCtx, workflowResource, resourceName, clusterData);
    if (!isReady) {
        LOG.debug("Job " + resourceName + " is not ready to be scheduled.");
        return emptyAssignment(resourceName, currStateOutput);
    }
    // Grab the old assignment, or an empty one if it doesn't exist
    ResourceAssignment prevAssignment = getPrevResourceAssignment(_manager, resourceName);
    if (prevAssignment == null) {
        prevAssignment = new ResourceAssignment(resourceName);
    }
    // Will contain the list of partitions that must be explicitly dropped from the ideal state that
    // is stored in zk.
    // Fetch the previous resource assignment from the property store. This is required because of
    // HELIX-230.
    Set<Integer> partitionsToDrop = new TreeSet<Integer>();
    ResourceAssignment newAssignment = computeResourceMapping(resourceName, workflowCfg, jobCfg, prevAssignment, clusterData.getLiveInstances().keySet(), currStateOutput, workflowCtx, jobCtx, partitionsToDrop, clusterData);
    if (!partitionsToDrop.isEmpty()) {
        for (Integer pId : partitionsToDrop) {
            taskIs.getRecord().getMapFields().remove(pName(resourceName, pId));
        }
        HelixDataAccessor accessor = _manager.getHelixDataAccessor();
        PropertyKey propertyKey = accessor.keyBuilder().idealStates(resourceName);
        accessor.setProperty(propertyKey, taskIs);
    }
    // Update Workflow and Job context in data cache and ZK.
    clusterData.updateJobContext(resourceName, jobCtx, _manager.getHelixDataAccessor());
    clusterData.updateWorkflowContext(workflowResource, workflowCtx, _manager.getHelixDataAccessor());
    setPrevResourceAssignment(_manager, resourceName, newAssignment);
    LOG.debug("Job " + resourceName + " new assignment " + Arrays.toString(newAssignment.getMappedPartitions().toArray()));
    return newAssignment;
}
Also used : ResourceAssignment(org.apache.helix.model.ResourceAssignment) HelixDataAccessor(org.apache.helix.HelixDataAccessor) TreeSet(java.util.TreeSet) ZNRecord(org.apache.helix.ZNRecord) PropertyKey(org.apache.helix.PropertyKey)

Example 9 with ResourceAssignment

use of org.apache.helix.model.ResourceAssignment in project helix by apache.

the class JobRebalancer method computeBestPossiblePartitionState.

@Override
public ResourceAssignment computeBestPossiblePartitionState(ClusterDataCache clusterData, IdealState taskIs, Resource resource, CurrentStateOutput currStateOutput) {
    final String jobName = resource.getResourceName();
    LOG.debug("Computer Best Partition for job: " + jobName);
    // Fetch job configuration
    JobConfig jobCfg = clusterData.getJobConfig(jobName);
    if (jobCfg == null) {
        LOG.error("Job configuration is NULL for " + jobName);
        return buildEmptyAssignment(jobName, currStateOutput);
    }
    String workflowResource = jobCfg.getWorkflow();
    // Fetch workflow configuration and context
    WorkflowConfig workflowCfg = clusterData.getWorkflowConfig(workflowResource);
    if (workflowCfg == null) {
        LOG.error("Workflow configuration is NULL for " + jobName);
        return buildEmptyAssignment(jobName, currStateOutput);
    }
    WorkflowContext workflowCtx = clusterData.getWorkflowContext(workflowResource);
    if (workflowCtx == null) {
        LOG.error("Workflow context is NULL for " + jobName);
        return buildEmptyAssignment(jobName, currStateOutput);
    }
    TargetState targetState = workflowCfg.getTargetState();
    if (targetState != TargetState.START && targetState != TargetState.STOP) {
        LOG.info("Target state is " + targetState.name() + " for workflow " + workflowResource + ".Stop scheduling job " + jobName);
        return buildEmptyAssignment(jobName, currStateOutput);
    }
    // Stop current run of the job if workflow or job is already in final state (failed or completed)
    TaskState workflowState = workflowCtx.getWorkflowState();
    TaskState jobState = workflowCtx.getJobState(jobName);
    // The job is already in a final state (completed/failed).
    if (workflowState == TaskState.FAILED || workflowState == TaskState.COMPLETED || jobState == TaskState.FAILED || jobState == TaskState.COMPLETED) {
        LOG.info(String.format("Workflow %s or job %s is already failed or completed, workflow state (%s), job state (%s), clean up job IS.", workflowResource, jobName, workflowState, jobState));
        TaskUtil.cleanupJobIdealStateExtView(_manager.getHelixDataAccessor(), jobName);
        _rebalanceScheduler.removeScheduledRebalance(jobName);
        return buildEmptyAssignment(jobName, currStateOutput);
    }
    if (!isWorkflowReadyForSchedule(workflowCfg)) {
        LOG.info("Job is not ready to be run since workflow is not ready " + jobName);
        return buildEmptyAssignment(jobName, currStateOutput);
    }
    if (!isJobStarted(jobName, workflowCtx) && !isJobReadyToSchedule(jobName, workflowCfg, workflowCtx, getInCompleteJobCount(workflowCfg, workflowCtx), clusterData.getJobConfigMap())) {
        LOG.info("Job is not ready to run " + jobName);
        return buildEmptyAssignment(jobName, currStateOutput);
    }
    // Fetch any existing context information from the property store.
    JobContext jobCtx = clusterData.getJobContext(jobName);
    if (jobCtx == null) {
        jobCtx = new JobContext(new ZNRecord(TaskUtil.TASK_CONTEXT_KW));
        jobCtx.setStartTime(System.currentTimeMillis());
        jobCtx.setName(jobName);
        workflowCtx.setJobState(jobName, TaskState.IN_PROGRESS);
    }
    if (!TaskState.TIMED_OUT.equals(workflowCtx.getJobState(jobName))) {
        scheduleRebalanceForTimeout(jobCfg.getJobId(), jobCtx.getStartTime(), jobCfg.getTimeout());
    }
    // Grab the old assignment, or an empty one if it doesn't exist
    ResourceAssignment prevAssignment = getPrevResourceAssignment(jobName);
    if (prevAssignment == null) {
        prevAssignment = new ResourceAssignment(jobName);
    }
    // Will contain the list of partitions that must be explicitly dropped from the ideal state that
    // is stored in zk.
    // Fetch the previous resource assignment from the property store. This is required because of
    // HELIX-230.
    Set<String> liveInstances = jobCfg.getInstanceGroupTag() == null ? clusterData.getEnabledLiveInstances() : clusterData.getEnabledLiveInstancesWithTag(jobCfg.getInstanceGroupTag());
    if (liveInstances.isEmpty()) {
        LOG.error("No available instance found for job!");
    }
    Set<Integer> partitionsToDrop = new TreeSet<Integer>();
    ResourceAssignment newAssignment = computeResourceMapping(jobName, workflowCfg, jobCfg, prevAssignment, liveInstances, currStateOutput, workflowCtx, jobCtx, partitionsToDrop, clusterData);
    HelixDataAccessor accessor = _manager.getHelixDataAccessor();
    PropertyKey propertyKey = accessor.keyBuilder().idealStates(jobName);
    taskIs = clusterData.getIdealState(jobName);
    if (!partitionsToDrop.isEmpty() && taskIs != null) {
        for (Integer pId : partitionsToDrop) {
            taskIs.getRecord().getMapFields().remove(pName(jobName, pId));
        }
        accessor.setProperty(propertyKey, taskIs);
    }
    // Update Workflow and Job context in data cache and ZK.
    clusterData.updateJobContext(jobName, jobCtx, _manager.getHelixDataAccessor());
    clusterData.updateWorkflowContext(workflowResource, workflowCtx, _manager.getHelixDataAccessor());
    setPrevResourceAssignment(jobName, newAssignment);
    LOG.debug("Job " + jobName + " new assignment " + Arrays.toString(newAssignment.getMappedPartitions().toArray()));
    return newAssignment;
}
Also used : ResourceAssignment(org.apache.helix.model.ResourceAssignment) HelixDataAccessor(org.apache.helix.HelixDataAccessor) TreeSet(java.util.TreeSet) ZNRecord(org.apache.helix.ZNRecord) PropertyKey(org.apache.helix.PropertyKey)

Example 10 with ResourceAssignment

use of org.apache.helix.model.ResourceAssignment in project helix by apache.

the class JobRebalancer method toResourceAssignment.

private ResourceAssignment toResourceAssignment(String jobResource, Map<Integer, PartitionAssignment> paMap) {
    // Construct a ResourceAssignment object from the map of partition assignments.
    ResourceAssignment ra = new ResourceAssignment(jobResource);
    for (Map.Entry<Integer, PartitionAssignment> e : paMap.entrySet()) {
        PartitionAssignment pa = e.getValue();
        ra.addReplicaMap(new Partition(pName(jobResource, e.getKey())), ImmutableMap.of(pa._instance, pa._state));
    }
    return ra;
}
Also used : Partition(org.apache.helix.model.Partition) ResourceAssignment(org.apache.helix.model.ResourceAssignment) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) TreeMap(java.util.TreeMap)

Aggregations

ResourceAssignment (org.apache.helix.model.ResourceAssignment)11 Partition (org.apache.helix.model.Partition)8 TreeSet (java.util.TreeSet)4 ImmutableMap (com.google.common.collect.ImmutableMap)3 HashMap (java.util.HashMap)3 Map (java.util.Map)3 TreeMap (java.util.TreeMap)3 StateModelDefinition (org.apache.helix.model.StateModelDefinition)3 HashSet (java.util.HashSet)2 SortedSet (java.util.SortedSet)2 HelixDataAccessor (org.apache.helix.HelixDataAccessor)2 PropertyKey (org.apache.helix.PropertyKey)2 ZNRecord (org.apache.helix.ZNRecord)2 Message (org.apache.helix.model.Message)2 BiMap (com.google.common.collect.BiMap)1 HashBiMap (com.google.common.collect.HashBiMap)1 ClusterConfig (org.apache.helix.model.ClusterConfig)1 Test (org.testng.annotations.Test)1