Search in sources :

Example 16 with JobEvent

use of org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent in project hadoop by apache.

the class RMContainerAllocator method getResources.

@SuppressWarnings("unchecked")
private List<Container> getResources() throws Exception {
    applyConcurrentTaskLimits();
    // will be null the first time
    Resource headRoom = Resources.clone(getAvailableResources());
    AllocateResponse response;
    /*
     * If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS
     * milliseconds before aborting. During this interval, AM will still try
     * to contact the RM.
     */
    try {
        response = makeRemoteRequest();
        // Reset retry count if no exception occurred.
        retrystartTime = System.currentTimeMillis();
    } catch (ApplicationAttemptNotFoundException e) {
        // This can happen if the RM has been restarted. If it is in that state,
        // this application must clean itself up.
        eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
        throw new RMContainerAllocationException("Resource Manager doesn't recognize AttemptId: " + this.getContext().getApplicationAttemptId(), e);
    } catch (ApplicationMasterNotRegisteredException e) {
        LOG.info("ApplicationMaster is out of sync with ResourceManager," + " hence resync and send outstanding requests.");
        // RM may have restarted, re-register with RM.
        lastResponseID = 0;
        register();
        addOutstandingRequestOnResync();
        return null;
    } catch (InvalidLabelResourceRequestException e) {
        // If Invalid label exception is received means the requested label doesnt
        // have access so killing job in this case.
        String diagMsg = "Requested node-label-expression is invalid: " + StringUtils.stringifyException(e);
        LOG.info(diagMsg);
        JobId jobId = this.getJob().getID();
        eventHandler.handle(new JobDiagnosticsUpdateEvent(jobId, diagMsg));
        eventHandler.handle(new JobEvent(jobId, JobEventType.JOB_KILL));
        throw e;
    } catch (Exception e) {
        // re-trying until the retryInterval has expired.
        if (System.currentTimeMillis() - retrystartTime >= retryInterval) {
            LOG.error("Could not contact RM after " + retryInterval + " milliseconds.");
            eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
            throw new RMContainerAllocationException("Could not contact RM after " + retryInterval + " milliseconds.");
        }
        // continue to attempt to contact the RM.
        throw e;
    }
    Resource newHeadRoom = getAvailableResources();
    List<Container> newContainers = response.getAllocatedContainers();
    // Setting NMTokens
    if (response.getNMTokens() != null) {
        for (NMToken nmToken : response.getNMTokens()) {
            NMTokenCache.setNMToken(nmToken.getNodeId().toString(), nmToken.getToken());
        }
    }
    // Setting AMRMToken
    if (response.getAMRMToken() != null) {
        updateAMRMToken(response.getAMRMToken());
    }
    List<ContainerStatus> finishedContainers = response.getCompletedContainersStatuses();
    // propagate preemption requests
    final PreemptionMessage preemptReq = response.getPreemptionMessage();
    if (preemptReq != null) {
        preemptionPolicy.preempt(new PreemptionContext(assignedRequests), preemptReq);
    }
    if (newContainers.size() + finishedContainers.size() > 0 || !headRoom.equals(newHeadRoom)) {
        //something changed
        recalculateReduceSchedule = true;
        if (LOG.isDebugEnabled() && !headRoom.equals(newHeadRoom)) {
            LOG.debug("headroom=" + newHeadRoom);
        }
    }
    if (LOG.isDebugEnabled()) {
        for (Container cont : newContainers) {
            LOG.debug("Received new Container :" + cont);
        }
    }
    //Called on each allocation. Will know about newly blacklisted/added hosts.
    computeIgnoreBlacklisting();
    handleUpdatedNodes(response);
    handleJobPriorityChange(response);
    // handle receiving the timeline collector address for this app
    String collectorAddr = response.getCollectorAddr();
    MRAppMaster.RunningAppContext appContext = (MRAppMaster.RunningAppContext) this.getContext();
    if (collectorAddr != null && !collectorAddr.isEmpty() && appContext.getTimelineV2Client() != null) {
        appContext.getTimelineV2Client().setTimelineServiceAddress(response.getCollectorAddr());
    }
    for (ContainerStatus cont : finishedContainers) {
        processFinishedContainer(cont);
    }
    return newContainers;
}
Also used : PreemptionMessage(org.apache.hadoop.yarn.api.records.PreemptionMessage) MRAppMaster(org.apache.hadoop.mapreduce.v2.app.MRAppMaster) NMToken(org.apache.hadoop.yarn.api.records.NMToken) Resource(org.apache.hadoop.yarn.api.records.Resource) JobDiagnosticsUpdateEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobDiagnosticsUpdateEvent) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) InvalidLabelResourceRequestException(org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException) IOException(java.io.IOException) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) Container(org.apache.hadoop.yarn.api.records.Container) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) JobEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent) InvalidLabelResourceRequestException(org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException) JobId(org.apache.hadoop.mapreduce.v2.api.records.JobId)

Example 17 with JobEvent

use of org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent in project hadoop by apache.

the class RMContainerAllocator method handleReduceContainerRequest.

@SuppressWarnings({ "unchecked" })
private void handleReduceContainerRequest(ContainerRequestEvent reqEvent) {
    assert (reqEvent.getAttemptID().getTaskId().getTaskType().equals(TaskType.REDUCE));
    Resource supportedMaxContainerCapability = getMaxContainerCapability();
    JobId jobId = getJob().getID();
    if (reduceResourceRequest.equals(Resources.none())) {
        reduceResourceRequest = reqEvent.getCapability();
        eventHandler.handle(new JobHistoryEvent(jobId, new NormalizedResourceEvent(org.apache.hadoop.mapreduce.TaskType.REDUCE, reduceResourceRequest.getMemorySize())));
        LOG.info("reduceResourceRequest:" + reduceResourceRequest);
    }
    boolean reduceContainerRequestAccepted = true;
    if (reduceResourceRequest.getMemorySize() > supportedMaxContainerCapability.getMemorySize() || reduceResourceRequest.getVirtualCores() > supportedMaxContainerCapability.getVirtualCores()) {
        reduceContainerRequestAccepted = false;
    }
    if (reduceContainerRequestAccepted) {
        // set the resources
        reqEvent.getCapability().setVirtualCores(reduceResourceRequest.getVirtualCores());
        reqEvent.getCapability().setMemorySize(reduceResourceRequest.getMemorySize());
        if (reqEvent.getEarlierAttemptFailed()) {
            //previously failed reducers are added to the front for fail fast
            pendingReduces.addFirst(new ContainerRequest(reqEvent, PRIORITY_REDUCE, reduceNodeLabelExpression));
        } else {
            //reduces are added to pending queue and are slowly ramped up
            pendingReduces.add(new ContainerRequest(reqEvent, PRIORITY_REDUCE, reduceNodeLabelExpression));
        }
    } else {
        String diagMsg = "REDUCE capability required is more than the " + "supported max container capability in the cluster. Killing" + " the Job. reduceResourceRequest: " + reduceResourceRequest + " maxContainerCapability:" + supportedMaxContainerCapability;
        LOG.info(diagMsg);
        eventHandler.handle(new JobDiagnosticsUpdateEvent(jobId, diagMsg));
        eventHandler.handle(new JobEvent(jobId, JobEventType.JOB_KILL));
    }
}
Also used : NormalizedResourceEvent(org.apache.hadoop.mapreduce.jobhistory.NormalizedResourceEvent) JobEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent) Resource(org.apache.hadoop.yarn.api.records.Resource) JobHistoryEvent(org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent) JobDiagnosticsUpdateEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobDiagnosticsUpdateEvent) JobId(org.apache.hadoop.mapreduce.v2.api.records.JobId)

Example 18 with JobEvent

use of org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent in project hadoop by apache.

the class JobImpl method handle.

@Override
public /**
   * The only entry point to change the Job.
   */
void handle(JobEvent event) {
    if (LOG.isDebugEnabled()) {
        LOG.debug("Processing " + event.getJobId() + " of type " + event.getType());
    }
    try {
        writeLock.lock();
        JobStateInternal oldState = getInternalState();
        try {
            getStateMachine().doTransition(event.getType(), event);
        } catch (InvalidStateTransitionException e) {
            LOG.error("Can't handle this event at current state", e);
            addDiagnostic("Invalid event " + event.getType() + " on Job " + this.jobId);
            eventHandler.handle(new JobEvent(this.jobId, JobEventType.INTERNAL_ERROR));
        }
        //notify the eventhandler of state change
        if (oldState != getInternalState()) {
            LOG.info(jobId + "Job Transitioned from " + oldState + " to " + getInternalState());
            rememberLastNonFinalState(oldState);
        }
    } finally {
        writeLock.unlock();
    }
}
Also used : InvalidStateTransitionException(org.apache.hadoop.yarn.state.InvalidStateTransitionException) JobEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent) JobStateInternal(org.apache.hadoop.mapreduce.v2.app.job.JobStateInternal)

Example 19 with JobEvent

use of org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent in project hadoop by apache.

the class MRAppMaster method startJobs.

/**
   * This can be overridden to instantiate multiple jobs and create a 
   * workflow.
   *
   * TODO:  Rework the design to actually support this.  Currently much of the
   * job stuff has been moved to init() above to support uberization (MR-1220).
   * In a typical workflow, one presumably would want to uberize only a subset
   * of the jobs (the "small" ones), which is awkward with the current design.
   */
@SuppressWarnings("unchecked")
protected void startJobs() {
    /** create a job-start event to get this ball rolling */
    JobEvent startJobEvent = new JobStartEvent(job.getID(), recoveredJobStartTime);
    /** send the job-start event. this triggers the job execution. */
    dispatcher.getEventHandler().handle(startJobEvent);
}
Also used : JobEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent) JobStartEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobStartEvent)

Example 20 with JobEvent

use of org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent in project hadoop by apache.

the class RMContainerAllocator method serviceStart.

@Override
protected void serviceStart() throws Exception {
    this.eventHandlingThread = new Thread() {

        @SuppressWarnings("unchecked")
        @Override
        public void run() {
            ContainerAllocatorEvent event;
            while (!stopped.get() && !Thread.currentThread().isInterrupted()) {
                try {
                    event = RMContainerAllocator.this.eventQueue.take();
                } catch (InterruptedException e) {
                    if (!stopped.get()) {
                        LOG.error("Returning, interrupted : " + e);
                    }
                    return;
                }
                try {
                    handleEvent(event);
                } catch (Throwable t) {
                    LOG.error("Error in handling event type " + event.getType() + " to the ContainreAllocator", t);
                    // Kill the AM
                    eventHandler.handle(new JobEvent(getJob().getID(), JobEventType.INTERNAL_ERROR));
                    return;
                }
            }
        }
    };
    this.eventHandlingThread.start();
    super.serviceStart();
}
Also used : JobEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent)

Aggregations

JobEvent (org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent)33 Test (org.junit.Test)21 Configuration (org.apache.hadoop.conf.Configuration)19 JobId (org.apache.hadoop.mapreduce.v2.api.records.JobId)16 AsyncDispatcher (org.apache.hadoop.yarn.event.AsyncDispatcher)13 CommitterEventHandler (org.apache.hadoop.mapreduce.v2.app.commit.CommitterEventHandler)12 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)9 JobStartEvent (org.apache.hadoop.mapreduce.v2.app.job.event.JobStartEvent)9 Job (org.apache.hadoop.mapreduce.v2.app.job.Job)7 Task (org.apache.hadoop.mapreduce.v2.app.job.Task)6 JobDiagnosticsUpdateEvent (org.apache.hadoop.mapreduce.v2.app.job.event.JobDiagnosticsUpdateEvent)6 JobContext (org.apache.hadoop.mapreduce.JobContext)5 AppContext (org.apache.hadoop.mapreduce.v2.app.AppContext)5 IOException (java.io.IOException)4 TaskAttempt (org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt)4 CountDownLatch (java.util.concurrent.CountDownLatch)3 CyclicBarrier (java.util.concurrent.CyclicBarrier)3 JobID (org.apache.hadoop.mapreduce.JobID)3 JobHistoryEvent (org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent)3 JobTokenSecretManager (org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager)3