Search in sources :

Example 1 with ApplicationMasterNotRegisteredException

use of org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException in project hadoop by apache.

the class RMCommunicator method doUnregistration.

@VisibleForTesting
protected void doUnregistration() throws YarnException, IOException, InterruptedException {
    FinalApplicationStatus finishState = FinalApplicationStatus.UNDEFINED;
    JobImpl jobImpl = (JobImpl) job;
    if (jobImpl.getInternalState() == JobStateInternal.SUCCEEDED) {
        finishState = FinalApplicationStatus.SUCCEEDED;
    } else if (jobImpl.getInternalState() == JobStateInternal.KILLED || (jobImpl.getInternalState() == JobStateInternal.RUNNING && isSignalled)) {
        finishState = FinalApplicationStatus.KILLED;
    } else if (jobImpl.getInternalState() == JobStateInternal.FAILED || jobImpl.getInternalState() == JobStateInternal.ERROR) {
        finishState = FinalApplicationStatus.FAILED;
    }
    StringBuffer sb = new StringBuffer();
    for (String s : job.getDiagnostics()) {
        sb.append(s).append("\n");
    }
    LOG.info("Setting job diagnostics to " + sb.toString());
    String historyUrl = MRWebAppUtil.getApplicationWebURLOnJHSWithScheme(getConfig(), context.getApplicationID());
    LOG.info("History url is " + historyUrl);
    FinishApplicationMasterRequest request = FinishApplicationMasterRequest.newInstance(finishState, sb.toString(), historyUrl);
    try {
        while (true) {
            FinishApplicationMasterResponse response = scheduler.finishApplicationMaster(request);
            if (response.getIsUnregistered()) {
                // When excepting ClientService, other services are already stopped,
                // it is safe to let clients know the final states. ClientService
                // should wait for some time so clients have enough time to know the
                // final states.
                RunningAppContext raContext = (RunningAppContext) context;
                raContext.markSuccessfulUnregistration();
                break;
            }
            LOG.info("Waiting for application to be successfully unregistered.");
            Thread.sleep(rmPollInterval);
        }
    } catch (ApplicationMasterNotRegisteredException e) {
        // RM might have restarted or failed over and so lost the fact that AM had
        // registered before.
        register();
        doUnregistration();
    }
}
Also used : ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) JobImpl(org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl) FinalApplicationStatus(org.apache.hadoop.yarn.api.records.FinalApplicationStatus) RunningAppContext(org.apache.hadoop.mapreduce.v2.app.MRAppMaster.RunningAppContext) FinishApplicationMasterResponse(org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse) FinishApplicationMasterRequest(org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with ApplicationMasterNotRegisteredException

use of org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException in project hadoop by apache.

the class RMContainerAllocator method getResources.

@SuppressWarnings("unchecked")
private List<Container> getResources() throws Exception {
    applyConcurrentTaskLimits();
    // will be null the first time
    Resource headRoom = Resources.clone(getAvailableResources());
    AllocateResponse response;
    /*
     * If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS
     * milliseconds before aborting. During this interval, AM will still try
     * to contact the RM.
     */
    try {
        response = makeRemoteRequest();
        // Reset retry count if no exception occurred.
        retrystartTime = System.currentTimeMillis();
    } catch (ApplicationAttemptNotFoundException e) {
        // This can happen if the RM has been restarted. If it is in that state,
        // this application must clean itself up.
        eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
        throw new RMContainerAllocationException("Resource Manager doesn't recognize AttemptId: " + this.getContext().getApplicationAttemptId(), e);
    } catch (ApplicationMasterNotRegisteredException e) {
        LOG.info("ApplicationMaster is out of sync with ResourceManager," + " hence resync and send outstanding requests.");
        // RM may have restarted, re-register with RM.
        lastResponseID = 0;
        register();
        addOutstandingRequestOnResync();
        return null;
    } catch (InvalidLabelResourceRequestException e) {
        // If Invalid label exception is received means the requested label doesnt
        // have access so killing job in this case.
        String diagMsg = "Requested node-label-expression is invalid: " + StringUtils.stringifyException(e);
        LOG.info(diagMsg);
        JobId jobId = this.getJob().getID();
        eventHandler.handle(new JobDiagnosticsUpdateEvent(jobId, diagMsg));
        eventHandler.handle(new JobEvent(jobId, JobEventType.JOB_KILL));
        throw e;
    } catch (Exception e) {
        // re-trying until the retryInterval has expired.
        if (System.currentTimeMillis() - retrystartTime >= retryInterval) {
            LOG.error("Could not contact RM after " + retryInterval + " milliseconds.");
            eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
            throw new RMContainerAllocationException("Could not contact RM after " + retryInterval + " milliseconds.");
        }
        // continue to attempt to contact the RM.
        throw e;
    }
    Resource newHeadRoom = getAvailableResources();
    List<Container> newContainers = response.getAllocatedContainers();
    // Setting NMTokens
    if (response.getNMTokens() != null) {
        for (NMToken nmToken : response.getNMTokens()) {
            NMTokenCache.setNMToken(nmToken.getNodeId().toString(), nmToken.getToken());
        }
    }
    // Setting AMRMToken
    if (response.getAMRMToken() != null) {
        updateAMRMToken(response.getAMRMToken());
    }
    List<ContainerStatus> finishedContainers = response.getCompletedContainersStatuses();
    // propagate preemption requests
    final PreemptionMessage preemptReq = response.getPreemptionMessage();
    if (preemptReq != null) {
        preemptionPolicy.preempt(new PreemptionContext(assignedRequests), preemptReq);
    }
    if (newContainers.size() + finishedContainers.size() > 0 || !headRoom.equals(newHeadRoom)) {
        //something changed
        recalculateReduceSchedule = true;
        if (LOG.isDebugEnabled() && !headRoom.equals(newHeadRoom)) {
            LOG.debug("headroom=" + newHeadRoom);
        }
    }
    if (LOG.isDebugEnabled()) {
        for (Container cont : newContainers) {
            LOG.debug("Received new Container :" + cont);
        }
    }
    //Called on each allocation. Will know about newly blacklisted/added hosts.
    computeIgnoreBlacklisting();
    handleUpdatedNodes(response);
    handleJobPriorityChange(response);
    // handle receiving the timeline collector address for this app
    String collectorAddr = response.getCollectorAddr();
    MRAppMaster.RunningAppContext appContext = (MRAppMaster.RunningAppContext) this.getContext();
    if (collectorAddr != null && !collectorAddr.isEmpty() && appContext.getTimelineV2Client() != null) {
        appContext.getTimelineV2Client().setTimelineServiceAddress(response.getCollectorAddr());
    }
    for (ContainerStatus cont : finishedContainers) {
        processFinishedContainer(cont);
    }
    return newContainers;
}
Also used : PreemptionMessage(org.apache.hadoop.yarn.api.records.PreemptionMessage) MRAppMaster(org.apache.hadoop.mapreduce.v2.app.MRAppMaster) NMToken(org.apache.hadoop.yarn.api.records.NMToken) Resource(org.apache.hadoop.yarn.api.records.Resource) JobDiagnosticsUpdateEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobDiagnosticsUpdateEvent) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) InvalidLabelResourceRequestException(org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException) IOException(java.io.IOException) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) Container(org.apache.hadoop.yarn.api.records.Container) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) JobEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent) InvalidLabelResourceRequestException(org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException) JobId(org.apache.hadoop.mapreduce.v2.api.records.JobId)

Example 3 with ApplicationMasterNotRegisteredException

use of org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException in project hadoop by apache.

the class TestApplicationMasterService method testFinishApplicationMasterBeforeRegistering.

@Test(timeout = 1200000)
public void testFinishApplicationMasterBeforeRegistering() throws Exception {
    MockRM rm = new MockRM(conf);
    try {
        rm.start();
        // Register node1
        MockNM nm1 = rm.registerNode("127.0.0.1:1234", 6 * GB);
        // Submit an application
        RMApp app1 = rm.submitApp(2048);
        MockAM am1 = MockRM.launchAM(app1, rm, nm1);
        FinishApplicationMasterRequest req = FinishApplicationMasterRequest.newInstance(FinalApplicationStatus.FAILED, "", "");
        try {
            am1.unregisterAppAttempt(req, false);
            Assert.fail("ApplicationMasterNotRegisteredException should be thrown");
        } catch (ApplicationMasterNotRegisteredException e) {
            Assert.assertNotNull(e);
            Assert.assertNotNull(e.getMessage());
            Assert.assertTrue(e.getMessage().contains("Application Master is trying to unregister before registering for:"));
        } catch (Exception e) {
            Assert.fail("ApplicationMasterNotRegisteredException should be thrown");
        }
        am1.registerAppAttempt();
        am1.unregisterAppAttempt(req, false);
        rm.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FINISHING);
    } finally {
        if (rm != null) {
            rm.stop();
        }
    }
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) InvalidContainerReleaseException(org.apache.hadoop.yarn.exceptions.InvalidContainerReleaseException) FinishApplicationMasterRequest(org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest) Test(org.junit.Test)

Example 4 with ApplicationMasterNotRegisteredException

use of org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException in project hadoop by apache.

the class AMRMClientImpl method unregisterApplicationMaster.

@Override
public void unregisterApplicationMaster(FinalApplicationStatus appStatus, String appMessage, String appTrackingUrl) throws YarnException, IOException {
    Preconditions.checkArgument(appStatus != null, "AppStatus should not be null.");
    FinishApplicationMasterRequest request = FinishApplicationMasterRequest.newInstance(appStatus, appMessage, appTrackingUrl);
    try {
        while (true) {
            FinishApplicationMasterResponse response = rmClient.finishApplicationMaster(request);
            if (response.getIsUnregistered()) {
                break;
            }
            LOG.info("Waiting for application to be successfully unregistered.");
            Thread.sleep(100);
        }
    } catch (InterruptedException e) {
        LOG.info("Interrupted while waiting for application" + " to be removed from RMStateStore");
    } catch (ApplicationMasterNotRegisteredException e) {
        LOG.warn("ApplicationMaster is out of sync with ResourceManager," + " hence resyncing.");
        // re register with RM
        registerApplicationMaster();
        unregisterApplicationMaster(appStatus, appMessage, appTrackingUrl);
    }
}
Also used : ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) FinishApplicationMasterResponse(org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse) FinishApplicationMasterRequest(org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest)

Example 5 with ApplicationMasterNotRegisteredException

use of org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException in project hadoop by apache.

the class AMRMClientImpl method allocate.

@Override
public AllocateResponse allocate(float progressIndicator) throws YarnException, IOException {
    Preconditions.checkArgument(progressIndicator >= 0, "Progress indicator should not be negative");
    AllocateResponse allocateResponse = null;
    List<ResourceRequest> askList = null;
    List<ContainerId> releaseList = null;
    AllocateRequest allocateRequest = null;
    List<String> blacklistToAdd = new ArrayList<String>();
    List<String> blacklistToRemove = new ArrayList<String>();
    Map<ContainerId, SimpleEntry<Container, UpdateContainerRequest>> oldChange = new HashMap<>();
    try {
        synchronized (this) {
            askList = cloneAsks();
            // Save the current change for recovery
            oldChange.putAll(change);
            List<UpdateContainerRequest> updateList = createUpdateList();
            releaseList = new ArrayList<ContainerId>(release);
            // optimistically clear this collection assuming no RPC failure
            ask.clear();
            release.clear();
            change.clear();
            blacklistToAdd.addAll(blacklistAdditions);
            blacklistToRemove.addAll(blacklistRemovals);
            ResourceBlacklistRequest blacklistRequest = ResourceBlacklistRequest.newInstance(blacklistToAdd, blacklistToRemove);
            allocateRequest = AllocateRequest.newBuilder().responseId(lastResponseId).progress(progressIndicator).askList(askList).resourceBlacklistRequest(blacklistRequest).releaseList(releaseList).updateRequests(updateList).build();
            // clear blacklistAdditions and blacklistRemovals before
            // unsynchronized part
            blacklistAdditions.clear();
            blacklistRemovals.clear();
        }
        try {
            allocateResponse = rmClient.allocate(allocateRequest);
        } catch (ApplicationMasterNotRegisteredException e) {
            LOG.warn("ApplicationMaster is out of sync with ResourceManager," + " hence resyncing.");
            synchronized (this) {
                release.addAll(this.pendingRelease);
                blacklistAdditions.addAll(this.blacklistedNodes);
                for (RemoteRequestsTable remoteRequestsTable : remoteRequests.values()) {
                    @SuppressWarnings("unchecked") Iterator<ResourceRequestInfo<T>> reqIter = remoteRequestsTable.iterator();
                    while (reqIter.hasNext()) {
                        addResourceRequestToAsk(reqIter.next().remoteRequest);
                    }
                }
                change.putAll(this.pendingChange);
            }
            // re register with RM
            registerApplicationMaster();
            allocateResponse = allocate(progressIndicator);
            return allocateResponse;
        }
        synchronized (this) {
            // update these on successful RPC
            clusterNodeCount = allocateResponse.getNumClusterNodes();
            lastResponseId = allocateResponse.getResponseId();
            clusterAvailableResources = allocateResponse.getAvailableResources();
            if (!allocateResponse.getNMTokens().isEmpty()) {
                populateNMTokens(allocateResponse.getNMTokens());
            }
            if (allocateResponse.getAMRMToken() != null) {
                updateAMRMToken(allocateResponse.getAMRMToken());
            }
            if (!pendingRelease.isEmpty() && !allocateResponse.getCompletedContainersStatuses().isEmpty()) {
                removePendingReleaseRequests(allocateResponse.getCompletedContainersStatuses());
            }
            if (!pendingChange.isEmpty()) {
                List<ContainerStatus> completed = allocateResponse.getCompletedContainersStatuses();
                List<UpdatedContainer> changed = new ArrayList<>();
                changed.addAll(allocateResponse.getUpdatedContainers());
                // containers
                for (ContainerStatus status : completed) {
                    ContainerId containerId = status.getContainerId();
                    pendingChange.remove(containerId);
                }
                // remove all pending change requests that have been satisfied
                if (!changed.isEmpty()) {
                    removePendingChangeRequests(changed);
                }
            }
        }
    } finally {
        // TODO how to differentiate remote yarn exception vs error in rpc
        if (allocateResponse == null) {
            // preserve ask and release for next call to allocate()
            synchronized (this) {
                release.addAll(releaseList);
                // synchronized block at the beginning of this method.
                for (ResourceRequest oldAsk : askList) {
                    if (!ask.contains(oldAsk)) {
                        ask.add(oldAsk);
                    }
                }
                // that do not exist in the current change map:
                for (Map.Entry<ContainerId, SimpleEntry<Container, UpdateContainerRequest>> entry : oldChange.entrySet()) {
                    ContainerId oldContainerId = entry.getKey();
                    Container oldContainer = entry.getValue().getKey();
                    UpdateContainerRequest oldupdate = entry.getValue().getValue();
                    if (change.get(oldContainerId) == null) {
                        change.put(oldContainerId, new SimpleEntry<>(oldContainer, oldupdate));
                    }
                }
                blacklistAdditions.addAll(blacklistToAdd);
                blacklistRemovals.addAll(blacklistToRemove);
            }
        }
    }
    return allocateResponse;
}
Also used : HashMap(java.util.HashMap) ResourceBlacklistRequest(org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest) AllocateRequest(org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest) ArrayList(java.util.ArrayList) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) UpdatedContainer(org.apache.hadoop.yarn.api.records.UpdatedContainer) Container(org.apache.hadoop.yarn.api.records.Container) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) Iterator(java.util.Iterator) SimpleEntry(java.util.AbstractMap.SimpleEntry) UpdatedContainer(org.apache.hadoop.yarn.api.records.UpdatedContainer) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) UpdateContainerRequest(org.apache.hadoop.yarn.api.records.UpdateContainerRequest) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

ApplicationMasterNotRegisteredException (org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException)9 AllocateResponse (org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse)5 ApplicationAttemptNotFoundException (org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException)4 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)4 IOException (java.io.IOException)3 FinishApplicationMasterRequest (org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest)3 ArrayList (java.util.ArrayList)2 JobEvent (org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent)2 AllocateRequest (org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest)2 FinishApplicationMasterResponse (org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse)2 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)2 Container (org.apache.hadoop.yarn.api.records.Container)2 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)2 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)2 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)2 YarnRuntimeException (org.apache.hadoop.yarn.exceptions.YarnRuntimeException)2 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)2 Test (org.junit.Test)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 SimpleEntry (java.util.AbstractMap.SimpleEntry)1