Search in sources :

Example 36 with YarnClient

use of org.apache.hadoop.yarn.client.api.YarnClient in project apex-core by apache.

the class StreamingAppMasterService method execute.

/**
   * Main run function for the application master
   *
   * @throws YarnException
   */
@SuppressWarnings("SleepWhileInLoop")
private void execute() throws YarnException, IOException {
    LOG.info("Starting ApplicationMaster");
    final Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
    LOG.info("number of tokens: {}", credentials.getAllTokens().size());
    Iterator<Token<?>> iter = credentials.getAllTokens().iterator();
    while (iter.hasNext()) {
        Token<?> token = iter.next();
        LOG.debug("token: {}", token);
    }
    final Configuration conf = getConfig();
    long tokenLifeTime = (long) (dag.getValue(LogicalPlan.TOKEN_REFRESH_ANTICIPATORY_FACTOR) * Math.min(dag.getValue(LogicalPlan.HDFS_TOKEN_LIFE_TIME), dag.getValue(LogicalPlan.RM_TOKEN_LIFE_TIME)));
    long expiryTime = System.currentTimeMillis() + tokenLifeTime;
    LOG.debug(" expiry token time {}", tokenLifeTime);
    String principal = dag.getValue(LogicalPlan.PRINCIPAL);
    String hdfsKeyTabFile = dag.getValue(LogicalPlan.KEY_TAB_FILE);
    // Register self with ResourceManager
    RegisterApplicationMasterResponse response = amRmClient.registerApplicationMaster(appMasterHostname, 0, appMasterTrackingUrl);
    // Dump out information about cluster capability as seen by the resource manager
    int maxMem = response.getMaximumResourceCapability().getMemory();
    int maxVcores = response.getMaximumResourceCapability().getVirtualCores();
    int minMem = conf.getInt("yarn.scheduler.minimum-allocation-mb", 0);
    int minVcores = conf.getInt("yarn.scheduler.minimum-allocation-vcores", 0);
    LOG.info("Max mem {}m, Min mem {}m, Max vcores {} and Min vcores {} capabililty of resources in this cluster ", maxMem, minMem, maxVcores, minVcores);
    long blacklistRemovalTime = dag.getValue(DAGContext.BLACKLISTED_NODE_REMOVAL_TIME_MILLIS);
    int maxConsecutiveContainerFailures = dag.getValue(DAGContext.MAX_CONSECUTIVE_CONTAINER_FAILURES_FOR_BLACKLIST);
    LOG.info("Blacklist removal time in millis = {}, max consecutive node failure count = {}", blacklistRemovalTime, maxConsecutiveContainerFailures);
    // for locality relaxation fall back
    Map<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> requestedResources = Maps.newHashMap();
    // Setup heartbeat emitter
    // TODO poll RM every now and then with an empty request to let RM know that we are alive
    // The heartbeat interval after which an AM is timed out by the RM is defined by a config setting:
    // RM_AM_EXPIRY_INTERVAL_MS with default defined by DEFAULT_RM_AM_EXPIRY_INTERVAL_MS
    // The allocate calls to the RM count as heartbeat so, for now, this additional heartbeat emitter
    // is not required.
    int loopCounter = -1;
    long nodeReportUpdateTime = 0;
    // keep track of already requested containers to not request them again while waiting for allocation
    int numRequestedContainers = 0;
    int numReleasedContainers = 0;
    int nextRequestPriority = 0;
    // Use override for resource requestor in case of cloudera distribution, to handle host specific requests
    ResourceRequestHandler resourceRequestor = System.getenv().containsKey("CDH_HADOOP_BIN") ? new BlacklistBasedResourceRequestHandler() : new ResourceRequestHandler();
    List<ContainerStartRequest> pendingContainerStartRequests = new LinkedList<>();
    YarnClient clientRMService = YarnClient.createYarnClient();
    try {
        // YARN-435
        // we need getClusterNodes to populate the initial node list,
        // subsequent updates come through the heartbeat response
        clientRMService.init(conf);
        clientRMService.start();
        ApplicationReport ar = StramClientUtils.getStartedAppInstanceByName(clientRMService, dag.getAttributes().get(DAG.APPLICATION_NAME), UserGroupInformation.getLoginUser().getUserName(), dag.getAttributes().get(DAG.APPLICATION_ID));
        if (ar != null) {
            appDone = true;
            dnmgr.shutdownDiagnosticsMessage = String.format("Application master failed due to application %s with duplicate application name \"%s\" by the same user \"%s\" is already started.", ar.getApplicationId().toString(), ar.getName(), ar.getUser());
            LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
            finishApplication(FinalApplicationStatus.FAILED);
            return;
        }
        resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
        nodeReportUpdateTime = System.currentTimeMillis() + UPDATE_NODE_REPORTS_INTERVAL;
    } catch (Exception e) {
        throw new RuntimeException("Failed to retrieve cluster nodes report.", e);
    } finally {
        clientRMService.stop();
    }
    List<Container> containers = response.getContainersFromPreviousAttempts();
    // Running containers might take a while to register with the new app master and send the heartbeat signal.
    int waitForRecovery = containers.size() > 0 ? dag.getValue(LogicalPlan.HEARTBEAT_TIMEOUT_MILLIS) / 1000 : 0;
    List<ContainerId> releasedContainers = previouslyAllocatedContainers(containers);
    FinalApplicationStatus finalStatus = FinalApplicationStatus.SUCCEEDED;
    final InetSocketAddress rmAddress = conf.getSocketAddr(YarnConfiguration.RM_ADDRESS, YarnConfiguration.DEFAULT_RM_ADDRESS, YarnConfiguration.DEFAULT_RM_PORT);
    while (!appDone) {
        loopCounter++;
        final long currentTimeMillis = System.currentTimeMillis();
        if (UserGroupInformation.isSecurityEnabled() && currentTimeMillis >= expiryTime && hdfsKeyTabFile != null) {
            String applicationId = appAttemptID.getApplicationId().toString();
            expiryTime = StramUserLogin.refreshTokens(tokenLifeTime, FileUtils.getTempDirectoryPath(), applicationId, conf, principal, hdfsKeyTabFile, credentials, rmAddress, true);
        }
        if (currentTimeMillis > nodeReportUpdateTime) {
            resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
            nodeReportUpdateTime = currentTimeMillis + UPDATE_NODE_REPORTS_INTERVAL;
        }
        Runnable r;
        while ((r = this.pendingTasks.poll()) != null) {
            r.run();
        }
        // need not have any available containers
        try {
            sleep(1000);
        } catch (InterruptedException e) {
            LOG.info("Sleep interrupted " + e.getMessage());
        }
        // Setup request to be sent to RM to allocate containers
        List<ContainerRequest> containerRequests = new ArrayList<>();
        List<ContainerRequest> removedContainerRequests = new ArrayList<>();
        // request containers for pending deploy requests
        if (!dnmgr.containerStartRequests.isEmpty()) {
            StreamingContainerAgent.ContainerStartRequest csr;
            while ((csr = dnmgr.containerStartRequests.poll()) != null) {
                if (csr.container.getRequiredMemoryMB() > maxMem) {
                    LOG.warn("Container memory {}m above max threshold of cluster. Using max value {}m.", csr.container.getRequiredMemoryMB(), maxMem);
                    csr.container.setRequiredMemoryMB(maxMem);
                }
                if (csr.container.getRequiredMemoryMB() < minMem) {
                    csr.container.setRequiredMemoryMB(minMem);
                }
                if (csr.container.getRequiredVCores() > maxVcores) {
                    LOG.warn("Container vcores {} above max threshold of cluster. Using max value {}.", csr.container.getRequiredVCores(), maxVcores);
                    csr.container.setRequiredVCores(maxVcores);
                }
                if (csr.container.getRequiredVCores() < minVcores) {
                    csr.container.setRequiredVCores(minVcores);
                }
                csr.container.setResourceRequestPriority(nextRequestPriority++);
                ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
                if (cr == null) {
                    pendingContainerStartRequests.add(csr);
                } else {
                    resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
                }
            }
        }
        // If all other requests are allocated, retry pending requests which need host availability
        if (containerRequests.isEmpty() && !pendingContainerStartRequests.isEmpty()) {
            List<ContainerStartRequest> removalList = new LinkedList<>();
            for (ContainerStartRequest csr : pendingContainerStartRequests) {
                ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
                if (cr != null) {
                    resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
                    removalList.add(csr);
                }
            }
            pendingContainerStartRequests.removeAll(removalList);
        }
        resourceRequestor.reissueContainerRequests(amRmClient, requestedResources, loopCounter, resourceRequestor, containerRequests, removedContainerRequests);
        /* Remove nodes from blacklist after timeout */
        List<String> blacklistRemovals = new ArrayList<>();
        for (String hostname : failedBlackListedNodes) {
            Long timeDiff = currentTimeMillis - failedContainerNodesMap.get(hostname).blackListAdditionTime;
            if (timeDiff >= blacklistRemovalTime) {
                blacklistRemovals.add(hostname);
                failedContainerNodesMap.remove(hostname);
            }
        }
        if (!blacklistRemovals.isEmpty()) {
            amRmClient.updateBlacklist(null, blacklistRemovals);
            LOG.info("Removing nodes {} from blacklist: time elapsed since last blacklisting due to failure is greater than specified timeout", blacklistRemovals.toString());
            failedBlackListedNodes.removeAll(blacklistRemovals);
        }
        numRequestedContainers += containerRequests.size() - removedContainerRequests.size();
        AllocateResponse amResp = sendContainerAskToRM(containerRequests, removedContainerRequests, releasedContainers);
        if (amResp.getAMCommand() != null) {
            LOG.info(" statement executed:{}", amResp.getAMCommand());
            switch(amResp.getAMCommand()) {
                case AM_RESYNC:
                case AM_SHUTDOWN:
                    throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
                default:
                    throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
            }
        }
        releasedContainers.clear();
        // Retrieve list of allocated containers from the response
        List<Container> newAllocatedContainers = amResp.getAllocatedContainers();
        // LOG.info("Got response from RM for container ask, allocatedCnt=" + newAllocatedContainers.size());
        numRequestedContainers -= newAllocatedContainers.size();
        long timestamp = System.currentTimeMillis();
        for (Container allocatedContainer : newAllocatedContainers) {
            LOG.info("Got new container." + ", containerId=" + allocatedContainer.getId() + ", containerNode=" + allocatedContainer.getNodeId() + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() + ", containerResourceMemory" + allocatedContainer.getResource().getMemory() + ", priority" + allocatedContainer.getPriority());
            // + ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());
            boolean alreadyAllocated = true;
            StreamingContainerAgent.ContainerStartRequest csr = null;
            for (Map.Entry<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> entry : requestedResources.entrySet()) {
                if (entry.getKey().container.getResourceRequestPriority() == allocatedContainer.getPriority().getPriority()) {
                    alreadyAllocated = false;
                    csr = entry.getKey();
                    break;
                }
            }
            if (alreadyAllocated) {
                LOG.info("Releasing {} as resource with priority {} was already assigned", allocatedContainer.getId(), allocatedContainer.getPriority());
                releasedContainers.add(allocatedContainer.getId());
                numReleasedContainers++;
                numRequestedContainers--;
                continue;
            }
            if (csr != null) {
                requestedResources.remove(csr);
            }
            // allocate resource to container
            ContainerResource resource = new ContainerResource(allocatedContainer.getPriority().getPriority(), allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), allocatedContainer.getResource().getMemory(), allocatedContainer.getResource().getVirtualCores(), allocatedContainer.getNodeHttpAddress());
            StreamingContainerAgent sca = dnmgr.assignContainer(resource, null);
            if (sca == null) {
                // allocated container no longer needed, add release request
                LOG.warn("Container {} allocated but nothing to deploy, going to release this container.", allocatedContainer.getId());
                releasedContainers.add(allocatedContainer.getId());
            } else {
                AllocatedContainer allocatedContainerHolder = new AllocatedContainer(allocatedContainer);
                this.allocatedContainers.put(allocatedContainer.getId().toString(), allocatedContainerHolder);
                ByteBuffer tokens = null;
                if (UserGroupInformation.isSecurityEnabled()) {
                    UserGroupInformation ugi = UserGroupInformation.getLoginUser();
                    Token<StramDelegationTokenIdentifier> delegationToken = allocateDelegationToken(ugi.getUserName(), heartbeatListener.getAddress());
                    allocatedContainerHolder.delegationToken = delegationToken;
                    //ByteBuffer tokens = LaunchContainerRunnable.getTokens(delegationTokenManager, heartbeatListener.getAddress());
                    tokens = LaunchContainerRunnable.getTokens(ugi, delegationToken);
                }
                LaunchContainerRunnable launchContainer = new LaunchContainerRunnable(allocatedContainer, nmClient, sca, tokens);
                // Thread launchThread = new Thread(runnableLaunchContainer);
                // launchThreads.add(launchThread);
                // launchThread.start();
                // communication with NMs is now async
                launchContainer.run();
                // record container start event
                StramEvent ev = new StramEvent.StartContainerEvent(allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString());
                ev.setTimestamp(timestamp);
                dnmgr.recordEventAsync(ev);
            }
        }
        // track node updates for future locality constraint allocations
        // TODO: it seems 2.0.4-alpha doesn't give us any updates
        resourceRequestor.updateNodeReports(amResp.getUpdatedNodes());
        // Check the completed containers
        List<ContainerStatus> completedContainers = amResp.getCompletedContainersStatuses();
        // LOG.debug("Got response from RM for container ask, completedCnt=" + completedContainers.size());
        List<String> blacklistAdditions = new ArrayList<>();
        for (ContainerStatus containerStatus : completedContainers) {
            LOG.info("Completed containerId=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());
            // non complete containers should not be here
            assert (containerStatus.getState() == ContainerState.COMPLETE);
            AllocatedContainer allocatedContainer = allocatedContainers.remove(containerStatus.getContainerId().toString());
            if (allocatedContainer != null && allocatedContainer.delegationToken != null) {
                UserGroupInformation ugi = UserGroupInformation.getLoginUser();
                delegationTokenManager.cancelToken(allocatedContainer.delegationToken, ugi.getUserName());
            }
            int exitStatus = containerStatus.getExitStatus();
            if (0 != exitStatus) {
                if (allocatedContainer != null) {
                    numFailedContainers.incrementAndGet();
                    if (exitStatus != 1 && maxConsecutiveContainerFailures != Integer.MAX_VALUE) {
                        // If container failure due to framework
                        String hostname = allocatedContainer.container.getNodeId().getHost();
                        if (!failedBlackListedNodes.contains(hostname)) {
                            // Blacklist the node if not already blacklisted
                            if (failedContainerNodesMap.containsKey(hostname)) {
                                NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                                long timeStamp = System.currentTimeMillis();
                                if (timeStamp - stats.lastFailureTimeStamp >= blacklistRemovalTime) {
                                    // Reset failure count if last failure was before Blacklist removal time
                                    stats.failureCount = 1;
                                    stats.lastFailureTimeStamp = timeStamp;
                                } else {
                                    stats.lastFailureTimeStamp = timeStamp;
                                    stats.failureCount++;
                                    if (stats.failureCount >= maxConsecutiveContainerFailures) {
                                        LOG.info("Node {} failed {} times consecutively within {} minutes, marking the node blacklisted", hostname, stats.failureCount, blacklistRemovalTime / (60 * 1000));
                                        blacklistAdditions.add(hostname);
                                        failedBlackListedNodes.add(hostname);
                                    }
                                }
                            } else {
                                failedContainerNodesMap.put(hostname, new NodeFailureStats(System.currentTimeMillis(), 1));
                            }
                        }
                    }
                }
                //          if (exitStatus == 1) {
                //            // non-recoverable StreamingContainer failure
                //            appDone = true;
                //            finalStatus = FinalApplicationStatus.FAILED;
                //            dnmgr.shutdownDiagnosticsMessage = "Unrecoverable failure " + containerStatus.getContainerId();
                //            LOG.info("Exiting due to: {}", dnmgr.shutdownDiagnosticsMessage);
                //          }
                //          else {
                // Recoverable failure or process killed (externally or via stop request by AM)
                // also occurs when a container was released by the application but never assigned/launched
                LOG.debug("Container {} failed or killed.", containerStatus.getContainerId());
                dnmgr.scheduleContainerRestart(containerStatus.getContainerId().toString());
            //          }
            } else {
                // container completed successfully
                numCompletedContainers.incrementAndGet();
                LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
                // Reset counter for node failure, if exists
                String hostname = allocatedContainer.container.getNodeId().getHost();
                NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                if (stats != null) {
                    stats.failureCount = 0;
                }
            }
            String containerIdStr = containerStatus.getContainerId().toString();
            dnmgr.removeContainerAgent(containerIdStr);
            // record container stop event
            StramEvent ev = new StramEvent.StopContainerEvent(containerIdStr, containerStatus.getExitStatus());
            ev.setReason(containerStatus.getDiagnostics());
            dnmgr.recordEventAsync(ev);
        }
        if (!blacklistAdditions.isEmpty()) {
            amRmClient.updateBlacklist(blacklistAdditions, null);
            long timeStamp = System.currentTimeMillis();
            for (String hostname : blacklistAdditions) {
                NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                stats.blackListAdditionTime = timeStamp;
            }
        }
        if (dnmgr.forcedShutdown) {
            LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
            finalStatus = FinalApplicationStatus.FAILED;
            appDone = true;
        } else if (allocatedContainers.isEmpty() && numRequestedContainers == 0 && dnmgr.containerStartRequests.isEmpty()) {
            LOG.debug("Exiting as no more containers are allocated or requested");
            finalStatus = FinalApplicationStatus.SUCCEEDED;
            appDone = true;
        }
        LOG.debug("Current application state: loop={}, appDone={}, requested={}, released={}, completed={}, failed={}, currentAllocated={}, dnmgr.containerStartRequests={}", loopCounter, appDone, numRequestedContainers, numReleasedContainers, numCompletedContainers, numFailedContainers, allocatedContainers.size(), dnmgr.containerStartRequests);
        // monitor child containers
        dnmgr.monitorHeartbeat(waitForRecovery > 0);
        waitForRecovery = Math.max(waitForRecovery - 1, 0);
    }
    finishApplication(finalStatus);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) FinalApplicationStatus(org.apache.hadoop.yarn.api.records.FinalApplicationStatus) InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) Token(org.apache.hadoop.security.token.Token) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) Container(org.apache.hadoop.yarn.api.records.Container) StreamingContainer(com.datatorrent.stram.engine.StreamingContainer) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ContainerRequest(org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) LinkedList(java.util.LinkedList) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ContainerResource(com.datatorrent.stram.StreamingContainerManager.ContainerResource) RegisterApplicationMasterResponse(org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse) Map(java.util.Map) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) StramDelegationTokenIdentifier(com.datatorrent.stram.security.StramDelegationTokenIdentifier) StramEvent(com.datatorrent.stram.api.StramEvent) MutablePair(org.apache.commons.lang3.tuple.MutablePair) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ByteBuffer(java.nio.ByteBuffer) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) Credentials(org.apache.hadoop.security.Credentials)

Example 37 with YarnClient

use of org.apache.hadoop.yarn.client.api.YarnClient in project apex-core by apache.

the class YarnAppLauncherImpl method shutdownApp.

protected void shutdownApp(YarnAppHandleImpl app, ShutdownMode shutdownMode) throws LauncherException {
    if (shutdownMode == ShutdownMode.KILL) {
        YarnClient yarnClient = YarnClient.createYarnClient();
        try {
            ApplicationId applicationId = app.appId;
            ApplicationReport appReport = yarnClient.getApplicationReport(applicationId);
            if (appReport == null) {
                throw new LauncherException("Application " + app.getApplicationId() + " not found");
            }
            yarnClient.killApplication(applicationId);
        } catch (YarnException | IOException e) {
            throw Throwables.propagate(e);
        } finally {
            IOUtils.closeQuietly(yarnClient);
        }
    } else {
        throw new UnsupportedOperationException("Orderly shutdown not supported, try kill instead");
    }
}
Also used : ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) IOException(java.io.IOException) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) YarnException(org.apache.hadoop.yarn.exceptions.YarnException)

Example 38 with YarnClient

use of org.apache.hadoop.yarn.client.api.YarnClient in project apex-core by apache.

the class StramUserLogin method refreshTokens.

public static long refreshTokens(long tokenLifeTime, String destinationDir, String destinationFile, final Configuration conf, String principal, String hdfsKeyTabFile, final Credentials credentials, final InetSocketAddress rmAddress, final boolean renewRMToken) throws IOException {
    long expiryTime = System.currentTimeMillis() + tokenLifeTime;
    //renew tokens
    final String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL);
    if (tokenRenewer == null || tokenRenewer.length() == 0) {
        throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer");
    }
    File keyTabFile;
    try (FileSystem fs = FileSystem.newInstance(conf)) {
        keyTabFile = FSUtil.copyToLocalFileSystem(fs, destinationDir, destinationFile, hdfsKeyTabFile, conf);
    }
    if (principal == null) {
        principal = UserGroupInformation.getCurrentUser().getUserName();
    }
    UserGroupInformation ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keyTabFile.getAbsolutePath());
    try {
        ugi.doAs(new PrivilegedExceptionAction<Object>() {

            @Override
            public Object run() throws Exception {
                YarnClient yarnClient = null;
                if (renewRMToken) {
                    yarnClient = YarnClient.createYarnClient();
                    yarnClient.init(conf);
                    yarnClient.start();
                }
                Credentials creds = new Credentials();
                try (FileSystem fs1 = FileSystem.newInstance(conf)) {
                    fs1.addDelegationTokens(tokenRenewer, creds);
                    if (renewRMToken) {
                        new StramClientUtils.ClientRMHelper(yarnClient, conf).addRMDelegationToken(tokenRenewer, creds);
                    }
                } finally {
                    if (renewRMToken) {
                        yarnClient.stop();
                    }
                }
                credentials.addAll(creds);
                return null;
            }
        });
        UserGroupInformation.getCurrentUser().addCredentials(credentials);
    } catch (InterruptedException e) {
        LOG.error("Error while renewing tokens ", e);
        expiryTime = System.currentTimeMillis();
    } catch (IOException e) {
        LOG.error("Error while renewing tokens ", e);
        expiryTime = System.currentTimeMillis();
    }
    LOG.debug("number of tokens: {}", credentials.getAllTokens().size());
    Iterator<Token<?>> iter = credentials.getAllTokens().iterator();
    while (iter.hasNext()) {
        Token<?> token = iter.next();
        LOG.debug("updated token: {}", token);
    }
    keyTabFile.delete();
    return expiryTime;
}
Also used : Token(org.apache.hadoop.security.token.Token) IOException(java.io.IOException) IOException(java.io.IOException) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) FileSystem(org.apache.hadoop.fs.FileSystem) File(java.io.File) Credentials(org.apache.hadoop.security.Credentials) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 39 with YarnClient

use of org.apache.hadoop.yarn.client.api.YarnClient in project cdap by caskdata.

the class YarnCheck method run.

@Override
public void run() {
    int yarnConnectTimeout = cConf.getInt(Constants.Startup.YARN_CONNECT_TIMEOUT_SECONDS, 60);
    LOG.info("Checking YARN availability -- may take up to {} seconds.", yarnConnectTimeout);
    final YarnClient yarnClient = YarnClient.createYarnClient();
    yarnClient.init(hConf);
    List<NodeReport> nodeReports;
    // if yarn is not up, yarnClient.start() will hang.
    ExecutorService executorService = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setNameFormat("startup-checker").build());
    try {
        Future<List<NodeReport>> result = executorService.submit(new Callable<List<NodeReport>>() {

            @Override
            public List<NodeReport> call() throws Exception {
                yarnClient.start();
                return yarnClient.getNodeReports();
            }
        });
        nodeReports = result.get(yarnConnectTimeout, TimeUnit.SECONDS);
        LOG.info("  YARN availability successfully verified.");
    } catch (Exception e) {
        throw new RuntimeException("Unable to get status of YARN nodemanagers. " + "Please check that YARN is running " + "and that the correct Hadoop configuration (core-site.xml, yarn-site.xml) and libraries " + "are included in the CDAP master classpath.", e);
    } finally {
        try {
            yarnClient.stop();
        } catch (Exception e) {
            LOG.warn("Error stopping yarn client.", e);
        } finally {
            executorService.shutdown();
        }
    }
    checkResources(nodeReports);
}
Also used : ExecutorService(java.util.concurrent.ExecutorService) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) List(java.util.List) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) NodeReport(org.apache.hadoop.yarn.api.records.NodeReport)

Example 40 with YarnClient

use of org.apache.hadoop.yarn.client.api.YarnClient in project cdap by caskdata.

the class YarnApps method collect.

@Override
public synchronized void collect() throws Exception {
    reset();
    YarnClient yarnClient = createYARNClient();
    List<ApplicationReport> applications;
    try {
        applications = yarnClient.getApplications();
    } finally {
        yarnClient.stop();
    }
    for (ApplicationReport application : applications) {
        switch(application.getYarnApplicationState()) {
            case NEW:
            case NEW_SAVING:
                newApps++;
                break;
            case ACCEPTED:
                accepted++;
                break;
            case SUBMITTED:
                submitted++;
                break;
            case RUNNING:
                running++;
                break;
            case FINISHED:
                finished++;
                break;
            case FAILED:
                failed++;
                break;
            case KILLED:
                killed++;
                break;
        }
    }
    total = applications.size();
}
Also used : ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient)

Aggregations

YarnClient (org.apache.hadoop.yarn.client.api.YarnClient)89 Test (org.junit.Test)51 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)50 Configuration (org.apache.hadoop.conf.Configuration)45 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)37 ApplicationReport (org.apache.hadoop.yarn.api.records.ApplicationReport)21 CapacitySchedulerConfiguration (org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration)18 IOException (java.io.IOException)17 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)15 MiniYARNCluster (org.apache.hadoop.yarn.server.MiniYARNCluster)15 YarnException (org.apache.hadoop.yarn.exceptions.YarnException)14 Path (org.apache.hadoop.fs.Path)13 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 Matchers.anyString (org.mockito.Matchers.anyString)11 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)9 NodeId (org.apache.hadoop.yarn.api.records.NodeId)9 ArrayList (java.util.ArrayList)8 LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)8 ReservationSubmissionRequest (org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest)7