Search in sources :

Example 6 with ContainerStartRequest

use of com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest in project apex-core by apache.

the class HostLocalTest method testUnavailableResources.

@Test
public void testUnavailableResources() {
    LogicalPlan dag = new LogicalPlan();
    dag.getAttributes().put(com.datatorrent.api.Context.DAGContext.APPLICATION_PATH, new File("target", HostLocalTest.class.getName()).getAbsolutePath());
    dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
    dag.getMeta(o1).getAttributes().put(OperatorContext.LOCALITY_HOST, "host2");
    GenericTestOperator partitioned = dag.addOperator("partitioned", GenericTestOperator.class);
    dag.addStream("o1_outport1", o1.outport1, partitioned.inport1).setLocality(Locality.CONTAINER_LOCAL);
    dag.setOperatorAttribute(o1, OperatorContext.MEMORY_MB, 256);
    dag.setOperatorAttribute(o1, OperatorContext.VCORES, 2);
    dag.setOperatorAttribute(partitioned, OperatorContext.VCORES, 1);
    StreamingContainerManager scm = new StreamingContainerManager(dag);
    ResourceRequestHandler rr = new ResourceRequestHandler();
    int containerMem = 1000;
    Map<String, NodeReport> nodeReports = Maps.newHashMap();
    NodeReport nr = BuilderUtils.newNodeReport(BuilderUtils.newNodeId("host1", 0), NodeState.RUNNING, "httpAddress", "rackName", BuilderUtils.newResource(0, 0), BuilderUtils.newResource(containerMem * 2, 2), 0, null, 0);
    nodeReports.put(nr.getNodeId().getHost(), nr);
    nr = BuilderUtils.newNodeReport(BuilderUtils.newNodeId("host2", 0), NodeState.RUNNING, "httpAddress", "rackName", BuilderUtils.newResource(0, 0), BuilderUtils.newResource(containerMem * 2, 2), 0, null, 0);
    nodeReports.put(nr.getNodeId().getHost(), nr);
    // set resources
    rr.updateNodeReports(Lists.newArrayList(nodeReports.values()));
    Assert.assertEquals("number of containers is 1", 1, scm.containerStartRequests.size());
    for (ContainerStartRequest csr : scm.containerStartRequests) {
        String host = rr.getHost(csr, true);
        Assert.assertEquals("number of vcores", 3, csr.container.getRequiredVCores());
        Assert.assertNull("Host is null", host);
    }
}
Also used : ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) LogicalPlan(com.datatorrent.stram.plan.logical.LogicalPlan) File(java.io.File) NodeReport(org.apache.hadoop.yarn.api.records.NodeReport) Test(org.junit.Test)

Example 7 with ContainerStartRequest

use of com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest in project apex-core by apache.

the class LocalityTest method testNodeLocal.

@Test
public void testNodeLocal() {
    LogicalPlan dag = new LogicalPlan();
    dag.getAttributes().put(com.datatorrent.api.Context.DAGContext.APPLICATION_PATH, new File("target", LocalityTest.class.getName()).getAbsolutePath());
    dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
    GenericTestOperator partitioned = dag.addOperator("partitioned", GenericTestOperator.class);
    dag.getMeta(partitioned).getAttributes().put(OperatorContext.PARTITIONER, new StatelessPartitioner<GenericTestOperator>(2));
    GenericTestOperator partitionedParallel = dag.addOperator("partitionedParallel", GenericTestOperator.class);
    dag.addStream("o1_outport1", o1.outport1, partitioned.inport1).setLocality(null);
    dag.addStream("partitioned_outport1", partitioned.outport1, partitionedParallel.inport2).setLocality(Locality.NODE_LOCAL);
    dag.setInputPortAttribute(partitionedParallel.inport2, PortContext.PARTITION_PARALLEL, true);
    GenericTestOperator single = dag.addOperator("single", GenericTestOperator.class);
    dag.addStream("partitionedParallel_outport1", partitionedParallel.outport1, single.inport1);
    int maxContainers = 7;
    dag.setAttribute(LogicalPlan.CONTAINERS_MAX_COUNT, maxContainers);
    StreamingContainerManager scm = new StreamingContainerManager(dag);
    Assert.assertEquals("number required containers", 6, scm.containerStartRequests.size());
    ResourceRequestHandler rr = new ResourceRequestHandler();
    int containerMem = 2000;
    Map<String, NodeReport> nodeReports = Maps.newHashMap();
    NodeReport nr = BuilderUtils.newNodeReport(BuilderUtils.newNodeId("host1", 0), NodeState.RUNNING, "httpAddress", "rackName", BuilderUtils.newResource(0, 0), BuilderUtils.newResource(containerMem * 2, 2), 0, null, 0);
    nodeReports.put(nr.getNodeId().getHost(), nr);
    nr = BuilderUtils.newNodeReport(BuilderUtils.newNodeId("host2", 0), NodeState.RUNNING, "httpAddress", "rackName", BuilderUtils.newResource(0, 0), BuilderUtils.newResource(containerMem * 2, 2), 0, null, 0);
    nodeReports.put(nr.getNodeId().getHost(), nr);
    // set resources
    rr.updateNodeReports(Lists.newArrayList(nodeReports.values()));
    Map<PTContainer, String> requestedHosts = Maps.newHashMap();
    for (ContainerStartRequest csr : scm.containerStartRequests) {
        String host = rr.getHost(csr, true);
        csr.container.host = host;
        // update the node report
        if (host != null) {
            requestedHosts.put(csr.container, host);
            nr = nodeReports.get(host);
            nr.getUsed().setMemory(nr.getUsed().getMemory() + containerMem);
        }
    }
    Assert.assertEquals("" + requestedHosts, nodeReports.keySet(), Sets.newHashSet(requestedHosts.values()));
    for (Map.Entry<PTContainer, String> e : requestedHosts.entrySet()) {
        for (PTOperator oper : e.getKey().getOperators()) {
            if (oper.getNodeLocalOperators().getOperatorSet().size() > 1) {
                String expHost = null;
                for (PTOperator nodeLocalOper : oper.getNodeLocalOperators().getOperatorSet()) {
                    Assert.assertNotNull("host null " + nodeLocalOper.getContainer(), nodeLocalOper.getContainer().host);
                    if (expHost == null) {
                        expHost = nodeLocalOper.getContainer().host;
                    } else {
                        Assert.assertEquals("expected same host " + nodeLocalOper, expHost, nodeLocalOper.getContainer().host);
                    }
                }
            }
        }
    }
}
Also used : ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) LogicalPlan(com.datatorrent.stram.plan.logical.LogicalPlan) File(java.io.File) Map(java.util.Map) NodeReport(org.apache.hadoop.yarn.api.records.NodeReport) Test(org.junit.Test)

Example 8 with ContainerStartRequest

use of com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest in project apex-core by apache.

the class StreamingAppMasterService method execute.

/**
 * Main run function for the application master
 *
 * @throws YarnException
 */
@SuppressWarnings("SleepWhileInLoop")
private void execute() throws YarnException, IOException {
    LOG.info("Starting ApplicationMaster");
    final Configuration conf = getConfig();
    if (UserGroupInformation.isSecurityEnabled()) {
        tokenRenewer = new TokenRenewer(dag, true, conf, appAttemptID.getApplicationId().toString());
    }
    // Register self with ResourceManager
    RegisterApplicationMasterResponse response = amRmClient.registerApplicationMaster(appMasterHostname, 0, appMasterTrackingUrl);
    // Dump out information about cluster capability as seen by the resource manager
    int maxMem = response.getMaximumResourceCapability().getMemory();
    int maxVcores = response.getMaximumResourceCapability().getVirtualCores();
    int minMem = conf.getInt("yarn.scheduler.minimum-allocation-mb", 0);
    int minVcores = conf.getInt("yarn.scheduler.minimum-allocation-vcores", 0);
    LOG.info("Max mem {}m, Min mem {}m, Max vcores {} and Min vcores {} capabililty of resources in this cluster ", maxMem, minMem, maxVcores, minVcores);
    long blacklistRemovalTime = dag.getValue(DAGContext.BLACKLISTED_NODE_REMOVAL_TIME_MILLIS);
    int maxConsecutiveContainerFailures = dag.getValue(DAGContext.MAX_CONSECUTIVE_CONTAINER_FAILURES_FOR_BLACKLIST);
    LOG.info("Blacklist removal time in millis = {}, max consecutive node failure count = {}", blacklistRemovalTime, maxConsecutiveContainerFailures);
    // for locality relaxation fall back
    Map<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> requestedResources = Maps.newHashMap();
    // Setup heartbeat emitter
    // TODO poll RM every now and then with an empty request to let RM know that we are alive
    // The heartbeat interval after which an AM is timed out by the RM is defined by a config setting:
    // RM_AM_EXPIRY_INTERVAL_MS with default defined by DEFAULT_RM_AM_EXPIRY_INTERVAL_MS
    // The allocate calls to the RM count as heartbeat so, for now, this additional heartbeat emitter
    // is not required.
    int loopCounter = -1;
    long nodeReportUpdateTime = 0;
    // keep track of already requested containers to not request them again while waiting for allocation
    int numRequestedContainers = 0;
    int numReleasedContainers = 0;
    int nextRequestPriority = 0;
    // Use override for resource requestor in case of cloudera distribution, to handle host specific requests
    ResourceRequestHandler resourceRequestor = System.getenv().containsKey("CDH_HADOOP_BIN") ? new BlacklistBasedResourceRequestHandler() : new ResourceRequestHandler();
    List<ContainerStartRequest> pendingContainerStartRequests = new LinkedList<>();
    try (YarnClient clientRMService = StramClientUtils.createYarnClient(conf)) {
        try {
            // YARN-435
            // we need getClusterNodes to populate the initial node list,
            // subsequent updates come through the heartbeat response
            ApplicationReport ar = StramClientUtils.getStartedAppInstanceByName(clientRMService, dag.getAttributes().get(DAG.APPLICATION_NAME), UserGroupInformation.getLoginUser().getUserName(), dag.getAttributes().get(DAG.APPLICATION_ID));
            if (ar != null) {
                appDone = true;
                dnmgr.shutdownDiagnosticsMessage = String.format("Application master failed due to application %s with duplicate application name \"%s\" by the same user \"%s\" is already started.", ar.getApplicationId().toString(), ar.getName(), ar.getUser());
                LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
                finishApplication(FinalApplicationStatus.FAILED);
                return;
            }
            resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
            nodeReportUpdateTime = System.currentTimeMillis() + UPDATE_NODE_REPORTS_INTERVAL;
        } catch (Exception e) {
            throw new RuntimeException("Failed to retrieve cluster nodes report.", e);
        }
        List<Container> containers = response.getContainersFromPreviousAttempts();
        // Running containers might take a while to register with the new app master and send the heartbeat signal.
        int waitForRecovery = containers.size() > 0 ? dag.getValue(LogicalPlan.HEARTBEAT_TIMEOUT_MILLIS) / 1000 : 0;
        List<ContainerId> releasedContainers = previouslyAllocatedContainers(containers);
        FinalApplicationStatus finalStatus = FinalApplicationStatus.SUCCEEDED;
        final InetSocketAddress rmAddress = conf.getSocketAddr(YarnConfiguration.RM_ADDRESS, YarnConfiguration.DEFAULT_RM_ADDRESS, YarnConfiguration.DEFAULT_RM_PORT);
        while (!appDone) {
            loopCounter++;
            final long currentTimeMillis = System.currentTimeMillis();
            if (tokenRenewer != null) {
                tokenRenewer.checkAndRenew();
            }
            if (currentTimeMillis > nodeReportUpdateTime) {
                resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
                nodeReportUpdateTime = currentTimeMillis + UPDATE_NODE_REPORTS_INTERVAL;
            }
            Runnable r;
            while ((r = this.pendingTasks.poll()) != null) {
                r.run();
            }
            // need not have any available containers
            try {
                sleep(1000);
            } catch (InterruptedException e) {
                LOG.info("Sleep interrupted", e);
            }
            // Setup request to be sent to RM to allocate containers
            List<ContainerRequest> containerRequests = new ArrayList<>();
            List<ContainerRequest> removedContainerRequests = new ArrayList<>();
            // request containers for pending deploy requests
            if (!dnmgr.containerStartRequests.isEmpty()) {
                StreamingContainerAgent.ContainerStartRequest csr;
                while ((csr = dnmgr.containerStartRequests.poll()) != null) {
                    if (csr.container.getRequiredMemoryMB() > maxMem) {
                        LOG.warn("Container memory {}m above max threshold of cluster. Using max value {}m.", csr.container.getRequiredMemoryMB(), maxMem);
                        csr.container.setRequiredMemoryMB(maxMem);
                    }
                    if (csr.container.getRequiredMemoryMB() < minMem) {
                        csr.container.setRequiredMemoryMB(minMem);
                    }
                    if (csr.container.getRequiredVCores() > maxVcores) {
                        LOG.warn("Container vcores {} above max threshold of cluster. Using max value {}.", csr.container.getRequiredVCores(), maxVcores);
                        csr.container.setRequiredVCores(maxVcores);
                    }
                    if (csr.container.getRequiredVCores() < minVcores) {
                        csr.container.setRequiredVCores(minVcores);
                    }
                    csr.container.setResourceRequestPriority(nextRequestPriority++);
                    ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
                    if (cr == null) {
                        pendingContainerStartRequests.add(csr);
                    } else {
                        resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
                    }
                }
            }
            // If all other requests are allocated, retry pending requests which need host availability
            if (containerRequests.isEmpty() && !pendingContainerStartRequests.isEmpty()) {
                List<ContainerStartRequest> removalList = new LinkedList<>();
                for (ContainerStartRequest csr : pendingContainerStartRequests) {
                    ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
                    if (cr != null) {
                        resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
                        removalList.add(csr);
                    }
                }
                pendingContainerStartRequests.removeAll(removalList);
            }
            resourceRequestor.reissueContainerRequests(amRmClient, requestedResources, loopCounter, resourceRequestor, containerRequests, removedContainerRequests);
            /* Remove nodes from blacklist after timeout */
            List<String> blacklistRemovals = new ArrayList<>();
            for (String hostname : failedBlackListedNodes) {
                Long timeDiff = currentTimeMillis - failedContainerNodesMap.get(hostname).blackListAdditionTime;
                if (timeDiff >= blacklistRemovalTime) {
                    blacklistRemovals.add(hostname);
                    failedContainerNodesMap.remove(hostname);
                }
            }
            if (!blacklistRemovals.isEmpty()) {
                amRmClient.updateBlacklist(null, blacklistRemovals);
                LOG.info("Removing nodes {} from blacklist: time elapsed since last blacklisting due to failure is greater than specified timeout", blacklistRemovals.toString());
                failedBlackListedNodes.removeAll(blacklistRemovals);
            }
            numRequestedContainers += containerRequests.size() - removedContainerRequests.size();
            AllocateResponse amResp = sendContainerAskToRM(containerRequests, removedContainerRequests, releasedContainers);
            if (amResp.getAMCommand() != null) {
                LOG.info(" statement executed:{}", amResp.getAMCommand());
                switch(amResp.getAMCommand()) {
                    case AM_RESYNC:
                    case AM_SHUTDOWN:
                        throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
                    default:
                        throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
                }
            }
            releasedContainers.clear();
            // Retrieve list of allocated containers from the response
            List<Container> newAllocatedContainers = amResp.getAllocatedContainers();
            // LOG.info("Got response from RM for container ask, allocatedCnt=" + newAllocatedContainers.size());
            numRequestedContainers -= newAllocatedContainers.size();
            long timestamp = System.currentTimeMillis();
            for (Container allocatedContainer : newAllocatedContainers) {
                LOG.info("Got new container." + ", containerId=" + allocatedContainer.getId() + ", containerNode=" + allocatedContainer.getNodeId() + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() + ", containerResourceMemory" + allocatedContainer.getResource().getMemory() + ", priority" + allocatedContainer.getPriority());
                // + ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());
                boolean alreadyAllocated = true;
                StreamingContainerAgent.ContainerStartRequest csr = null;
                for (Map.Entry<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> entry : requestedResources.entrySet()) {
                    if (entry.getKey().container.getResourceRequestPriority() == allocatedContainer.getPriority().getPriority()) {
                        alreadyAllocated = false;
                        csr = entry.getKey();
                        break;
                    }
                }
                if (alreadyAllocated) {
                    LOG.info("Releasing {} as resource with priority {} was already assigned", allocatedContainer.getId(), allocatedContainer.getPriority());
                    releasedContainers.add(allocatedContainer.getId());
                    numReleasedContainers++;
                    // undo the decrement above for this allocated container
                    numRequestedContainers++;
                    continue;
                }
                if (csr != null) {
                    requestedResources.remove(csr);
                }
                // allocate resource to container
                ContainerResource resource = new ContainerResource(allocatedContainer.getPriority().getPriority(), allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), allocatedContainer.getResource().getMemory(), allocatedContainer.getResource().getVirtualCores(), allocatedContainer.getNodeHttpAddress());
                StreamingContainerAgent sca = dnmgr.assignContainer(resource, null);
                if (sca == null) {
                    // allocated container no longer needed, add release request
                    LOG.warn("Container {} allocated but nothing to deploy, going to release this container.", allocatedContainer.getId());
                    releasedContainers.add(allocatedContainer.getId());
                } else {
                    AllocatedContainer allocatedContainerHolder = new AllocatedContainer(allocatedContainer);
                    this.allocatedContainers.put(allocatedContainer.getId().toString(), allocatedContainerHolder);
                    ByteBuffer tokens = null;
                    if (UserGroupInformation.isSecurityEnabled()) {
                        UserGroupInformation ugi = UserGroupInformation.getLoginUser();
                        Token<StramDelegationTokenIdentifier> delegationToken = allocateDelegationToken(ugi.getUserName(), heartbeatListener.getAddress());
                        allocatedContainerHolder.delegationToken = delegationToken;
                        // ByteBuffer tokens = LaunchContainerRunnable.getTokens(delegationTokenManager, heartbeatListener.getAddress());
                        tokens = LaunchContainerRunnable.getTokens(ugi, delegationToken);
                    }
                    LaunchContainerRunnable launchContainer = new LaunchContainerRunnable(allocatedContainer, nmClient, sca, tokens);
                    // Thread launchThread = new Thread(runnableLaunchContainer);
                    // launchThreads.add(launchThread);
                    // launchThread.start();
                    // communication with NMs is now async
                    launchContainer.run();
                    // record container start event
                    StramEvent ev = new StramEvent.StartContainerEvent(allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), groupingManager.getEventGroupIdForAffectedContainer(allocatedContainer.getId().toString()));
                    ev.setTimestamp(timestamp);
                    dnmgr.recordEventAsync(ev);
                }
            }
            // track node updates for future locality constraint allocations
            // TODO: it seems 2.0.4-alpha doesn't give us any updates
            resourceRequestor.updateNodeReports(amResp.getUpdatedNodes());
            // Check the completed containers
            List<ContainerStatus> completedContainers = amResp.getCompletedContainersStatuses();
            // LOG.debug("Got response from RM for container ask, completedCnt=" + completedContainers.size());
            List<String> blacklistAdditions = new ArrayList<>();
            for (ContainerStatus containerStatus : completedContainers) {
                LOG.info("Completed containerId=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());
                // non complete containers should not be here
                assert (containerStatus.getState() == ContainerState.COMPLETE);
                AllocatedContainer allocatedContainer = allocatedContainers.remove(containerStatus.getContainerId().toString());
                if (allocatedContainer != null && allocatedContainer.delegationToken != null) {
                    UserGroupInformation ugi = UserGroupInformation.getLoginUser();
                    delegationTokenManager.cancelToken(allocatedContainer.delegationToken, ugi.getUserName());
                }
                EventGroupId groupId = null;
                int exitStatus = containerStatus.getExitStatus();
                if (0 != exitStatus) {
                    if (allocatedContainer != null) {
                        numFailedContainers.incrementAndGet();
                        if (exitStatus != 1 && maxConsecutiveContainerFailures != Integer.MAX_VALUE) {
                            // If container failure due to framework
                            String hostname = allocatedContainer.container.getNodeId().getHost();
                            if (!failedBlackListedNodes.contains(hostname)) {
                                // Blacklist the node if not already blacklisted
                                if (failedContainerNodesMap.containsKey(hostname)) {
                                    NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                                    long timeStamp = System.currentTimeMillis();
                                    if (timeStamp - stats.lastFailureTimeStamp >= blacklistRemovalTime) {
                                        // Reset failure count if last failure was before Blacklist removal time
                                        stats.failureCount = 1;
                                        stats.lastFailureTimeStamp = timeStamp;
                                    } else {
                                        stats.lastFailureTimeStamp = timeStamp;
                                        stats.failureCount++;
                                        if (stats.failureCount >= maxConsecutiveContainerFailures) {
                                            LOG.info("Node {} failed {} times consecutively within {} minutes, marking the node blacklisted", hostname, stats.failureCount, blacklistRemovalTime / (60 * 1000));
                                            blacklistAdditions.add(hostname);
                                            failedBlackListedNodes.add(hostname);
                                        }
                                    }
                                } else {
                                    failedContainerNodesMap.put(hostname, new NodeFailureStats(System.currentTimeMillis(), 1));
                                }
                            }
                        }
                    }
                    // if (exitStatus == 1) {
                    // // non-recoverable StreamingContainer failure
                    // appDone = true;
                    // finalStatus = FinalApplicationStatus.FAILED;
                    // dnmgr.shutdownDiagnosticsMessage = "Unrecoverable failure " + containerStatus.getContainerId();
                    // LOG.info("Exiting due to: {}", dnmgr.shutdownDiagnosticsMessage);
                    // }
                    // else {
                    // Recoverable failure or process killed (externally or via stop request by AM)
                    // also occurs when a container was released by the application but never assigned/launched
                    LOG.debug("Container {} failed or killed.", containerStatus.getContainerId());
                    String containerIdStr = containerStatus.getContainerId().toString();
                    dnmgr.scheduleContainerRestart(containerIdStr);
                    groupId = groupingManager.getEventGroupIdForAffectedContainer(containerIdStr);
                // }
                } else {
                    // container completed successfully
                    numCompletedContainers.incrementAndGet();
                    LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
                    // Reset counter for node failure, if exists
                    String hostname = allocatedContainer.container.getNodeId().getHost();
                    NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                    if (stats != null) {
                        stats.failureCount = 0;
                    }
                }
                String containerIdStr = containerStatus.getContainerId().toString();
                dnmgr.removeContainerAgent(containerIdStr);
                // record container stop event
                StramEvent ev = new StramEvent.StopContainerEvent(containerIdStr, containerStatus.getExitStatus(), groupId);
                ev.setReason(containerStatus.getDiagnostics());
                dnmgr.recordEventAsync(ev);
            }
            if (!blacklistAdditions.isEmpty()) {
                amRmClient.updateBlacklist(blacklistAdditions, null);
                long timeStamp = System.currentTimeMillis();
                for (String hostname : blacklistAdditions) {
                    NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                    stats.blackListAdditionTime = timeStamp;
                }
            }
            if (dnmgr.forcedShutdown) {
                LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
                finalStatus = FinalApplicationStatus.FAILED;
                appDone = true;
            } else if (allocatedContainers.isEmpty() && numRequestedContainers == 0 && dnmgr.containerStartRequests.isEmpty()) {
                LOG.debug("Exiting as no more containers are allocated or requested");
                finalStatus = FinalApplicationStatus.SUCCEEDED;
                appDone = true;
            }
            LOG.debug("Current application state: loop={}, appDone={}, requested={}, released={}, completed={}, failed={}, currentAllocated={}, dnmgr.containerStartRequests={}", loopCounter, appDone, numRequestedContainers, numReleasedContainers, numCompletedContainers, numFailedContainers, allocatedContainers.size(), dnmgr.containerStartRequests);
            // monitor child containers
            dnmgr.monitorHeartbeat(waitForRecovery > 0);
            waitForRecovery = Math.max(waitForRecovery - 1, 0);
        }
        finishApplication(finalStatus);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) FinalApplicationStatus(org.apache.hadoop.yarn.api.records.FinalApplicationStatus) InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) Container(org.apache.hadoop.yarn.api.records.Container) StreamingContainer(com.datatorrent.stram.engine.StreamingContainer) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ContainerRequest(org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) LinkedList(java.util.LinkedList) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ContainerResource(com.datatorrent.stram.StreamingContainerManager.ContainerResource) RegisterApplicationMasterResponse(org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse) Map(java.util.Map) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) StramDelegationTokenIdentifier(com.datatorrent.stram.security.StramDelegationTokenIdentifier) StramEvent(com.datatorrent.stram.api.StramEvent) EventGroupId(org.apache.apex.engine.events.grouping.GroupingRequest.EventGroupId) MutablePair(org.apache.commons.lang3.tuple.MutablePair) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) TokenRenewer(org.apache.apex.engine.security.TokenRenewer) ByteBuffer(java.nio.ByteBuffer) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException)

Example 9 with ContainerStartRequest

use of com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest in project apex-core by apache.

the class StramLocalCluster method run.

@Override
@SuppressWarnings({ "SleepWhileInLoop", "ResultOfObjectAllocationIgnored" })
public void run(long runMillis) {
    Thread eventLoopThread = null;
    List<Thread> containerThreads = new LinkedList<>();
    try {
        if (!perContainerBufferServer) {
            eventLoopThread = StreamingContainer.eventloop.start();
            bufferServer = new Server(StreamingContainer.eventloop, 0, 1024 * 1024, 8);
            try {
                bufferServer.setSpoolStorage(new DiskStorage());
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            bufferServerAddress = InetSocketAddress.createUnresolved(LOCALHOST, bufferServer.run().getPort());
            LOG.info("Buffer server started: {}", bufferServerAddress);
        }
        long endMillis = System.currentTimeMillis() + runMillis;
        while (!appDone) {
            for (String containerIdStr : dnmgr.containerStopRequests.values()) {
                // teardown child thread
                StreamingContainer c = childContainers.get(containerIdStr);
                if (c != null) {
                    ContainerHeartbeatResponse r = new ContainerHeartbeatResponse();
                    r.shutdown = StreamingContainerUmbilicalProtocol.ShutdownType.ABORT;
                    c.processHeartbeatResponse(r);
                }
                dnmgr.containerStopRequests.remove(containerIdStr);
                LOG.info("Container {} restart.", containerIdStr);
                dnmgr.scheduleContainerRestart(containerIdStr);
            // dnmgr.removeContainerAgent(containerIdStr);
            }
            // start containers
            while (!dnmgr.containerStartRequests.isEmpty()) {
                ContainerStartRequest cdr = dnmgr.containerStartRequests.poll();
                if (cdr != null) {
                    new LocalStreamingContainerLauncher(cdr, containerThreads);
                }
            }
            if (heartbeatMonitoringEnabled) {
                // monitor child containers
                dnmgr.monitorHeartbeat(false);
            }
            if (childContainers.isEmpty() && dnmgr.containerStartRequests.isEmpty()) {
                appDone = true;
            }
            if (runMillis > 0 && System.currentTimeMillis() > endMillis) {
                appDone = true;
            }
            try {
                if (exitCondition != null && exitCondition.call()) {
                    LOG.info("Stopping on exit condition");
                    appDone = true;
                }
            } catch (Exception ex) {
                break;
            }
            if (Thread.interrupted()) {
                break;
            }
            if (!appDone) {
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e) {
                    LOG.debug("Sleep interrupted", e);
                    break;
                }
            }
        }
    } finally {
        for (LocalStreamingContainer lsc : childContainers.values()) {
            injectShutdown.put(lsc.getContainerId(), lsc);
            lsc.triggerHeartbeat();
        }
        for (Thread thread : containerThreads) {
            try {
                thread.join(1000);
            } catch (InterruptedException e) {
                LOG.debug("Wait for {} to terminate interrupted", thread, e);
            }
            if (thread.isAlive()) {
                LOG.warn("Container thread {} is still alive", thread.getName());
            }
        }
        try {
            dnmgr.teardown();
        } catch (RuntimeException e) {
            LOG.warn("Exception during StreamingContainerManager teardown", e);
        }
        if (bufferServerAddress != null) {
            try {
                bufferServer.stop();
            } catch (RuntimeException e) {
                LOG.warn("Exception during BufferServer stop", e);
            }
        }
        if (eventLoopThread != null) {
            try {
                StreamingContainer.eventloop.stop();
                eventLoopThread.join(1000);
            } catch (InterruptedException ie) {
                LOG.debug("Wait for {} to terminate interrupted", eventLoopThread.getName(), ie);
            } catch (RuntimeException e) {
                LOG.warn("Exception during {} stop", StreamingContainer.eventloop, e);
            }
            if (StreamingContainer.eventloop.isActive()) {
                LOG.warn("Event loop {} is still active", StreamingContainer.eventloop);
            }
        }
    }
    LOG.info("Application finished.");
}
Also used : StreamingContainer(com.datatorrent.stram.engine.StreamingContainer) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) Server(com.datatorrent.bufferserver.server.Server) ContainerHeartbeatResponse(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse) IOException(java.io.IOException) LinkedList(java.util.LinkedList) IOException(java.io.IOException) DiskStorage(com.datatorrent.bufferserver.storage.DiskStorage)

Example 10 with ContainerStartRequest

use of com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest in project apex-core by apache.

the class StreamingContainerManagerTest method testShutdownOperatorRecovery.

@Test
public void testShutdownOperatorRecovery() throws Exception {
    GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
    GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class);
    dag.addStream("s1", o1.outport1, o2.inport1);
    dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    StreamingContainerManager scm = new StreamingContainerManager(dag);
    scm.containerStartRequests.poll();
    scm.containerStartRequests.poll();
    PhysicalPlan plan = scm.getPhysicalPlan();
    PTOperator p1 = plan.getOperators(dag.getMeta(o1)).get(0);
    PTOperator p2 = plan.getOperators(dag.getMeta(o2)).get(0);
    shutdownOperator(scm, p1, p2);
    scm.scheduleContainerRestart(p1.getContainer().getExternalId());
    ContainerStartRequest dr = scm.containerStartRequests.poll();
    Assert.assertTrue(dr.container.getOperators().contains(p1));
}
Also used : PhysicalPlan(com.datatorrent.stram.plan.physical.PhysicalPlan) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) Test(org.junit.Test) PhysicalPlanTest(com.datatorrent.stram.plan.physical.PhysicalPlanTest)

Aggregations

ContainerStartRequest (com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest)15 GenericTestOperator (com.datatorrent.stram.engine.GenericTestOperator)11 MemoryStorageAgent (com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent)11 Test (org.junit.Test)11 LogicalPlan (com.datatorrent.stram.plan.logical.LogicalPlan)9 NodeReport (org.apache.hadoop.yarn.api.records.NodeReport)9 File (java.io.File)7 PTContainer (com.datatorrent.stram.plan.physical.PTContainer)4 PTOperator (com.datatorrent.stram.plan.physical.PTOperator)4 ArrayList (java.util.ArrayList)4 AffinityRule (com.datatorrent.api.AffinityRule)2 AffinityRulesSet (com.datatorrent.api.AffinityRulesSet)2 StreamingContainer (com.datatorrent.stram.engine.StreamingContainer)2 PhysicalPlan (com.datatorrent.stram.plan.physical.PhysicalPlan)2 PhysicalPlanTest (com.datatorrent.stram.plan.physical.PhysicalPlanTest)2 IOException (java.io.IOException)2 LinkedList (java.util.LinkedList)2 Map (java.util.Map)2 MutablePair (org.apache.commons.lang3.tuple.MutablePair)2 Server (com.datatorrent.bufferserver.server.Server)1