Search in sources :

Example 6 with ContainerHeartbeatResponse

use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse in project apex-core by apache.

the class StreamingContainerManager method getHeartbeatResponse.

private ContainerHeartbeatResponse getHeartbeatResponse(StreamingContainerAgent sca) {
    ContainerHeartbeatResponse rsp = new ContainerHeartbeatResponse();
    if (this.deployChangeInProgress.get() || sca.deployCnt != this.deployChangeCnt) {
        LOG.debug("{} deferred requests due to concurrent plan change.", sca.container.toIdStateString());
        rsp.hasPendingRequests = true;
        return rsp;
    }
    if (!sca.undeployOpers.isEmpty()) {
        rsp.undeployRequest = Lists.newArrayList(sca.undeployOpers);
        rsp.hasPendingRequests = (!sca.deployOpers.isEmpty());
        return rsp;
    }
    Set<PTOperator> deployOperators = sca.deployOpers;
    if (!deployOperators.isEmpty()) {
        // deploy once all containers are running and no undeploy operations are pending.
        for (PTContainer c : getPhysicalPlan().getContainers()) {
            if (c.getState() != PTContainer.State.ACTIVE) {
                LOG.debug("{} waiting for container activation {}", sca.container.toIdStateString(), c.toIdStateString());
                rsp.hasPendingRequests = true;
                return rsp;
            }
            for (PTOperator oper : c.getOperators()) {
                if (oper.getState() == PTOperator.State.PENDING_UNDEPLOY) {
                    LOG.debug("{} waiting for undeploy {} {}", sca.container.toIdStateString(), c.toIdStateString(), oper);
                    rsp.hasPendingRequests = true;
                    return rsp;
                }
            }
        }
        LOG.debug("{} deployable operators: {}", sca.container.toIdStateString(), deployOperators);
        List<OperatorDeployInfo> deployList = sca.getDeployInfoList(deployOperators);
        if (deployList != null && !deployList.isEmpty()) {
            rsp.deployRequest = deployList;
            rsp.nodeRequests = Lists.newArrayList();
            for (PTOperator o : deployOperators) {
                rsp.nodeRequests.addAll(o.deployRequests);
            }
        }
        rsp.hasPendingRequests = false;
        return rsp;
    }
    return rsp;
}
Also used : OperatorDeployInfo(com.datatorrent.stram.api.OperatorDeployInfo) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) ContainerHeartbeatResponse(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse) PTContainer(com.datatorrent.stram.plan.physical.PTContainer)

Example 7 with ContainerHeartbeatResponse

use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse in project apex-core by apache.

the class StreamingContainer method heartbeatLoop.

public void heartbeatLoop() throws Exception {
    logger.debug("Entering heartbeat loop (interval is {} ms)", this.heartbeatIntervalMillis);
    umbilical.log(containerId, "[" + containerId + "] Entering heartbeat loop..");
    final YarnConfiguration conf = new YarnConfiguration();
    long tokenLifeTime = (long) (containerContext.getValue(LogicalPlan.TOKEN_REFRESH_ANTICIPATORY_FACTOR) * containerContext.getValue(LogicalPlan.HDFS_TOKEN_LIFE_TIME));
    long expiryTime = System.currentTimeMillis();
    final Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
    String stackTrace = null;
    Iterator<Token<?>> iter = credentials.getAllTokens().iterator();
    while (iter.hasNext()) {
        Token<?> token = iter.next();
        logger.debug("token: {}", token);
    }
    String principal = containerContext.getValue(LogicalPlan.PRINCIPAL);
    String hdfsKeyTabFile = containerContext.getValue(LogicalPlan.KEY_TAB_FILE);
    while (!exitHeartbeatLoop) {
        if (UserGroupInformation.isSecurityEnabled() && System.currentTimeMillis() >= expiryTime && hdfsKeyTabFile != null) {
            expiryTime = StramUserLogin.refreshTokens(tokenLifeTime, FileUtils.getTempDirectoryPath(), containerId, conf, principal, hdfsKeyTabFile, credentials, null, false);
        }
        synchronized (this.heartbeatTrigger) {
            try {
                this.heartbeatTrigger.wait(heartbeatIntervalMillis);
            } catch (InterruptedException e1) {
                logger.warn("Interrupted in heartbeat loop, exiting..");
                break;
            }
        }
        long currentTime = System.currentTimeMillis();
        ContainerHeartbeat msg = new ContainerHeartbeat();
        msg.jvmName = jvmName;
        if (this.bufferServerAddress != null) {
            msg.bufferServerHost = this.bufferServerAddress.getHostName();
            msg.bufferServerPort = this.bufferServerAddress.getPort();
            if (bufferServer != null && !eventloop.isActive()) {
                logger.warn("Requesting restart due to terminated event loop");
                msg.restartRequested = true;
            }
        }
        msg.memoryMBFree = ((int) (Runtime.getRuntime().freeMemory() / (1024 * 1024)));
        garbageCollectorMXBeans = ManagementFactory.getGarbageCollectorMXBeans();
        for (GarbageCollectorMXBean bean : garbageCollectorMXBeans) {
            msg.gcCollectionTime += bean.getCollectionTime();
            msg.gcCollectionCount += bean.getCollectionCount();
        }
        ContainerHeartbeatResponse rsp;
        do {
            ContainerStats stats = new ContainerStats(containerId);
            // gather heartbeat info for all operators
            for (Map.Entry<Integer, Node<?>> e : nodes.entrySet()) {
                OperatorHeartbeat hb = new OperatorHeartbeat();
                hb.setNodeId(e.getKey());
                hb.setGeneratedTms(currentTime);
                hb.setIntervalMs(heartbeatIntervalMillis);
                if (e.getValue().commandResponse.size() > 0) {
                    BlockingQueue<StatsListener.OperatorResponse> commandResponse = e.getValue().commandResponse;
                    ArrayList<StatsListener.OperatorResponse> response = new ArrayList<>();
                    for (int i = 0; i < commandResponse.size(); i++) {
                        response.add(commandResponse.poll());
                    }
                    hb.requestResponse = response;
                }
                OperatorContext context = e.getValue().context;
                context.drainStats(hb.getOperatorStatsContainer());
                if (context.getThread() == null || context.getThread().getState() != Thread.State.TERMINATED) {
                    hb.setState(DeployState.ACTIVE);
                } else if (failedNodes.contains(hb.nodeId)) {
                    hb.setState(DeployState.FAILED);
                } else {
                    logger.debug("Reporting SHUTDOWN state because thread is {} and failedNodes is {}", context.getThread(), failedNodes);
                    hb.setState(DeployState.SHUTDOWN);
                }
                stats.addNodeStats(hb);
            }
            /**
         * Container stats published for whoever is interested in listening.
         * Currently interested candidates are TupleRecorderCollection and BufferServerStatsSubscriber
         */
            eventBus.publish(new ContainerStatsEvent(stats));
            msg.setContainerStats(stats);
            // heartbeat call and follow-up processing
            //logger.debug("Sending heartbeat for {} operators.", msg.getContainerStats().size());
            msg.sentTms = System.currentTimeMillis();
            msg.stackTrace = stackTrace;
            rsp = umbilical.processHeartbeat(msg);
            if (rsp.stackTraceRequired) {
                stackTrace = StramUtils.getStackTrace().toString();
            } else {
                stackTrace = null;
            }
            processHeartbeatResponse(rsp);
            if (rsp.hasPendingRequests) {
                logger.info("Waiting for pending request.");
                synchronized (this.heartbeatTrigger) {
                    try {
                        this.heartbeatTrigger.wait(500);
                    } catch (InterruptedException ie) {
                        logger.warn("Interrupted in heartbeat loop", ie);
                        break;
                    }
                }
            }
        } while (rsp.hasPendingRequests);
    }
    logger.debug("[{}] Exiting heartbeat loop", containerId);
    umbilical.log(containerId, "[" + containerId + "] Exiting heartbeat loop..");
}
Also used : ContainerStats(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats) OperatorHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat) ContainerStatsEvent(com.datatorrent.stram.api.ContainerEvent.ContainerStatsEvent) GarbageCollectorMXBean(java.lang.management.GarbageCollectorMXBean) ArrayList(java.util.ArrayList) ContainerHeartbeatResponse(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse) Token(org.apache.hadoop.security.token.Token) Checkpoint(com.datatorrent.stram.api.Checkpoint) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) Credentials(org.apache.hadoop.security.Credentials) ContainerHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeat)

Example 8 with ContainerHeartbeatResponse

use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse in project apex-core by apache.

the class StreamingContainerManagerTest method testProcessHeartbeat.

@Test
public void testProcessHeartbeat() throws Exception {
    TestGeneratorInputOperator o1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
    dag.setOperatorAttribute(o1, OperatorContext.STATS_LISTENERS, Arrays.asList(new StatsListener[] { new PartitioningTest.PartitionLoadWatch() }));
    dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    StreamingContainerManager scm = new StreamingContainerManager(dag);
    PhysicalPlan plan = scm.getPhysicalPlan();
    Assert.assertEquals("number required containers", 1, plan.getContainers().size());
    PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0);
    // assign container
    String containerId = "container1";
    StreamingContainerAgent sca = scm.assignContainer(new ContainerResource(0, containerId, "localhost", 512, 0, null), InetSocketAddress.createUnresolved("localhost", 0));
    Assert.assertNotNull(sca);
    Assert.assertEquals(PTContainer.State.ALLOCATED, o1p1.getContainer().getState());
    Assert.assertEquals(PTOperator.State.PENDING_DEPLOY, o1p1.getState());
    ContainerStats cstats = new ContainerStats(containerId);
    ContainerHeartbeat hb = new ContainerHeartbeat();
    hb.setContainerStats(cstats);
    // get deploy request
    ContainerHeartbeatResponse chr = scm.processHeartbeat(hb);
    Assert.assertNotNull(chr.deployRequest);
    Assert.assertEquals("" + chr.deployRequest, 1, chr.deployRequest.size());
    Assert.assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    Assert.assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
    // first operator heartbeat
    OperatorHeartbeat ohb = new OperatorHeartbeat();
    ohb.setNodeId(o1p1.getId());
    ohb.setState(OperatorHeartbeat.DeployState.ACTIVE);
    OperatorStats stats = new OperatorStats();
    stats.checkpoint = new Checkpoint(2, 0, 0);
    stats.windowId = 3;
    stats.outputPorts = Lists.newArrayList();
    PortStats ps = new PortStats(TestGeneratorInputOperator.OUTPUT_PORT);
    ps.bufferServerBytes = 101;
    ps.tupleCount = 1;
    stats.outputPorts.add(ps);
    ohb.windowStats = Lists.newArrayList(stats);
    cstats.operators.add(ohb);
    // activate operator
    scm.processHeartbeat(hb);
    Assert.assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    Assert.assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState());
    Assert.assertEquals("tuples " + o1p1, 1, o1p1.stats.totalTuplesEmitted.get());
    Assert.assertEquals("tuples " + o1p1, 0, o1p1.stats.totalTuplesProcessed.get());
    Assert.assertEquals("window " + o1p1, 3, o1p1.stats.currentWindowId.get());
    Assert.assertEquals("port stats", 1, o1p1.stats.outputPortStatusList.size());
    PortStatus o1p1ps = o1p1.stats.outputPortStatusList.get(TestGeneratorInputOperator.OUTPUT_PORT);
    Assert.assertNotNull("port stats", o1p1ps);
    Assert.assertEquals("port stats", 1, o1p1ps.totalTuples);
    // second operator heartbeat
    stats = new OperatorStats();
    stats.checkpoint = new Checkpoint(2, 0, 0);
    stats.windowId = 4;
    stats.outputPorts = Lists.newArrayList();
    ps = new PortStats(TestGeneratorInputOperator.OUTPUT_PORT);
    ps.bufferServerBytes = 1;
    ps.tupleCount = 1;
    stats.outputPorts.add(ps);
    ohb.windowStats = Lists.newArrayList(stats);
    cstats.operators.clear();
    cstats.operators.add(ohb);
    scm.processHeartbeat(hb);
    Assert.assertEquals("tuples " + o1p1, 2, o1p1.stats.totalTuplesEmitted.get());
    Assert.assertEquals("window " + o1p1, 4, o1p1.stats.currentWindowId.get());
    Assert.assertEquals("statsQueue " + o1p1, 2, o1p1.stats.listenerStats.size());
    scm.processEvents();
    Assert.assertEquals("statsQueue " + o1p1, 0, o1p1.stats.listenerStats.size());
    Assert.assertEquals("lastStats " + o1p1, 2, o1p1.stats.lastWindowedStats.size());
}
Also used : PhysicalPlan(com.datatorrent.stram.plan.physical.PhysicalPlan) ContainerStats(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) OperatorHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat) ContainerHeartbeatResponse(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse) TestGeneratorInputOperator(com.datatorrent.stram.engine.TestGeneratorInputOperator) MockOperatorStats(com.datatorrent.stram.MockContainer.MockOperatorStats) OperatorStats(com.datatorrent.api.Stats.OperatorStats) StatsListener(com.datatorrent.api.StatsListener) Checkpoint(com.datatorrent.stram.api.Checkpoint) ContainerResource(com.datatorrent.stram.StreamingContainerManager.ContainerResource) PortStatus(com.datatorrent.stram.plan.physical.OperatorStatus.PortStatus) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) PortStats(com.datatorrent.api.Stats.OperatorStats.PortStats) ContainerHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeat) Test(org.junit.Test) PhysicalPlanTest(com.datatorrent.stram.plan.physical.PhysicalPlanTest)

Aggregations

ContainerHeartbeatResponse (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse)8 ContainerStats (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats)5 Checkpoint (com.datatorrent.stram.api.Checkpoint)4 ContainerHeartbeat (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeat)4 OperatorHeartbeat (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat)4 PTOperator (com.datatorrent.stram.plan.physical.PTOperator)4 OperatorStats (com.datatorrent.api.Stats.OperatorStats)3 Map (java.util.Map)3 StatsListener (com.datatorrent.api.StatsListener)2 OperatorDeployInfo (com.datatorrent.stram.api.OperatorDeployInfo)2 PortStatus (com.datatorrent.stram.plan.physical.OperatorStatus.PortStatus)2 PhysicalPlanTest (com.datatorrent.stram.plan.physical.PhysicalPlanTest)2 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 PortStats (com.datatorrent.api.Stats.OperatorStats.PortStats)1 Server (com.datatorrent.bufferserver.server.Server)1 DiskStorage (com.datatorrent.bufferserver.storage.DiskStorage)1 Pair (com.datatorrent.common.util.Pair)1