Search in sources :

Example 41 with PTContainer

use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.

the class StreamingContainerManagerTest method testRecoveryUpstreamInline.

@Test
public void testRecoveryUpstreamInline() throws Exception {
    GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
    GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class);
    GenericTestOperator o3 = dag.addOperator("o3", GenericTestOperator.class);
    dag.addStream("o1o3", o1.outport1, o3.inport1);
    dag.addStream("o2o3", o2.outport1, o3.inport2);
    dag.getAttributes().put(LogicalPlan.CONTAINERS_MAX_COUNT, 2);
    dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    StreamingContainerManager scm = new StreamingContainerManager(dag);
    PhysicalPlan plan = scm.getPhysicalPlan();
    Assert.assertEquals(2, plan.getContainers().size());
    plan.getOperators(dag.getMeta(o1)).get(0);
    Assert.assertEquals(2, plan.getContainers().size());
    PTContainer c1 = plan.getContainers().get(0);
    Assert.assertEquals(Sets.newHashSet(plan.getOperators(dag.getMeta(o1)).get(0), plan.getOperators(dag.getMeta(o3)).get(0)), Sets.newHashSet(c1.getOperators()));
    PTContainer c2 = plan.getContainers().get(1);
    assignContainer(scm, "c1");
    assignContainer(scm, "c2");
    for (PTOperator oper : c1.getOperators()) {
        Assert.assertEquals("state " + oper, PTOperator.State.PENDING_DEPLOY, oper.getState());
    }
    scm.scheduleContainerRestart(c2.getExternalId());
    for (PTOperator oper : c1.getOperators()) {
        Assert.assertEquals("state " + oper, PTOperator.State.PENDING_UNDEPLOY, oper.getState());
    }
}
Also used : PhysicalPlan(com.datatorrent.stram.plan.physical.PhysicalPlan) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) Test(org.junit.Test) PhysicalPlanTest(com.datatorrent.stram.plan.physical.PhysicalPlanTest)

Example 42 with PTContainer

use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.

the class StreamingContainerManager method monitorHeartbeat.

/**
   * Check periodically that deployed containers phone home.
   * Run from the master main loop (single threaded access).
   */
public void monitorHeartbeat(boolean waitForRecovery) {
    long currentTms = clock.getTime();
    // look for resource allocation timeout
    if (!pendingAllocation.isEmpty()) {
        // look for resource allocation timeout
        if (lastResourceRequest + plan.getLogicalPlan().getValue(LogicalPlan.RESOURCE_ALLOCATION_TIMEOUT_MILLIS) < currentTms) {
            String msg = String.format("Shutdown due to resource allocation timeout (%s ms) waiting for %s containers", currentTms - lastResourceRequest, pendingAllocation.size());
            LOG.warn(msg);
            for (PTContainer c : pendingAllocation) {
                LOG.warn("Waiting for resource: {}m priority: {} {}", c.getRequiredMemoryMB(), c.getResourceRequestPriority(), c);
            }
            shutdownAllContainers(ShutdownType.ABORT, msg);
            this.forcedShutdown = true;
        } else {
            for (PTContainer c : pendingAllocation) {
                LOG.debug("Waiting for resource: {}m {}", c.getRequiredMemoryMB(), c);
            }
        }
    }
    // monitor currently deployed containers
    for (StreamingContainerAgent sca : containers.values()) {
        PTContainer c = sca.container;
        if (!pendingAllocation.contains(c) && c.getExternalId() != null) {
            if (sca.lastHeartbeatMillis == 0) {
                // container allocated but process was either not launched or is not able to phone home
                if (currentTms - sca.createdMillis > 2 * this.vars.heartbeatTimeoutMillis) {
                    LOG.warn("Container {}@{} startup timeout ({} ms).", c.getExternalId(), c.host, currentTms - sca.createdMillis);
                    containerStopRequests.put(c.getExternalId(), c.getExternalId());
                }
            } else {
                if (currentTms - sca.lastHeartbeatMillis > this.vars.heartbeatTimeoutMillis) {
                    if (!isApplicationIdle()) {
                        // Check if the heartbeat for this agent has already been missed to raise the StramEvent only once
                        if (sca.lastHeartbeatMillis != -1) {
                            String msg = String.format("Container %s@%s heartbeat timeout  (%d%n ms).", c.getExternalId(), c.host, currentTms - sca.lastHeartbeatMillis);
                            LOG.warn(msg);
                            StramEvent stramEvent = new StramEvent.ContainerErrorEvent(c.getExternalId(), msg, null);
                            stramEvent.setReason(msg);
                            recordEventAsync(stramEvent);
                            sca.lastHeartbeatMillis = -1;
                        }
                        // request stop (kill) as process may still be hanging around (would have been detected by Yarn otherwise)
                        containerStopRequests.put(c.getExternalId(), c.getExternalId());
                    }
                }
            }
        }
    }
    // events that may modify the plan
    processEvents();
    committedWindowId = updateCheckpoints(waitForRecovery);
    if (lastCommittedWindowId != committedWindowId) {
        apexPluginDispatcher.dispatch(new DAGExecutionEvent.CommitExecutionEvent(committedWindowId));
        lastCommittedWindowId = committedWindowId;
    }
    calculateEndWindowStats();
    if (this.vars.enableStatsRecording) {
        recordStats(currentTms);
    }
}
Also used : StramEvent(com.datatorrent.stram.api.StramEvent) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) DAGExecutionEvent(org.apache.apex.engine.api.plugin.DAGExecutionEvent)

Example 43 with PTContainer

use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.

the class StreamingContainerManager method deploy.

@Override
public void deploy(Set<PTContainer> releaseContainers, Collection<PTOperator> undeploy, Set<PTContainer> startContainers, Collection<PTOperator> deploy) {
    try {
        this.deployChangeInProgress.set(true);
        Map<PTContainer, List<PTOperator>> undeployGroups = groupByContainer(undeploy);
        // order does not matter, remove all operators in each container in one sweep
        for (Map.Entry<PTContainer, List<PTOperator>> e : undeployGroups.entrySet()) {
            // container may already be in failed or pending deploy state, notified by RM or timed out
            PTContainer c = e.getKey();
            if (!startContainers.contains(c) && !releaseContainers.contains(c) && c.getState() != PTContainer.State.KILLED) {
                LOG.debug("scheduling undeploy {} {}", e.getKey().getExternalId(), e.getValue());
                for (PTOperator oper : e.getValue()) {
                    oper.setState(PTOperator.State.PENDING_UNDEPLOY);
                }
            }
        }
        // start new containers
        for (PTContainer c : startContainers) {
            requestContainer(c);
        }
        // (re)deploy affected operators
        // can happen in parallel after buffer server for recovered publishers is reset
        Map<PTContainer, List<PTOperator>> deployGroups = groupByContainer(deploy);
        for (Map.Entry<PTContainer, List<PTOperator>> e : deployGroups.entrySet()) {
            if (!startContainers.contains(e.getKey())) {
                // to reset publishers, clean buffer server past checkpoint so subscribers don't read stale data (including end of stream)
                for (PTOperator operator : e.getValue()) {
                    for (PTOperator.PTOutput out : operator.getOutputs()) {
                        if (!out.isDownStreamInline()) {
                            for (InputPortMeta ipm : out.logicalStream.getSinks()) {
                                StreamCodec<?> streamCodec = ipm.getStreamCodec();
                                Integer codecId = plan.getStreamCodecIdentifier(streamCodec);
                                // following needs to match the concat logic in StreamingContainer
                                String sourceIdentifier = Integer.toString(operator.getId()).concat(Component.CONCAT_SEPARATOR).concat(out.portName).concat(Component.CONCAT_SEPARATOR).concat(codecId.toString());
                                if (operator.getContainer().getState() == PTContainer.State.ACTIVE) {
                                    // TODO: unit test - find way to mock this when testing rest of logic
                                    if (operator.getContainer().bufferServerAddress.getPort() != 0) {
                                        BufferServerController bsc = getBufferServerClient(operator);
                                        // ensures new subscriber starting to read from checkpoint will wait until publisher redeploy cycle is complete
                                        try {
                                            bsc.reset(null, sourceIdentifier, 0);
                                        } catch (Exception ex) {
                                            LOG.error("Failed to reset buffer server {} {}", sourceIdentifier, ex);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // add to operators that we expect to deploy
            LOG.debug("scheduling deploy {} {}", e.getKey().getExternalId(), e.getValue());
            for (PTOperator oper : e.getValue()) {
                // operator will be deployed after it has been undeployed, if still referenced by the container
                if (oper.getState() != PTOperator.State.PENDING_UNDEPLOY) {
                    oper.setState(PTOperator.State.PENDING_DEPLOY);
                }
            }
        }
        // stop containers that are no longer used
        for (PTContainer c : releaseContainers) {
            if (c.getExternalId() == null) {
                continue;
            }
            StreamingContainerAgent sca = containers.get(c.getExternalId());
            if (sca != null) {
                LOG.debug("Container marked for shutdown: {}", c);
                // container already removed from plan
                // TODO: monitor soft shutdown
                sca.requestShutDown(ShutdownType.ABORT);
            }
        }
    } finally {
        this.deployChangeCnt++;
        this.deployChangeInProgress.set(false);
    }
}
Also used : PTOperator(com.datatorrent.stram.plan.physical.PTOperator) InputPortMeta(com.datatorrent.stram.plan.logical.LogicalPlan.InputPortMeta) NotFoundException(org.apache.hadoop.yarn.webapp.NotFoundException) IOException(java.io.IOException) JSONException(org.codehaus.jettison.json.JSONException) KryoException(com.esotericsoftware.kryo.KryoException) PTOutput(com.datatorrent.stram.plan.physical.PTOperator.PTOutput) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap)

Example 44 with PTContainer

use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.

the class StreamingContainerManager method assignContainer.

/**
   * Assign operators to allocated container resource.
   *
   * @param resource
   * @param bufferServerAddr
   * @return streaming container agent
   */
public StreamingContainerAgent assignContainer(ContainerResource resource, InetSocketAddress bufferServerAddr) {
    PTContainer container = null;
    // match container waiting for resource
    for (PTContainer c : pendingAllocation) {
        if (c.getState() == PTContainer.State.NEW || c.getState() == PTContainer.State.KILLED) {
            if (c.getResourceRequestPriority() == resource.priority) {
                container = c;
                break;
            }
        }
    }
    if (container == null) {
        LOG.debug("No container matching allocated resource {}", resource);
        LOG.debug("Containers waiting for allocation {}", pendingAllocation);
        return null;
    }
    pendingAllocation.remove(container);
    container.setState(PTContainer.State.ALLOCATED);
    if (container.getExternalId() != null) {
        LOG.info("Removing container agent {}", container.getExternalId());
        this.containers.remove(container.getExternalId());
    }
    container.setExternalId(resource.containerId);
    container.host = resource.host;
    container.bufferServerAddress = bufferServerAddr;
    if (UserGroupInformation.isSecurityEnabled()) {
        byte[] token = AuthManager.generateToken();
        container.setBufferServerToken(token);
    }
    container.nodeHttpAddress = resource.nodeHttpAddress;
    container.setAllocatedMemoryMB(resource.memoryMB);
    container.setAllocatedVCores(resource.vCores);
    container.setStartedTime(-1);
    container.setFinishedTime(-1);
    writeJournal(container.getSetContainerState());
    StreamingContainerAgent sca = new StreamingContainerAgent(container, newStreamingContainerContext(container), this);
    containers.put(resource.containerId, sca);
    LOG.debug("Assigned container {} priority {}", resource.containerId, resource.priority);
    return sca;
}
Also used : PTContainer(com.datatorrent.stram.plan.physical.PTContainer)

Example 45 with PTContainer

use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.

the class StreamingContainerManager method fillLogicalOperatorInfo.

private LogicalOperatorInfo fillLogicalOperatorInfo(OperatorMeta operator) {
    LogicalOperatorInfo loi = new LogicalOperatorInfo();
    loi.name = operator.getName();
    loi.className = operator.getOperator().getClass().getName();
    loi.totalTuplesEmitted = operator.getStatus().totalTuplesEmitted;
    loi.totalTuplesProcessed = operator.getStatus().totalTuplesProcessed;
    loi.failureCount = operator.getStatus().failureCount;
    loi.status = new HashMap<>();
    loi.partitions = new TreeSet<>();
    loi.unifiers = new TreeSet<>();
    loi.containerIds = new TreeSet<>();
    loi.hosts = new TreeSet<>();
    Collection<PTOperator> physicalOperators = getPhysicalPlan().getAllOperators(operator);
    NumberAggregate.LongAggregate checkpointTimeAggregate = new NumberAggregate.LongAggregate();
    for (PTOperator physicalOperator : physicalOperators) {
        OperatorStatus os = physicalOperator.stats;
        if (physicalOperator.isUnifier()) {
            loi.unifiers.add(physicalOperator.getId());
        } else {
            loi.partitions.add(physicalOperator.getId());
            // exclude unifier, not sure if we should include it in the future
            loi.tuplesEmittedPSMA += os.tuplesEmittedPSMA.get();
            loi.tuplesProcessedPSMA += os.tuplesProcessedPSMA.get();
            // calculate maximum latency for all partitions
            long latency = calculateLatency(physicalOperator);
            if (latency > loi.latencyMA) {
                loi.latencyMA = latency;
            }
            checkpointTimeAggregate.addNumber(os.checkpointTimeMA.getAvg());
        }
        loi.cpuPercentageMA += os.cpuNanosPMSMA.getAvg() / 10000;
        if (os.lastHeartbeat != null && (loi.lastHeartbeat == 0 || loi.lastHeartbeat > os.lastHeartbeat.getGeneratedTms())) {
            loi.lastHeartbeat = os.lastHeartbeat.getGeneratedTms();
        }
        long currentWindowId = toWsWindowId(os.currentWindowId.get());
        if (loi.currentWindowId == 0 || loi.currentWindowId > currentWindowId) {
            loi.currentWindowId = currentWindowId;
        }
        MutableInt count = loi.status.get(physicalOperator.getState().toString());
        if (count == null) {
            count = new MutableInt();
            loi.status.put(physicalOperator.getState().toString(), count);
        }
        count.increment();
        if (physicalOperator.getRecoveryCheckpoint() != null) {
            long recoveryWindowId = toWsWindowId(physicalOperator.getRecoveryCheckpoint().windowId);
            if (loi.recoveryWindowId == 0 || loi.recoveryWindowId > recoveryWindowId) {
                loi.recoveryWindowId = recoveryWindowId;
            }
        }
        PTContainer container = physicalOperator.getContainer();
        if (container != null) {
            String externalId = container.getExternalId();
            if (externalId != null) {
                loi.containerIds.add(externalId);
                loi.hosts.add(container.host);
            }
        }
    }
    if (physicalOperators.size() > 0 && checkpointTimeAggregate.getAvg() != null) {
        loi.checkpointTimeMA = checkpointTimeAggregate.getAvg().longValue();
        loi.counters = latestLogicalCounters.get(operator.getName());
        loi.autoMetrics = latestLogicalMetrics.get(operator.getName());
    }
    return loi;
}
Also used : NumberAggregate(com.datatorrent.common.util.NumberAggregate) LogicalOperatorInfo(com.datatorrent.stram.webapp.LogicalOperatorInfo) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) OperatorStatus(com.datatorrent.stram.plan.physical.OperatorStatus) LogicalOperatorStatus(com.datatorrent.stram.plan.logical.LogicalOperatorStatus) MutableInt(org.apache.commons.lang3.mutable.MutableInt) PTContainer(com.datatorrent.stram.plan.physical.PTContainer)

Aggregations

PTContainer (com.datatorrent.stram.plan.physical.PTContainer)50 PTOperator (com.datatorrent.stram.plan.physical.PTOperator)34 PhysicalPlan (com.datatorrent.stram.plan.physical.PhysicalPlan)34 Test (org.junit.Test)31 GenericTestOperator (com.datatorrent.stram.engine.GenericTestOperator)30 LogicalPlan (com.datatorrent.stram.plan.logical.LogicalPlan)19 OperatorDeployInfo (com.datatorrent.stram.api.OperatorDeployInfo)18 StramTestSupport (com.datatorrent.stram.support.StramTestSupport)16 MemoryStorageAgent (com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent)10 ArrayList (java.util.ArrayList)10 Checkpoint (com.datatorrent.stram.api.Checkpoint)7 PhysicalPlanTest (com.datatorrent.stram.plan.physical.PhysicalPlanTest)7 TestGeneratorInputOperator (com.datatorrent.stram.engine.TestGeneratorInputOperator)6 Map (java.util.Map)5 Operator (com.datatorrent.api.Operator)4 StatsListener (com.datatorrent.api.StatsListener)3 ContainerStartRequest (com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest)3 TestOutputOperator (com.datatorrent.stram.engine.TestOutputOperator)3 TestPlanContext (com.datatorrent.stram.plan.TestPlanContext)3 OperatorMeta (com.datatorrent.stram.plan.logical.LogicalPlan.OperatorMeta)3