use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.
the class StreamingContainerManagerTest method testRecoveryUpstreamInline.
@Test
public void testRecoveryUpstreamInline() throws Exception {
GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class);
GenericTestOperator o3 = dag.addOperator("o3", GenericTestOperator.class);
dag.addStream("o1o3", o1.outport1, o3.inport1);
dag.addStream("o2o3", o2.outport1, o3.inport2);
dag.getAttributes().put(LogicalPlan.CONTAINERS_MAX_COUNT, 2);
dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
StreamingContainerManager scm = new StreamingContainerManager(dag);
PhysicalPlan plan = scm.getPhysicalPlan();
Assert.assertEquals(2, plan.getContainers().size());
plan.getOperators(dag.getMeta(o1)).get(0);
Assert.assertEquals(2, plan.getContainers().size());
PTContainer c1 = plan.getContainers().get(0);
Assert.assertEquals(Sets.newHashSet(plan.getOperators(dag.getMeta(o1)).get(0), plan.getOperators(dag.getMeta(o3)).get(0)), Sets.newHashSet(c1.getOperators()));
PTContainer c2 = plan.getContainers().get(1);
assignContainer(scm, "c1");
assignContainer(scm, "c2");
for (PTOperator oper : c1.getOperators()) {
Assert.assertEquals("state " + oper, PTOperator.State.PENDING_DEPLOY, oper.getState());
}
scm.scheduleContainerRestart(c2.getExternalId());
for (PTOperator oper : c1.getOperators()) {
Assert.assertEquals("state " + oper, PTOperator.State.PENDING_UNDEPLOY, oper.getState());
}
}
use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.
the class StreamingContainerManager method monitorHeartbeat.
/**
* Check periodically that deployed containers phone home.
* Run from the master main loop (single threaded access).
*/
public void monitorHeartbeat(boolean waitForRecovery) {
long currentTms = clock.getTime();
// look for resource allocation timeout
if (!pendingAllocation.isEmpty()) {
// look for resource allocation timeout
if (lastResourceRequest + plan.getLogicalPlan().getValue(LogicalPlan.RESOURCE_ALLOCATION_TIMEOUT_MILLIS) < currentTms) {
String msg = String.format("Shutdown due to resource allocation timeout (%s ms) waiting for %s containers", currentTms - lastResourceRequest, pendingAllocation.size());
LOG.warn(msg);
for (PTContainer c : pendingAllocation) {
LOG.warn("Waiting for resource: {}m priority: {} {}", c.getRequiredMemoryMB(), c.getResourceRequestPriority(), c);
}
shutdownAllContainers(ShutdownType.ABORT, msg);
this.forcedShutdown = true;
} else {
for (PTContainer c : pendingAllocation) {
LOG.debug("Waiting for resource: {}m {}", c.getRequiredMemoryMB(), c);
}
}
}
// monitor currently deployed containers
for (StreamingContainerAgent sca : containers.values()) {
PTContainer c = sca.container;
if (!pendingAllocation.contains(c) && c.getExternalId() != null) {
if (sca.lastHeartbeatMillis == 0) {
// container allocated but process was either not launched or is not able to phone home
if (currentTms - sca.createdMillis > 2 * this.vars.heartbeatTimeoutMillis) {
LOG.warn("Container {}@{} startup timeout ({} ms).", c.getExternalId(), c.host, currentTms - sca.createdMillis);
containerStopRequests.put(c.getExternalId(), c.getExternalId());
}
} else {
if (currentTms - sca.lastHeartbeatMillis > this.vars.heartbeatTimeoutMillis) {
if (!isApplicationIdle()) {
// Check if the heartbeat for this agent has already been missed to raise the StramEvent only once
if (sca.lastHeartbeatMillis != -1) {
String msg = String.format("Container %s@%s heartbeat timeout (%d%n ms).", c.getExternalId(), c.host, currentTms - sca.lastHeartbeatMillis);
LOG.warn(msg);
StramEvent stramEvent = new StramEvent.ContainerErrorEvent(c.getExternalId(), msg, null);
stramEvent.setReason(msg);
recordEventAsync(stramEvent);
sca.lastHeartbeatMillis = -1;
}
// request stop (kill) as process may still be hanging around (would have been detected by Yarn otherwise)
containerStopRequests.put(c.getExternalId(), c.getExternalId());
}
}
}
}
}
// events that may modify the plan
processEvents();
committedWindowId = updateCheckpoints(waitForRecovery);
if (lastCommittedWindowId != committedWindowId) {
apexPluginDispatcher.dispatch(new DAGExecutionEvent.CommitExecutionEvent(committedWindowId));
lastCommittedWindowId = committedWindowId;
}
calculateEndWindowStats();
if (this.vars.enableStatsRecording) {
recordStats(currentTms);
}
}
use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.
the class StreamingContainerManager method deploy.
@Override
public void deploy(Set<PTContainer> releaseContainers, Collection<PTOperator> undeploy, Set<PTContainer> startContainers, Collection<PTOperator> deploy) {
try {
this.deployChangeInProgress.set(true);
Map<PTContainer, List<PTOperator>> undeployGroups = groupByContainer(undeploy);
// order does not matter, remove all operators in each container in one sweep
for (Map.Entry<PTContainer, List<PTOperator>> e : undeployGroups.entrySet()) {
// container may already be in failed or pending deploy state, notified by RM or timed out
PTContainer c = e.getKey();
if (!startContainers.contains(c) && !releaseContainers.contains(c) && c.getState() != PTContainer.State.KILLED) {
LOG.debug("scheduling undeploy {} {}", e.getKey().getExternalId(), e.getValue());
for (PTOperator oper : e.getValue()) {
oper.setState(PTOperator.State.PENDING_UNDEPLOY);
}
}
}
// start new containers
for (PTContainer c : startContainers) {
requestContainer(c);
}
// (re)deploy affected operators
// can happen in parallel after buffer server for recovered publishers is reset
Map<PTContainer, List<PTOperator>> deployGroups = groupByContainer(deploy);
for (Map.Entry<PTContainer, List<PTOperator>> e : deployGroups.entrySet()) {
if (!startContainers.contains(e.getKey())) {
// to reset publishers, clean buffer server past checkpoint so subscribers don't read stale data (including end of stream)
for (PTOperator operator : e.getValue()) {
for (PTOperator.PTOutput out : operator.getOutputs()) {
if (!out.isDownStreamInline()) {
for (InputPortMeta ipm : out.logicalStream.getSinks()) {
StreamCodec<?> streamCodec = ipm.getStreamCodec();
Integer codecId = plan.getStreamCodecIdentifier(streamCodec);
// following needs to match the concat logic in StreamingContainer
String sourceIdentifier = Integer.toString(operator.getId()).concat(Component.CONCAT_SEPARATOR).concat(out.portName).concat(Component.CONCAT_SEPARATOR).concat(codecId.toString());
if (operator.getContainer().getState() == PTContainer.State.ACTIVE) {
// TODO: unit test - find way to mock this when testing rest of logic
if (operator.getContainer().bufferServerAddress.getPort() != 0) {
BufferServerController bsc = getBufferServerClient(operator);
// ensures new subscriber starting to read from checkpoint will wait until publisher redeploy cycle is complete
try {
bsc.reset(null, sourceIdentifier, 0);
} catch (Exception ex) {
LOG.error("Failed to reset buffer server {} {}", sourceIdentifier, ex);
}
}
}
}
}
}
}
}
// add to operators that we expect to deploy
LOG.debug("scheduling deploy {} {}", e.getKey().getExternalId(), e.getValue());
for (PTOperator oper : e.getValue()) {
// operator will be deployed after it has been undeployed, if still referenced by the container
if (oper.getState() != PTOperator.State.PENDING_UNDEPLOY) {
oper.setState(PTOperator.State.PENDING_DEPLOY);
}
}
}
// stop containers that are no longer used
for (PTContainer c : releaseContainers) {
if (c.getExternalId() == null) {
continue;
}
StreamingContainerAgent sca = containers.get(c.getExternalId());
if (sca != null) {
LOG.debug("Container marked for shutdown: {}", c);
// container already removed from plan
// TODO: monitor soft shutdown
sca.requestShutDown(ShutdownType.ABORT);
}
}
} finally {
this.deployChangeCnt++;
this.deployChangeInProgress.set(false);
}
}
use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.
the class StreamingContainerManager method assignContainer.
/**
* Assign operators to allocated container resource.
*
* @param resource
* @param bufferServerAddr
* @return streaming container agent
*/
public StreamingContainerAgent assignContainer(ContainerResource resource, InetSocketAddress bufferServerAddr) {
PTContainer container = null;
// match container waiting for resource
for (PTContainer c : pendingAllocation) {
if (c.getState() == PTContainer.State.NEW || c.getState() == PTContainer.State.KILLED) {
if (c.getResourceRequestPriority() == resource.priority) {
container = c;
break;
}
}
}
if (container == null) {
LOG.debug("No container matching allocated resource {}", resource);
LOG.debug("Containers waiting for allocation {}", pendingAllocation);
return null;
}
pendingAllocation.remove(container);
container.setState(PTContainer.State.ALLOCATED);
if (container.getExternalId() != null) {
LOG.info("Removing container agent {}", container.getExternalId());
this.containers.remove(container.getExternalId());
}
container.setExternalId(resource.containerId);
container.host = resource.host;
container.bufferServerAddress = bufferServerAddr;
if (UserGroupInformation.isSecurityEnabled()) {
byte[] token = AuthManager.generateToken();
container.setBufferServerToken(token);
}
container.nodeHttpAddress = resource.nodeHttpAddress;
container.setAllocatedMemoryMB(resource.memoryMB);
container.setAllocatedVCores(resource.vCores);
container.setStartedTime(-1);
container.setFinishedTime(-1);
writeJournal(container.getSetContainerState());
StreamingContainerAgent sca = new StreamingContainerAgent(container, newStreamingContainerContext(container), this);
containers.put(resource.containerId, sca);
LOG.debug("Assigned container {} priority {}", resource.containerId, resource.priority);
return sca;
}
use of com.datatorrent.stram.plan.physical.PTContainer in project apex-core by apache.
the class StreamingContainerManager method fillLogicalOperatorInfo.
private LogicalOperatorInfo fillLogicalOperatorInfo(OperatorMeta operator) {
LogicalOperatorInfo loi = new LogicalOperatorInfo();
loi.name = operator.getName();
loi.className = operator.getOperator().getClass().getName();
loi.totalTuplesEmitted = operator.getStatus().totalTuplesEmitted;
loi.totalTuplesProcessed = operator.getStatus().totalTuplesProcessed;
loi.failureCount = operator.getStatus().failureCount;
loi.status = new HashMap<>();
loi.partitions = new TreeSet<>();
loi.unifiers = new TreeSet<>();
loi.containerIds = new TreeSet<>();
loi.hosts = new TreeSet<>();
Collection<PTOperator> physicalOperators = getPhysicalPlan().getAllOperators(operator);
NumberAggregate.LongAggregate checkpointTimeAggregate = new NumberAggregate.LongAggregate();
for (PTOperator physicalOperator : physicalOperators) {
OperatorStatus os = physicalOperator.stats;
if (physicalOperator.isUnifier()) {
loi.unifiers.add(physicalOperator.getId());
} else {
loi.partitions.add(physicalOperator.getId());
// exclude unifier, not sure if we should include it in the future
loi.tuplesEmittedPSMA += os.tuplesEmittedPSMA.get();
loi.tuplesProcessedPSMA += os.tuplesProcessedPSMA.get();
// calculate maximum latency for all partitions
long latency = calculateLatency(physicalOperator);
if (latency > loi.latencyMA) {
loi.latencyMA = latency;
}
checkpointTimeAggregate.addNumber(os.checkpointTimeMA.getAvg());
}
loi.cpuPercentageMA += os.cpuNanosPMSMA.getAvg() / 10000;
if (os.lastHeartbeat != null && (loi.lastHeartbeat == 0 || loi.lastHeartbeat > os.lastHeartbeat.getGeneratedTms())) {
loi.lastHeartbeat = os.lastHeartbeat.getGeneratedTms();
}
long currentWindowId = toWsWindowId(os.currentWindowId.get());
if (loi.currentWindowId == 0 || loi.currentWindowId > currentWindowId) {
loi.currentWindowId = currentWindowId;
}
MutableInt count = loi.status.get(physicalOperator.getState().toString());
if (count == null) {
count = new MutableInt();
loi.status.put(physicalOperator.getState().toString(), count);
}
count.increment();
if (physicalOperator.getRecoveryCheckpoint() != null) {
long recoveryWindowId = toWsWindowId(physicalOperator.getRecoveryCheckpoint().windowId);
if (loi.recoveryWindowId == 0 || loi.recoveryWindowId > recoveryWindowId) {
loi.recoveryWindowId = recoveryWindowId;
}
}
PTContainer container = physicalOperator.getContainer();
if (container != null) {
String externalId = container.getExternalId();
if (externalId != null) {
loi.containerIds.add(externalId);
loi.hosts.add(container.host);
}
}
}
if (physicalOperators.size() > 0 && checkpointTimeAggregate.getAvg() != null) {
loi.checkpointTimeMA = checkpointTimeAggregate.getAvg().longValue();
loi.counters = latestLogicalCounters.get(operator.getName());
loi.autoMetrics = latestLogicalMetrics.get(operator.getName());
}
return loi;
}
Aggregations