use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat in project apex-core by apache.
the class MockContainer method sendHeartbeat.
public void sendHeartbeat() {
ContainerStats cstats = new ContainerStats(sca.container.getExternalId());
ContainerHeartbeat hb = new ContainerHeartbeat();
hb.setContainerStats(cstats);
for (Map.Entry<Integer, MockOperatorStats> oe : this.stats.entrySet()) {
OperatorHeartbeat ohb = new OperatorHeartbeat();
ohb.setNodeId(oe.getKey());
ohb.setState(oe.getValue().deployState);
OperatorStats lstats = new OperatorStats();
lstats.checkpoint = new Checkpoint(oe.getValue().checkpointWindowId, 0, 0);
lstats.windowId = oe.getValue().currentWindowId;
//stats.outputPorts = Lists.newArrayList();
//PortStats ps = new PortStats(TestGeneratorInputOperator.OUTPUT_PORT);
//ps.bufferServerBytes = 101;
//ps.tupleCount = 1;
//stats.outputPorts.add(ps);
ohb.windowStats = Lists.newArrayList(lstats);
cstats.operators.add(ohb);
}
ContainerHeartbeatResponse chr = sca.dnmgr.processHeartbeat(hb);
Assert.assertNull(chr.deployRequest);
}
use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat in project apex-core by apache.
the class StreamingContainerManager method processOperatorDeployStatus.
private void processOperatorDeployStatus(final PTOperator oper, OperatorHeartbeat ohb, StreamingContainerAgent sca) {
OperatorHeartbeat.DeployState ds = null;
if (ohb != null) {
ds = ohb.getState();
}
LOG.debug("heartbeat {} {}/{} {}", oper, oper.getState(), ds, oper.getContainer().getExternalId());
switch(oper.getState()) {
case ACTIVE:
// operator expected active, check remote status
if (ds == null) {
sca.deployOpers.add(oper);
} else {
switch(ds) {
case SHUTDOWN:
// schedule operator deactivation against the windowId
// will be processed once window is committed and all dependent operators completed processing
long windowId = oper.stats.currentWindowId.get();
if (ohb.windowStats != null && !ohb.windowStats.isEmpty()) {
windowId = ohb.windowStats.get(ohb.windowStats.size() - 1).windowId;
}
LOG.debug("Operator {} deactivated at window {}", oper, windowId);
synchronized (this.shutdownOperators) {
Set<PTOperator> deactivatedOpers = this.shutdownOperators.get(windowId);
if (deactivatedOpers == null) {
this.shutdownOperators.put(windowId, deactivatedOpers = new HashSet<>());
}
deactivatedOpers.add(oper);
}
oper.setState(State.INACTIVE);
sca.undeployOpers.add(oper.getId());
slowestUpstreamOp.remove(oper);
// record operator stop event
recordEventAsync(new StramEvent.StopOperatorEvent(oper.getName(), oper.getId(), oper.getContainer().getExternalId()));
break;
case FAILED:
processOperatorFailure(oper);
sca.undeployOpers.add(oper.getId());
slowestUpstreamOp.remove(oper);
recordEventAsync(new StramEvent.StopOperatorEvent(oper.getName(), oper.getId(), oper.getContainer().getExternalId()));
break;
case ACTIVE:
default:
break;
}
}
break;
case PENDING_UNDEPLOY:
if (ds == null) {
// operator no longer deployed in container
recordEventAsync(new StramEvent.StopOperatorEvent(oper.getName(), oper.getId(), oper.getContainer().getExternalId()));
oper.setState(State.PENDING_DEPLOY);
sca.deployOpers.add(oper);
} else {
// operator is currently deployed, request undeploy
sca.undeployOpers.add(oper.getId());
slowestUpstreamOp.remove(oper);
}
break;
case PENDING_DEPLOY:
if (ds == null) {
// operator to be deployed
sca.deployOpers.add(oper);
} else {
// operator was deployed in container
PTContainer container = oper.getContainer();
LOG.debug("{} marking deployed: {} remote status {}", container.getExternalId(), oper, ds);
oper.setState(PTOperator.State.ACTIVE);
// reset on redeploy
oper.stats.lastHeartbeat = null;
oper.stats.lastWindowIdChangeTms = clock.getTime();
recordEventAsync(new StramEvent.StartOperatorEvent(oper.getName(), oper.getId(), container.getExternalId()));
}
break;
default:
//LOG.warn("Unhandled operator state {} {} remote {}", oper, oper.getState(), ds);
if (ds != null) {
// operator was removed and needs to be undeployed from container
sca.undeployOpers.add(oper.getId());
slowestUpstreamOp.remove(oper);
recordEventAsync(new StramEvent.StopOperatorEvent(oper.getName(), oper.getId(), oper.getContainer().getExternalId()));
}
}
}
use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat in project apex-core by apache.
the class TupleRecorderCollection method collected.
@Handler
public void collected(ContainerStatsEvent cse) {
ContainerStats stats = cse.getContainerStats();
for (OperatorHeartbeat node : stats.operators) {
for (OperatorStats os : node.windowStats) {
if (os.inputPorts != null) {
for (PortStats ps : os.inputPorts) {
ps.recordingId = null;
}
}
if (os.outputPorts != null) {
for (PortStats ps : os.outputPorts) {
ps.recordingId = null;
}
}
}
}
for (OperatorHeartbeat node : stats.operators) {
String recordingId;
TupleRecorder tupleRecorder = get(new OperatorIdPortNamePair(node.nodeId, null));
if (tupleRecorder == null) {
recordingId = null;
for (Map.Entry<OperatorIdPortNamePair, TupleRecorder> entry : this.entrySet()) {
if (entry.getKey().operatorId == node.nodeId) {
for (OperatorStats os : node.windowStats) {
if (os.inputPorts != null) {
for (PortStats ps : os.inputPorts) {
if (ps.id.equals(entry.getKey().portName)) {
ps.recordingId = entry.getValue().getId();
break;
}
}
}
if (os.outputPorts != null) {
for (PortStats ps : os.outputPorts) {
if (ps.id.equals(entry.getKey().portName)) {
ps.recordingId = entry.getValue().getId();
break;
}
}
}
}
}
}
} else {
recordingId = tupleRecorder.getId();
}
for (OperatorStats os : node.windowStats) {
os.recordingId = recordingId;
}
}
}
use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat in project apex-core by apache.
the class StreamingContainerManagerTest method shutdownOperator.
private static void shutdownOperator(StreamingContainerManager scm, PTOperator p1, PTOperator p2) {
assignContainer(scm, "c1");
assignContainer(scm, "c2");
ContainerHeartbeat c1hb = new ContainerHeartbeat();
c1hb.setContainerStats(new ContainerStats(p1.getContainer().getExternalId()));
scm.processHeartbeat(c1hb);
ContainerHeartbeat c2hb = new ContainerHeartbeat();
c2hb.setContainerStats(new ContainerStats(p2.getContainer().getExternalId()));
scm.processHeartbeat(c2hb);
OperatorHeartbeat o1hb = new OperatorHeartbeat();
c1hb.getContainerStats().addNodeStats(o1hb);
o1hb.setNodeId(p1.getId());
o1hb.setState(DeployState.ACTIVE);
OperatorStats o1stats = new OperatorStats();
o1hb.getOperatorStatsContainer().add(o1stats);
o1stats.checkpoint = new Checkpoint(2, 0, 0);
o1stats.windowId = 3;
scm.processHeartbeat(c1hb);
Assert.assertEquals(PTOperator.State.ACTIVE, p1.getState());
OperatorHeartbeat o2hb = new OperatorHeartbeat();
c2hb.getContainerStats().addNodeStats(o2hb);
o2hb.setNodeId(p2.getId());
o2hb.setState(DeployState.ACTIVE);
OperatorStats o2stats = new OperatorStats();
o2stats.checkpoint = new Checkpoint(2, 0, 0);
o2stats.windowId = 3;
scm.processHeartbeat(c2hb);
Assert.assertEquals(PTOperator.State.ACTIVE, p1.getState());
Assert.assertEquals(PTOperator.State.ACTIVE, p2.getState());
o1hb.setState(DeployState.SHUTDOWN);
o1stats.checkpoint = new Checkpoint(4, 0, 0);
o1stats.windowId = 5;
scm.processHeartbeat(c1hb);
Assert.assertEquals(PTOperator.State.INACTIVE, p1.getState());
}
use of com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat in project apex-core by apache.
the class StreamingContainerManager method processHeartbeat.
/**
* process the heartbeat from each container.
* called by the RPC thread for each container. (i.e. called by multiple threads)
*
* @param heartbeat
* @return heartbeat response
*/
@SuppressWarnings("StatementWithEmptyBody")
public ContainerHeartbeatResponse processHeartbeat(ContainerHeartbeat heartbeat) {
long currentTimeMillis = clock.getTime();
final StreamingContainerAgent sca = this.containers.get(heartbeat.getContainerId());
if (sca == null || sca.container.getState() == PTContainer.State.KILLED) {
// could be orphaned container that was replaced and needs to terminate
LOG.error("Unknown container {}", heartbeat.getContainerId());
ContainerHeartbeatResponse response = new ContainerHeartbeatResponse();
response.shutdown = ShutdownType.ABORT;
return response;
}
//LOG.debug("{} {} {}", new Object[]{sca.container.containerId, sca.container.bufferServerAddress, sca.container.getState()});
if (sca.container.getState() == PTContainer.State.ALLOCATED) {
// capture dynamically assigned address from container
if (sca.container.bufferServerAddress == null && heartbeat.bufferServerHost != null) {
sca.container.bufferServerAddress = InetSocketAddress.createUnresolved(heartbeat.bufferServerHost, heartbeat.bufferServerPort);
LOG.info("Container {} buffer server: {}", sca.container.getExternalId(), sca.container.bufferServerAddress);
}
final long containerStartTime = System.currentTimeMillis();
sca.container.setState(PTContainer.State.ACTIVE);
sca.container.setStartedTime(containerStartTime);
sca.container.setFinishedTime(-1);
sca.jvmName = heartbeat.jvmName;
poolExecutor.submit(new Runnable() {
@Override
public void run() {
try {
containerFile.append(sca.getContainerInfo());
} catch (IOException ex) {
LOG.warn("Cannot write to container file");
}
for (PTOperator ptOp : sca.container.getOperators()) {
try {
JSONObject operatorInfo = new JSONObject();
operatorInfo.put("name", ptOp.getName());
operatorInfo.put("id", ptOp.getId());
operatorInfo.put("container", sca.container.getExternalId());
operatorInfo.put("startTime", containerStartTime);
operatorFile.append(operatorInfo);
} catch (IOException | JSONException ex) {
LOG.warn("Cannot write to operator file: ", ex);
}
}
}
});
}
sca.containerStackTrace = heartbeat.stackTrace;
if (heartbeat.restartRequested) {
LOG.error("Container {} restart request", sca.container.getExternalId());
containerStopRequests.put(sca.container.getExternalId(), sca.container.getExternalId());
}
sca.memoryMBFree = heartbeat.memoryMBFree;
sca.gcCollectionCount = heartbeat.gcCollectionCount;
sca.gcCollectionTime = heartbeat.gcCollectionTime;
sca.undeployOpers.clear();
sca.deployOpers.clear();
if (!this.deployChangeInProgress.get()) {
sca.deployCnt = this.deployChangeCnt;
}
Set<Integer> reportedOperators = Sets.newHashSetWithExpectedSize(sca.container.getOperators().size());
for (OperatorHeartbeat shb : heartbeat.getContainerStats().operators) {
long maxEndWindowTimestamp = 0;
reportedOperators.add(shb.nodeId);
PTOperator oper = this.plan.getAllOperators().get(shb.getNodeId());
if (oper == null) {
LOG.info("Heartbeat for unknown operator {} (container {})", shb.getNodeId(), heartbeat.getContainerId());
sca.undeployOpers.add(shb.nodeId);
continue;
}
if (shb.requestResponse != null) {
for (StatsListener.OperatorResponse obj : shb.requestResponse) {
if (obj instanceof OperatorResponse) {
// This is to identify platform requests
commandResponse.put((Long) obj.getResponseId(), obj.getResponse());
LOG.debug(" Got back the response {} for the request {}", obj, obj.getResponseId());
} else {
// This is to identify user requests
oper.stats.responses.add(obj);
}
}
}
//LOG.debug("heartbeat {} {}/{} {}", oper, oper.getState(), shb.getState(), oper.getContainer().getExternalId());
if (!(oper.getState() == PTOperator.State.ACTIVE && shb.getState() == OperatorHeartbeat.DeployState.ACTIVE)) {
// deploy state may require synchronization
processOperatorDeployStatus(oper, shb, sca);
}
oper.stats.lastHeartbeat = shb;
List<ContainerStats.OperatorStats> statsList = shb.getOperatorStatsContainer();
if (!statsList.isEmpty()) {
long tuplesProcessed = 0;
long tuplesEmitted = 0;
long totalCpuTimeUsed = 0;
int statCount = 0;
long maxDequeueTimestamp = -1;
oper.stats.recordingId = null;
final OperatorStatus status = oper.stats;
status.statsRevs.checkout();
for (Map.Entry<String, PortStatus> entry : status.inputPortStatusList.entrySet()) {
entry.getValue().recordingId = null;
}
for (Map.Entry<String, PortStatus> entry : status.outputPortStatusList.entrySet()) {
entry.getValue().recordingId = null;
}
for (ContainerStats.OperatorStats stats : statsList) {
if (stats == null) {
LOG.warn("Operator {} statistics list contains null element", shb.getNodeId());
continue;
}
/* report checkpoint-ed WindowId status of the operator */
if (stats.checkpoint instanceof Checkpoint) {
if (oper.getRecentCheckpoint() == null || oper.getRecentCheckpoint().windowId < stats.checkpoint.getWindowId()) {
addCheckpoint(oper, (Checkpoint) stats.checkpoint);
if (stats.checkpointStats != null) {
status.checkpointStats = stats.checkpointStats;
status.checkpointTimeMA.add(stats.checkpointStats.checkpointTime);
}
oper.failureCount = 0;
}
}
oper.stats.recordingId = stats.recordingId;
/* report all the other stuff */
// calculate the stats related to end window
// end window stats for a particular window id for a particular node
EndWindowStats endWindowStats = new EndWindowStats();
Collection<ContainerStats.OperatorStats.PortStats> ports = stats.inputPorts;
if (ports != null) {
Set<String> currentInputPortSet = Sets.newHashSetWithExpectedSize(ports.size());
for (ContainerStats.OperatorStats.PortStats s : ports) {
currentInputPortSet.add(s.id);
PortStatus ps = status.inputPortStatusList.get(s.id);
if (ps == null) {
ps = status.new PortStatus();
ps.portName = s.id;
status.inputPortStatusList.put(s.id, ps);
}
ps.totalTuples += s.tupleCount;
ps.recordingId = s.recordingId;
tuplesProcessed += s.tupleCount;
endWindowStats.dequeueTimestamps.put(s.id, s.endWindowTimestamp);
Pair<Integer, String> operatorPortName = new Pair<>(oper.getId(), s.id);
Long lastEndWindowTimestamp = operatorPortLastEndWindowTimestamps.get(operatorPortName);
if (lastEndWindowTimestamp == null) {
lastEndWindowTimestamp = lastStatsTimestamp;
}
long portElapsedMillis = Math.max(s.endWindowTimestamp - lastEndWindowTimestamp, 0);
//LOG.debug("=== PROCESSED TUPLE COUNT for {}: {}, {}, {}, {}", operatorPortName, s.tupleCount, portElapsedMillis, operatorPortLastEndWindowTimestamps.get(operatorPortName), lastStatsTimestamp);
ps.tuplesPMSMA.add(s.tupleCount, portElapsedMillis);
ps.bufferServerBytesPMSMA.add(s.bufferServerBytes, portElapsedMillis);
ps.queueSizeMA.add(s.queueSize);
operatorPortLastEndWindowTimestamps.put(operatorPortName, s.endWindowTimestamp);
if (maxEndWindowTimestamp < s.endWindowTimestamp) {
maxEndWindowTimestamp = s.endWindowTimestamp;
}
if (s.endWindowTimestamp > maxDequeueTimestamp) {
maxDequeueTimestamp = s.endWindowTimestamp;
}
}
// need to remove dead ports, for unifiers
Iterator<Map.Entry<String, PortStatus>> it = status.inputPortStatusList.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String, PortStatus> entry = it.next();
if (!currentInputPortSet.contains(entry.getKey())) {
it.remove();
}
}
}
ports = stats.outputPorts;
if (ports != null) {
Set<String> currentOutputPortSet = Sets.newHashSetWithExpectedSize(ports.size());
for (ContainerStats.OperatorStats.PortStats s : ports) {
currentOutputPortSet.add(s.id);
PortStatus ps = status.outputPortStatusList.get(s.id);
if (ps == null) {
ps = status.new PortStatus();
ps.portName = s.id;
status.outputPortStatusList.put(s.id, ps);
}
ps.totalTuples += s.tupleCount;
ps.recordingId = s.recordingId;
tuplesEmitted += s.tupleCount;
Pair<Integer, String> operatorPortName = new Pair<>(oper.getId(), s.id);
Long lastEndWindowTimestamp = operatorPortLastEndWindowTimestamps.get(operatorPortName);
if (lastEndWindowTimestamp == null) {
lastEndWindowTimestamp = lastStatsTimestamp;
}
long portElapsedMillis = Math.max(s.endWindowTimestamp - lastEndWindowTimestamp, 0);
//LOG.debug("=== EMITTED TUPLE COUNT for {}: {}, {}, {}, {}", operatorPortName, s.tupleCount, portElapsedMillis, operatorPortLastEndWindowTimestamps.get(operatorPortName), lastStatsTimestamp);
ps.tuplesPMSMA.add(s.tupleCount, portElapsedMillis);
ps.bufferServerBytesPMSMA.add(s.bufferServerBytes, portElapsedMillis);
operatorPortLastEndWindowTimestamps.put(operatorPortName, s.endWindowTimestamp);
if (maxEndWindowTimestamp < s.endWindowTimestamp) {
maxEndWindowTimestamp = s.endWindowTimestamp;
}
}
if (ports.size() > 0) {
endWindowStats.emitTimestamp = ports.iterator().next().endWindowTimestamp;
}
// need to remove dead ports, for unifiers
Iterator<Map.Entry<String, PortStatus>> it = status.outputPortStatusList.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String, PortStatus> entry = it.next();
if (!currentOutputPortSet.contains(entry.getKey())) {
it.remove();
}
}
}
// (we don't know the latency for output operators because they don't emit tuples)
if (endWindowStats.emitTimestamp < 0) {
endWindowStats.emitTimestamp = maxDequeueTimestamp;
}
if (status.currentWindowId.get() != stats.windowId) {
status.lastWindowIdChangeTms = currentTimeMillis;
status.currentWindowId.set(stats.windowId);
}
totalCpuTimeUsed += stats.cpuTimeUsed;
statCount++;
if (oper.getOperatorMeta().getValue(OperatorContext.COUNTERS_AGGREGATOR) != null) {
endWindowStats.counters = stats.counters;
}
if (oper.getOperatorMeta().getMetricAggregatorMeta() != null && oper.getOperatorMeta().getMetricAggregatorMeta().getAggregator() != null) {
endWindowStats.metrics = stats.metrics;
}
if (stats.windowId > currentEndWindowStatsWindowId) {
Map<Integer, EndWindowStats> endWindowStatsMap = endWindowStatsOperatorMap.get(stats.windowId);
if (endWindowStatsMap == null) {
endWindowStatsMap = new ConcurrentSkipListMap<>();
Map<Integer, EndWindowStats> endWindowStatsMapPrevious = endWindowStatsOperatorMap.putIfAbsent(stats.windowId, endWindowStatsMap);
if (endWindowStatsMapPrevious != null) {
endWindowStatsMap = endWindowStatsMapPrevious;
}
}
endWindowStatsMap.put(shb.getNodeId(), endWindowStats);
Set<Integer> allCurrentOperators = plan.getAllOperators().keySet();
int numOperators = plan.getAllOperators().size();
if (allCurrentOperators.containsAll(endWindowStatsMap.keySet()) && endWindowStatsMap.size() == numOperators) {
completeEndWindowStatsWindowId = stats.windowId;
}
}
}
status.totalTuplesProcessed.add(tuplesProcessed);
status.totalTuplesEmitted.add(tuplesEmitted);
OperatorMeta logicalOperator = oper.getOperatorMeta();
LogicalOperatorStatus logicalStatus = logicalOperator.getStatus();
if (!oper.isUnifier()) {
logicalStatus.totalTuplesProcessed += tuplesProcessed;
logicalStatus.totalTuplesEmitted += tuplesEmitted;
}
long lastMaxEndWindowTimestamp = operatorLastEndWindowTimestamps.containsKey(oper.getId()) ? operatorLastEndWindowTimestamps.get(oper.getId()) : lastStatsTimestamp;
if (maxEndWindowTimestamp >= lastMaxEndWindowTimestamp) {
double tuplesProcessedPMSMA = 0.0;
double tuplesEmittedPMSMA = 0.0;
if (statCount != 0) {
//LOG.debug("CPU for {}: {} / {} - {}", oper.getId(), totalCpuTimeUsed, maxEndWindowTimestamp, lastMaxEndWindowTimestamp);
status.cpuNanosPMSMA.add(totalCpuTimeUsed, maxEndWindowTimestamp - lastMaxEndWindowTimestamp);
}
for (PortStatus ps : status.inputPortStatusList.values()) {
tuplesProcessedPMSMA += ps.tuplesPMSMA.getAvg();
}
for (PortStatus ps : status.outputPortStatusList.values()) {
tuplesEmittedPMSMA += ps.tuplesPMSMA.getAvg();
}
status.tuplesProcessedPSMA.set(Math.round(tuplesProcessedPMSMA * 1000));
status.tuplesEmittedPSMA.set(Math.round(tuplesEmittedPMSMA * 1000));
} else {
//LOG.warn("This timestamp for {} is lower than the previous!! {} < {}", oper.getId(),
// maxEndWindowTimestamp, lastMaxEndWindowTimestamp);
}
operatorLastEndWindowTimestamps.put(oper.getId(), maxEndWindowTimestamp);
status.listenerStats.add(statsList);
this.reportStats.put(oper, oper);
status.statsRevs.commit();
}
if (lastStatsTimestamp < maxEndWindowTimestamp) {
lastStatsTimestamp = maxEndWindowTimestamp;
}
}
sca.lastHeartbeatMillis = currentTimeMillis;
for (PTOperator oper : sca.container.getOperators()) {
if (!reportedOperators.contains(oper.getId())) {
processOperatorDeployStatus(oper, null, sca);
}
}
ContainerHeartbeatResponse rsp = getHeartbeatResponse(sca);
if (heartbeat.getContainerStats().operators.isEmpty() && isApplicationIdle()) {
LOG.info("requesting idle shutdown for container {}", heartbeat.getContainerId());
rsp.shutdown = ShutdownType.ABORT;
} else {
if (sca.isShutdownRequested()) {
LOG.info("requesting shutdown for container {}", heartbeat.getContainerId());
rsp.shutdown = sca.shutdownRequest;
}
}
List<StramToNodeRequest> requests = rsp.nodeRequests != null ? rsp.nodeRequests : new ArrayList<StramToNodeRequest>();
ConcurrentLinkedQueue<StramToNodeRequest> operatorRequests = sca.getOperatorRequests();
while (true) {
StramToNodeRequest r = operatorRequests.poll();
if (r == null) {
break;
}
requests.add(r);
}
rsp.nodeRequests = requests;
rsp.committedWindowId = committedWindowId;
rsp.stackTraceRequired = sca.stackTraceRequested;
sca.stackTraceRequested = false;
apexPluginDispatcher.dispatch(new DAGExecutionEvent.HeartbeatExecutionEvent(heartbeat));
return rsp;
}
Aggregations