use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class StramLocalClusterTest method testRecovery.
@Test
@SuppressWarnings("SleepWhileInLoop")
public void testRecovery() throws Exception {
AsyncFSStorageAgent agent = new AsyncFSStorageAgent(testMeta.getPath(), null);
agent.setSyncCheckpoint(true);
dag.setAttribute(OperatorContext.STORAGE_AGENT, agent);
TestGeneratorInputOperator node1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
// data will be added externally from test
node1.setMaxTuples(0);
GenericTestOperator node2 = dag.addOperator("o2", GenericTestOperator.class);
dag.addStream("o1o2", node1.outport, node2.inport1);
dag.validate();
dag.getAttributes().put(LogicalPlan.CHECKPOINT_WINDOW_COUNT, 2);
final ManualScheduledExecutorService wclock = new ManualScheduledExecutorService(1);
MockComponentFactory mcf = new MockComponentFactory() {
@Override
public WindowGenerator setupWindowGenerator() {
WindowGenerator wingen = StramTestSupport.setupWindowGenerator(wclock);
wingen.setCheckpointCount(2, 0);
return wingen;
}
};
StramLocalCluster localCluster = new StramLocalCluster(dag, mcf);
localCluster.setPerContainerBufferServer(true);
// driven by test
localCluster.setHeartbeatMonitoringEnabled(false);
localCluster.runAsync();
PTOperator ptNode1 = localCluster.findByLogicalNode(dag.getMeta(node1));
PTOperator ptNode2 = localCluster.findByLogicalNode(dag.getMeta(node2));
LocalStreamingContainer c0 = StramTestSupport.waitForActivation(localCluster, ptNode1);
Map<Integer, Node<?>> nodeMap = c0.getNodes();
Assert.assertEquals("number operators", 1, nodeMap.size());
TestGeneratorInputOperator n1 = (TestGeneratorInputOperator) nodeMap.get(ptNode1.getId()).getOperator();
Assert.assertNotNull(n1);
LocalStreamingContainer c2 = StramTestSupport.waitForActivation(localCluster, ptNode2);
Map<Integer, Node<?>> c2NodeMap = c2.getNodes();
Assert.assertEquals("number operators downstream", 1, c2NodeMap.size());
GenericTestOperator n2 = (GenericTestOperator) c2NodeMap.get(localCluster.findByLogicalNode(dag.getMeta(node2)).getId()).getOperator();
Assert.assertNotNull(n2);
// input data
String tuple1 = "tuple1";
n1.addTuple(tuple1);
OperatorContext n1Context = c0.getNodeContext(ptNode1.getId());
Assert.assertEquals("initial window id", -1, n1Context.getLastProcessedWindowId());
// checkpoint window
wclock.tick(1);
wclock.tick(1);
Assert.assertEquals("current window", 2, wclock.getCurrentTimeMillis());
OperatorContext o2Context = c2.getNodeContext(ptNode2.getId());
Assert.assertNotNull("context ", o2Context);
StramTestSupport.waitForWindowComplete(o2Context, 1);
Assert.assertEquals("o2 received ", tuple1, n2.inport1Tuple);
wclock.tick(1);
Assert.assertEquals("current window", 3, wclock.getCurrentTimeMillis());
// checkpoint between window 1 and 2
StramTestSupport.waitForWindowComplete(o2Context, 2);
// propagate checkpoints to master
c0.triggerHeartbeat();
// wait for heartbeat cycle to complete
c0.waitForHeartbeat(5000);
Assert.assertEquals("checkpoint " + ptNode1, 1, ptNode1.getRecentCheckpoint().windowId);
c2.triggerHeartbeat();
//Thread.yield();
// yield without using yield for heartbeat cycle
Thread.sleep(1);
c2.waitForHeartbeat(5000);
Assert.assertEquals("checkpoint " + ptNode2, 1, ptNode2.getRecentCheckpoint().windowId);
Assert.assertEquals("checkpoints " + ptNode1, Arrays.asList(new Checkpoint[] { new Checkpoint(1L, 0, 0) }), ptNode1.checkpoints);
Assert.assertEquals("checkpoints " + ptNode2, Arrays.asList(new Checkpoint[] { new Checkpoint(1L, 0, 0) }), ptNode2.checkpoints);
//
// simulate container failure (operator o1)
//
localCluster.failContainer(c0);
// replacement container starts empty
// operators will deploy after downstream operator was removed
LocalStreamingContainer c0Replaced = StramTestSupport.waitForActivation(localCluster, ptNode1);
c0Replaced.triggerHeartbeat();
// next heartbeat after setup
c0Replaced.waitForHeartbeat(5000);
Assert.assertNotSame("old container", c0, c0Replaced);
Assert.assertNotSame("old container", c0.getContainerId(), c0Replaced.getContainerId());
// verify change in downstream container
LOG.debug("triggering c2 heartbeat processing");
StreamingContainerAgent c2Agent = localCluster.getContainerAgent(c2);
// wait for downstream re-deploy to complete
long startTms = System.currentTimeMillis();
while (c2Agent.hasPendingWork() && StramTestSupport.DEFAULT_TIMEOUT_MILLIS > System.currentTimeMillis() - startTms) {
Thread.sleep(200);
c2.triggerHeartbeat();
LOG.debug("Waiting for {} to complete pending work.", c2.getContainerId());
}
Assert.assertEquals(c2.getContainerId() + " operators after redeploy " + c2.getNodes(), 1, c2.getNodes().size());
// verify downstream operator re-deployed in existing container
Assert.assertEquals("active " + ptNode2, c2, StramTestSupport.waitForActivation(localCluster, ptNode2));
GenericTestOperator o2Recovered = (GenericTestOperator) c2NodeMap.get(localCluster.findByLogicalNode(dag.getMeta(node2)).getId()).getOperator();
Assert.assertNotNull("redeployed " + ptNode2, o2Recovered);
Assert.assertNotSame("new instance " + ptNode2, n2, o2Recovered);
Assert.assertEquals("restored state " + ptNode2, tuple1, o2Recovered.inport1Tuple);
TestGeneratorInputOperator o1Recovered = (TestGeneratorInputOperator) c0Replaced.getNodes().get(ptNode1.getId()).getOperator();
Assert.assertNotNull(o1Recovered);
OperatorContext o1RecoveredContext = c0Replaced.getNodeContext(ptNode1.getId());
Assert.assertNotNull("active " + ptNode1, o1RecoveredContext);
wclock.tick(1);
Assert.assertEquals("current window", 4, wclock.getCurrentTimeMillis());
// refresh context after operator re-deploy
o2Context = c2.getNodeContext(ptNode2.getId());
Assert.assertNotNull("active " + ptNode2, o2Context);
StramTestSupport.waitForWindowComplete(o1RecoveredContext, 3);
StramTestSupport.waitForWindowComplete(o2Context, 3);
// checkpoint window
wclock.tick(1);
Assert.assertEquals("current window", 5, wclock.getCurrentTimeMillis());
String tuple2 = "tuple2";
o1Recovered.addTuple(tuple2);
StramTestSupport.waitForWindowComplete(o1RecoveredContext, 4);
StramTestSupport.waitForWindowComplete(o2Context, 4);
// check data flow after recovery
Assert.assertEquals("retrieved tuple (after recovery) " + ptNode2, tuple2, o2Recovered.inport1Tuple);
// propagate checkpoints to master
c0Replaced.triggerHeartbeat();
c0Replaced.waitForHeartbeat(5000);
c2.triggerHeartbeat();
c2.waitForHeartbeat(5000);
// purge checkpoints
// checkpoint purging
localCluster.dnmgr.monitorHeartbeat(false);
Assert.assertEquals("checkpoints " + ptNode1, Arrays.asList(new Checkpoint[] { new Checkpoint(3L, 0, 0) }), ptNode1.checkpoints);
Assert.assertEquals("checkpoints " + ptNode2, Arrays.asList(new Checkpoint[] { new Checkpoint(3L, 0, 0) }), ptNode2.checkpoints);
localCluster.shutdown();
}
use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class StreamingContainerManagerTest method shutdownOperator.
private static void shutdownOperator(StreamingContainerManager scm, PTOperator p1, PTOperator p2) {
assignContainer(scm, "c1");
assignContainer(scm, "c2");
ContainerHeartbeat c1hb = new ContainerHeartbeat();
c1hb.setContainerStats(new ContainerStats(p1.getContainer().getExternalId()));
scm.processHeartbeat(c1hb);
ContainerHeartbeat c2hb = new ContainerHeartbeat();
c2hb.setContainerStats(new ContainerStats(p2.getContainer().getExternalId()));
scm.processHeartbeat(c2hb);
OperatorHeartbeat o1hb = new OperatorHeartbeat();
c1hb.getContainerStats().addNodeStats(o1hb);
o1hb.setNodeId(p1.getId());
o1hb.setState(DeployState.ACTIVE);
OperatorStats o1stats = new OperatorStats();
o1hb.getOperatorStatsContainer().add(o1stats);
o1stats.checkpoint = new Checkpoint(2, 0, 0);
o1stats.windowId = 3;
scm.processHeartbeat(c1hb);
Assert.assertEquals(PTOperator.State.ACTIVE, p1.getState());
OperatorHeartbeat o2hb = new OperatorHeartbeat();
c2hb.getContainerStats().addNodeStats(o2hb);
o2hb.setNodeId(p2.getId());
o2hb.setState(DeployState.ACTIVE);
OperatorStats o2stats = new OperatorStats();
o2stats.checkpoint = new Checkpoint(2, 0, 0);
o2stats.windowId = 3;
scm.processHeartbeat(c2hb);
Assert.assertEquals(PTOperator.State.ACTIVE, p1.getState());
Assert.assertEquals(PTOperator.State.ACTIVE, p2.getState());
o1hb.setState(DeployState.SHUTDOWN);
o1stats.checkpoint = new Checkpoint(4, 0, 0);
o1stats.windowId = 5;
scm.processHeartbeat(c1hb);
Assert.assertEquals(PTOperator.State.INACTIVE, p1.getState());
}
use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class StreamingContainerManager method addCheckpoint.
@SuppressWarnings("StatementWithEmptyBody")
void addCheckpoint(PTOperator node, Checkpoint checkpoint) {
synchronized (node.checkpoints) {
if (!node.checkpoints.isEmpty()) {
Checkpoint lastCheckpoint = node.checkpoints.getLast();
// skip unless checkpoint moves
if (lastCheckpoint.windowId != checkpoint.windowId) {
if (lastCheckpoint.windowId > checkpoint.windowId) {
// list needs to have max windowId last
LOG.warn("Out of sequence checkpoint {} last {} (operator {})", checkpoint, lastCheckpoint, node);
ListIterator<Checkpoint> li = node.checkpoints.listIterator();
while (li.hasNext() && li.next().windowId < checkpoint.windowId) {
//continue;
}
if (li.previous().windowId != checkpoint.windowId) {
li.add(checkpoint);
}
} else {
node.checkpoints.add(checkpoint);
}
}
} else {
node.checkpoints.add(checkpoint);
}
}
}
use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class Node method reportStats.
protected void reportStats(ContainerStats.OperatorStats stats, long windowId) {
stats.outputPorts = new ArrayList<>();
for (Entry<String, Sink<Object>> e : outputs.entrySet()) {
ContainerStats.OperatorStats.PortStats portStats = new ContainerStats.OperatorStats.PortStats(e.getKey());
portStats.tupleCount = e.getValue().getCount(true) - controlTupleCount;
portStats.endWindowTimestamp = endWindowEmitTime;
stats.outputPorts.add(portStats);
}
controlTupleCount = 0;
long currentCpuTime = tmb.getCurrentThreadCpuTime();
stats.cpuTimeUsed = currentCpuTime - lastSampleCpuTime;
lastSampleCpuTime = currentCpuTime;
if (checkpoint != null) {
stats.checkpoint = checkpoint;
stats.checkpointStats = checkpointStats;
checkpointStats = null;
checkpoint = null;
} else {
Pair<FutureTask<Stats.CheckpointStats>, CheckpointWindowInfo> pair = taskQueue.peek();
if (pair != null && pair.getFirst().isDone()) {
taskQueue.poll();
try {
CheckpointWindowInfo checkpointWindowInfo = pair.getSecond();
stats.checkpointStats = pair.getFirst().get();
stats.checkpoint = new Checkpoint(checkpointWindowInfo.windowId, checkpointWindowInfo.applicationWindowCount, checkpointWindowInfo.checkpointWindowCount);
if (operator instanceof Operator.CheckpointListener) {
((Operator.CheckpointListener) operator).checkpointed(checkpointWindowInfo.windowId);
}
} catch (Exception ex) {
throw Throwables.propagate(ex);
}
}
}
context.report(stats, windowId);
}
use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class CheckpointTest method testUpdateCheckpointsProcessingTimeout.
@Test
public void testUpdateCheckpointsProcessingTimeout() {
MockClock clock = new MockClock();
dag.setAttribute(com.datatorrent.api.Context.OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class);
dag.addStream("o1.outport1", o1.outport1, o2.inport1);
StreamingContainerManager dnm = new StreamingContainerManager(dag);
PhysicalPlan plan = dnm.getPhysicalPlan();
// set all operators as active to enable recovery window id update
for (PTOperator oper : plan.getAllOperators().values()) {
oper.setState(PTOperator.State.ACTIVE);
}
List<PTOperator> partitions = plan.getOperators(dag.getMeta(o1));
Assert.assertNotNull(partitions);
Assert.assertEquals(1, partitions.size());
PTOperator o1p1 = partitions.get(0);
partitions = plan.getOperators(dag.getMeta(o2));
Assert.assertNotNull(partitions);
Assert.assertEquals(1, partitions.size());
PTOperator o2p1 = partitions.get(0);
UpdateCheckpointsContext ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertTrue("no blocked operators", ctx.blocked.isEmpty());
o1p1.stats.statsRevs.checkout();
o1p1.stats.currentWindowId.set(1);
o1p1.stats.lastWindowIdChangeTms = 1;
o1p1.stats.statsRevs.commit();
clock.time = o1p1.stats.windowProcessingTimeoutMillis + 1;
ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertEquals("o2 blocked", Sets.newHashSet(o2p1), ctx.blocked);
// assign future activation window (state-less or at-most-once).
Checkpoint cp2 = o2p1.getRecoveryCheckpoint();
o2p1.setRecoveryCheckpoint(new Checkpoint(o1p1.getRecoveryCheckpoint().windowId + 1, cp2.applicationWindowCount, cp2.checkpointWindowCount));
ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertEquals("no operators blocked (o2 activation window ahead)", Sets.newHashSet(), ctx.blocked);
// reset to blocked
o2p1.setRecoveryCheckpoint(cp2);
ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertEquals("o2 blocked", Sets.newHashSet(o2p1), ctx.blocked);
clock.time++;
ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertEquals("operators blocked", Sets.newHashSet(o1p1, o2p1), ctx.blocked);
o2p1.stats.statsRevs.checkout();
o2p1.stats.currentWindowId.set(o1p1.stats.getCurrentWindowId());
o2p1.stats.statsRevs.commit();
ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertEquals("operators blocked", Sets.newHashSet(o1p1), ctx.blocked);
clock.time--;
ctx = new UpdateCheckpointsContext(clock);
dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
Assert.assertEquals("operators blocked", Sets.newHashSet(), ctx.blocked);
}
Aggregations