Search in sources :

Example 11 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class StramLocalClusterTest method testRecovery.

@Test
@SuppressWarnings("SleepWhileInLoop")
public void testRecovery() throws Exception {
    AsyncFSStorageAgent agent = new AsyncFSStorageAgent(testMeta.getPath(), null);
    agent.setSyncCheckpoint(true);
    dag.setAttribute(OperatorContext.STORAGE_AGENT, agent);
    TestGeneratorInputOperator node1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
    // data will be added externally from test
    node1.setMaxTuples(0);
    GenericTestOperator node2 = dag.addOperator("o2", GenericTestOperator.class);
    dag.addStream("o1o2", node1.outport, node2.inport1);
    dag.validate();
    dag.getAttributes().put(LogicalPlan.CHECKPOINT_WINDOW_COUNT, 2);
    final ManualScheduledExecutorService wclock = new ManualScheduledExecutorService(1);
    MockComponentFactory mcf = new MockComponentFactory() {

        @Override
        public WindowGenerator setupWindowGenerator() {
            WindowGenerator wingen = StramTestSupport.setupWindowGenerator(wclock);
            wingen.setCheckpointCount(2, 0);
            return wingen;
        }
    };
    StramLocalCluster localCluster = new StramLocalCluster(dag, mcf);
    localCluster.setPerContainerBufferServer(true);
    // driven by test
    localCluster.setHeartbeatMonitoringEnabled(false);
    localCluster.runAsync();
    PTOperator ptNode1 = localCluster.findByLogicalNode(dag.getMeta(node1));
    PTOperator ptNode2 = localCluster.findByLogicalNode(dag.getMeta(node2));
    LocalStreamingContainer c0 = StramTestSupport.waitForActivation(localCluster, ptNode1);
    Map<Integer, Node<?>> nodeMap = c0.getNodes();
    Assert.assertEquals("number operators", 1, nodeMap.size());
    TestGeneratorInputOperator n1 = (TestGeneratorInputOperator) nodeMap.get(ptNode1.getId()).getOperator();
    Assert.assertNotNull(n1);
    LocalStreamingContainer c2 = StramTestSupport.waitForActivation(localCluster, ptNode2);
    Map<Integer, Node<?>> c2NodeMap = c2.getNodes();
    Assert.assertEquals("number operators downstream", 1, c2NodeMap.size());
    GenericTestOperator n2 = (GenericTestOperator) c2NodeMap.get(localCluster.findByLogicalNode(dag.getMeta(node2)).getId()).getOperator();
    Assert.assertNotNull(n2);
    // input data
    String tuple1 = "tuple1";
    n1.addTuple(tuple1);
    OperatorContext n1Context = c0.getNodeContext(ptNode1.getId());
    Assert.assertEquals("initial window id", -1, n1Context.getLastProcessedWindowId());
    // checkpoint window
    wclock.tick(1);
    wclock.tick(1);
    Assert.assertEquals("current window", 2, wclock.getCurrentTimeMillis());
    OperatorContext o2Context = c2.getNodeContext(ptNode2.getId());
    Assert.assertNotNull("context ", o2Context);
    StramTestSupport.waitForWindowComplete(o2Context, 1);
    Assert.assertEquals("o2 received ", tuple1, n2.inport1Tuple);
    wclock.tick(1);
    Assert.assertEquals("current window", 3, wclock.getCurrentTimeMillis());
    // checkpoint between window 1 and 2
    StramTestSupport.waitForWindowComplete(o2Context, 2);
    // propagate checkpoints to master
    c0.triggerHeartbeat();
    // wait for heartbeat cycle to complete
    c0.waitForHeartbeat(5000);
    Assert.assertEquals("checkpoint " + ptNode1, 1, ptNode1.getRecentCheckpoint().windowId);
    c2.triggerHeartbeat();
    //Thread.yield();
    // yield without using yield for heartbeat cycle
    Thread.sleep(1);
    c2.waitForHeartbeat(5000);
    Assert.assertEquals("checkpoint " + ptNode2, 1, ptNode2.getRecentCheckpoint().windowId);
    Assert.assertEquals("checkpoints " + ptNode1, Arrays.asList(new Checkpoint[] { new Checkpoint(1L, 0, 0) }), ptNode1.checkpoints);
    Assert.assertEquals("checkpoints " + ptNode2, Arrays.asList(new Checkpoint[] { new Checkpoint(1L, 0, 0) }), ptNode2.checkpoints);
    //
    // simulate container failure (operator o1)
    //
    localCluster.failContainer(c0);
    // replacement container starts empty
    // operators will deploy after downstream operator was removed
    LocalStreamingContainer c0Replaced = StramTestSupport.waitForActivation(localCluster, ptNode1);
    c0Replaced.triggerHeartbeat();
    // next heartbeat after setup
    c0Replaced.waitForHeartbeat(5000);
    Assert.assertNotSame("old container", c0, c0Replaced);
    Assert.assertNotSame("old container", c0.getContainerId(), c0Replaced.getContainerId());
    // verify change in downstream container
    LOG.debug("triggering c2 heartbeat processing");
    StreamingContainerAgent c2Agent = localCluster.getContainerAgent(c2);
    // wait for downstream re-deploy to complete
    long startTms = System.currentTimeMillis();
    while (c2Agent.hasPendingWork() && StramTestSupport.DEFAULT_TIMEOUT_MILLIS > System.currentTimeMillis() - startTms) {
        Thread.sleep(200);
        c2.triggerHeartbeat();
        LOG.debug("Waiting for {} to complete pending work.", c2.getContainerId());
    }
    Assert.assertEquals(c2.getContainerId() + " operators after redeploy " + c2.getNodes(), 1, c2.getNodes().size());
    // verify downstream operator re-deployed in existing container
    Assert.assertEquals("active " + ptNode2, c2, StramTestSupport.waitForActivation(localCluster, ptNode2));
    GenericTestOperator o2Recovered = (GenericTestOperator) c2NodeMap.get(localCluster.findByLogicalNode(dag.getMeta(node2)).getId()).getOperator();
    Assert.assertNotNull("redeployed " + ptNode2, o2Recovered);
    Assert.assertNotSame("new instance " + ptNode2, n2, o2Recovered);
    Assert.assertEquals("restored state " + ptNode2, tuple1, o2Recovered.inport1Tuple);
    TestGeneratorInputOperator o1Recovered = (TestGeneratorInputOperator) c0Replaced.getNodes().get(ptNode1.getId()).getOperator();
    Assert.assertNotNull(o1Recovered);
    OperatorContext o1RecoveredContext = c0Replaced.getNodeContext(ptNode1.getId());
    Assert.assertNotNull("active " + ptNode1, o1RecoveredContext);
    wclock.tick(1);
    Assert.assertEquals("current window", 4, wclock.getCurrentTimeMillis());
    // refresh context after operator re-deploy
    o2Context = c2.getNodeContext(ptNode2.getId());
    Assert.assertNotNull("active " + ptNode2, o2Context);
    StramTestSupport.waitForWindowComplete(o1RecoveredContext, 3);
    StramTestSupport.waitForWindowComplete(o2Context, 3);
    // checkpoint window
    wclock.tick(1);
    Assert.assertEquals("current window", 5, wclock.getCurrentTimeMillis());
    String tuple2 = "tuple2";
    o1Recovered.addTuple(tuple2);
    StramTestSupport.waitForWindowComplete(o1RecoveredContext, 4);
    StramTestSupport.waitForWindowComplete(o2Context, 4);
    // check data flow after recovery
    Assert.assertEquals("retrieved tuple (after recovery) " + ptNode2, tuple2, o2Recovered.inport1Tuple);
    // propagate checkpoints to master
    c0Replaced.triggerHeartbeat();
    c0Replaced.waitForHeartbeat(5000);
    c2.triggerHeartbeat();
    c2.waitForHeartbeat(5000);
    // purge checkpoints
    // checkpoint purging
    localCluster.dnmgr.monitorHeartbeat(false);
    Assert.assertEquals("checkpoints " + ptNode1, Arrays.asList(new Checkpoint[] { new Checkpoint(3L, 0, 0) }), ptNode1.checkpoints);
    Assert.assertEquals("checkpoints " + ptNode2, Arrays.asList(new Checkpoint[] { new Checkpoint(3L, 0, 0) }), ptNode2.checkpoints);
    localCluster.shutdown();
}
Also used : PTOperator(com.datatorrent.stram.plan.physical.PTOperator) LocalStreamingContainer(com.datatorrent.stram.StramLocalCluster.LocalStreamingContainer) Node(com.datatorrent.stram.engine.Node) AsyncFSStorageAgent(com.datatorrent.common.util.AsyncFSStorageAgent) TestGeneratorInputOperator(com.datatorrent.stram.engine.TestGeneratorInputOperator) Checkpoint(com.datatorrent.stram.api.Checkpoint) MockComponentFactory(com.datatorrent.stram.StramLocalCluster.MockComponentFactory) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) WindowGenerator(com.datatorrent.stram.engine.WindowGenerator) OperatorContext(com.datatorrent.stram.engine.OperatorContext) ManualScheduledExecutorService(com.datatorrent.stram.support.ManualScheduledExecutorService) Test(org.junit.Test)

Example 12 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class StreamingContainerManagerTest method shutdownOperator.

private static void shutdownOperator(StreamingContainerManager scm, PTOperator p1, PTOperator p2) {
    assignContainer(scm, "c1");
    assignContainer(scm, "c2");
    ContainerHeartbeat c1hb = new ContainerHeartbeat();
    c1hb.setContainerStats(new ContainerStats(p1.getContainer().getExternalId()));
    scm.processHeartbeat(c1hb);
    ContainerHeartbeat c2hb = new ContainerHeartbeat();
    c2hb.setContainerStats(new ContainerStats(p2.getContainer().getExternalId()));
    scm.processHeartbeat(c2hb);
    OperatorHeartbeat o1hb = new OperatorHeartbeat();
    c1hb.getContainerStats().addNodeStats(o1hb);
    o1hb.setNodeId(p1.getId());
    o1hb.setState(DeployState.ACTIVE);
    OperatorStats o1stats = new OperatorStats();
    o1hb.getOperatorStatsContainer().add(o1stats);
    o1stats.checkpoint = new Checkpoint(2, 0, 0);
    o1stats.windowId = 3;
    scm.processHeartbeat(c1hb);
    Assert.assertEquals(PTOperator.State.ACTIVE, p1.getState());
    OperatorHeartbeat o2hb = new OperatorHeartbeat();
    c2hb.getContainerStats().addNodeStats(o2hb);
    o2hb.setNodeId(p2.getId());
    o2hb.setState(DeployState.ACTIVE);
    OperatorStats o2stats = new OperatorStats();
    o2stats.checkpoint = new Checkpoint(2, 0, 0);
    o2stats.windowId = 3;
    scm.processHeartbeat(c2hb);
    Assert.assertEquals(PTOperator.State.ACTIVE, p1.getState());
    Assert.assertEquals(PTOperator.State.ACTIVE, p2.getState());
    o1hb.setState(DeployState.SHUTDOWN);
    o1stats.checkpoint = new Checkpoint(4, 0, 0);
    o1stats.windowId = 5;
    scm.processHeartbeat(c1hb);
    Assert.assertEquals(PTOperator.State.INACTIVE, p1.getState());
}
Also used : Checkpoint(com.datatorrent.stram.api.Checkpoint) ContainerStats(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats) OperatorHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat) MockOperatorStats(com.datatorrent.stram.MockContainer.MockOperatorStats) OperatorStats(com.datatorrent.api.Stats.OperatorStats) ContainerHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeat)

Example 13 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class StreamingContainerManager method addCheckpoint.

@SuppressWarnings("StatementWithEmptyBody")
void addCheckpoint(PTOperator node, Checkpoint checkpoint) {
    synchronized (node.checkpoints) {
        if (!node.checkpoints.isEmpty()) {
            Checkpoint lastCheckpoint = node.checkpoints.getLast();
            // skip unless checkpoint moves
            if (lastCheckpoint.windowId != checkpoint.windowId) {
                if (lastCheckpoint.windowId > checkpoint.windowId) {
                    // list needs to have max windowId last
                    LOG.warn("Out of sequence checkpoint {} last {} (operator {})", checkpoint, lastCheckpoint, node);
                    ListIterator<Checkpoint> li = node.checkpoints.listIterator();
                    while (li.hasNext() && li.next().windowId < checkpoint.windowId) {
                    //continue;
                    }
                    if (li.previous().windowId != checkpoint.windowId) {
                        li.add(checkpoint);
                    }
                } else {
                    node.checkpoints.add(checkpoint);
                }
            }
        } else {
            node.checkpoints.add(checkpoint);
        }
    }
}
Also used : Checkpoint(com.datatorrent.stram.api.Checkpoint)

Example 14 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class Node method reportStats.

protected void reportStats(ContainerStats.OperatorStats stats, long windowId) {
    stats.outputPorts = new ArrayList<>();
    for (Entry<String, Sink<Object>> e : outputs.entrySet()) {
        ContainerStats.OperatorStats.PortStats portStats = new ContainerStats.OperatorStats.PortStats(e.getKey());
        portStats.tupleCount = e.getValue().getCount(true) - controlTupleCount;
        portStats.endWindowTimestamp = endWindowEmitTime;
        stats.outputPorts.add(portStats);
    }
    controlTupleCount = 0;
    long currentCpuTime = tmb.getCurrentThreadCpuTime();
    stats.cpuTimeUsed = currentCpuTime - lastSampleCpuTime;
    lastSampleCpuTime = currentCpuTime;
    if (checkpoint != null) {
        stats.checkpoint = checkpoint;
        stats.checkpointStats = checkpointStats;
        checkpointStats = null;
        checkpoint = null;
    } else {
        Pair<FutureTask<Stats.CheckpointStats>, CheckpointWindowInfo> pair = taskQueue.peek();
        if (pair != null && pair.getFirst().isDone()) {
            taskQueue.poll();
            try {
                CheckpointWindowInfo checkpointWindowInfo = pair.getSecond();
                stats.checkpointStats = pair.getFirst().get();
                stats.checkpoint = new Checkpoint(checkpointWindowInfo.windowId, checkpointWindowInfo.applicationWindowCount, checkpointWindowInfo.checkpointWindowCount);
                if (operator instanceof Operator.CheckpointListener) {
                    ((Operator.CheckpointListener) operator).checkpointed(checkpointWindowInfo.windowId);
                }
            } catch (Exception ex) {
                throw Throwables.propagate(ex);
            }
        }
    }
    context.report(stats, windowId);
}
Also used : ContainerStats(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats) IntrospectionException(java.beans.IntrospectionException) InvocationTargetException(java.lang.reflect.InvocationTargetException) IOException(java.io.IOException) Checkpoint(com.datatorrent.stram.api.Checkpoint) MuxSink(com.datatorrent.stram.debug.MuxSink) Sink(com.datatorrent.api.Sink) FutureTask(java.util.concurrent.FutureTask) Stats(com.datatorrent.api.Stats) ContainerStats(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats)

Example 15 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class CheckpointTest method testUpdateCheckpointsProcessingTimeout.

@Test
public void testUpdateCheckpointsProcessingTimeout() {
    MockClock clock = new MockClock();
    dag.setAttribute(com.datatorrent.api.Context.OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
    GenericTestOperator o2 = dag.addOperator("o2", GenericTestOperator.class);
    dag.addStream("o1.outport1", o1.outport1, o2.inport1);
    StreamingContainerManager dnm = new StreamingContainerManager(dag);
    PhysicalPlan plan = dnm.getPhysicalPlan();
    // set all operators as active to enable recovery window id update
    for (PTOperator oper : plan.getAllOperators().values()) {
        oper.setState(PTOperator.State.ACTIVE);
    }
    List<PTOperator> partitions = plan.getOperators(dag.getMeta(o1));
    Assert.assertNotNull(partitions);
    Assert.assertEquals(1, partitions.size());
    PTOperator o1p1 = partitions.get(0);
    partitions = plan.getOperators(dag.getMeta(o2));
    Assert.assertNotNull(partitions);
    Assert.assertEquals(1, partitions.size());
    PTOperator o2p1 = partitions.get(0);
    UpdateCheckpointsContext ctx = new UpdateCheckpointsContext(clock);
    dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
    Assert.assertTrue("no blocked operators", ctx.blocked.isEmpty());
    o1p1.stats.statsRevs.checkout();
    o1p1.stats.currentWindowId.set(1);
    o1p1.stats.lastWindowIdChangeTms = 1;
    o1p1.stats.statsRevs.commit();
    clock.time = o1p1.stats.windowProcessingTimeoutMillis + 1;
    ctx = new UpdateCheckpointsContext(clock);
    dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
    Assert.assertEquals("o2 blocked", Sets.newHashSet(o2p1), ctx.blocked);
    // assign future activation window (state-less or at-most-once).
    Checkpoint cp2 = o2p1.getRecoveryCheckpoint();
    o2p1.setRecoveryCheckpoint(new Checkpoint(o1p1.getRecoveryCheckpoint().windowId + 1, cp2.applicationWindowCount, cp2.checkpointWindowCount));
    ctx = new UpdateCheckpointsContext(clock);
    dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
    Assert.assertEquals("no operators blocked (o2 activation window ahead)", Sets.newHashSet(), ctx.blocked);
    // reset to blocked
    o2p1.setRecoveryCheckpoint(cp2);
    ctx = new UpdateCheckpointsContext(clock);
    dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
    Assert.assertEquals("o2 blocked", Sets.newHashSet(o2p1), ctx.blocked);
    clock.time++;
    ctx = new UpdateCheckpointsContext(clock);
    dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
    Assert.assertEquals("operators blocked", Sets.newHashSet(o1p1, o2p1), ctx.blocked);
    o2p1.stats.statsRevs.checkout();
    o2p1.stats.currentWindowId.set(o1p1.stats.getCurrentWindowId());
    o2p1.stats.statsRevs.commit();
    ctx = new UpdateCheckpointsContext(clock);
    dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
    Assert.assertEquals("operators blocked", Sets.newHashSet(o1p1), ctx.blocked);
    clock.time--;
    ctx = new UpdateCheckpointsContext(clock);
    dnm.updateRecoveryCheckpoints(o1p1, ctx, false);
    Assert.assertEquals("operators blocked", Sets.newHashSet(), ctx.blocked);
}
Also used : PhysicalPlan(com.datatorrent.stram.plan.physical.PhysicalPlan) Checkpoint(com.datatorrent.stram.api.Checkpoint) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) UpdateCheckpointsContext(com.datatorrent.stram.StreamingContainerManager.UpdateCheckpointsContext) Test(org.junit.Test)

Aggregations

Checkpoint (com.datatorrent.stram.api.Checkpoint)28 PTOperator (com.datatorrent.stram.plan.physical.PTOperator)15 GenericTestOperator (com.datatorrent.stram.engine.GenericTestOperator)9 PhysicalPlan (com.datatorrent.stram.plan.physical.PhysicalPlan)9 Test (org.junit.Test)9 ContainerStats (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats)6 OperatorMeta (com.datatorrent.stram.plan.logical.LogicalPlan.OperatorMeta)6 MemoryStorageAgent (com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent)6 OperatorStats (com.datatorrent.api.Stats.OperatorStats)5 UpdateCheckpointsContext (com.datatorrent.stram.StreamingContainerManager.UpdateCheckpointsContext)5 HashMap (java.util.HashMap)5 Map (java.util.Map)5 OperatorHeartbeat (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat)4 TestGeneratorInputOperator (com.datatorrent.stram.engine.TestGeneratorInputOperator)4 IOException (java.io.IOException)4 LinkedHashMap (java.util.LinkedHashMap)4 Operator (com.datatorrent.api.Operator)3 StatsListener (com.datatorrent.api.StatsListener)3 AsyncFSStorageAgent (com.datatorrent.common.util.AsyncFSStorageAgent)3 ContainerHeartbeatResponse (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse)3