Search in sources :

Example 26 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class StreamingContainerManagerTest method testProcessHeartbeat.

@Test
public void testProcessHeartbeat() throws Exception {
    TestGeneratorInputOperator o1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
    dag.setOperatorAttribute(o1, OperatorContext.STATS_LISTENERS, Arrays.asList(new StatsListener[] { new PartitioningTest.PartitionLoadWatch() }));
    dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    StreamingContainerManager scm = new StreamingContainerManager(dag);
    PhysicalPlan plan = scm.getPhysicalPlan();
    Assert.assertEquals("number required containers", 1, plan.getContainers().size());
    PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0);
    // assign container
    String containerId = "container1";
    StreamingContainerAgent sca = scm.assignContainer(new ContainerResource(0, containerId, "localhost", 512, 0, null), InetSocketAddress.createUnresolved("localhost", 0));
    Assert.assertNotNull(sca);
    Assert.assertEquals(PTContainer.State.ALLOCATED, o1p1.getContainer().getState());
    Assert.assertEquals(PTOperator.State.PENDING_DEPLOY, o1p1.getState());
    ContainerStats cstats = new ContainerStats(containerId);
    ContainerHeartbeat hb = new ContainerHeartbeat();
    hb.setContainerStats(cstats);
    // get deploy request
    ContainerHeartbeatResponse chr = scm.processHeartbeat(hb);
    Assert.assertNotNull(chr.deployRequest);
    Assert.assertEquals("" + chr.deployRequest, 1, chr.deployRequest.size());
    Assert.assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    Assert.assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
    // first operator heartbeat
    OperatorHeartbeat ohb = new OperatorHeartbeat();
    ohb.setNodeId(o1p1.getId());
    ohb.setState(OperatorHeartbeat.DeployState.ACTIVE);
    OperatorStats stats = new OperatorStats();
    stats.checkpoint = new Checkpoint(2, 0, 0);
    stats.windowId = 3;
    stats.outputPorts = Lists.newArrayList();
    PortStats ps = new PortStats(TestGeneratorInputOperator.OUTPUT_PORT);
    ps.bufferServerBytes = 101;
    ps.tupleCount = 1;
    stats.outputPorts.add(ps);
    ohb.windowStats = Lists.newArrayList(stats);
    cstats.operators.add(ohb);
    // activate operator
    scm.processHeartbeat(hb);
    Assert.assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    Assert.assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState());
    Assert.assertEquals("tuples " + o1p1, 1, o1p1.stats.totalTuplesEmitted.get());
    Assert.assertEquals("tuples " + o1p1, 0, o1p1.stats.totalTuplesProcessed.get());
    Assert.assertEquals("window " + o1p1, 3, o1p1.stats.currentWindowId.get());
    Assert.assertEquals("port stats", 1, o1p1.stats.outputPortStatusList.size());
    PortStatus o1p1ps = o1p1.stats.outputPortStatusList.get(TestGeneratorInputOperator.OUTPUT_PORT);
    Assert.assertNotNull("port stats", o1p1ps);
    Assert.assertEquals("port stats", 1, o1p1ps.totalTuples);
    // second operator heartbeat
    stats = new OperatorStats();
    stats.checkpoint = new Checkpoint(2, 0, 0);
    stats.windowId = 4;
    stats.outputPorts = Lists.newArrayList();
    ps = new PortStats(TestGeneratorInputOperator.OUTPUT_PORT);
    ps.bufferServerBytes = 1;
    ps.tupleCount = 1;
    stats.outputPorts.add(ps);
    ohb.windowStats = Lists.newArrayList(stats);
    cstats.operators.clear();
    cstats.operators.add(ohb);
    scm.processHeartbeat(hb);
    Assert.assertEquals("tuples " + o1p1, 2, o1p1.stats.totalTuplesEmitted.get());
    Assert.assertEquals("window " + o1p1, 4, o1p1.stats.currentWindowId.get());
    Assert.assertEquals("statsQueue " + o1p1, 2, o1p1.stats.listenerStats.size());
    scm.processEvents();
    Assert.assertEquals("statsQueue " + o1p1, 0, o1p1.stats.listenerStats.size());
    Assert.assertEquals("lastStats " + o1p1, 2, o1p1.stats.lastWindowedStats.size());
}
Also used : PhysicalPlan(com.datatorrent.stram.plan.physical.PhysicalPlan) ContainerStats(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) OperatorHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat) ContainerHeartbeatResponse(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse) TestGeneratorInputOperator(com.datatorrent.stram.engine.TestGeneratorInputOperator) MockOperatorStats(com.datatorrent.stram.MockContainer.MockOperatorStats) OperatorStats(com.datatorrent.api.Stats.OperatorStats) StatsListener(com.datatorrent.api.StatsListener) Checkpoint(com.datatorrent.stram.api.Checkpoint) ContainerResource(com.datatorrent.stram.StreamingContainerManager.ContainerResource) PortStatus(com.datatorrent.stram.plan.physical.OperatorStatus.PortStatus) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) PortStats(com.datatorrent.api.Stats.OperatorStats.PortStats) ContainerHeartbeat(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeat) Test(org.junit.Test) PhysicalPlanTest(com.datatorrent.stram.plan.physical.PhysicalPlanTest)

Example 27 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class StramRecoveryTest method testContainerManager.

/**
   * Test serialization of the container manager with mock execution layer.
   * @throws Exception
   */
private void testContainerManager(StorageAgent agent) throws Exception {
    dag.setAttribute(OperatorContext.STORAGE_AGENT, agent);
    StatsListeningOperator o1 = dag.addOperator("o1", StatsListeningOperator.class);
    FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false));
    StreamingContainerManager scm = StreamingContainerManager.getInstance(recoveryHandler, dag, false);
    File expFile = new File(recoveryHandler.getDir(), FSRecoveryHandler.FILE_SNAPSHOT);
    Assert.assertTrue("snapshot file " + expFile, expFile.exists());
    PhysicalPlan plan = scm.getPhysicalPlan();
    assertEquals("number required containers", 1, plan.getContainers().size());
    PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0);
    @SuppressWarnings("UnusedAssignment") MockContainer /* sneaky: the constructor does some changes to the container */
    mc = new MockContainer(scm, o1p1.getContainer());
    PTContainer originalContainer = o1p1.getContainer();
    Assert.assertNotNull(o1p1.getContainer().bufferServerAddress);
    assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
    // test restore initial snapshot + log
    dag = StramTestSupport.createDAG(testMeta);
    scm = StreamingContainerManager.getInstance(new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false);
    dag = scm.getLogicalPlan();
    plan = scm.getPhysicalPlan();
    o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0);
    assertEquals("post restore state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
    o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator();
    assertEquals("containerId", originalContainer.getExternalId(), o1p1.getContainer().getExternalId());
    assertEquals("stats listener", 1, o1p1.statsListeners.size());
    // stats are not logged
    assertEquals("number stats calls", 0, o1.processStatsCnt);
    assertEquals("post restore 1", PTContainer.State.ALLOCATED, o1p1.getContainer().getState());
    assertEquals("post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress);
    StreamingContainerAgent sca = scm.getContainerAgent(originalContainer.getExternalId());
    Assert.assertNotNull("allocated container restored " + originalContainer, sca);
    assertEquals("memory usage allocated container", (int) OperatorContext.MEMORY_MB.defaultValue, sca.container.getAllocatedMemoryMB());
    // YARN-1490 - simulate container terminated on AM recovery
    scm.scheduleContainerRestart(originalContainer.getExternalId());
    assertEquals("memory usage of failed container", 0, sca.container.getAllocatedMemoryMB());
    Checkpoint firstCheckpoint = new Checkpoint(3, 0, 0);
    mc = new MockContainer(scm, o1p1.getContainer());
    checkpoint(scm, o1p1, firstCheckpoint);
    mc.stats(o1p1.getId()).deployState(OperatorHeartbeat.DeployState.ACTIVE).currentWindowId(3).checkpointWindowId(3);
    mc.sendHeartbeat();
    assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState());
    // logical plan modification triggers snapshot
    CreateOperatorRequest cor = new CreateOperatorRequest();
    cor.setOperatorFQCN(GenericTestOperator.class.getName());
    cor.setOperatorName("o2");
    CreateStreamRequest csr = new CreateStreamRequest();
    csr.setSourceOperatorName("o1");
    csr.setSourceOperatorPortName("outport");
    csr.setSinkOperatorName("o2");
    csr.setSinkOperatorPortName("inport1");
    FutureTask<?> lpmf = scm.logicalPlanModification(Lists.newArrayList(cor, csr));
    while (!lpmf.isDone()) {
        scm.monitorHeartbeat(false);
    }
    // unmask exception, if any
    Assert.assertNull(lpmf.get());
    Assert.assertSame("dag references", dag, scm.getLogicalPlan());
    assertEquals("number operators after plan modification", 2, dag.getAllOperators().size());
    // set operator state triggers journal write
    o1p1.setState(PTOperator.State.INACTIVE);
    Checkpoint offlineCheckpoint = new Checkpoint(10, 0, 0);
    // write checkpoint while AM is out,
    // it needs to be picked up as part of restore
    checkpoint(scm, o1p1, offlineCheckpoint);
    // test restore
    dag = StramTestSupport.createDAG(testMeta);
    scm = StreamingContainerManager.getInstance(new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false);
    Assert.assertNotSame("dag references", dag, scm.getLogicalPlan());
    assertEquals("number operators after restore", 2, scm.getLogicalPlan().getAllOperators().size());
    dag = scm.getLogicalPlan();
    plan = scm.getPhysicalPlan();
    o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0);
    assertEquals("post restore state " + o1p1, PTOperator.State.INACTIVE, o1p1.getState());
    o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator();
    assertEquals("stats listener", 1, o1p1.statsListeners.size());
    assertEquals("number stats calls post restore", 1, o1.processStatsCnt);
    assertEquals("post restore 1", PTContainer.State.ACTIVE, o1p1.getContainer().getState());
    assertEquals("post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress);
    // offline checkpoint detection
    assertEquals("checkpoints after recovery", Lists.newArrayList(firstCheckpoint, offlineCheckpoint), o1p1.checkpoints);
}
Also used : PhysicalPlan(com.datatorrent.stram.plan.physical.PhysicalPlan) Configuration(org.apache.hadoop.conf.Configuration) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) Checkpoint(com.datatorrent.stram.api.Checkpoint) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) CreateStreamRequest(com.datatorrent.stram.plan.logical.requests.CreateStreamRequest) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) File(java.io.File) CreateOperatorRequest(com.datatorrent.stram.plan.logical.requests.CreateOperatorRequest)

Example 28 with Checkpoint

use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.

the class DelayOperatorTest method testCheckpointUpdate.

@Test
public void testCheckpointUpdate() {
    LogicalPlan dag = StramTestSupport.createDAG(testMeta);
    TestGeneratorInputOperator opA = dag.addOperator("A", TestGeneratorInputOperator.class);
    GenericTestOperator opB = dag.addOperator("B", GenericTestOperator.class);
    GenericTestOperator opC = dag.addOperator("C", GenericTestOperator.class);
    GenericTestOperator opD = dag.addOperator("D", GenericTestOperator.class);
    DefaultDelayOperator<Object> opDelay = dag.addOperator("opDelay", new DefaultDelayOperator<>());
    dag.addStream("AtoB", opA.outport, opB.inport1);
    dag.addStream("BtoC", opB.outport1, opC.inport1);
    dag.addStream("CtoD", opC.outport1, opD.inport1);
    dag.addStream("CtoDelay", opC.outport2, opDelay.input);
    dag.addStream("DelayToB", opDelay.output, opB.inport2);
    dag.validate();
    dag.setAttribute(com.datatorrent.api.Context.OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
    StreamingContainerManager scm = new StreamingContainerManager(dag);
    PhysicalPlan plan = scm.getPhysicalPlan();
    // set all operators as active to enable recovery window id update
    for (PTOperator oper : plan.getAllOperators().values()) {
        oper.setState(PTOperator.State.ACTIVE);
    }
    Clock clock = new SystemClock();
    PTOperator opA1 = plan.getOperators(dag.getMeta(opA)).get(0);
    PTOperator opB1 = plan.getOperators(dag.getMeta(opB)).get(0);
    PTOperator opC1 = plan.getOperators(dag.getMeta(opC)).get(0);
    PTOperator opDelay1 = plan.getOperators(dag.getMeta(opDelay)).get(0);
    PTOperator opD1 = plan.getOperators(dag.getMeta(opD)).get(0);
    Checkpoint cp3 = new Checkpoint(3L, 0, 0);
    Checkpoint cp5 = new Checkpoint(5L, 0, 0);
    Checkpoint cp4 = new Checkpoint(4L, 0, 0);
    opB1.checkpoints.add(cp3);
    opC1.checkpoints.add(cp3);
    opC1.checkpoints.add(cp4);
    opDelay1.checkpoints.add(cp3);
    opDelay1.checkpoints.add(cp5);
    opD1.checkpoints.add(cp5);
    // construct grouping that would be supplied through LogicalPlan
    Set<OperatorMeta> stronglyConnected = Sets.newHashSet(dag.getMeta(opB), dag.getMeta(opC), dag.getMeta(opDelay));
    Map<OperatorMeta, Set<OperatorMeta>> groups = new HashMap<>();
    for (OperatorMeta om : stronglyConnected) {
        groups.put(om, stronglyConnected);
    }
    UpdateCheckpointsContext ctx = new UpdateCheckpointsContext(clock, false, groups);
    scm.updateRecoveryCheckpoints(opB1, ctx, false);
    Assert.assertEquals("checkpoint " + opA1, Checkpoint.INITIAL_CHECKPOINT, opA1.getRecoveryCheckpoint());
    Assert.assertEquals("checkpoint " + opB1, cp3, opC1.getRecoveryCheckpoint());
    Assert.assertEquals("checkpoint " + opC1, cp3, opC1.getRecoveryCheckpoint());
    Assert.assertEquals("checkpoint " + opD1, cp5, opD1.getRecoveryCheckpoint());
}
Also used : StreamingContainerManager(com.datatorrent.stram.StreamingContainerManager) PhysicalPlan(com.datatorrent.stram.plan.physical.PhysicalPlan) TreeSet(java.util.TreeSet) Set(java.util.Set) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) SystemClock(org.apache.hadoop.yarn.util.SystemClock) OperatorMeta(com.datatorrent.stram.plan.logical.LogicalPlan.OperatorMeta) HashMap(java.util.HashMap) TestGeneratorInputOperator(com.datatorrent.stram.engine.TestGeneratorInputOperator) Clock(org.apache.hadoop.yarn.util.Clock) SystemClock(org.apache.hadoop.yarn.util.SystemClock) Checkpoint(com.datatorrent.stram.api.Checkpoint) GenericTestOperator(com.datatorrent.stram.engine.GenericTestOperator) MemoryStorageAgent(com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent) UpdateCheckpointsContext(com.datatorrent.stram.StreamingContainerManager.UpdateCheckpointsContext) Test(org.junit.Test)

Aggregations

Checkpoint (com.datatorrent.stram.api.Checkpoint)28 PTOperator (com.datatorrent.stram.plan.physical.PTOperator)15 GenericTestOperator (com.datatorrent.stram.engine.GenericTestOperator)9 PhysicalPlan (com.datatorrent.stram.plan.physical.PhysicalPlan)9 Test (org.junit.Test)9 ContainerStats (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerStats)6 OperatorMeta (com.datatorrent.stram.plan.logical.LogicalPlan.OperatorMeta)6 MemoryStorageAgent (com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent)6 OperatorStats (com.datatorrent.api.Stats.OperatorStats)5 UpdateCheckpointsContext (com.datatorrent.stram.StreamingContainerManager.UpdateCheckpointsContext)5 HashMap (java.util.HashMap)5 Map (java.util.Map)5 OperatorHeartbeat (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.OperatorHeartbeat)4 TestGeneratorInputOperator (com.datatorrent.stram.engine.TestGeneratorInputOperator)4 IOException (java.io.IOException)4 LinkedHashMap (java.util.LinkedHashMap)4 Operator (com.datatorrent.api.Operator)3 StatsListener (com.datatorrent.api.StatsListener)3 AsyncFSStorageAgent (com.datatorrent.common.util.AsyncFSStorageAgent)3 ContainerHeartbeatResponse (com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.ContainerHeartbeatResponse)3