use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class StreamingContainerManagerTest method testProcessHeartbeat.
@Test
public void testProcessHeartbeat() throws Exception {
TestGeneratorInputOperator o1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
dag.setOperatorAttribute(o1, OperatorContext.STATS_LISTENERS, Arrays.asList(new StatsListener[] { new PartitioningTest.PartitionLoadWatch() }));
dag.setAttribute(OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
StreamingContainerManager scm = new StreamingContainerManager(dag);
PhysicalPlan plan = scm.getPhysicalPlan();
Assert.assertEquals("number required containers", 1, plan.getContainers().size());
PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0);
// assign container
String containerId = "container1";
StreamingContainerAgent sca = scm.assignContainer(new ContainerResource(0, containerId, "localhost", 512, 0, null), InetSocketAddress.createUnresolved("localhost", 0));
Assert.assertNotNull(sca);
Assert.assertEquals(PTContainer.State.ALLOCATED, o1p1.getContainer().getState());
Assert.assertEquals(PTOperator.State.PENDING_DEPLOY, o1p1.getState());
ContainerStats cstats = new ContainerStats(containerId);
ContainerHeartbeat hb = new ContainerHeartbeat();
hb.setContainerStats(cstats);
// get deploy request
ContainerHeartbeatResponse chr = scm.processHeartbeat(hb);
Assert.assertNotNull(chr.deployRequest);
Assert.assertEquals("" + chr.deployRequest, 1, chr.deployRequest.size());
Assert.assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
Assert.assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
// first operator heartbeat
OperatorHeartbeat ohb = new OperatorHeartbeat();
ohb.setNodeId(o1p1.getId());
ohb.setState(OperatorHeartbeat.DeployState.ACTIVE);
OperatorStats stats = new OperatorStats();
stats.checkpoint = new Checkpoint(2, 0, 0);
stats.windowId = 3;
stats.outputPorts = Lists.newArrayList();
PortStats ps = new PortStats(TestGeneratorInputOperator.OUTPUT_PORT);
ps.bufferServerBytes = 101;
ps.tupleCount = 1;
stats.outputPorts.add(ps);
ohb.windowStats = Lists.newArrayList(stats);
cstats.operators.add(ohb);
// activate operator
scm.processHeartbeat(hb);
Assert.assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
Assert.assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState());
Assert.assertEquals("tuples " + o1p1, 1, o1p1.stats.totalTuplesEmitted.get());
Assert.assertEquals("tuples " + o1p1, 0, o1p1.stats.totalTuplesProcessed.get());
Assert.assertEquals("window " + o1p1, 3, o1p1.stats.currentWindowId.get());
Assert.assertEquals("port stats", 1, o1p1.stats.outputPortStatusList.size());
PortStatus o1p1ps = o1p1.stats.outputPortStatusList.get(TestGeneratorInputOperator.OUTPUT_PORT);
Assert.assertNotNull("port stats", o1p1ps);
Assert.assertEquals("port stats", 1, o1p1ps.totalTuples);
// second operator heartbeat
stats = new OperatorStats();
stats.checkpoint = new Checkpoint(2, 0, 0);
stats.windowId = 4;
stats.outputPorts = Lists.newArrayList();
ps = new PortStats(TestGeneratorInputOperator.OUTPUT_PORT);
ps.bufferServerBytes = 1;
ps.tupleCount = 1;
stats.outputPorts.add(ps);
ohb.windowStats = Lists.newArrayList(stats);
cstats.operators.clear();
cstats.operators.add(ohb);
scm.processHeartbeat(hb);
Assert.assertEquals("tuples " + o1p1, 2, o1p1.stats.totalTuplesEmitted.get());
Assert.assertEquals("window " + o1p1, 4, o1p1.stats.currentWindowId.get());
Assert.assertEquals("statsQueue " + o1p1, 2, o1p1.stats.listenerStats.size());
scm.processEvents();
Assert.assertEquals("statsQueue " + o1p1, 0, o1p1.stats.listenerStats.size());
Assert.assertEquals("lastStats " + o1p1, 2, o1p1.stats.lastWindowedStats.size());
}
use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class StramRecoveryTest method testContainerManager.
/**
* Test serialization of the container manager with mock execution layer.
* @throws Exception
*/
private void testContainerManager(StorageAgent agent) throws Exception {
dag.setAttribute(OperatorContext.STORAGE_AGENT, agent);
StatsListeningOperator o1 = dag.addOperator("o1", StatsListeningOperator.class);
FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false));
StreamingContainerManager scm = StreamingContainerManager.getInstance(recoveryHandler, dag, false);
File expFile = new File(recoveryHandler.getDir(), FSRecoveryHandler.FILE_SNAPSHOT);
Assert.assertTrue("snapshot file " + expFile, expFile.exists());
PhysicalPlan plan = scm.getPhysicalPlan();
assertEquals("number required containers", 1, plan.getContainers().size());
PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0);
@SuppressWarnings("UnusedAssignment") MockContainer /* sneaky: the constructor does some changes to the container */
mc = new MockContainer(scm, o1p1.getContainer());
PTContainer originalContainer = o1p1.getContainer();
Assert.assertNotNull(o1p1.getContainer().bufferServerAddress);
assertEquals(PTContainer.State.ACTIVE, o1p1.getContainer().getState());
assertEquals("state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
// test restore initial snapshot + log
dag = StramTestSupport.createDAG(testMeta);
scm = StreamingContainerManager.getInstance(new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false);
dag = scm.getLogicalPlan();
plan = scm.getPhysicalPlan();
o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0);
assertEquals("post restore state " + o1p1, PTOperator.State.PENDING_DEPLOY, o1p1.getState());
o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator();
assertEquals("containerId", originalContainer.getExternalId(), o1p1.getContainer().getExternalId());
assertEquals("stats listener", 1, o1p1.statsListeners.size());
// stats are not logged
assertEquals("number stats calls", 0, o1.processStatsCnt);
assertEquals("post restore 1", PTContainer.State.ALLOCATED, o1p1.getContainer().getState());
assertEquals("post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress);
StreamingContainerAgent sca = scm.getContainerAgent(originalContainer.getExternalId());
Assert.assertNotNull("allocated container restored " + originalContainer, sca);
assertEquals("memory usage allocated container", (int) OperatorContext.MEMORY_MB.defaultValue, sca.container.getAllocatedMemoryMB());
// YARN-1490 - simulate container terminated on AM recovery
scm.scheduleContainerRestart(originalContainer.getExternalId());
assertEquals("memory usage of failed container", 0, sca.container.getAllocatedMemoryMB());
Checkpoint firstCheckpoint = new Checkpoint(3, 0, 0);
mc = new MockContainer(scm, o1p1.getContainer());
checkpoint(scm, o1p1, firstCheckpoint);
mc.stats(o1p1.getId()).deployState(OperatorHeartbeat.DeployState.ACTIVE).currentWindowId(3).checkpointWindowId(3);
mc.sendHeartbeat();
assertEquals("state " + o1p1, PTOperator.State.ACTIVE, o1p1.getState());
// logical plan modification triggers snapshot
CreateOperatorRequest cor = new CreateOperatorRequest();
cor.setOperatorFQCN(GenericTestOperator.class.getName());
cor.setOperatorName("o2");
CreateStreamRequest csr = new CreateStreamRequest();
csr.setSourceOperatorName("o1");
csr.setSourceOperatorPortName("outport");
csr.setSinkOperatorName("o2");
csr.setSinkOperatorPortName("inport1");
FutureTask<?> lpmf = scm.logicalPlanModification(Lists.newArrayList(cor, csr));
while (!lpmf.isDone()) {
scm.monitorHeartbeat(false);
}
// unmask exception, if any
Assert.assertNull(lpmf.get());
Assert.assertSame("dag references", dag, scm.getLogicalPlan());
assertEquals("number operators after plan modification", 2, dag.getAllOperators().size());
// set operator state triggers journal write
o1p1.setState(PTOperator.State.INACTIVE);
Checkpoint offlineCheckpoint = new Checkpoint(10, 0, 0);
// write checkpoint while AM is out,
// it needs to be picked up as part of restore
checkpoint(scm, o1p1, offlineCheckpoint);
// test restore
dag = StramTestSupport.createDAG(testMeta);
scm = StreamingContainerManager.getInstance(new FSRecoveryHandler(dag.assertAppPath(), new Configuration(false)), dag, false);
Assert.assertNotSame("dag references", dag, scm.getLogicalPlan());
assertEquals("number operators after restore", 2, scm.getLogicalPlan().getAllOperators().size());
dag = scm.getLogicalPlan();
plan = scm.getPhysicalPlan();
o1p1 = plan.getOperators(dag.getOperatorMeta("o1")).get(0);
assertEquals("post restore state " + o1p1, PTOperator.State.INACTIVE, o1p1.getState());
o1 = (StatsListeningOperator) o1p1.getOperatorMeta().getOperator();
assertEquals("stats listener", 1, o1p1.statsListeners.size());
assertEquals("number stats calls post restore", 1, o1.processStatsCnt);
assertEquals("post restore 1", PTContainer.State.ACTIVE, o1p1.getContainer().getState());
assertEquals("post restore 1", originalContainer.bufferServerAddress, o1p1.getContainer().bufferServerAddress);
// offline checkpoint detection
assertEquals("checkpoints after recovery", Lists.newArrayList(firstCheckpoint, offlineCheckpoint), o1p1.checkpoints);
}
use of com.datatorrent.stram.api.Checkpoint in project apex-core by apache.
the class DelayOperatorTest method testCheckpointUpdate.
@Test
public void testCheckpointUpdate() {
LogicalPlan dag = StramTestSupport.createDAG(testMeta);
TestGeneratorInputOperator opA = dag.addOperator("A", TestGeneratorInputOperator.class);
GenericTestOperator opB = dag.addOperator("B", GenericTestOperator.class);
GenericTestOperator opC = dag.addOperator("C", GenericTestOperator.class);
GenericTestOperator opD = dag.addOperator("D", GenericTestOperator.class);
DefaultDelayOperator<Object> opDelay = dag.addOperator("opDelay", new DefaultDelayOperator<>());
dag.addStream("AtoB", opA.outport, opB.inport1);
dag.addStream("BtoC", opB.outport1, opC.inport1);
dag.addStream("CtoD", opC.outport1, opD.inport1);
dag.addStream("CtoDelay", opC.outport2, opDelay.input);
dag.addStream("DelayToB", opDelay.output, opB.inport2);
dag.validate();
dag.setAttribute(com.datatorrent.api.Context.OperatorContext.STORAGE_AGENT, new MemoryStorageAgent());
StreamingContainerManager scm = new StreamingContainerManager(dag);
PhysicalPlan plan = scm.getPhysicalPlan();
// set all operators as active to enable recovery window id update
for (PTOperator oper : plan.getAllOperators().values()) {
oper.setState(PTOperator.State.ACTIVE);
}
Clock clock = new SystemClock();
PTOperator opA1 = plan.getOperators(dag.getMeta(opA)).get(0);
PTOperator opB1 = plan.getOperators(dag.getMeta(opB)).get(0);
PTOperator opC1 = plan.getOperators(dag.getMeta(opC)).get(0);
PTOperator opDelay1 = plan.getOperators(dag.getMeta(opDelay)).get(0);
PTOperator opD1 = plan.getOperators(dag.getMeta(opD)).get(0);
Checkpoint cp3 = new Checkpoint(3L, 0, 0);
Checkpoint cp5 = new Checkpoint(5L, 0, 0);
Checkpoint cp4 = new Checkpoint(4L, 0, 0);
opB1.checkpoints.add(cp3);
opC1.checkpoints.add(cp3);
opC1.checkpoints.add(cp4);
opDelay1.checkpoints.add(cp3);
opDelay1.checkpoints.add(cp5);
opD1.checkpoints.add(cp5);
// construct grouping that would be supplied through LogicalPlan
Set<OperatorMeta> stronglyConnected = Sets.newHashSet(dag.getMeta(opB), dag.getMeta(opC), dag.getMeta(opDelay));
Map<OperatorMeta, Set<OperatorMeta>> groups = new HashMap<>();
for (OperatorMeta om : stronglyConnected) {
groups.put(om, stronglyConnected);
}
UpdateCheckpointsContext ctx = new UpdateCheckpointsContext(clock, false, groups);
scm.updateRecoveryCheckpoints(opB1, ctx, false);
Assert.assertEquals("checkpoint " + opA1, Checkpoint.INITIAL_CHECKPOINT, opA1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + opB1, cp3, opC1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + opC1, cp3, opC1.getRecoveryCheckpoint());
Assert.assertEquals("checkpoint " + opD1, cp5, opD1.getRecoveryCheckpoint());
}
Aggregations