use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.
the class StreamingContainerManager method scheduleContainerRestart.
/**
* Schedule container restart. Called by Stram after a container was terminated
* and requires recovery (killed externally, or after heartbeat timeout). <br>
* Recovery will resolve affected operators (within the container and
* everything downstream with respective recovery checkpoint states).
* Dependent operators will be undeployed and buffer server connections reset prior to
* redeploy to recovery checkpoint.
*
* @param containerId
*/
public void scheduleContainerRestart(String containerId) {
StreamingContainerAgent cs = this.getContainerAgent(containerId);
if (cs == null || cs.isShutdownRequested()) {
// the container is no longer used / was released by us
return;
}
LOG.info("Initiating recovery for {}@{}", containerId, cs.container.host);
cs.container.setState(PTContainer.State.KILLED);
cs.container.bufferServerAddress = null;
cs.container.setResourceRequestPriority(-1);
cs.container.setAllocatedMemoryMB(0);
cs.container.setAllocatedVCores(0);
// resolve dependencies
UpdateCheckpointsContext ctx = new UpdateCheckpointsContext(clock, false, getCheckpointGroups());
for (PTOperator oper : cs.container.getOperators()) {
updateRecoveryCheckpoints(oper, ctx, false);
}
includeLocalUpstreamOperators(ctx);
// redeploy cycle for all affected operators
LOG.info("Affected operators {}", ctx.visited);
deploy(Collections.<PTContainer>emptySet(), ctx.visited, Sets.newHashSet(cs.container), ctx.visited);
}
use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.
the class PartitioningTest method testDefaultPartitioning.
@Test
public void testDefaultPartitioning() throws Exception {
LogicalPlan dag = new LogicalPlan();
File checkpointDir = new File(TEST_OUTPUT_DIR, "testDefaultPartitioning");
dag.setAttribute(Context.OperatorContext.STORAGE_AGENT, new AsyncFSStorageAgent(checkpointDir.getPath(), null));
Integer[][] testData = { { 4, 5 } };
CollectorOperator.receivedTuples.clear();
TestInputOperator<Integer> input = dag.addOperator("input", new TestInputOperator<Integer>());
input.testTuples = new ArrayList<>();
for (Integer[] tuples : testData) {
input.testTuples.add(new ArrayList<>(Arrays.asList(tuples)));
}
CollectorOperator collector = dag.addOperator("collector", new CollectorOperator());
collector.prefix = "" + System.identityHashCode(collector);
dag.getMeta(collector).getAttributes().put(OperatorContext.PARTITIONER, new StatelessPartitioner<CollectorOperator>(2));
dag.addStream("fromInput", input.output, collector.input);
CollectorOperator merged = dag.addOperator("merged", new CollectorOperator());
merged.prefix = "" + System.identityHashCode(merged);
dag.addStream("toMerged", collector.output, merged.input);
StramLocalCluster lc = new StramLocalCluster(dag);
lc.setHeartbeatMonitoringEnabled(false);
// terminates on end of stream
lc.run();
List<PTOperator> operators = lc.getPlanOperators(dag.getMeta(collector));
Assert.assertEquals("number operator instances " + operators, 2, operators.size());
// one entry for each partition + merged output
Assert.assertEquals("received tuples " + CollectorOperator.receivedTuples, 3, CollectorOperator.receivedTuples.size());
//Assert.assertEquals("received tuples " + operators.get(0), Arrays.asList(4), CollectorOperator.receivedTuples.get(collector.prefix + operators.get(0).getId()));
Assert.assertEquals("received tuples " + operators.get(1), Arrays.asList(5), CollectorOperator.receivedTuples.get(collector.prefix + operators.get(1).getId()));
PTOperator pmerged = lc.findByLogicalNode(dag.getMeta(merged));
List<Object> tuples = CollectorOperator.receivedTuples.get(merged.prefix + pmerged.getId());
Assert.assertNotNull("merged tuples " + pmerged, tuples);
Assert.assertEquals("merged tuples " + pmerged, Sets.newHashSet(testData[0]), Sets.newHashSet(tuples));
}
use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.
the class StramLocalClusterTest method testRecovery.
@Test
@SuppressWarnings("SleepWhileInLoop")
public void testRecovery() throws Exception {
AsyncFSStorageAgent agent = new AsyncFSStorageAgent(testMeta.getPath(), null);
agent.setSyncCheckpoint(true);
dag.setAttribute(OperatorContext.STORAGE_AGENT, agent);
TestGeneratorInputOperator node1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
// data will be added externally from test
node1.setMaxTuples(0);
GenericTestOperator node2 = dag.addOperator("o2", GenericTestOperator.class);
dag.addStream("o1o2", node1.outport, node2.inport1);
dag.validate();
dag.getAttributes().put(LogicalPlan.CHECKPOINT_WINDOW_COUNT, 2);
final ManualScheduledExecutorService wclock = new ManualScheduledExecutorService(1);
MockComponentFactory mcf = new MockComponentFactory() {
@Override
public WindowGenerator setupWindowGenerator() {
WindowGenerator wingen = StramTestSupport.setupWindowGenerator(wclock);
wingen.setCheckpointCount(2, 0);
return wingen;
}
};
StramLocalCluster localCluster = new StramLocalCluster(dag, mcf);
localCluster.setPerContainerBufferServer(true);
// driven by test
localCluster.setHeartbeatMonitoringEnabled(false);
localCluster.runAsync();
PTOperator ptNode1 = localCluster.findByLogicalNode(dag.getMeta(node1));
PTOperator ptNode2 = localCluster.findByLogicalNode(dag.getMeta(node2));
LocalStreamingContainer c0 = StramTestSupport.waitForActivation(localCluster, ptNode1);
Map<Integer, Node<?>> nodeMap = c0.getNodes();
Assert.assertEquals("number operators", 1, nodeMap.size());
TestGeneratorInputOperator n1 = (TestGeneratorInputOperator) nodeMap.get(ptNode1.getId()).getOperator();
Assert.assertNotNull(n1);
LocalStreamingContainer c2 = StramTestSupport.waitForActivation(localCluster, ptNode2);
Map<Integer, Node<?>> c2NodeMap = c2.getNodes();
Assert.assertEquals("number operators downstream", 1, c2NodeMap.size());
GenericTestOperator n2 = (GenericTestOperator) c2NodeMap.get(localCluster.findByLogicalNode(dag.getMeta(node2)).getId()).getOperator();
Assert.assertNotNull(n2);
// input data
String tuple1 = "tuple1";
n1.addTuple(tuple1);
OperatorContext n1Context = c0.getNodeContext(ptNode1.getId());
Assert.assertEquals("initial window id", -1, n1Context.getLastProcessedWindowId());
// checkpoint window
wclock.tick(1);
wclock.tick(1);
Assert.assertEquals("current window", 2, wclock.getCurrentTimeMillis());
OperatorContext o2Context = c2.getNodeContext(ptNode2.getId());
Assert.assertNotNull("context ", o2Context);
StramTestSupport.waitForWindowComplete(o2Context, 1);
Assert.assertEquals("o2 received ", tuple1, n2.inport1Tuple);
wclock.tick(1);
Assert.assertEquals("current window", 3, wclock.getCurrentTimeMillis());
// checkpoint between window 1 and 2
StramTestSupport.waitForWindowComplete(o2Context, 2);
// propagate checkpoints to master
c0.triggerHeartbeat();
// wait for heartbeat cycle to complete
c0.waitForHeartbeat(5000);
Assert.assertEquals("checkpoint " + ptNode1, 1, ptNode1.getRecentCheckpoint().windowId);
c2.triggerHeartbeat();
//Thread.yield();
// yield without using yield for heartbeat cycle
Thread.sleep(1);
c2.waitForHeartbeat(5000);
Assert.assertEquals("checkpoint " + ptNode2, 1, ptNode2.getRecentCheckpoint().windowId);
Assert.assertEquals("checkpoints " + ptNode1, Arrays.asList(new Checkpoint[] { new Checkpoint(1L, 0, 0) }), ptNode1.checkpoints);
Assert.assertEquals("checkpoints " + ptNode2, Arrays.asList(new Checkpoint[] { new Checkpoint(1L, 0, 0) }), ptNode2.checkpoints);
//
// simulate container failure (operator o1)
//
localCluster.failContainer(c0);
// replacement container starts empty
// operators will deploy after downstream operator was removed
LocalStreamingContainer c0Replaced = StramTestSupport.waitForActivation(localCluster, ptNode1);
c0Replaced.triggerHeartbeat();
// next heartbeat after setup
c0Replaced.waitForHeartbeat(5000);
Assert.assertNotSame("old container", c0, c0Replaced);
Assert.assertNotSame("old container", c0.getContainerId(), c0Replaced.getContainerId());
// verify change in downstream container
LOG.debug("triggering c2 heartbeat processing");
StreamingContainerAgent c2Agent = localCluster.getContainerAgent(c2);
// wait for downstream re-deploy to complete
long startTms = System.currentTimeMillis();
while (c2Agent.hasPendingWork() && StramTestSupport.DEFAULT_TIMEOUT_MILLIS > System.currentTimeMillis() - startTms) {
Thread.sleep(200);
c2.triggerHeartbeat();
LOG.debug("Waiting for {} to complete pending work.", c2.getContainerId());
}
Assert.assertEquals(c2.getContainerId() + " operators after redeploy " + c2.getNodes(), 1, c2.getNodes().size());
// verify downstream operator re-deployed in existing container
Assert.assertEquals("active " + ptNode2, c2, StramTestSupport.waitForActivation(localCluster, ptNode2));
GenericTestOperator o2Recovered = (GenericTestOperator) c2NodeMap.get(localCluster.findByLogicalNode(dag.getMeta(node2)).getId()).getOperator();
Assert.assertNotNull("redeployed " + ptNode2, o2Recovered);
Assert.assertNotSame("new instance " + ptNode2, n2, o2Recovered);
Assert.assertEquals("restored state " + ptNode2, tuple1, o2Recovered.inport1Tuple);
TestGeneratorInputOperator o1Recovered = (TestGeneratorInputOperator) c0Replaced.getNodes().get(ptNode1.getId()).getOperator();
Assert.assertNotNull(o1Recovered);
OperatorContext o1RecoveredContext = c0Replaced.getNodeContext(ptNode1.getId());
Assert.assertNotNull("active " + ptNode1, o1RecoveredContext);
wclock.tick(1);
Assert.assertEquals("current window", 4, wclock.getCurrentTimeMillis());
// refresh context after operator re-deploy
o2Context = c2.getNodeContext(ptNode2.getId());
Assert.assertNotNull("active " + ptNode2, o2Context);
StramTestSupport.waitForWindowComplete(o1RecoveredContext, 3);
StramTestSupport.waitForWindowComplete(o2Context, 3);
// checkpoint window
wclock.tick(1);
Assert.assertEquals("current window", 5, wclock.getCurrentTimeMillis());
String tuple2 = "tuple2";
o1Recovered.addTuple(tuple2);
StramTestSupport.waitForWindowComplete(o1RecoveredContext, 4);
StramTestSupport.waitForWindowComplete(o2Context, 4);
// check data flow after recovery
Assert.assertEquals("retrieved tuple (after recovery) " + ptNode2, tuple2, o2Recovered.inport1Tuple);
// propagate checkpoints to master
c0Replaced.triggerHeartbeat();
c0Replaced.waitForHeartbeat(5000);
c2.triggerHeartbeat();
c2.waitForHeartbeat(5000);
// purge checkpoints
// checkpoint purging
localCluster.dnmgr.monitorHeartbeat(false);
Assert.assertEquals("checkpoints " + ptNode1, Arrays.asList(new Checkpoint[] { new Checkpoint(3L, 0, 0) }), ptNode1.checkpoints);
Assert.assertEquals("checkpoints " + ptNode2, Arrays.asList(new Checkpoint[] { new Checkpoint(3L, 0, 0) }), ptNode2.checkpoints);
localCluster.shutdown();
}
use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.
the class StramRecoveryTest method testPhysicalPlanSerialization.
private void testPhysicalPlanSerialization(StorageAgent agent) throws Exception {
GenericTestOperator o1 = dag.addOperator("o1", GenericTestOperator.class);
PartitioningTestOperator o2 = dag.addOperator("o2", PartitioningTestOperator.class);
o2.setPartitionCount(3);
GenericTestOperator o3 = dag.addOperator("o3", GenericTestOperator.class);
dag.addStream("o1.outport1", o1.outport1, o2.inport1, o2.inportWithCodec);
dag.addStream("mergeStream", o2.outport1, o3.inport1);
dag.getAttributes().put(LogicalPlan.CONTAINERS_MAX_COUNT, 2);
TestPlanContext ctx = new TestPlanContext();
dag.setAttribute(OperatorContext.STORAGE_AGENT, agent);
PhysicalPlan plan = new PhysicalPlan(dag, ctx);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
LogicalPlan.write(dag, bos);
LOG.debug("logicalPlan size: " + bos.toByteArray().length);
bos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(bos);
oos.writeObject(plan);
LOG.debug("physicalPlan size: " + bos.toByteArray().length);
ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
plan = (PhysicalPlan) new ObjectInputStream(bis).readObject();
dag = plan.getLogicalPlan();
Field f = PhysicalPlan.class.getDeclaredField("ctx");
f.setAccessible(true);
f.set(plan, ctx);
f.setAccessible(false);
OperatorMeta o2Meta = dag.getOperatorMeta("o2");
List<PTOperator> o2Partitions = plan.getOperators(o2Meta);
assertEquals(3, o2Partitions.size());
for (PTOperator o : o2Partitions) {
Assert.assertNotNull("partition null " + o, o.getPartitionKeys());
assertEquals("partition keys " + o + " " + o.getPartitionKeys(), 2, o.getPartitionKeys().size());
PartitioningTestOperator partitionedInstance = (PartitioningTestOperator) plan.loadOperator(o);
assertEquals("instance per partition", o.getPartitionKeys().values().toString(), partitionedInstance.pks);
Assert.assertNotNull("partition stats null " + o, o.stats);
}
}
use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.
the class StramRecoveryTest method testWriteAheadLog.
@Test
public void testWriteAheadLog() throws Exception {
final MutableInt flushCount = new MutableInt();
final MutableBoolean isClosed = new MutableBoolean(false);
dag.setAttribute(OperatorContext.STORAGE_AGENT, new FSStorageAgent(testMeta.getPath(), null));
TestGeneratorInputOperator o1 = dag.addOperator("o1", TestGeneratorInputOperator.class);
StreamingContainerManager scm = new StreamingContainerManager(dag);
PhysicalPlan plan = scm.getPhysicalPlan();
Journal j = scm.getJournal();
ByteArrayOutputStream bos = new ByteArrayOutputStream() {
@Override
public void flush() throws IOException {
super.flush();
flushCount.increment();
}
@Override
public void close() throws IOException {
super.close();
isClosed.setValue(true);
}
};
j.setOutputStream(new DataOutputStream(bos));
PTOperator o1p1 = plan.getOperators(dag.getMeta(o1)).get(0);
assertEquals(PTOperator.State.PENDING_DEPLOY, o1p1.getState());
String externalId = new MockContainer(scm, o1p1.getContainer()).container.getExternalId();
assertEquals("flush count", 1, flushCount.intValue());
o1p1.setState(PTOperator.State.ACTIVE);
assertEquals(PTOperator.State.ACTIVE, o1p1.getState());
assertEquals("flush count", 2, flushCount.intValue());
assertEquals("is closed", false, isClosed.booleanValue());
// this will close the stream. There are 2 calls to flush() during the close() - one in Kryo Output and one
// in FilterOutputStream
j.setOutputStream(null);
assertEquals("flush count", 4, flushCount.intValue());
assertEquals("is closed", true, isClosed.booleanValue());
// output stream is closed, so state will be changed without recording it in the journal
o1p1.setState(PTOperator.State.INACTIVE);
assertEquals(PTOperator.State.INACTIVE, o1p1.getState());
assertEquals("flush count", 4, flushCount.intValue());
ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
j.replay(new DataInputStream(bis));
assertEquals(PTOperator.State.ACTIVE, o1p1.getState());
InetSocketAddress addr1 = InetSocketAddress.createUnresolved("host1", 1);
PTContainer c1 = plan.getContainers().get(0);
c1.setState(PTContainer.State.ALLOCATED);
c1.host = "host1";
c1.bufferServerAddress = addr1;
c1.setAllocatedMemoryMB(2);
c1.setRequiredMemoryMB(1);
c1.setAllocatedVCores(3);
c1.setRequiredVCores(4);
j.setOutputStream(new DataOutputStream(bos));
j.write(c1.getSetContainerState());
c1.setExternalId(null);
c1.setState(PTContainer.State.NEW);
c1.setExternalId(null);
c1.host = null;
c1.bufferServerAddress = null;
bis = new ByteArrayInputStream(bos.toByteArray());
j.replay(new DataInputStream(bis));
assertEquals(externalId, c1.getExternalId());
assertEquals(PTContainer.State.ALLOCATED, c1.getState());
assertEquals("host1", c1.host);
assertEquals(addr1, c1.bufferServerAddress);
assertEquals(1, c1.getRequiredMemoryMB());
assertEquals(2, c1.getAllocatedMemoryMB());
assertEquals(3, c1.getAllocatedVCores());
assertEquals(4, c1.getRequiredVCores());
j.write(scm.getSetOperatorProperty("o1", "maxTuples", "100"));
o1.setMaxTuples(10);
j.setOutputStream(null);
bis = new ByteArrayInputStream(bos.toByteArray());
j.replay(new DataInputStream(bis));
assertEquals(100, o1.getMaxTuples());
j.setOutputStream(new DataOutputStream(bos));
scm.setOperatorProperty("o1", "maxTuples", "10");
assertEquals(10, o1.getMaxTuples());
o1.setMaxTuples(100);
assertEquals(100, o1.getMaxTuples());
j.setOutputStream(null);
bis = new ByteArrayInputStream(bos.toByteArray());
j.replay(new DataInputStream(bis));
assertEquals(10, o1.getMaxTuples());
j.setOutputStream(new DataOutputStream(bos));
scm.setPhysicalOperatorProperty(o1p1.getId(), "maxTuples", "50");
}
Aggregations