use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.
the class DataPipelineTest method testMacroEvaluationActionPipeline.
public void testMacroEvaluationActionPipeline(Engine engine) throws Exception {
ETLStage action1 = new ETLStage("action1", MockAction.getPlugin("actionTable", "action1.row", "action1.column", "${value}"));
ETLBatchConfig etlConfig = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder("* * * * *").addStage(action1).setEngine(engine).build();
// set runtime arguments for macro substitution
Map<String, String> runtimeArguments = ImmutableMap.of("value", "macroValue");
AppRequest<co.cask.cdap.etl.proto.v2.ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("macroActionTest-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
manager.setRuntimeArgs(runtimeArguments);
manager.start(ImmutableMap.of("logical.start.time", "0"));
manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
DataSetManager<Table> actionTableDS = getDataset("actionTable");
Assert.assertEquals("macroValue", MockAction.readOutput(actionTableDS, "action1.row", "action1.column"));
}
use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.
the class DataPipelineTest method testKVTableLookup.
@Test
public void testKVTableLookup() throws Exception {
addDatasetInstance(KeyValueTable.class.getName(), "ageTable");
DataSetManager<KeyValueTable> lookupTable = getDataset("ageTable");
lookupTable.get().write("samuel".getBytes(Charsets.UTF_8), "12".getBytes(Charsets.UTF_8));
lookupTable.get().write("bob".getBytes(Charsets.UTF_8), "36".getBytes(Charsets.UTF_8));
lookupTable.get().write("jane".getBytes(Charsets.UTF_8), "25".getBytes(Charsets.UTF_8));
lookupTable.flush();
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin("inputTable"))).addStage(new ETLStage("transform", LookupTransform.getPlugin("person", "age", "ageTable"))).addStage(new ETLStage("sink", MockSink.getPlugin("outputTable"))).addConnection("source", "transform").addConnection("transform", "sink").build();
ApplicationId appId = NamespaceId.DEFAULT.app("testKVTableLookup");
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
// set up input data
Schema inputSchema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema).set("person", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema).set("person", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema).set("person", "jane").build();
DataSetManager<Table> inputTable = getDataset("inputTable");
MockSource.writeInput(inputTable, ImmutableList.of(recordSamuel, recordBob, recordJane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME).start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Schema schema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(schema).set("person", "samuel").set("age", "12").build());
expected.add(StructuredRecord.builder(schema).set("person", "bob").set("age", "36").build());
expected.add(StructuredRecord.builder(schema).set("person", "jane").set("age", "25").build());
DataSetManager<Table> outputTable = getDataset("outputTable");
Set<StructuredRecord> actual = new HashSet<>(MockSink.readOutput(outputTable));
Assert.assertEquals(expected, actual);
validateMetric(3, appId, "source.records.out");
validateMetric(3, appId, "sink.records.in");
deleteDatasetInstance(NamespaceId.DEFAULT.dataset("inputTable"));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset("outputTable"));
}
use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.
the class DataPipelineTest method testSplitterToConnector.
private void testSplitterToConnector(Engine engine) throws Exception {
Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord user0 = StructuredRecord.builder(schema).set("id", 0L).build();
StructuredRecord user1 = StructuredRecord.builder(schema).set("id", 1L).set("email", "one@example.com").build();
StructuredRecord user2 = StructuredRecord.builder(schema).set("id", 2L).set("name", "two").build();
StructuredRecord user3 = StructuredRecord.builder(schema).set("id", 3L).set("name", "three").set("email", "three@example.com").build();
String sourceName = "splitconSource" + engine.name();
String sink1Name = "splitconSink1" + engine.name();
String sink2Name = "splitconSink2" + engine.name();
/*
*
* |null --> sink1
* |null--> identity-agg --> splitter2 --|
* source --> splitter1--| |non-null --|
* | |--> sink2
* |non-null-----------------------------------------|
*/
ETLBatchConfig config = ETLBatchConfig.builder("* * * * *").setEngine(engine).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("splitter1", NullFieldSplitterTransform.getPlugin("name"))).addStage(new ETLStage("splitter2", NullFieldSplitterTransform.getPlugin("email"))).addStage(new ETLStage("identity", IdentityAggregator.getPlugin())).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addConnection("source", "splitter1").addConnection("splitter1", "identity", "null").addConnection("splitter1", "sink2", "non-null").addConnection("identity", "splitter2").addConnection("splitter2", "sink1", "null").addConnection("splitter2", "sink2", "non-null").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("SplitConTest-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
DataSetManager<Table> inputManager = getDataset(sourceName);
MockSource.writeInput(inputManager, ImmutableList.of(user0, user1, user2, user3));
// run pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check output
// sink1 should only have records where both name and email are null (user0)
DataSetManager<Table> sinkManager = getDataset(sink1Name);
Set<StructuredRecord> expected = ImmutableSet.of(user0);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
// sink2 should have anything with a non-null name or non-null email
sinkManager = getDataset(sink2Name);
expected = ImmutableSet.of(user1, user2, user3);
actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(4, appId, "source.records.out");
validateMetric(4, appId, "splitter1.records.in");
validateMetric(2, appId, "splitter1.records.out.null");
validateMetric(2, appId, "splitter1.records.out.non-null");
validateMetric(2, appId, "identity.records.in");
validateMetric(2, appId, "identity.records.out");
validateMetric(2, appId, "splitter2.records.in");
validateMetric(1, appId, "splitter2.records.out.null");
validateMetric(1, appId, "splitter2.records.out.non-null");
validateMetric(1, appId, "sink1.records.in");
validateMetric(3, appId, "sink2.records.in");
}
use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkSink.
private void testSinglePhaseWithSparkSink() throws Exception {
/*
* source1 ---|
* |--> sparksink
* source2 ---|
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// set up five spam messages and five non-spam messages to be used for classification
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
// write records to source1
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
MockSource.writeInput(inputManager, messagesToWrite);
messagesToWrite.clear();
messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
// write records to source2
inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
MockSource.writeInput(inputManager, messagesToWrite);
// ingest in some messages to be classified
StreamManager textsToClassify = getStreamManager(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
textsToClassify.send("how are you doing today");
textsToClassify.send("free money money");
textsToClassify.send("what are you doing today");
textsToClassify.send("genuine report");
// manually trigger the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
// only 'free money money' should be predicated as spam
Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
validateMetric(5, appId, "source1.records.out");
validateMetric(5, appId, "source2.records.out");
validateMetric(10, appId, "customsink.records.in");
}
use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.
the class DataPipelineTest method testSimpleConditionWithActions.
@Test
public void testSimpleConditionWithActions() throws Exception {
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
/*
* action --> condition --> file ---> trueSink
* |
* |---file-->----> falseSink
*
*/
String appName = "SimpleConditionWithActions";
String trueSource = "true" + appName + "Source";
String falseSource = "false" + appName + "Source";
String trueSink = "true" + appName + "Sink";
String falseSink = "false" + appName + "Sink";
String actionTable = "actionTable" + appName;
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("trueSource", MockSource.getPlugin(trueSource, schema))).addStage(new ETLStage("falseSource", MockSource.getPlugin(falseSource, schema))).addStage(new ETLStage("trueSink", MockSink.getPlugin(trueSink))).addStage(new ETLStage("falseSink", MockSink.getPlugin(falseSink))).addStage(new ETLStage("condition", MockCondition.getPlugin("condition"))).addStage(new ETLStage("action", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addConnection("action", "condition").addConnection("condition", "trueSource", true).addConnection("condition", "falseSource", false).addConnection("trueSource", "trueSink").addConnection("falseSource", "falseSink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(appName);
ApplicationManager appManager = deployApplication(appId, appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
for (String branch : Arrays.asList("true", "false")) {
String source = branch.equals("true") ? trueSource : falseSource;
String sink = branch.equals("true") ? trueSink : falseSink;
// write records to source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition.branch.to.execute", branch));
if (branch.equals("true")) {
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
} else {
workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
}
// check sink
DataSetManager<Table> sinkManager = getDataset(sink);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, branch + "Source.records.out");
validateMetric(2, appId, branch + "Sink.records.in");
// check Action is executed correctly
DataSetManager<Table> actionTableDS = getDataset(actionTable);
Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
}
}
Aggregations