use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testPostAction.
@Test
public void testPostAction() throws Exception {
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin("actionInput"))).addStage(new ETLStage("sink", MockSink.getPlugin("actionOutput"))).addPostAction(new ETLStage("tokenWriter", NodeStatesAction.getPlugin("tokenTable"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ActionApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("actionInput"));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> tokenTableManager = getDataset(NamespaceId.DEFAULT.dataset("tokenTable"));
Table tokenTable = tokenTableManager.get();
NodeStatus status = NodeStatus.valueOf(Bytes.toString(tokenTable.get(Bytes.toBytes("phase-1"), Bytes.toBytes("status"))));
Assert.assertEquals(NodeStatus.COMPLETED, status);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkSink.
private void testSinglePhaseWithSparkSink() throws Exception {
/*
* source1 ---|
* |--> sparksink
* source2 ---|
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// set up five spam messages and five non-spam messages to be used for classification
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
// write records to source1
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
MockSource.writeInput(inputManager, messagesToWrite);
messagesToWrite.clear();
messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
// write records to source2
inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
MockSource.writeInput(inputManager, messagesToWrite);
// ingest in some messages to be classified
DataSetManager<FileSet> fileSetManager = getDataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
FileSet fileSet = fileSetManager.get();
try (PrintStream out = new PrintStream(fileSet.getLocation("inputTexts").getOutputStream(), true, "UTF-8")) {
out.println("how are you doing today");
out.println("free money money");
out.println("what are you doing today");
out.println("genuine report");
}
// manually trigger the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
FileSetArguments.setInputPath(runtimeArgs, "inputTexts");
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
// only 'free money money' should be predicated as spam
Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
validateMetric(5, appId, "source1.records.out");
validateMetric(5, appId, "source2.records.out");
validateMetric(10, appId, "customsink.records.in");
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method deployPipelineWithSchedule.
private WorkflowManager deployPipelineWithSchedule(String pipelineName, Engine engine, String triggeringPipelineName, ArgumentMapping key1Mapping, String expectedKey1Value, PluginPropertyMapping key2Mapping, String expectedKey2Value) throws Exception {
String tableName = "actionScheduleTable" + pipelineName + engine;
String sourceName = "macroActionWithScheduleInput-" + pipelineName + engine;
String sinkName = "macroActionWithScheduleOutput-" + pipelineName + engine;
String key1 = key1Mapping.getTarget();
String key2 = key2Mapping.getTarget();
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("action1", MockAction.getPlugin(tableName, "row1", "column1", String.format("${%s}", key1)))).addStage(new ETLStage("action2", MockAction.getPlugin(tableName, "row2", "column2", String.format("${%s}", key2)))).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", String.format("${%s}", key1)))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", String.format("${%s}", key2)))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("action1", "action2").addConnection("action2", "source").addConnection("source", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(pipelineName);
ApplicationManager appManager = deployApplication(appId, appRequest);
// there should be only two programs - one workflow and one mapreduce/spark
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
// Use the expectedKey1Value and expectedKey2Value as values for two records, so that Only record "samuel"
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordKey1Value = StructuredRecord.builder(schema).set("name", expectedKey1Value).build();
StructuredRecord recordKey2Value = StructuredRecord.builder(schema).set("name", expectedKey2Value).build();
// write one record to each source
DataSetManager<Table> inputManager = getDataset(sourceName);
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordKey1Value, recordKey2Value));
String defaultNamespace = NamespaceId.DEFAULT.getNamespace();
// Use properties from the triggering pipeline as values for runtime argument key1, key2
TriggeringPropertyMapping propertyMapping = new TriggeringPropertyMapping(ImmutableList.of(key1Mapping), ImmutableList.of(key2Mapping));
ProgramStatusTrigger completeTrigger = new ProgramStatusTrigger(new WorkflowId(defaultNamespace, triggeringPipelineName, SmartWorkflow.NAME), ImmutableSet.of(ProgramStatus.COMPLETED));
ScheduleId scheduleId = appId.schedule("completeSchedule");
appManager.addSchedule(new ScheduleDetail(scheduleId.getNamespace(), scheduleId.getApplication(), scheduleId.getVersion(), scheduleId.getSchedule(), "", new ScheduleProgramInfo(SchedulableProgramType.WORKFLOW, SmartWorkflow.NAME), ImmutableMap.of(SmartWorkflow.TRIGGERING_PROPERTIES_MAPPING, GSON.toJson(propertyMapping)), completeTrigger, ImmutableList.of(), Schedulers.JOB_QUEUE_TIMEOUT_MILLIS, null, null));
appManager.enableSchedule(scheduleId);
return appManager.getWorkflowManager(SmartWorkflow.NAME);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testKVTableLookup.
@Test
public void testKVTableLookup() throws Exception {
addDatasetInstance(KeyValueTable.class.getName(), "ageTable");
DataSetManager<KeyValueTable> lookupTable = getDataset("ageTable");
lookupTable.get().write("samuel".getBytes(Charsets.UTF_8), "12".getBytes(Charsets.UTF_8));
lookupTable.get().write("bob".getBytes(Charsets.UTF_8), "36".getBytes(Charsets.UTF_8));
lookupTable.get().write("jane".getBytes(Charsets.UTF_8), "25".getBytes(Charsets.UTF_8));
lookupTable.flush();
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin("inputTable"))).addStage(new ETLStage("transform", LookupTransform.getPlugin("person", "age", "ageTable"))).addStage(new ETLStage("sink", MockSink.getPlugin("outputTable"))).addConnection("source", "transform").addConnection("transform", "sink").build();
ApplicationId appId = NamespaceId.DEFAULT.app("testKVTableLookup");
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
// set up input data
Schema inputSchema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema).set("person", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema).set("person", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema).set("person", "jane").build();
DataSetManager<Table> inputTable = getDataset("inputTable");
MockSource.writeInput(inputTable, ImmutableList.of(recordSamuel, recordBob, recordJane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME).start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Schema schema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(schema).set("person", "samuel").set("age", "12").build());
expected.add(StructuredRecord.builder(schema).set("person", "bob").set("age", "36").build());
expected.add(StructuredRecord.builder(schema).set("person", "jane").set("age", "25").build());
DataSetManager<Table> outputTable = getDataset("outputTable");
Set<StructuredRecord> actual = new HashSet<>(MockSink.readOutput(outputTable));
Assert.assertEquals(expected, actual);
validateMetric(3, appId, "source.records.out");
validateMetric(3, appId, "sink.records.in");
deleteDatasetInstance(NamespaceId.DEFAULT.dataset("inputTable"));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset("outputTable"));
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testNestedCondition.
private void testNestedCondition(Engine engine) throws Exception {
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
/*
* action1----->|
* |
* action2----->|
* |
* V
* source --> condition1 --> sink1
* |
* |------->condition2 --> sink2
* |
* |-------> condition3----> sink3----->action3----->condition4---->action4
* |
* |------>action5
*/
String appName = "NestedCondition-" + engine;
String source = appName + "Source-" + engine;
String sink1 = appName + "Sink1-" + engine;
String sink2 = appName + "Sink2-" + engine;
String sink3 = appName + "Sink3-" + engine;
String actionTable = "actionTable" + appName + "-" + engine;
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(source, schema))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2))).addStage(new ETLStage("sink3", MockSink.getPlugin(sink3))).addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, "row2", "key2", "val2"))).addStage(new ETLStage("action3", MockAction.getPlugin(actionTable, "row3", "key3", "val3"))).addStage(new ETLStage("action4", MockAction.getPlugin(actionTable, "row4", "key4", "val4"))).addStage(new ETLStage("action5", MockAction.getPlugin(actionTable, "row5", "key5", "val5"))).addStage(new ETLStage("condition1", MockCondition.getPlugin("condition1"))).addStage(new ETLStage("condition2", MockCondition.getPlugin("condition2"))).addStage(new ETLStage("condition3", MockCondition.getPlugin("condition3"))).addStage(new ETLStage("condition4", MockCondition.getPlugin("condition4"))).addConnection("action1", "condition1").addConnection("action2", "condition1").addConnection("source", "condition1").addConnection("condition1", "sink1", true).addConnection("condition1", "condition2", false).addConnection("condition2", "sink2", true).addConnection("condition2", "condition3", false).addConnection("condition3", "sink3", true).addConnection("sink3", "action3").addConnection("action3", "condition4").addConnection("condition4", "action4", true).addConnection("condition4", "action5", false).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(appName);
ApplicationManager appManager = deployApplication(appId, appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
// write records to source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition3.branch.to.execute", "true", "condition4.branch.to.execute", "true"));
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check sink
DataSetManager<Table> sinkManager = getDataset(sink3);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
// other sinks should not have any data in them
sinkManager = getDataset(sink1);
Assert.assertTrue(MockSink.readOutput(sinkManager).isEmpty());
sinkManager = getDataset(sink2);
Assert.assertTrue(MockSink.readOutput(sinkManager).isEmpty());
// check actions
DataSetManager<Table> actionTableDS = getDataset(actionTable);
// action1 is executed
Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
// action2 is executed
Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
// action3 is executed
Assert.assertEquals("val3", MockAction.readOutput(actionTableDS, "row3", "key3"));
// action4 is executed
Assert.assertEquals("val4", MockAction.readOutput(actionTableDS, "row4", "key4"));
// action5 should not get executed.
Assert.assertNull(MockAction.readOutput(actionTableDS, "row5", "key5"));
}
Aggregations