use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataPipelineTest method testMacrosMapReducePipeline.
@Test
public void testMacrosMapReducePipeline() throws Exception {
/*
* Trivial MapReduce pipeline from batch source to batch sink.
*
* source --------- sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "${runtime${source}}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "${runtime}${sink}"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// set runtime arguments for macro substitution
Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "MRSinkDataset", "source", "Source", "runtimeSource", "mockRuntimeMRSourceDataset");
// make sure the datasets don't exist beforehand
Assert.assertNull(getDataset("mockRuntimeMRSourceDataset").get());
Assert.assertNull(getDataset("mockRuntimeMRSinkDataset").get());
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.setRuntimeArgs(runtimeArguments);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// now the datasets should exist
Assert.assertNotNull(getDataset("mockRuntimeMRSourceDataset").get());
Assert.assertNotNull(getDataset("mockRuntimeMRSinkDataset").get());
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataPipelineTest method testPipelineWithActions.
private void testPipelineWithActions(Engine engine) throws Exception {
String actionTable = "actionTable-" + engine;
String action1RowKey = "action1.row";
String action1ColumnKey = "action1.column";
String action1Value = "action1.value";
String action2RowKey = "action2.row";
String action2ColumnKey = "action2.column";
String action2Value = "action2.value";
String action3RowKey = "action3.row";
String action3ColumnKey = "action3.column";
String action3Value = "action3.value";
String sourceName = "actionSource-" + engine;
String sinkName = "actionSink-" + engine;
String sourceTableName = "actionSourceTable-" + engine;
String sinkTableName = "actionSinkTable-" + engine;
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, action1RowKey, action1ColumnKey, action1Value))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, action2RowKey, action2ColumnKey, action2Value))).addStage(new ETLStage("action3", MockAction.getPlugin(actionTable, action3RowKey, action3ColumnKey, action3Value))).addStage(new ETLStage(sourceName, MockSource.getPlugin(sourceTableName, schema))).addStage(new ETLStage(sinkName, MockSink.getPlugin(sinkTableName))).addConnection(sourceName, sinkName).addConnection("action1", "action2").addConnection("action2", sourceName).addConnection(sinkName, "action3").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MyApp-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
// write records to source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceTableName));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// check sink
DataSetManager<Table> sinkManager = getDataset(sinkTableName);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
DataSetManager<Table> actionTableDS = getDataset(actionTable);
Assert.assertEquals(action1Value, MockAction.readOutput(actionTableDS, action1RowKey, action1ColumnKey));
Assert.assertEquals(action2Value, MockAction.readOutput(actionTableDS, action2RowKey, action2ColumnKey));
Assert.assertEquals(action3Value, MockAction.readOutput(actionTableDS, action3RowKey, action3ColumnKey));
validateMetric(2, appId, sourceName + ".records.out");
validateMetric(2, appId, sinkName + ".records.in");
List<RunRecord> history = workflowManager.getHistory(ProgramRunStatus.COMPLETED);
Assert.assertEquals(1, history.size());
String runId = history.get(0).getPid();
for (WorkflowToken.Scope scope : Arrays.asList(WorkflowToken.Scope.SYSTEM, WorkflowToken.Scope.USER)) {
WorkflowTokenDetail token = workflowManager.getToken(runId, scope, null);
for (Map.Entry<String, List<WorkflowTokenDetail.NodeValueDetail>> tokenData : token.getTokenData().entrySet()) {
Assert.assertTrue(!tokenData.getKey().startsWith(io.cdap.cdap.etl.common.Constants.StageStatistics.PREFIX));
}
}
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataPipelineTest method testMultipleOrderedInputActions.
@Test
public void testMultipleOrderedInputActions() throws Exception {
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
/*
* action1--->action2---|
* |--> condition --> file ---> trueSink
* action3--->action4---| |
* |--->file----> falseSink
*
*/
String appName = "MultipleOrderedInputActions";
String trueSource = "true" + appName + "Source";
String falseSource = "false" + appName + "Source";
String trueSink = "true" + appName + "Sink";
String falseSink = "false" + appName + "Sink";
String actionTable = "actionTable" + appName;
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("trueSource", MockSource.getPlugin(trueSource, schema))).addStage(new ETLStage("falseSource", MockSource.getPlugin(falseSource, schema))).addStage(new ETLStage("trueSink", MockSink.getPlugin(trueSink))).addStage(new ETLStage("falseSink", MockSink.getPlugin(falseSink))).addStage(new ETLStage("condition", MockCondition.getPlugin("condition"))).addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, "row2", "key2", "val2", "row1key1", "val1"))).addStage(new ETLStage("action3", MockAction.getPlugin(actionTable, "row3", "key3", "val3"))).addStage(new ETLStage("action4", MockAction.getPlugin(actionTable, "row4", "key4", "val4", "row3key3", "val3"))).addConnection("action1", "action2").addConnection("action3", "action4").addConnection("action2", "condition").addConnection("action4", "condition").addConnection("condition", "trueSource", true).addConnection("condition", "falseSource", false).addConnection("trueSource", "trueSink").addConnection("falseSource", "falseSink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(appName);
ApplicationManager appManager = deployApplication(appId, appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
for (String branch : Arrays.asList("true", "false")) {
// write records to source
String source = branch.equals("true") ? trueSource : falseSource;
String sink = branch.equals("true") ? trueSink : falseSink;
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition.branch.to.execute", branch));
if (branch.equals("true")) {
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
} else {
workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
}
// check sink
DataSetManager<Table> sinkManager = getDataset(sink);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, branch + "Source.records.out");
validateMetric(2, appId, branch + "Sink.records.in");
// check Action1 and Action2 is executed correctly
DataSetManager<Table> actionTableDS = getDataset(actionTable);
Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
Assert.assertEquals("val3", MockAction.readOutput(actionTableDS, "row3", "key3"));
Assert.assertEquals("val4", MockAction.readOutput(actionTableDS, "row4", "key4"));
}
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataPipelineTest method testExternalSparkProgramPipelines.
@Test
public void testExternalSparkProgramPipelines() throws Exception {
File testDir = TMP_FOLDER.newFolder("sparkProgramTest");
File input = new File(testDir, "poem.txt");
try (PrintWriter writer = new PrintWriter(input.getAbsolutePath())) {
writer.println("this");
writer.println("is");
writer.println("a");
writer.println("poem");
writer.println("it");
writer.println("is");
writer.println("a");
writer.println("bad");
writer.println("poem");
}
File wordCountOutput = new File(testDir, "poem_counts");
File filterOutput = new File(testDir, "poem_filtered");
String args = String.format("%s %s", input.getAbsolutePath(), wordCountOutput.getAbsolutePath());
Map<String, String> wordCountProperties = ImmutableMap.of("program.args", args);
Map<String, String> filterProperties = ImmutableMap.of("inputPath", input.getAbsolutePath(), "outputPath", filterOutput.getAbsolutePath(), "filterStr", "bad");
ETLBatchConfig etlConfig = io.cdap.cdap.etl.proto.v2.ETLBatchConfig.builder().addStage(new ETLStage("wordcount", new ETLPlugin(WORDCOUNT_PLUGIN, SPARK_TYPE, wordCountProperties, null))).addStage(new ETLStage("filter", new ETLPlugin(FILTER_PLUGIN, SPARK_TYPE, filterProperties, null))).addConnection("wordcount", "filter").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("sparkProgramTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
manager.start();
manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
// check wordcount output
/*
this is a poem
it is a bad poem
*/
Map<String, Integer> expected = new HashMap<>();
expected.put("this", 1);
expected.put("is", 2);
expected.put("a", 2);
expected.put("poem", 2);
expected.put("it", 1);
expected.put("bad", 1);
Map<String, Integer> counts = new HashMap<>();
File[] files = wordCountOutput.listFiles();
Assert.assertNotNull("No output files for wordcount found.", files);
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
String[] fields = line.split(" ");
counts.put(fields[0], Integer.parseInt(fields[1]));
}
}
}
Assert.assertEquals(expected, counts);
// check filter output
files = filterOutput.listFiles();
Assert.assertNotNull("No output files for filter program found.", files);
// Note: we are only interested in the word "bad" being filtered out for the assertion hence it is okay to use a
// set here even though it will not assert for the cardinality.
Set<String> expectedLines = ImmutableSet.of("this", "is", "a", "poem", "it", "is", "a", "poem");
Set<String> actualLines = new HashSet<>();
for (File file : files) {
String fileName = file.getName();
if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
continue;
}
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
actualLines.add(line);
}
}
}
Assert.assertEquals(expectedLines, actualLines);
}
use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.
the class DataPipelineTest method testNoMacroMapReduce.
/**
* Tests that if no macro is provided to the dataset name property, datasets will be created at config time.
*/
@Test
public void testNoMacroMapReduce() throws Exception {
/*
* Trivial MapReduce pipeline from batch source to batch sink.
*
* source --------- sink
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "configTimeMockSourceDataset"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "configTimeMockSinkDataset"))).addConnection("source", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// set runtime arguments for macro substitution
Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "SinkDataset", "source", "Source", "runtimeSource", "mockRuntimeSourceDataset");
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
// make sure the datasets were created at configure time
Assert.assertNotNull(getDataset("configTimeMockSourceDataset").get());
Assert.assertNotNull(getDataset("configTimeMockSinkDataset").get());
workflowManager.setRuntimeArgs(runtimeArguments);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
}
Aggregations