Examples with ETLConfig - io.cdap.cdap.etl.proto.v2.ETLConfig

Example 61 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testMacrosMapReducePipeline.

@Test
public void testMacrosMapReducePipeline() throws Exception {
    /*
     * Trivial MapReduce pipeline from batch source to batch sink.
     *
     * source --------- sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "${runtime${source}}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "${runtime}${sink}"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // set runtime arguments for macro substitution
    Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "MRSinkDataset", "source", "Source", "runtimeSource", "mockRuntimeMRSourceDataset");
    // make sure the datasets don't exist beforehand
    Assert.assertNull(getDataset("mockRuntimeMRSourceDataset").get());
    Assert.assertNull(getDataset("mockRuntimeMRSinkDataset").get());
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.setRuntimeArgs(runtimeArguments);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // now the datasets should exist
    Assert.assertNotNull(getDataset("mockRuntimeMRSourceDataset").get());
    Assert.assertNotNull(getDataset("mockRuntimeMRSinkDataset").get());
}

Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 62 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testPipelineWithActions.

private void testPipelineWithActions(Engine engine) throws Exception {
    String actionTable = "actionTable-" + engine;
    String action1RowKey = "action1.row";
    String action1ColumnKey = "action1.column";
    String action1Value = "action1.value";
    String action2RowKey = "action2.row";
    String action2ColumnKey = "action2.column";
    String action2Value = "action2.value";
    String action3RowKey = "action3.row";
    String action3ColumnKey = "action3.column";
    String action3Value = "action3.value";
    String sourceName = "actionSource-" + engine;
    String sinkName = "actionSink-" + engine;
    String sourceTableName = "actionSourceTable-" + engine;
    String sinkTableName = "actionSinkTable-" + engine;
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, action1RowKey, action1ColumnKey, action1Value))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, action2RowKey, action2ColumnKey, action2Value))).addStage(new ETLStage("action3", MockAction.getPlugin(actionTable, action3RowKey, action3ColumnKey, action3Value))).addStage(new ETLStage(sourceName, MockSource.getPlugin(sourceTableName, schema))).addStage(new ETLStage(sinkName, MockSink.getPlugin(sinkTableName))).addConnection(sourceName, sinkName).addConnection("action1", "action2").addConnection("action2", sourceName).addConnection(sinkName, "action3").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("MyApp-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    // write records to source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(sourceTableName));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // check sink
    DataSetManager<Table> sinkManager = getDataset(sinkTableName);
    Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    DataSetManager<Table> actionTableDS = getDataset(actionTable);
    Assert.assertEquals(action1Value, MockAction.readOutput(actionTableDS, action1RowKey, action1ColumnKey));
    Assert.assertEquals(action2Value, MockAction.readOutput(actionTableDS, action2RowKey, action2ColumnKey));
    Assert.assertEquals(action3Value, MockAction.readOutput(actionTableDS, action3RowKey, action3ColumnKey));
    validateMetric(2, appId, sourceName + ".records.out");
    validateMetric(2, appId, sinkName + ".records.in");
    List<RunRecord> history = workflowManager.getHistory(ProgramRunStatus.COMPLETED);
    Assert.assertEquals(1, history.size());
    String runId = history.get(0).getPid();
    for (WorkflowToken.Scope scope : Arrays.asList(WorkflowToken.Scope.SYSTEM, WorkflowToken.Scope.USER)) {
        WorkflowTokenDetail token = workflowManager.getToken(runId, scope, null);
        for (Map.Entry<String, List<WorkflowTokenDetail.NodeValueDetail>> tokenData : token.getTokenData().entrySet()) {
            Assert.assertTrue(!tokenData.getKey().startsWith(io.cdap.cdap.etl.common.Constants.StageStatistics.PREFIX));
        }
    }
}

Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) RunRecord(io.cdap.cdap.proto.RunRecord) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) WorkflowTokenDetail(io.cdap.cdap.proto.WorkflowTokenDetail) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap)

Example 63 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testMultipleOrderedInputActions.

@Test
public void testMultipleOrderedInputActions() throws Exception {
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    /*
     * action1--->action2---|
     *                      |--> condition --> file ---> trueSink
     * action3--->action4---|      |
     *                             |--->file----> falseSink
     *
     */
    String appName = "MultipleOrderedInputActions";
    String trueSource = "true" + appName + "Source";
    String falseSource = "false" + appName + "Source";
    String trueSink = "true" + appName + "Sink";
    String falseSink = "false" + appName + "Sink";
    String actionTable = "actionTable" + appName;
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("trueSource", MockSource.getPlugin(trueSource, schema))).addStage(new ETLStage("falseSource", MockSource.getPlugin(falseSource, schema))).addStage(new ETLStage("trueSink", MockSink.getPlugin(trueSink))).addStage(new ETLStage("falseSink", MockSink.getPlugin(falseSink))).addStage(new ETLStage("condition", MockCondition.getPlugin("condition"))).addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, "row2", "key2", "val2", "row1key1", "val1"))).addStage(new ETLStage("action3", MockAction.getPlugin(actionTable, "row3", "key3", "val3"))).addStage(new ETLStage("action4", MockAction.getPlugin(actionTable, "row4", "key4", "val4", "row3key3", "val3"))).addConnection("action1", "action2").addConnection("action3", "action4").addConnection("action2", "condition").addConnection("action4", "condition").addConnection("condition", "trueSource", true).addConnection("condition", "falseSource", false).addConnection("trueSource", "trueSink").addConnection("falseSource", "falseSink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(appName);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    for (String branch : Arrays.asList("true", "false")) {
        // write records to source
        String source = branch.equals("true") ? trueSource : falseSource;
        String sink = branch.equals("true") ? trueSink : falseSink;
        DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source));
        MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
        WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
        workflowManager.start(ImmutableMap.of("condition.branch.to.execute", branch));
        if (branch.equals("true")) {
            workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
        } else {
            workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
        }
        // check sink
        DataSetManager<Table> sinkManager = getDataset(sink);
        Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
        Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
        Assert.assertEquals(expected, actual);
        validateMetric(2, appId, branch + "Source.records.out");
        validateMetric(2, appId, branch + "Sink.records.in");
        // check Action1 and Action2 is executed correctly
        DataSetManager<Table> actionTableDS = getDataset(actionTable);
        Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
        Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
        Assert.assertEquals("val3", MockAction.readOutput(actionTableDS, "row3", "key3"));
        Assert.assertEquals("val4", MockAction.readOutput(actionTableDS, "row4", "key4"));
    }
}

Example 64 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testExternalSparkProgramPipelines.

@Test
public void testExternalSparkProgramPipelines() throws Exception {
    File testDir = TMP_FOLDER.newFolder("sparkProgramTest");
    File input = new File(testDir, "poem.txt");
    try (PrintWriter writer = new PrintWriter(input.getAbsolutePath())) {
        writer.println("this");
        writer.println("is");
        writer.println("a");
        writer.println("poem");
        writer.println("it");
        writer.println("is");
        writer.println("a");
        writer.println("bad");
        writer.println("poem");
    }
    File wordCountOutput = new File(testDir, "poem_counts");
    File filterOutput = new File(testDir, "poem_filtered");
    String args = String.format("%s %s", input.getAbsolutePath(), wordCountOutput.getAbsolutePath());
    Map<String, String> wordCountProperties = ImmutableMap.of("program.args", args);
    Map<String, String> filterProperties = ImmutableMap.of("inputPath", input.getAbsolutePath(), "outputPath", filterOutput.getAbsolutePath(), "filterStr", "bad");
    ETLBatchConfig etlConfig = io.cdap.cdap.etl.proto.v2.ETLBatchConfig.builder().addStage(new ETLStage("wordcount", new ETLPlugin(WORDCOUNT_PLUGIN, SPARK_TYPE, wordCountProperties, null))).addStage(new ETLStage("filter", new ETLPlugin(FILTER_PLUGIN, SPARK_TYPE, filterProperties, null))).addConnection("wordcount", "filter").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("sparkProgramTest");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    manager.start();
    manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
    // check wordcount output
    /*
        this is a poem
        it is a bad poem
     */
    Map<String, Integer> expected = new HashMap<>();
    expected.put("this", 1);
    expected.put("is", 2);
    expected.put("a", 2);
    expected.put("poem", 2);
    expected.put("it", 1);
    expected.put("bad", 1);
    Map<String, Integer> counts = new HashMap<>();
    File[] files = wordCountOutput.listFiles();
    Assert.assertNotNull("No output files for wordcount found.", files);
    for (File file : files) {
        String fileName = file.getName();
        if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
            continue;
        }
        try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] fields = line.split(" ");
                counts.put(fields[0], Integer.parseInt(fields[1]));
            }
        }
    }
    Assert.assertEquals(expected, counts);
    // check filter output
    files = filterOutput.listFiles();
    Assert.assertNotNull("No output files for filter program found.", files);
    // Note: we are only interested in the word "bad" being filtered out for the assertion hence it is okay to use a
    // set here even though it will not assert for the cardinality.
    Set<String> expectedLines = ImmutableSet.of("this", "is", "a", "poem", "it", "is", "a", "poem");
    Set<String> actualLines = new HashSet<>();
    for (File file : files) {
        String fileName = file.getName();
        if (fileName.startsWith(".") || fileName.equals("_SUCCESS")) {
            continue;
        }
        try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
            String line;
            while ((line = reader.readLine()) != null) {
                actualLines.add(line);
            }
        }
    }
    Assert.assertEquals(expectedLines, actualLines);
}

Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) HashMap(java.util.HashMap) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 65 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testNoMacroMapReduce.

/**
 * Tests that if no macro is provided to the dataset name property, datasets will be created at config time.
 */
@Test
public void testNoMacroMapReduce() throws Exception {
    /*
     * Trivial MapReduce pipeline from batch source to batch sink.
     *
     * source --------- sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("mrinput", "configTimeMockSourceDataset"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("mroutput", "configTimeMockSinkDataset"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("MRApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // set runtime arguments for macro substitution
    Map<String, String> runtimeArguments = ImmutableMap.of("runtime", "mockRuntime", "sink", "SinkDataset", "source", "Source", "runtimeSource", "mockRuntimeSourceDataset");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    // make sure the datasets were created at configure time
    Assert.assertNotNull(getDataset("configTimeMockSourceDataset").get());
    Assert.assertNotNull(getDataset("configTimeMockSinkDataset").get());
    workflowManager.setRuntimeArgs(runtimeArguments);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
}

Aggregations

ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)84 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)75 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)59 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)59 Test (org.junit.Test)54 ApplicationManager (io.cdap.cdap.test.ApplicationManager)53 Table (io.cdap.cdap.api.dataset.table.Table)46 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)45 Schema (io.cdap.cdap.api.data.schema.Schema)45 WorkflowManager (io.cdap.cdap.test.WorkflowManager)45 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)35 HashSet (java.util.HashSet)15 ArrayList (java.util.ArrayList)14 HashMap (java.util.HashMap)11 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)9 SpamMessage (io.cdap.cdap.datapipeline.mock.SpamMessage)8 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)7 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)7 DatasetFieldLineageSummary (io.cdap.cdap.metadata.DatasetFieldLineageSummary)7 FieldLineageAdmin (io.cdap.cdap.metadata.FieldLineageAdmin)7