Search in sources :

Example 1 with WorkflowManager

use of io.cdap.cdap.test.WorkflowManager in project cdap by caskdata.

the class DataPipelineTest method testOuterJoin.

private void testOuterJoin(Engine engine) throws Exception {
    Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_name", Schema.of(Schema.Type.STRING)));
    String input1Name = "source1OuterJoinInput-" + engine;
    String input2Name = "source2OuterJoinInput-" + engine;
    String input3Name = "source3OuterJoinInput-" + engine;
    String outputName = "outerJoinOutput-" + engine;
    String joinerName = "outerJoiner-" + engine;
    String sinkName = "outerJoinSink-" + engine;
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(input1Name, inputSchema1))).addStage(new ETLStage("source2", MockSource.getPlugin(input2Name, inputSchema2))).addStage(new ETLStage("source3", MockSource.getPlugin(input3Name, inputSchema3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage(joinerName, MockJoiner.getPlugin("t1.customer_id=t2.cust_id=t3.c_id&" + "t1.customer_name=t2.cust_name=t3.c_name", "t1", ""))).addStage(new ETLStage(sinkName, MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", joinerName).addConnection("t2", joinerName).addConnection("t3", joinerName).addConnection(joinerName, sinkName).setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("OuterJoinApp-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    Schema outSchema = Schema.recordOf("join.output", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
    StructuredRecord recordMartha = StructuredRecord.builder(inputSchema1).set("customer_id", "4").set("customer_name", "martha").build();
    StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
    StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
    StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("c_name", "samuel").build();
    StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "2").set("c_name", "bob").build();
    StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "3").set("c_name", "jane").build();
    // write one record to each source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(input1Name));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane, recordMartha));
    inputManager = getDataset(NamespaceId.DEFAULT.dataset(input2Name));
    MockSource.writeInput(inputManager, ImmutableList.of(recordCar, recordBike));
    inputManager = getDataset(NamespaceId.DEFAULT.dataset(input3Name));
    MockSource.writeInput(inputManager, ImmutableList.of(recordTrasCar, recordTrasPlane, recordTrasBike));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("c_name", "samuel").build();
    StructuredRecord joinRecordBob = StructuredRecord.builder(outSchema).set("customer_id", "2").set("customer_name", "bob").set("t_id", "2").set("c_id", "2").set("c_name", "bob").build();
    StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "3").set("c_id", "3").set("c_name", "jane").build();
    StructuredRecord joinRecordMartha = StructuredRecord.builder(outSchema).set("customer_id", "4").set("customer_name", "martha").build();
    DataSetManager<Table> sinkManager = getDataset(outputName);
    Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordBob, joinRecordMartha);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(4, appId, joinerName + ".records.out");
    validateMetric(4, appId, sinkName + ".records.in");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 2 with WorkflowManager

use of io.cdap.cdap.test.WorkflowManager in project cdap by caskdata.

the class DataPipelineTest method runHeadTriggeringPipeline.

private void runHeadTriggeringPipeline(Engine engine, String expectedValue1, String expectedValue2) throws Exception {
    // set runtime arguments
    Map<String, String> runtimeArguments = ImmutableMap.of("head-arg", expectedValue1);
    ETLStage action1 = new ETLStage("action1", MockAction.getPlugin("actionTable", "action1.row", "action1.column", expectedValue2));
    ETLBatchConfig etlConfig = io.cdap.cdap.etl.proto.v2.ETLBatchConfig.builder().addStage(action1).setEngine(engine).build();
    AppRequest<io.cdap.cdap.etl.proto.v2.ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("head");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager manager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    manager.setRuntimeArgs(runtimeArguments);
    manager.start(ImmutableMap.of("logical.start.time", "0"));
    manager.waitForRun(ProgramRunStatus.COMPLETED, 3, TimeUnit.MINUTES);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) AppRequest(io.cdap.cdap.proto.artifact.AppRequest)

Example 3 with WorkflowManager

use of io.cdap.cdap.test.WorkflowManager in project cdap by caskdata.

the class DataPipelineTest method testSimpleControlOnlyDag.

@Test
public void testSimpleControlOnlyDag() throws Exception {
    // 
    // condition-->action1
    // |
    // |------->action2
    // 
    String appName = "SimpleControlOnlyDag";
    String trueActionTable = "trueActionTable" + appName;
    String falseActionTable = "falseActionTable" + appName;
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("condition", MockCondition.getPlugin("condition"))).addStage(new ETLStage("action1", MockAction.getPlugin(trueActionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(falseActionTable, "row2", "key2", "val2"))).addConnection("condition", "action1", true).addConnection("condition", "action2", false).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(appName);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    for (String branch : Arrays.asList("true", "false")) {
        String table = branch.equals("true") ? trueActionTable : falseActionTable;
        WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
        workflowManager.start(ImmutableMap.of("condition.branch.to.execute", branch));
        if (branch.equals("true")) {
            workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
        } else {
            workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
        }
        DataSetManager<Table> actionTableDS = getDataset(table);
        if (branch.equals("true")) {
            Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
        } else {
            Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
        }
    }
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 4 with WorkflowManager

use of io.cdap.cdap.test.WorkflowManager in project cdap by caskdata.

the class DataPipelineTest method testActionFieldLineage.

private void testActionFieldLineage(Engine engine) throws Exception {
    String readDataset = "ActionReadDataset" + engine;
    String writeDataset = "ActionWriteDataset" + engine;
    List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
    Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
    List<Operation> operations = new ArrayList<>();
    /*
     *          |---------> srcField1 -> destField1----|
     *          |                                      |
     * ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
     *          |                                      |
     *          |---------> srcField3 -> destField3 ---|
     */
    operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
    operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
    operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
    operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
    operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
    ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    // get field lineage for dest dataset
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
    Assert.assertEquals(destFields, summary.getFields());
    Assert.assertTrue(summary.getOutgoing().isEmpty());
    Assert.assertEquals(1, summary.getIncoming().size());
    Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
    DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
    // get field lineage for src dataset
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
    Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Assert.assertEquals(1, summary.getOutgoing().size());
    expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
    LineageAdmin lineageAdmin = getLineageAdmin();
    ProgramId programId = appId.workflow(SmartWorkflow.NAME);
    RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
    // get dataset lineage for src dataset
    Tasks.waitFor(2, () -> {
        Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
        return lineage.getRelations().size();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    // get dataset lineage for dest dataset, in this test they should be same
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Assert.assertEquals(2, lineage.getRelations().size());
    Assert.assertEquals(expectedLineage, lineage.getRelations());
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) RunId(org.apache.twill.api.RunId) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin)

Example 5 with WorkflowManager

use of io.cdap.cdap.test.WorkflowManager in project cdap by caskdata.

the class DataPipelineTest method testSecureStorePipeline.

/**
 * Tests the secure storage macro function in a pipelines by creating datasets from the secure store data.
 */
private void testSecureStorePipeline(Engine engine, String prefix) throws Exception {
    /*
     * Trivial pipeline from batch source to batch sink.
     *
     * source --------- sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockRuntimeDatasetSource.getPlugin("input", "${secure(" + prefix + "source)}"))).addStage(new ETLStage("sink", MockRuntimeDatasetSink.getPlugin("output", "${secure(" + prefix + "sink)}"))).addConnection("source", "sink").setEngine(engine).build();
    // place dataset names into secure storage
    getSecureStoreManager().put("default", prefix + "source", prefix + "MockSecureSourceDataset", "secure source dataset name", new HashMap<>());
    getSecureStoreManager().put("default", prefix + "sink", prefix + "MockSecureSinkDataset", "secure dataset name", new HashMap<>());
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("App-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    // make sure the datasets don't exist beforehand
    Assert.assertNull(getDataset(prefix + "MockSecureSourceDataset").get());
    Assert.assertNull(getDataset(prefix + "MockSecureSinkDataset").get());
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // now the datasets should exist
    Assert.assertNotNull(getDataset(prefix + "MockSecureSourceDataset").get());
    Assert.assertNotNull(getDataset(prefix + "MockSecureSinkDataset").get());
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) AppRequest(io.cdap.cdap.proto.artifact.AppRequest)

Aggregations

WorkflowManager (io.cdap.cdap.test.WorkflowManager)80 ApplicationManager (io.cdap.cdap.test.ApplicationManager)78 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)69 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)67 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)67 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)63 Table (io.cdap.cdap.api.dataset.table.Table)59 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)54 Schema (io.cdap.cdap.api.data.schema.Schema)46 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)45 Test (org.junit.Test)40 ArrayList (java.util.ArrayList)23 HashMap (java.util.HashMap)16 HashSet (java.util.HashSet)15 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)12 File (java.io.File)12 RunRecord (io.cdap.cdap.proto.RunRecord)9 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)7 IOException (java.io.IOException)6 ServiceManager (io.cdap.cdap.test.ServiceManager)5