Search in sources :

Example 51 with ApplicationId

use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.

the class DataPipelineTest method testSplitterToConnector.

private void testSplitterToConnector(Engine engine) throws Exception {
    Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    StructuredRecord user0 = StructuredRecord.builder(schema).set("id", 0L).build();
    StructuredRecord user1 = StructuredRecord.builder(schema).set("id", 1L).set("email", "one@example.com").build();
    StructuredRecord user2 = StructuredRecord.builder(schema).set("id", 2L).set("name", "two").build();
    StructuredRecord user3 = StructuredRecord.builder(schema).set("id", 3L).set("name", "three").set("email", "three@example.com").build();
    String sourceName = "splitconSource" + engine.name();
    String sink1Name = "splitconSink1" + engine.name();
    String sink2Name = "splitconSink2" + engine.name();
    /*
     *
     *                                                             |null --> sink1
     *                       |null--> identity-agg --> splitter2 --|
     * source --> splitter1--|                                     |non-null --|
     *                       |                                                 |--> sink2
     *                       |non-null-----------------------------------------|
     */
    ETLBatchConfig config = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source", MockSource.getPlugin(sourceName))).addStage(new ETLStage("splitter1", NullFieldSplitterTransform.getPlugin("name"))).addStage(new ETLStage("splitter2", NullFieldSplitterTransform.getPlugin("email"))).addStage(new ETLStage("identity", IdentityAggregator.getPlugin())).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addConnection("source", "splitter1").addConnection("splitter1", "identity", "null").addConnection("splitter1", "sink2", "non-null").addConnection("identity", "splitter2").addConnection("splitter2", "sink1", "null").addConnection("splitter2", "sink2", "non-null").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app("SplitConTest-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    DataSetManager<Table> inputManager = getDataset(sourceName);
    MockSource.writeInput(inputManager, ImmutableList.of(user0, user1, user2, user3));
    // run pipeline
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // check output
    // sink1 should only have records where both name and email are null (user0)
    DataSetManager<Table> sinkManager = getDataset(sink1Name);
    Set<StructuredRecord> expected = ImmutableSet.of(user0);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    // sink2 should have anything with a non-null name or non-null email
    sinkManager = getDataset(sink2Name);
    expected = ImmutableSet.of(user1, user2, user3);
    actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(4, appId, "source.records.out");
    validateMetric(1, appId, "sink1.records.in");
    validateMetric(3, appId, "sink2.records.in");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 52 with ApplicationId

use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.

the class DataPipelineTest method testErrorTransform.

private void testErrorTransform(Engine engine) throws Exception {
    String source1TableName = "errTestIn1-" + engine;
    String source2TableName = "errTestIn2-" + engine;
    Schema inputSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
    File baseOutput = TMP_FOLDER.newFolder();
    String outputDir1 = new File(baseOutput, "output1").getAbsolutePath();
    String outputDir2 = new File(baseOutput, "output2").getAbsolutePath();
    /*
     *
     * source1 --> filter1 --> filter2 --> agg1 --> agg2
     *                |           |         |        |
     *                |-----------|---------|--------|--------|--> errorflatten --> sink1
     *                |                                       |
     *                |                                       |--> errorfilter --> sink2
     *                |
     * source2 --> dropnull
     *
     * arrows coming out the right represent output records
     * arrows coming out the bottom represent error records
     * this will test multiple stages from multiple phases emitting errors to the same stage
     * as well as errors from one stage going to multiple stages
     * and transforms that have an error schema different from their output schema
     */
    ETLBatchConfig config = ETLBatchConfig.builder().setEngine(engine).addStage(new ETLStage("source1", MockSource.getPlugin(source1TableName, inputSchema))).addStage(new ETLStage("source2", MockSource.getPlugin(source2TableName, inputSchema))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "Leo"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "Ralph"))).addStage(new ETLStage("agg1", GroupFilterAggregator.getPlugin("name", "Don"))).addStage(new ETLStage("agg2", GroupFilterAggregator.getPlugin("name", "Mike"))).addStage(new ETLStage("errorflatten", FlattenErrorTransform.getPlugin())).addStage(new ETLStage("errorfilter", FilterErrorTransform.getPlugin(3))).addStage(new ETLStage("dropnull", DropNullTransform.getPlugin("name"))).addStage(new ETLStage("sink1", MockExternalSink.getPlugin("sink1", "sink1", outputDir1))).addStage(new ETLStage("sink2", MockExternalSink.getPlugin("sink2", "sink2", outputDir2))).addConnection("source1", "filter1").addConnection("source2", "dropnull").addConnection("filter1", "filter2").addConnection("filter2", "agg1").addConnection("agg1", "agg2").addConnection("filter1", "errorflatten").addConnection("filter1", "errorfilter").addConnection("filter2", "errorflatten").addConnection("filter2", "errorfilter").addConnection("agg1", "errorflatten").addConnection("agg1", "errorfilter").addConnection("agg2", "errorflatten").addConnection("agg2", "errorfilter").addConnection("dropnull", "errorflatten").addConnection("dropnull", "errorfilter").addConnection("errorflatten", "sink1").addConnection("errorfilter", "sink2").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app("ErrTransformTest-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(inputSchema).set("name", "Leo").set("id", 1).build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").set("id", 2).build(), StructuredRecord.builder(inputSchema).set("name", "Don").set("id", 3).build(), StructuredRecord.builder(inputSchema).set("name", "Mike").set("id", 4).build());
    DataSetManager<Table> source1Table = getDataset(source1TableName);
    MockSource.writeInput(source1Table, input);
    input = ImmutableList.of(StructuredRecord.builder(inputSchema).set("name", "April").set("id", 5).build());
    DataSetManager<Table> source2Table = getDataset(source2TableName);
    MockSource.writeInput(source2Table, input);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    Map<String, String> args = Collections.singletonMap(io.cdap.cdap.etl.common.Constants.CONSOLIDATE_STAGES, "true");
    workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    Schema flattenSchema = Schema.recordOf("erroruser", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("errMsg", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("errCode", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("errStage", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(flattenSchema).set("name", "Leo").set("id", 1).set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter1").build(), StructuredRecord.builder(flattenSchema).set("name", "Ralph").set("id", 2).set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter2").build(), StructuredRecord.builder(flattenSchema).set("name", "Don").set("id", 3).set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg1").build(), StructuredRecord.builder(flattenSchema).set("name", "Mike").set("id", 4).set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg2").build(), StructuredRecord.builder(flattenSchema).set("name", "April").set("id", 5).set("errMsg", "Field name was not null").set("errCode", 5).set("errStage", "dropnull").build());
    Set<StructuredRecord> actual = new HashSet<>(MockExternalSink.readOutput(outputDir1, flattenSchema));
    Assert.assertEquals(expected, actual);
    expected = ImmutableSet.of(StructuredRecord.builder(inputSchema).set("name", "Leo").set("id", 1).build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").set("id", 2).build(), StructuredRecord.builder(inputSchema).set("name", "April").set("id", 5).build());
    actual = new HashSet<>(MockExternalSink.readOutput(outputDir2, inputSchema));
    Assert.assertEquals(expected, actual);
    /*
     *
     * source1 (4) --> filter1 (3) --> filter2 (2) --> agg1 (1) --> agg2
     *                   |                |              |            |
     *                  (1)              (1)            (1)          (1)
     *                   |----------------|--------------|------------|--------|--> errorflatten (5) --> sink1
     *                   |                                                     |
     *                  (1)                                                    |--> errorfilter (3) --> sink2
     *                   |
     * source2 --> dropnull
     */
    validateMetric(4, appId, "source1.records.out");
    validateMetric(1, appId, "source2.records.out");
    validateMetric(5, appId, "errorflatten.records.out");
    validateMetric(3, appId, "errorfilter.records.out");
    validateMetric(5, appId, "sink1.records.in");
    validateMetric(3, appId, "sink2.records.in");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File) HashSet(java.util.HashSet)

Example 53 with ApplicationId

use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.

the class DataPipelineTest method testSinglePhase.

@Test
public void testSinglePhase() throws Exception {
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    /*
     * source --> sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin("singleInput", schema))).addStage(new ETLStage("sink", MockSink.getPlugin("singleOutput"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SinglePhaseApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    // write records to source
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("singleInput"));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // check sink
    DataSetManager<Table> sinkManager = getDataset("singleOutput");
    Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
    Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
    Assert.assertEquals(expected, actual);
    validateMetric(2, appId, "source.records.out");
    validateMetric(2, appId, "sink.records.in");
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 54 with ApplicationId

use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.

the class DataPipelineTest method testPostAction.

@Test
public void testPostAction() throws Exception {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin("actionInput"))).addStage(new ETLStage("sink", MockSink.getPlugin("actionOutput"))).addPostAction(new ETLStage("tokenWriter", NodeStatesAction.getPlugin("tokenTable"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ActionApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(schema).set("name", "jane").build();
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("actionInput"));
    MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob, recordJane));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> tokenTableManager = getDataset(NamespaceId.DEFAULT.dataset("tokenTable"));
    Table tokenTable = tokenTableManager.get();
    NodeStatus status = NodeStatus.valueOf(Bytes.toString(tokenTable.get(Bytes.toBytes("phase-1"), Bytes.toBytes("status"))));
    Assert.assertEquals(NodeStatus.COMPLETED, status);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) NodeStatus(io.cdap.cdap.api.workflow.NodeStatus) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 55 with ApplicationId

use of io.cdap.cdap.proto.id.ApplicationId in project cdap by caskdata.

the class DataPipelineTest method testMultiConditionControlOnlyDag.

@Test
public void testMultiConditionControlOnlyDag() throws Exception {
    // 
    // 
    // action1---
    // |---condition1---action3
    // action2---      |
    // action4-----condition2-----condition3----action5
    // 
    String appName = "MultiConditionControlOnlyDag";
    String actionTable = "actionTable" + appName;
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("condition1", MockCondition.getPlugin("condition1"))).addStage(new ETLStage("condition2", MockCondition.getPlugin("condition2"))).addStage(new ETLStage("condition3", MockCondition.getPlugin("condition3"))).addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, "row2", "key2", "val2"))).addStage(new ETLStage("action3", MockAction.getPlugin(actionTable, "row3", "key3", "val3"))).addStage(new ETLStage("action4", MockAction.getPlugin(actionTable, "row4", "key4", "val4"))).addStage(new ETLStage("action5", MockAction.getPlugin(actionTable, "row5", "key5", "val5"))).addConnection("action1", "condition1").addConnection("action2", "condition1").addConnection("condition1", "action3", true).addConnection("condition1", "action4", false).addConnection("action4", "condition2").addConnection("condition2", "condition3", true).addConnection("condition3", "action5", true).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app(appName);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(ImmutableMap.of("condition2.branch.to.execute", "true", "condition3.branch.to.execute", "true"));
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> actionTableDS = getDataset(actionTable);
    Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
    Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
    Assert.assertEquals("val4", MockAction.readOutput(actionTableDS, "row4", "key4"));
    Assert.assertEquals("val5", MockAction.readOutput(actionTableDS, "row5", "key5"));
    Assert.assertNull(MockAction.readOutput(actionTableDS, "row3", "key3"));
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Aggregations

ApplicationId (io.cdap.cdap.proto.id.ApplicationId)789 Test (org.junit.Test)410 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)279 ProgramId (io.cdap.cdap.proto.id.ProgramId)263 ApplicationManager (io.cdap.cdap.test.ApplicationManager)225 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)223 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)196 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)180 Table (io.cdap.cdap.api.dataset.table.Table)178 WorkflowManager (io.cdap.cdap.test.WorkflowManager)169 NamespaceId (io.cdap.cdap.proto.id.NamespaceId)154 Schema (io.cdap.cdap.api.data.schema.Schema)147 ArrayList (java.util.ArrayList)129 HashSet (java.util.HashSet)126 ApplicationSpecification (io.cdap.cdap.api.app.ApplicationSpecification)124 HashMap (java.util.HashMap)109 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)107 ArtifactSummary (io.cdap.cdap.api.artifact.ArtifactSummary)88 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)88 Path (javax.ws.rs.Path)75