Search in sources :

Example 26 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataStreamsTest method testJoin.

@Test
public void testJoin() throws Exception {
    /*
     * source1 ----> t1 ------
     *                        | --> innerjoin ----> t4 ------
     * source2 ----> t2 ------                                 |
     *                                                         | ---> outerjoin --> sink1
     *                                                         |
     * source3 -------------------- t3 ------------------------
     */
    Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
    Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
    Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
    StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
    StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
    StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
    StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
    StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
    List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
    List<StructuredRecord> input2 = ImmutableList.of(recordCar, recordBike);
    List<StructuredRecord> input3 = ImmutableList.of(recordTrasCar, recordTrasBike, recordTrasPlane);
    String outputName = "multiJoinOutputSink";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("source3", MockSource.getPlugin(inputSchema3, input3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage("outerjoin", MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage("multijoinSink", MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", "outerjoin").addConnection("t4", "outerjoin").addConnection("outerjoin", "multijoinSink").setBatchInterval("5s").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("JoinerApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema2).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
    StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema2).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
    StructuredRecord joinRecordPlane = StructuredRecord.builder(outSchema2).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
    final Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordPlane);
    final DataSetManager<Table> outputManager = getDataset(outputName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(outputManager));
            return expected.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStatus(false, 10, 1);
    validateMetric(appId, "source1.records.out", 3);
    validateMetric(appId, "source2.records.out", 2);
    validateMetric(appId, "source3.records.out", 3);
    validateMetric(appId, "t1.records.in", 3);
    validateMetric(appId, "t1.records.out", 3);
    validateMetric(appId, "t2.records.in", 2);
    validateMetric(appId, "t2.records.out", 2);
    validateMetric(appId, "t3.records.in", 3);
    validateMetric(appId, "t3.records.out", 3);
    validateMetric(appId, "t4.records.in", 2);
    validateMetric(appId, "t4.records.out", 2);
    validateMetric(appId, "innerjoin.records.in", 5);
    validateMetric(appId, "innerjoin.records.out", 2);
    validateMetric(appId, "outerjoin.records.in", 5);
    validateMetric(appId, "outerjoin.records.out", 3);
    validateMetric(appId, "multijoinSink.records.in", 3);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 27 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class ETLWorkerTest method testDAG.

@Test
public void testDAG() throws Exception {
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("x", Schema.of(Schema.Type.INT)));
    StructuredRecord record1 = StructuredRecord.builder(schema).set("x", 1).build();
    StructuredRecord record2 = StructuredRecord.builder(schema).set("x", 2).build();
    StructuredRecord record3 = StructuredRecord.builder(schema).set("x", 3).build();
    List<StructuredRecord> input = ImmutableList.of(record1, record2, record3);
    /*
     *            ----- value filter ------- sink1
     *           |
     * source --------- double --------
     *           |                     |---- sink2
     *            ----- identity ------
     */
    File sink1Out = TMP_FOLDER.newFolder();
    File sink2Out = TMP_FOLDER.newFolder();
    ETLRealtimeConfig etlConfig = ETLRealtimeConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(input))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Out))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Out))).addStage(new ETLStage("valueFilter", IntValueFilterTransform.getPlugin("x", 2))).addStage(new ETLStage("double", DoubleTransform.getPlugin())).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "valueFilter").addConnection("source", "double").addConnection("source", "identity").addConnection("valueFilter", "sink1").addConnection("double", "sink2").addConnection("identity", "sink2").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("dagTest");
    AppRequest<ETLRealtimeConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    Assert.assertNotNull(appManager);
    WorkerManager workerManager = appManager.getWorkerManager(ETLWorker.NAME);
    workerManager.start();
    workerManager.waitForStatus(true, 10, 1);
    try {
        List<StructuredRecord> sink1output = MockSink.getRecords(sink1Out, 0, 10, TimeUnit.SECONDS);
        List<StructuredRecord> sink1expected = ImmutableList.of(record1, record3);
        Assert.assertEquals(sink1expected, sink1output);
        List<StructuredRecord> sink2output = MockSink.getRecords(sink2Out, 0, 10, TimeUnit.SECONDS);
        Assert.assertEquals(9, sink2output.size());
    } finally {
        stopWorker(workerManager);
    }
    validateMetric(3, appId, "source.records.out");
    validateMetric(3, appId, "valueFilter.records.in");
    validateMetric(2, appId, "valueFilter.records.out");
    validateMetric(3, appId, "double.records.in");
    validateMetric(6, appId, "double.records.out");
    validateMetric(3, appId, "identity.records.in");
    validateMetric(3, appId, "identity.records.out");
    validateMetric(2, appId, "sink1.records.in");
    validateMetric(9, appId, "sink2.records.in");
}
Also used : WorkerManager(co.cask.cdap.test.WorkerManager) ApplicationManager(co.cask.cdap.test.ApplicationManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Schema(co.cask.cdap.api.data.schema.Schema) ETLRealtimeConfig(co.cask.cdap.etl.proto.v2.ETLRealtimeConfig) ApplicationId(co.cask.cdap.proto.id.ApplicationId) File(java.io.File) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 28 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testUniqueStageNames.

@Test(expected = IllegalArgumentException.class)
public void testUniqueStageNames() {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("t1", MOCK_TRANSFORM_A)).addStage(new ETLStage("t1", MOCK_TRANSFORM_B)).addStage(new ETLStage("sink", MOCK_SINK)).addConnection("source", "sink").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 29 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testConnectionIntoSource.

@Test(expected = IllegalArgumentException.class)
public void testConnectionIntoSource() {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addStage(new ETLStage("transform", MOCK_TRANSFORM_A)).addConnection("source", "sink").addConnection("transform", "source").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 30 with ETLConfig

use of co.cask.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testGenerateSpec.

@Test
public void testGenerateSpec() {
    /*
     *           ---- t1 ------------
     *           |            |      |
     * source ---             |      |--- t3 --- sink1
     *           |            |      |
     *           ------------ t2 --------------- sink2
     *           |                        |
     *           |                        |
     *           -------------------------
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink1", MOCK_SINK)).addStage(new ETLStage("sink2", MOCK_SINK)).addStage(new ETLStage("t1", MOCK_TRANSFORM_A)).addStage(new ETLStage("t2", MOCK_TRANSFORM_A)).addStage(new ETLStage("t3", MOCK_TRANSFORM_B)).addConnection("source", "t1").addConnection("source", "t2").addConnection("source", "sink2").addConnection("t1", "t2").addConnection("t1", "t3").addConnection("t1", "sink2").addConnection("t2", "sink2").addConnection("t2", "t3").addConnection("t3", "sink1").build();
    // test the spec generated is correct, with the right input and output schemas and artifact information.
    BatchPipelineSpec actual = specGenerator.generateSpec(etlConfig);
    Map<String, String> emptyMap = ImmutableMap.of();
    PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("source", new PluginSpec(BatchSource.PLUGIN_TYPE, "mocksource", emptyMap, ARTIFACT_ID)).setOutputSchema(SCHEMA_A).addOutputs("t1", "t2", "sink2").build()).addStage(StageSpec.builder("sink1", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchema("t3", SCHEMA_B).addInputs("t3").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("sink2", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("t1", SCHEMA_A, "t2", SCHEMA_A, "source", SCHEMA_A)).addInputs("t1", "t2", "source").setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("t1", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).setOutputSchema(SCHEMA_A).addInputs("source").addOutputs("t2", "t3", "sink2").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("t2", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("source", SCHEMA_A, "t1", SCHEMA_A)).setOutputSchema(SCHEMA_A).addInputs("source", "t1").addOutputs("t3", "sink2").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("t3", new PluginSpec(Transform.PLUGIN_TYPE, "mockB", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("t1", SCHEMA_A, "t2", SCHEMA_A)).setOutputSchema(SCHEMA_B).addInputs("t1", "t2").addOutputs("sink1").setErrorSchema(SCHEMA_A).build()).addConnections(etlConfig.getConnections()).setResources(etlConfig.getResources()).setDriverResources(new Resources(1024, 1)).setClientResources(new Resources(1024, 1)).setStageLoggingEnabled(etlConfig.isStageLoggingEnabled()).build();
    Assert.assertEquals(expected, actual);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) BatchPipelineSpec(co.cask.cdap.etl.batch.BatchPipelineSpec) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) BatchPipelineSpec(co.cask.cdap.etl.batch.BatchPipelineSpec) Resources(co.cask.cdap.api.Resources) Test(org.junit.Test)

Aggregations

ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)50 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)37 AppRequest (co.cask.cdap.proto.artifact.AppRequest)35 ApplicationId (co.cask.cdap.proto.id.ApplicationId)35 ApplicationManager (co.cask.cdap.test.ApplicationManager)32 Test (org.junit.Test)31 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)27 Schema (co.cask.cdap.api.data.schema.Schema)26 WorkflowManager (co.cask.cdap.test.WorkflowManager)24 Table (co.cask.cdap.api.dataset.table.Table)23 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)19 ArrayList (java.util.ArrayList)9 HashSet (java.util.HashSet)6 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)5 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)5 File (java.io.File)5 TimeoutException (java.util.concurrent.TimeoutException)5 ETLRealtimeConfig (co.cask.cdap.etl.proto.v2.ETLRealtimeConfig)4 WorkerManager (co.cask.cdap.test.WorkerManager)4 PreviewManager (co.cask.cdap.app.preview.PreviewManager)3