Search in sources :

Example 31 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class DataStreamsTest method testSplitterTransform.

@Test
public void testSplitterTransform() throws Exception {
    Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    StructuredRecord user0 = StructuredRecord.builder(schema).set("id", 0L).build();
    StructuredRecord user1 = StructuredRecord.builder(schema).set("id", 1L).set("email", "one@example.com").build();
    StructuredRecord user2 = StructuredRecord.builder(schema).set("id", 2L).set("name", "two").build();
    StructuredRecord user3 = StructuredRecord.builder(schema).set("id", 3L).set("name", "three").set("email", "three@example.com").build();
    String sink1Name = "splitSink1";
    String sink2Name = "splitSink2";
    /*
     *
     *                                            |null --> sink1
     *                       |null--> splitter2 --|
     * source --> splitter1--|                    |non-null --|
     *                       |                                |--> sink2
     *                       |non-null------------------------|
     */
    DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(schema, ImmutableList.of(user0, user1, user2, user3)))).addStage(new ETLStage("splitter1", NullFieldSplitterTransform.getPlugin("name"))).addStage(new ETLStage("splitter2", NullFieldSplitterTransform.getPlugin("email"))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addConnection("source", "splitter1").addConnection("splitter1", "splitter2", "null").addConnection("splitter1", "sink2", "non-null").addConnection("splitter2", "sink1", "null").addConnection("splitter2", "sink2", "non-null").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app("SplitterTest");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // run pipeline
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    // check output
    // sink1 should only have records where both name and email are null (user0)
    final DataSetManager<Table> sink1Manager = getDataset(sink1Name);
    final Set<StructuredRecord> expected1 = ImmutableSet.of(user0);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sink1Manager.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sink1Manager));
            return expected1.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
    // sink2 should have anything with a non-null name or non-null email
    final DataSetManager<Table> sink2Manager = getDataset(sink2Name);
    final Set<StructuredRecord> expected2 = ImmutableSet.of(user1, user2, user3);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sink2Manager.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sink2Manager));
            return expected2.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStatus(false, 10, 1);
    validateMetric(appId, "source.records.out", 4);
    validateMetric(appId, "splitter1.records.in", 4);
    validateMetric(appId, "splitter1.records.out.non-null", 2);
    validateMetric(appId, "splitter1.records.out.null", 2);
    validateMetric(appId, "splitter2.records.in", 2);
    validateMetric(appId, "splitter2.records.out.non-null", 1);
    validateMetric(appId, "splitter2.records.out.null", 1);
    validateMetric(appId, "sink1.records.in", 1);
    validateMetric(appId, "sink2.records.in", 3);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) TopicNotFoundException(co.cask.cdap.api.messaging.TopicNotFoundException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 32 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class DataStreamsTest method testTransformComputeWithMacros.

@Test
public void testTransformComputeWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = new ArrayList<>();
    StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
    StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
    StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
    StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
    input.add(samuelRecord);
    input.add(jacksonRecord);
    input.add(dwayneRecord);
    input.add(johnsonRecord);
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    final Set<StructuredRecord> expected = new HashSet<>();
    expected.add(samuelRecord);
    expected.add(jacksonRecord);
    testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
    validateMetric(appId, "source.records.out", 4);
    validateMetric(appId, "sleep.records.in", 4);
    validateMetric(appId, "sleep.records.out", 4);
    validateMetric(appId, "filter1.records.in", 4);
    validateMetric(appId, "filter1.records.out", 3);
    validateMetric(appId, "filter2.records.in", 3);
    validateMetric(appId, "filter2.records.out", 2);
    validateMetric(appId, "sink.records.in", 2);
    Assert.assertTrue(getMetric(appId, "sleep." + co.cask.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
    expected.clear();
    expected.add(dwayneRecord);
    expected.add(johnsonRecord);
    testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ApplicationId(co.cask.cdap.proto.id.ApplicationId) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 33 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineTest method testWordCount.

public void testWordCount(String pluginType) throws Exception {
    String inputName = "wcInput-" + pluginType;
    String outputName = "wcOutput-" + pluginType;
    // create the pipeline config
    ETLStage source = new ETLStage("wcInput", MockSource.getPlugin(inputName));
    ETLStage sink = new ETLStage("wcOutput", MockSink.getPlugin(outputName));
    Map<String, String> aggProperties = new HashMap<>();
    aggProperties.put("field", "text");
    ETLStage agg = new ETLStage("middle", new ETLPlugin("WordCount", pluginType, aggProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(agg).addConnection(source.getName(), agg.getName()).addConnection(agg.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("wcTestPipeline-" + pluginType);
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(outputName);
    Set<StructuredRecord> outputRecords = new HashSet<>();
    outputRecords.addAll(MockSink.readOutput(outputManager));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hello").set("count", 3L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "World").set("count", 1L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "my").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "name").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "is").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hal").set("count", 1L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Sam").set("count", 1L).build());
    Assert.assertEquals(expected, outputRecords);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 34 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class ETLBatchConfigTest method testUpgrade.

@Test
public void testUpgrade() throws Exception {
    final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
    ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sourceNew = from(source, BatchSource.PLUGIN_TYPE);
    ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
    co.cask.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
    ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
    co.cask.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
    ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), null);
    co.cask.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
    ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, BatchSink.PLUGIN_TYPE);
    ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, BatchSink.PLUGIN_TYPE);
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
    connections.add(new Connection(transform1New.getName(), transform2New.getName()));
    connections.add(new Connection(transform2New.getName(), transform3New.getName()));
    connections.add(new Connection(transform3New.getName(), sink1New.getName()));
    connections.add(new Connection(transform3New.getName(), sink2New.getName()));
    String schedule = "*/5 * * * *";
    Resources resources = new Resources(1024, 1);
    ETLBatchConfig config = ETLBatchConfig.builder(schedule).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).setDriverResources(resources).build();
    co.cask.cdap.etl.proto.v2.ETLBatchConfig configNew = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder(schedule).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).build();
    Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {

        @Nullable
        @Override
        public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
            return null;
        }
    }));
}
Also used : ArtifactSelectorConfig(co.cask.cdap.etl.proto.ArtifactSelectorConfig) Connection(co.cask.cdap.etl.proto.Connection) UpgradeContext(co.cask.cdap.etl.proto.UpgradeContext) Resources(co.cask.cdap.api.Resources) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 35 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class Upgrader method convertStreamsConfig.

private DataStreamsConfig convertStreamsConfig(String configStr) {
    DataStreamsConfig config = GSON.fromJson(configStr, DataStreamsConfig.class);
    DataStreamsConfig.Builder builder = DataStreamsConfig.builder().addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setBatchInterval(config.getBatchInterval()).setCheckpointDir(config.getCheckpointDir()).setNumOfRecordsPreview(config.getNumOfRecordsPreview());
    for (ETLStage stage : config.getStages()) {
        builder.addStage(stage.upgradeStage(dataStreamsContext));
    }
    return builder.build();
}
Also used : ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig)

Aggregations

ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)94 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)75 Test (org.junit.Test)64 ApplicationId (co.cask.cdap.proto.id.ApplicationId)62 ApplicationManager (co.cask.cdap.test.ApplicationManager)58 AppRequest (co.cask.cdap.proto.artifact.AppRequest)57 Schema (co.cask.cdap.api.data.schema.Schema)51 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)50 Table (co.cask.cdap.api.dataset.table.Table)49 WorkflowManager (co.cask.cdap.test.WorkflowManager)44 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)39 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)15 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)14 HashMap (java.util.HashMap)14 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)11 TimeoutException (java.util.concurrent.TimeoutException)11 TopicNotFoundException (co.cask.cdap.api.messaging.TopicNotFoundException)7 SparkManager (co.cask.cdap.test.SparkManager)7 BatchPipelineSpec (co.cask.cdap.etl.batch.BatchPipelineSpec)6