Search in sources :

Example 6 with DataStreamsConfig

use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class Upgrader method convertStreamsConfig.

private DataStreamsConfig convertStreamsConfig(String configStr) {
    DataStreamsConfig config = GSON.fromJson(configStr, DataStreamsConfig.class);
    DataStreamsConfig.Builder builder = DataStreamsConfig.builder().addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setBatchInterval(config.getBatchInterval()).setCheckpointDir(config.getCheckpointDir()).setNumOfRecordsPreview(config.getNumOfRecordsPreview());
    for (ETLStage stage : config.getStages()) {
        builder.addStage(stage.upgradeStage(dataStreamsContext));
    }
    return builder.build();
}
Also used : ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig)

Example 7 with DataStreamsConfig

use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class PreviewDataStreamsTest method testDataStreamsPreviewRun.

@Test
public void testDataStreamsPreviewRun() throws Exception {
    PreviewManager previewManager = getPreviewManager();
    String sinkTableName = "singleOutput";
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> records = new ArrayList<>();
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordTest = StructuredRecord.builder(schema).set("name", "test").build();
    records.add(recordSamuel);
    records.add(recordBob);
    records.add(recordTest);
    /*
     * source --> transform -> sink
     */
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, records))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("sink", MockSink.getPlugin(sinkTableName))).addConnection("source", "transform").addConnection("transform", "sink").setNumOfRecordsPreview(100).setBatchInterval("1s").build();
    // Construct the preview config with the program name and program type.
    PreviewConfig previewConfig = new PreviewConfig(DataStreamsSparkLauncher.NAME, ProgramType.SPARK, Collections.<String, String>emptyMap(), 1);
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
    // Start the preview and get the corresponding PreviewRunner.
    ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
    final PreviewRunner previewRunner = previewManager.getRunner(previewId);
    // Wait for the preview to be running and wait until the records are processed in the sink.
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            Map<String, List<JsonElement>> data = previewRunner.getData("sink");
            return data != null && data.get(DATA_TRACER_PROPERTY) != null && data.get(DATA_TRACER_PROPERTY).size() == 3;
        }
    }, 1, TimeUnit.MINUTES);
    // check data in source and transform
    checkPreviewStore(previewRunner, "source", 3);
    checkPreviewStore(previewRunner, "transform", 3);
    // Wait for the pipeline to be shutdown by timer.
    TimeUnit.MINUTES.sleep(1);
    Tasks.waitFor(PreviewStatus.Status.KILLED_BY_TIMER, new Callable<PreviewStatus.Status>() {

        @Override
        public PreviewStatus.Status call() throws Exception {
            return previewRunner.getStatus().getStatus();
        }
    }, 1, TimeUnit.MINUTES);
    // Validate the metrics for preview
    validateMetric(3, previewId, "source.records.out", previewRunner);
    validateMetric(3, previewId, "transform.records.in", previewRunner);
    validateMetric(3, previewId, "transform.records.out", previewRunner);
    validateMetric(3, previewId, "sink.records.in", previewRunner);
    validateMetric(3, previewId, "sink.records.out", previewRunner);
    // Check the sink table is not created in the real space.
    DataSetManager<Table> sinkManager = getDataset(sinkTableName);
    Assert.assertNull(sinkManager.get());
}
Also used : PreviewStatus(co.cask.cdap.app.preview.PreviewStatus) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) PreviewManager(co.cask.cdap.app.preview.PreviewManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) JsonElement(com.google.gson.JsonElement) PreviewRunner(co.cask.cdap.app.preview.PreviewRunner) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) PreviewConfig(co.cask.cdap.proto.artifact.preview.PreviewConfig) Test(org.junit.Test)

Example 8 with DataStreamsConfig

use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class DataStreamsTest method testWindower.

@Test
public void testWindower() throws Exception {
    /*
     * source --> window(width=10,interval=1) --> aggregator --> filter --> sink
     */
    Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
    String sinkName = "windowOut";
    // source sleeps 1 second between outputs
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    // the sink should contain at least one record with count of 3, and no records with more than 3.
    // less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
    // that contains all 3.
    final DataSetManager<Table> outputManager = getDataset(sinkName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            boolean sawThree = false;
            for (StructuredRecord record : MockSink.readOutput(outputManager)) {
                long count = record.get("ct");
                if (count == 3L) {
                    sawThree = true;
                }
                Assert.assertTrue(count <= 3L);
            }
            return sawThree;
        }
    }, 2, TimeUnit.MINUTES);
    sparkManager.stop();
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 9 with DataStreamsConfig

use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class DataStreamsTest method testErrorTransform.

@Test
public void testErrorTransform() throws Exception {
    String sink1TableName = "errTestOut1";
    String sink2TableName = "errTestOut2";
    Schema inputSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build(), StructuredRecord.builder(inputSchema).set("name", "Don").build(), StructuredRecord.builder(inputSchema).set("name", "Mike").build(), StructuredRecord.builder(inputSchema).set("name", "April").build());
    /*
     *
     * source--> filter1 --> filter2 --> agg1 --> agg2
     *              |           |         |        |
     *              |-----------|---------|--------|--------|--> flatten errors --> sink1
     *                                                      |
     *                                                      |--> filter errors --> sink2
     * arrows coming out the right represent output records
     * arrows coming out the bottom represent error records
     * this will test multiple stages from multiple phases emitting errors to the same stage
     * as well as errors from one stage going to multiple stages
     */
    DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(inputSchema, input))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "Leo"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "Ralph"))).addStage(new ETLStage("agg1", GroupFilterAggregator.getPlugin("name", "Don"))).addStage(new ETLStage("agg2", GroupFilterAggregator.getPlugin("name", "Mike"))).addStage(new ETLStage("errorflatten", FlattenErrorTransform.getPlugin())).addStage(new ETLStage("errorfilter", FilterErrorTransform.getPlugin(3))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1TableName))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2TableName))).addConnection("source", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "agg1").addConnection("agg1", "agg2").addConnection("filter1", "errorflatten").addConnection("filter1", "errorfilter").addConnection("filter2", "errorflatten").addConnection("filter2", "errorfilter").addConnection("agg1", "errorflatten").addConnection("agg1", "errorfilter").addConnection("agg2", "errorflatten").addConnection("agg2", "errorfilter").addConnection("errorflatten", "sink1").addConnection("errorfilter", "sink2").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app("ErrTransformTest");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    Schema flattenSchema = Schema.recordOf("erroruser", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("errMsg", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("errCode", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("errStage", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    final Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(flattenSchema).set("name", "Leo").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter1").build(), StructuredRecord.builder(flattenSchema).set("name", "Ralph").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter2").build(), StructuredRecord.builder(flattenSchema).set("name", "Don").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg1").build(), StructuredRecord.builder(flattenSchema).set("name", "Mike").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg2").build());
    final DataSetManager<Table> sink1Table = getDataset(sink1TableName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sink1Table.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sink1Table));
            return expected.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
    final Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build());
    final DataSetManager<Table> sink2Table = getDataset(sink2TableName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sink2Table.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sink2Table));
            return expected2.equals(outputRecords);
        }
    }, 4, TimeUnit.MINUTES);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 10 with DataStreamsConfig

use of co.cask.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class DataStreamsTest method testParallelAggregators.

@Test
public void testParallelAggregators() throws Exception {
    String sink1Name = "pAggOutput1";
    String sink2Name = "pAggOutput2";
    Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
    List<StructuredRecord> input1 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build());
    List<StructuredRecord> input2 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build());
    /*
       source1 --|--> agg1 --> sink1
                 |
       source2 --|--> agg2 --> sink2
     */
    DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema, input2))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", FieldCountAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", FieldCountAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").disableCheckpoints().build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    // check output
    final DataSetManager<Table> sinkManager1 = getDataset(sink1Name);
    final Set<StructuredRecord> expected1 = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sinkManager1.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sinkManager1));
            return expected1.equals(outputRecords);
        }
    }, 1, TimeUnit.MINUTES);
    final DataSetManager<Table> sinkManager2 = getDataset(sink2Name);
    final Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            sinkManager2.flush();
            Set<StructuredRecord> outputRecords = new HashSet<>();
            outputRecords.addAll(MockSink.readOutput(sinkManager2));
            return expected2.equals(outputRecords);
        }
    }, 1, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStatus(false, 10, 1);
    validateMetric(appId, "source1.records.out", 2);
    validateMetric(appId, "source2.records.out", 3);
    validateMetric(appId, "agg1.records.in", 5);
    validateMetric(appId, "agg1.records.out", 3);
    validateMetric(appId, "agg2.records.in", 5);
    validateMetric(appId, "agg2.records.out", 5);
    validateMetric(appId, "sink1.records.in", 3);
    validateMetric(appId, "sink2.records.in", 5);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)10 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)9 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)8 Schema (co.cask.cdap.api.data.schema.Schema)8 AppRequest (co.cask.cdap.proto.artifact.AppRequest)8 ApplicationId (co.cask.cdap.proto.id.ApplicationId)8 Test (org.junit.Test)8 ApplicationManager (co.cask.cdap.test.ApplicationManager)7 Table (co.cask.cdap.api.dataset.table.Table)6 TimeoutException (java.util.concurrent.TimeoutException)6 SparkManager (co.cask.cdap.test.SparkManager)5 HashSet (java.util.HashSet)5 ImmutableSet (com.google.common.collect.ImmutableSet)4 Set (java.util.Set)4 ArrayList (java.util.ArrayList)3 PreviewManager (co.cask.cdap.app.preview.PreviewManager)1 PreviewRunner (co.cask.cdap.app.preview.PreviewRunner)1 PreviewStatus (co.cask.cdap.app.preview.PreviewStatus)1 PreviewConfig (co.cask.cdap.proto.artifact.preview.PreviewConfig)1 ImmutableMap (com.google.common.collect.ImmutableMap)1