Search in sources :

Example 26 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class Upgrader method convertStreamsConfig.

private DataStreamsConfig convertStreamsConfig(String configStr) {
    DataStreamsConfig config = GSON.fromJson(configStr, DataStreamsConfig.class);
    DataStreamsConfig.Builder builder = DataStreamsConfig.builder().addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setBatchInterval(config.getBatchInterval()).setCheckpointDir(config.getCheckpointDir()).setNumOfRecordsPreview(config.getNumOfRecordsPreview());
    for (ETLStage stage : config.getStages()) {
        builder.addStage(stage.upgradeStage(dataStreamsContext));
    }
    return builder.build();
}
Also used : ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig)

Example 27 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class DataStreamsTest method testTransformComputeWithMacros.

@Test
public void testTransformComputeWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = new ArrayList<>();
    StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
    StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
    StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
    StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
    input.add(samuelRecord);
    input.add(jacksonRecord);
    input.add(dwayneRecord);
    input.add(johnsonRecord);
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(samuelRecord);
    expected.add(jacksonRecord);
    testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
    validateMetric(appId, "source.records.out", 4);
    validateMetric(appId, "sleep.records.in", 4);
    validateMetric(appId, "sleep.records.out", 4);
    validateMetric(appId, "filter1.records.in", 4);
    validateMetric(appId, "filter1.records.out", 3);
    validateMetric(appId, "filter2.records.in", 3);
    validateMetric(appId, "filter2.records.out", 2);
    validateMetric(appId, "sink.records.in", 2);
    Assert.assertTrue(getMetric(appId, "sleep." + io.cdap.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
    expected.clear();
    expected.add(dwayneRecord);
    expected.add(johnsonRecord);
    testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 28 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class DataStreamsTest method testParallelAggregators.

private void testParallelAggregators(boolean isReducibleAggregator) throws Exception {
    String sink1Name = "pAggOutput1-" + isReducibleAggregator;
    String sink2Name = "pAggOutput2-" + isReducibleAggregator;
    Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
    List<StructuredRecord> input1 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build());
    List<StructuredRecord> input2 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build());
    /*
       source1 --|--> agg1 --> sink1
                 |
       source2 --|--> agg2 --> sink2
     */
    DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema, input2))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", isReducibleAggregator ? FieldCountReducibleAggregator.getPlugin("user", "string") : FieldCountAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", isReducibleAggregator ? FieldCountReducibleAggregator.getPlugin("item", "long") : FieldCountAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").setCheckpointDir(checkpointDir).disableCheckpoints().build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp" + isReducibleAggregator);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
    // check output
    DataSetManager<Table> sinkManager1 = getDataset(sink1Name);
    Set<StructuredRecord> expected1 = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
    Tasks.waitFor(true, () -> {
        sinkManager1.flush();
        Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(sinkManager1));
        return expected1.equals(outputRecords);
    }, 1, TimeUnit.MINUTES);
    DataSetManager<Table> sinkManager2 = getDataset(sink2Name);
    Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
    Tasks.waitFor(true, () -> {
        sinkManager2.flush();
        Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(sinkManager2));
        return expected2.equals(outputRecords);
    }, 1, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(10, TimeUnit.SECONDS);
    validateMetric(appId, "source1.records.out", 2);
    validateMetric(appId, "source2.records.out", 3);
    validateMetric(appId, "agg1.records.in", 5);
    validateMetric(appId, "agg1.records.out", 3);
    validateMetric(appId, "agg2.records.in", 5);
    validateMetric(appId, "agg2.records.out", 5);
    validateMetric(appId, "sink1.records.in", 3);
    validateMetric(appId, "sink2.records.in", 5);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 29 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class DataStreamsTest method testLineageWithMacros.

@Test
public void testLineageWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
    String srcName = "lineageSource";
    String sinkName1 = "lineageOutput1";
    String sinkName2 = "lineageOutput2";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
    RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    LineageAdmin lineageAdmin = getLineageAdmin();
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
    // here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
    TimeUnit.SECONDS.sleep(1);
    long startTimeMillis = System.currentTimeMillis();
    runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
        long end = System.currentTimeMillis();
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
    expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) Schema(io.cdap.cdap.api.data.schema.Schema) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) Test(org.junit.Test)

Example 30 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.

the class DataStreamsTest method testWindower.

@Test
public void testWindower() throws Exception {
    /*
     * source --> window(width=10,interval=1) --> aggregator --> filter --> sink
     */
    Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
    String sinkName = "windowOut";
    // source sleeps 1 second between outputs
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    // the sink should contain at least one record with count of 3, and no records with more than 3.
    // less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
    // that contains all 3.
    final DataSetManager<Table> outputManager = getDataset(sinkName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            boolean sawThree = false;
            for (StructuredRecord record : MockSink.readOutput(outputManager)) {
                long count = record.get("ct");
                if (count == 3L) {
                    sawThree = true;
                }
                Assert.assertTrue(count <= 3L);
            }
            return sawThree;
        }
    }, 2, TimeUnit.MINUTES);
    sparkManager.stop();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) TopicNotFoundException(io.cdap.cdap.api.messaging.TopicNotFoundException) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)36 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)34 Test (org.junit.Test)34 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)30 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)30 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)30 HashSet (java.util.HashSet)30 Schema (io.cdap.cdap.api.data.schema.Schema)29 ApplicationManager (io.cdap.cdap.test.ApplicationManager)28 SparkManager (io.cdap.cdap.test.SparkManager)22 Table (io.cdap.cdap.api.dataset.table.Table)18 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)12 TimeoutException (java.util.concurrent.TimeoutException)12 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)11 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)10 Schema (co.cask.cdap.api.data.schema.Schema)10 AppRequest (co.cask.cdap.proto.artifact.AppRequest)10 ApplicationId (co.cask.cdap.proto.id.ApplicationId)10 ApplicationManager (co.cask.cdap.test.ApplicationManager)9 ArrayList (java.util.ArrayList)9