Search in sources :

Example 51 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method testSinglePhaseWithSparkCompute.

private void testSinglePhaseWithSparkCompute() throws Exception {
    /*
     * source --> sparkcompute --> sink
     */
    String classifiedTextsTable = "classifiedTextTable";
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(NaiveBayesTrainer.TEXTS_TO_CLASSIFY_SOURCE, SpamMessage.SCHEMA))).addStage(new ETLStage("sparkcompute", new ETLPlugin(NaiveBayesClassifier.PLUGIN_NAME, SparkCompute.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "fieldToSet", SpamMessage.SPAM_PREDICTION_FIELD), null))).addStage(new ETLStage("sink", MockSink.getPlugin(classifiedTextsTable))).addConnection("source", "sparkcompute").addConnection("sparkcompute", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SparkComputeApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write some some messages to be classified
    List<StructuredRecord> messagesToWrite = new ArrayList<>();
    messagesToWrite.add(new SpamMessage("how are you doing today").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("free money money").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("what are you doing today").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("genuine report").toStructuredRecord());
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY_SOURCE));
    MockSource.writeInput(inputManager, messagesToWrite);
    // manually trigger the pipeline
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> classifiedTexts = getDataset(classifiedTextsTable);
    List<StructuredRecord> structuredRecords = MockSink.readOutput(classifiedTexts);
    Set<SpamMessage> results = new HashSet<>();
    for (StructuredRecord structuredRecord : structuredRecords) {
        results.add(SpamMessage.fromStructuredRecord(structuredRecord));
    }
    Set<SpamMessage> expected = new HashSet<>();
    expected.add(new SpamMessage("how are you doing today", 0.0));
    // only 'free money money' should be predicated as spam
    expected.add(new SpamMessage("free money money", 1.0));
    expected.add(new SpamMessage("what are you doing today", 0.0));
    expected.add(new SpamMessage("genuine report", 0.0));
    Assert.assertEquals(expected, results);
    validateMetric(4, appId, "source.records.out");
    validateMetric(4, appId, "sparkcompute.records.in");
    validateMetric(4, appId, "sink.records.in");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) SpamMessage(io.cdap.cdap.datapipeline.mock.SpamMessage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 52 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataPipelineTest method runPipelineForMetadata.

private void runPipelineForMetadata(MetadataAdmin metadataAdmin, Set<MetadataOperation> operations) throws Exception {
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    /*
     * source --> sink
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin("singleInput", schema, operations))).addStage(new ETLStage("sink", MockSink.getPlugin("singleOutput"))).addConnection("source", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("MetadataTestApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // wait for the system metadata for the app and the dataset to show up - the pipeline validates them
    Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, appId.toMetadataEntity()).isEmpty(), 10, TimeUnit.SECONDS);
    Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, NamespaceId.DEFAULT.dataset("singleInput").toMetadataEntity()).isEmpty(), 10, TimeUnit.SECONDS);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    int numRuns = workflowManager.getHistory().size();
    workflowManager.start();
    workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, numRuns + 1, 5, TimeUnit.MINUTES);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) AppRequest(io.cdap.cdap.proto.artifact.AppRequest)

Example 53 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataStreamsTest method testTransformComputeWithMacros.

@Test
public void testTransformComputeWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = new ArrayList<>();
    StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
    StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
    StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
    StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
    input.add(samuelRecord);
    input.add(jacksonRecord);
    input.add(dwayneRecord);
    input.add(johnsonRecord);
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(samuelRecord);
    expected.add(jacksonRecord);
    testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
    validateMetric(appId, "source.records.out", 4);
    validateMetric(appId, "sleep.records.in", 4);
    validateMetric(appId, "sleep.records.out", 4);
    validateMetric(appId, "filter1.records.in", 4);
    validateMetric(appId, "filter1.records.out", 3);
    validateMetric(appId, "filter2.records.in", 3);
    validateMetric(appId, "filter2.records.out", 2);
    validateMetric(appId, "sink.records.in", 2);
    Assert.assertTrue(getMetric(appId, "sleep." + io.cdap.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
    expected.clear();
    expected.add(dwayneRecord);
    expected.add(johnsonRecord);
    testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 54 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataStreamsTest method testLineageWithMacros.

@Test
public void testLineageWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
    String srcName = "lineageSource";
    String sinkName1 = "lineageOutput1";
    String sinkName2 = "lineageOutput2";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
    RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    LineageAdmin lineageAdmin = getLineageAdmin();
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
    // here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
    TimeUnit.SECONDS.sleep(1);
    long startTimeMillis = System.currentTimeMillis();
    runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
        long end = System.currentTimeMillis();
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
    expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) Schema(io.cdap.cdap.api.data.schema.Schema) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) Test(org.junit.Test)

Example 55 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataStreamsTest method testWindower.

@Test
public void testWindower() throws Exception {
    /*
     * source --> window(width=10,interval=1) --> aggregator --> filter --> sink
     */
    Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
    String sinkName = "windowOut";
    // source sleeps 1 second between outputs
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    // the sink should contain at least one record with count of 3, and no records with more than 3.
    // less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
    // that contains all 3.
    final DataSetManager<Table> outputManager = getDataset(sinkName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            boolean sawThree = false;
            for (StructuredRecord record : MockSink.readOutput(outputManager)) {
                long count = record.get("ct");
                if (count == 3L) {
                    sawThree = true;
                }
                Assert.assertTrue(count <= 3L);
            }
            return sawThree;
        }
    }, 2, TimeUnit.MINUTES);
    sparkManager.stop();
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) TopicNotFoundException(io.cdap.cdap.api.messaging.TopicNotFoundException) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)84 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)75 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)59 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)59 Test (org.junit.Test)54 ApplicationManager (io.cdap.cdap.test.ApplicationManager)53 Table (io.cdap.cdap.api.dataset.table.Table)46 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)45 Schema (io.cdap.cdap.api.data.schema.Schema)45 WorkflowManager (io.cdap.cdap.test.WorkflowManager)45 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)35 HashSet (java.util.HashSet)15 ArrayList (java.util.ArrayList)14 HashMap (java.util.HashMap)11 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)9 SpamMessage (io.cdap.cdap.datapipeline.mock.SpamMessage)8 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)7 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)7 DatasetFieldLineageSummary (io.cdap.cdap.metadata.DatasetFieldLineageSummary)7 FieldLineageAdmin (io.cdap.cdap.metadata.FieldLineageAdmin)7