Search in sources :

Example 1 with DatasetFieldLineageSummary

use of io.cdap.cdap.metadata.DatasetFieldLineageSummary in project cdap by caskdata.

the class DataPipelineTest method testActionFieldLineage.

private void testActionFieldLineage(Engine engine) throws Exception {
    String readDataset = "ActionReadDataset" + engine;
    String writeDataset = "ActionWriteDataset" + engine;
    List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
    Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
    List<Operation> operations = new ArrayList<>();
    /*
     *          |---------> srcField1 -> destField1----|
     *          |                                      |
     * ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
     *          |                                      |
     *          |---------> srcField3 -> destField3 ---|
     */
    operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
    operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
    operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
    operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
    operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
    ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    // get field lineage for dest dataset
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
    Assert.assertEquals(destFields, summary.getFields());
    Assert.assertTrue(summary.getOutgoing().isEmpty());
    Assert.assertEquals(1, summary.getIncoming().size());
    Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
    DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
    // get field lineage for src dataset
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
    Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Assert.assertEquals(1, summary.getOutgoing().size());
    expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
    LineageAdmin lineageAdmin = getLineageAdmin();
    ProgramId programId = appId.workflow(SmartWorkflow.NAME);
    RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
    // get dataset lineage for src dataset
    Tasks.waitFor(2, () -> {
        Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
        return lineage.getRelations().size();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    // get dataset lineage for dest dataset, in this test they should be same
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Assert.assertEquals(2, lineage.getRelations().size());
    Assert.assertEquals(expectedLineage, lineage.getRelations());
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) RunId(org.apache.twill.api.RunId) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin)

Example 2 with DatasetFieldLineageSummary

use of io.cdap.cdap.metadata.DatasetFieldLineageSummary in project cdap by caskdata.

the class DataStreamsTest method testLineageWithMacros.

@Test
public void testLineageWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
    String srcName = "lineageSource";
    String sinkName1 = "lineageOutput1";
    String sinkName2 = "lineageOutput2";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
    RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    LineageAdmin lineageAdmin = getLineageAdmin();
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
    // here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
    TimeUnit.SECONDS.sleep(1);
    long startTimeMillis = System.currentTimeMillis();
    runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
        long end = System.currentTimeMillis();
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
    expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) Schema(io.cdap.cdap.api.data.schema.Schema) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) Test(org.junit.Test)

Aggregations

Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)2 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)2 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)2 DatasetFieldLineageSummary (io.cdap.cdap.metadata.DatasetFieldLineageSummary)2 FieldLineageAdmin (io.cdap.cdap.metadata.FieldLineageAdmin)2 FieldRelation (io.cdap.cdap.metadata.FieldRelation)2 LineageAdmin (io.cdap.cdap.metadata.LineageAdmin)2 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)2 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)2 ProgramId (io.cdap.cdap.proto.id.ProgramId)2 ApplicationManager (io.cdap.cdap.test.ApplicationManager)2 RunId (org.apache.twill.api.RunId)2 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)1 Schema (io.cdap.cdap.api.data.schema.Schema)1 Operation (io.cdap.cdap.api.lineage.field.Operation)1 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)1 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)1 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)1 MetadataOperation (io.cdap.cdap.data2.metadata.writer.MetadataOperation)1 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)1