Search in sources :

Example 51 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageCollapserTest method testCollapseMulti.

@Test
public void testCollapseMulti() {
    Set<Relation> relations = ImmutableSet.of(new Relation(data1, service1, AccessType.READ, runId1), new Relation(data1, service1, AccessType.WRITE, runId1), new Relation(data1, service1, AccessType.READ, runId1), new Relation(data1, service2, AccessType.READ, runId1), new Relation(data1, service2, AccessType.READ, runId1), new Relation(data2, service1, AccessType.READ, runId1), new Relation(data2, service1, AccessType.READ, runId1));
    // Collapse on access
    Assert.assertEquals(toSet(new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.WRITE), toSet(runId1), Collections.emptySet()), new CollapsedRelation(data1, service2, toSet(AccessType.READ), toSet(runId1), Collections.emptySet()), new CollapsedRelation(data2, service1, toSet(AccessType.READ), toSet(runId1), Collections.emptySet())), LineageCollapser.collapseRelations(relations, ImmutableSet.of(CollapseType.ACCESS)));
}
Also used : CollapsedRelation(io.cdap.cdap.data2.metadata.lineage.CollapsedRelation) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) CollapsedRelation(io.cdap.cdap.data2.metadata.lineage.CollapsedRelation) Test(org.junit.Test)

Example 52 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.

the class DataPipelineTest method testActionFieldLineage.

private void testActionFieldLineage(Engine engine) throws Exception {
    String readDataset = "ActionReadDataset" + engine;
    String writeDataset = "ActionWriteDataset" + engine;
    List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
    Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
    List<Operation> operations = new ArrayList<>();
    /*
     *          |---------> srcField1 -> destField1----|
     *          |                                      |
     * ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
     *          |                                      |
     *          |---------> srcField3 -> destField3 ---|
     */
    operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
    operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
    operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
    operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
    operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
    ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    // get field lineage for dest dataset
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
    Assert.assertEquals(destFields, summary.getFields());
    Assert.assertTrue(summary.getOutgoing().isEmpty());
    Assert.assertEquals(1, summary.getIncoming().size());
    Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
    DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
    // get field lineage for src dataset
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
    Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Assert.assertEquals(1, summary.getOutgoing().size());
    expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
    LineageAdmin lineageAdmin = getLineageAdmin();
    ProgramId programId = appId.workflow(SmartWorkflow.NAME);
    RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
    // get dataset lineage for src dataset
    Tasks.waitFor(2, () -> {
        Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
        return lineage.getRelations().size();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    // get dataset lineage for dest dataset, in this test they should be same
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Assert.assertEquals(2, lineage.getRelations().size());
    Assert.assertEquals(expectedLineage, lineage.getRelations());
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) RunId(org.apache.twill.api.RunId) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin)

Example 53 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.

the class DataStreamsTest method testLineageWithMacros.

@Test
public void testLineageWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
    String srcName = "lineageSource";
    String sinkName1 = "lineageOutput1";
    String sinkName2 = "lineageOutput2";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
    RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    LineageAdmin lineageAdmin = getLineageAdmin();
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
    // here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
    TimeUnit.SECONDS.sleep(1);
    long startTimeMillis = System.currentTimeMillis();
    runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
        long end = System.currentTimeMillis();
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
    expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) Schema(io.cdap.cdap.api.data.schema.Schema) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) Test(org.junit.Test)

Example 54 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.

the class LineageCollapserTest method testCollapseCombinations.

@Test
public void testCollapseCombinations() {
    Set<Relation> relations = ImmutableSet.of(// First run
    new Relation(data1, service1, AccessType.READ, runId1), new Relation(data1, service1, AccessType.WRITE, runId1), new Relation(data1, service1, AccessType.READ, runId1), // Second run
    new Relation(data1, service1, AccessType.READ, runId2), new Relation(data1, service1, AccessType.WRITE, runId2), new Relation(data1, service1, AccessType.READ, runId2), new Relation(data1, service1, AccessType.UNKNOWN, runId2), // Third run
    new Relation(data1, service1, AccessType.READ, runId3), new Relation(data1, service1, AccessType.UNKNOWN, runId3));
    // Collapse on access type, run
    Assert.assertEquals(toSet(new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.WRITE, AccessType.UNKNOWN), toSet(runId1, runId2, runId3), Collections.emptySet())), LineageCollapser.collapseRelations(relations, toSet(CollapseType.ACCESS, CollapseType.RUN)));
    // Collapse on access type, component
    Assert.assertEquals(toSet(new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.WRITE), toSet(runId1), Collections.emptySet()), new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.WRITE, AccessType.UNKNOWN), toSet(runId2), Collections.emptySet()), new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.UNKNOWN), toSet(runId3), Collections.emptySet())), LineageCollapser.collapseRelations(relations, toSet(CollapseType.ACCESS, CollapseType.COMPONENT)));
    // Collapse on component, run
    Assert.assertEquals(toSet(new CollapsedRelation(data1, service1, toSet(AccessType.READ), toSet(runId1, runId2, runId3), Collections.emptySet()), new CollapsedRelation(data1, service1, toSet(AccessType.WRITE), toSet(runId1, runId2), Collections.emptySet()), new CollapsedRelation(data1, service1, toSet(AccessType.UNKNOWN), toSet(runId2, runId3), Collections.emptySet())), LineageCollapser.collapseRelations(relations, toSet(CollapseType.COMPONENT, CollapseType.RUN)));
    // Collapse on all three
    Assert.assertEquals(toSet(new CollapsedRelation(data1, service1, toSet(AccessType.READ, AccessType.WRITE, AccessType.UNKNOWN), toSet(runId1, runId2, runId3), Collections.emptySet())), LineageCollapser.collapseRelations(relations, toSet(CollapseType.COMPONENT, CollapseType.RUN, CollapseType.ACCESS)));
}
Also used : CollapsedRelation(io.cdap.cdap.data2.metadata.lineage.CollapsedRelation) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) CollapsedRelation(io.cdap.cdap.data2.metadata.lineage.CollapsedRelation) Test(org.junit.Test)

Example 55 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.

the class LineageAdminTest method testDirectCycleTwoRuns.

@Test
public void testDirectCycleTwoRuns() {
    // Lineage for:
    // 
    // D1 -> P1 (run1)
    // 
    // D1 <- P1 (run2)
    // 
    TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
    LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
    LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
    Store store = getInjector().getInstance(Store.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageWriter.addAccess(run1, dataset1, AccessType.READ);
    // Write is in a different run
    lineageWriter.addAccess(new ProgramRunId(run1.getNamespace(), run1.getApplication(), run1.getParent().getType(), run1.getProgram(), run2.getEntityName()), dataset1, AccessType.WRITE);
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run2))));
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
Also used : Relation(io.cdap.cdap.data2.metadata.lineage.Relation) BasicLineageWriter(io.cdap.cdap.data2.metadata.writer.BasicLineageWriter) LineageWriter(io.cdap.cdap.data2.metadata.writer.LineageWriter) TransactionRunner(io.cdap.cdap.spi.data.transaction.TransactionRunner) DefaultLineageStoreReader(io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) DefaultLineageStoreReader(io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader) Store(io.cdap.cdap.app.store.Store) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) BasicLineageWriter(io.cdap.cdap.data2.metadata.writer.BasicLineageWriter) Test(org.junit.Test)

Aggregations

Test (org.junit.Test)45 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)38 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)26 Relation (co.cask.cdap.data2.metadata.lineage.Relation)20 Store (io.cdap.cdap.app.store.Store)20 DefaultLineageStoreReader (io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader)20 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)20 BasicLineageWriter (io.cdap.cdap.data2.metadata.writer.BasicLineageWriter)16 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)16 TransactionRunner (io.cdap.cdap.spi.data.transaction.TransactionRunner)16 RunId (org.apache.twill.api.RunId)15 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)14 DatasetId (io.cdap.cdap.proto.id.DatasetId)14 ProgramId (io.cdap.cdap.proto.id.ProgramId)14 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)14 HashSet (java.util.HashSet)12 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)10 CollapsedRelation (io.cdap.cdap.data2.metadata.lineage.CollapsedRelation)10 ProgramRunId (co.cask.cdap.proto.id.ProgramRunId)9 ApplicationSpecification (io.cdap.cdap.api.app.ApplicationSpecification)8