Search in sources :

Example 1 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class RemoteLineageWriterTest method testSimpleCase.

@Test
public void testSimpleCase() {
    long now = System.currentTimeMillis();
    ApplicationId appId = NamespaceId.DEFAULT.app("test_app");
    ProgramId flowId = appId.flow("test_flow");
    ProgramRunId runId = flowId.run(RunIds.generate(now).getId());
    RunId twillRunId = RunIds.fromString(runId.getRun());
    DatasetId datasetId = NamespaceId.DEFAULT.dataset("test_dataset");
    StreamId streamId = NamespaceId.DEFAULT.stream("test_stream");
    Set<Relation> expectedRelations = new HashSet<>();
    // test null serialization
    remoteLineageWriter.addAccess(runId, datasetId, AccessType.READ, null);
    expectedRelations.add(new Relation(datasetId, flowId, AccessType.READ, twillRunId));
    Assert.assertEquals(ImmutableSet.of(flowId, datasetId), lineageStore.getEntitiesForRun(runId));
    Assert.assertEquals(expectedRelations, lineageStore.getRelations(flowId, now, now + 1, Predicates.<Relation>alwaysTrue()));
    remoteLineageWriter.addAccess(runId, streamId, AccessType.READ);
    expectedRelations.add(new Relation(streamId, flowId, AccessType.READ, twillRunId));
    Assert.assertEquals(expectedRelations, lineageStore.getRelations(flowId, now, now + 1, Predicates.<Relation>alwaysTrue()));
    remoteLineageWriter.addAccess(runId, streamId, AccessType.WRITE);
    expectedRelations.add(new Relation(streamId, flowId, AccessType.WRITE, twillRunId));
    Assert.assertEquals(expectedRelations, lineageStore.getRelations(flowId, now, now + 1, Predicates.<Relation>alwaysTrue()));
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) ApplicationId(co.cask.cdap.proto.id.ApplicationId) ProgramId(co.cask.cdap.proto.id.ProgramId) RunId(org.apache.twill.api.RunId) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) DatasetId(co.cask.cdap.proto.id.DatasetId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 2 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdminTest method testSimpleLoopLineage.

@Test
public void testSimpleLoopLineage() throws Exception {
    // Lineage for D1 -> P1 -> D2 -> P2 -> D3 -> P3 -> D4
    // |                 |
    // |                 V
    // |<-----------------
    // 
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testSimpleLoopLineage"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add access
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run2, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset3, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run3, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run3, dataset4, AccessType.WRITE, System.currentTimeMillis());
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset4, program3, AccessType.WRITE, twillRunId(run3), emptySet()), new Relation(dataset3, program3, AccessType.READ, twillRunId(run3), emptySet())));
    // Lineage for D1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
    // Lineage for D2
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset2, 500, 20000, 100));
    // Lineage for D1 for one level D1 -> P1 -> D2 -> P2 -> D3
    // |                 |
    // |                 V
    // |<-----------------
    // 
    Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2))), oneLevelLineage.getRelations());
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Test(org.junit.Test)

Example 3 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdminTest method testDirectCycleTwoRuns.

@Test
public void testDirectCycleTwoRuns() throws Exception {
    // Lineage for:
    // 
    // D1 -> P1 (run1)
    // 
    // D1 <- P1 (run2)
    // 
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testDirectCycleTwoRuns"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    // Write is in a different run
    lineageStore.addAccess(new ProgramRunId(run1.getNamespace(), run1.getApplication(), run1.getParent().getType(), run1.getProgram(), run2.getEntityName()), dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run2), toSet(flowlet1))));
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) Test(org.junit.Test)

Example 4 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdminTest method testBranchLoopLineage.

@Test
public void testBranchLoopLineage() throws Exception {
    // Lineage for:
    // 
    // |-------------------------------------|
    // |                                     |
    // |                                     |
    // |    -> D4       -> D5 -> P3 -> D6 -> P5
    // |    |           |                    ^
    // V    |           |                    |
    // D1 -> P1 -> D2 -> P2 -> D3 ----------->|
    // |     |           |
    // |     |           |
    // S1 -->|     ---------------> P4 -> D7
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testBranchLoopLineage"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, stream1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset4, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run2, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset3, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset5, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run3, dataset5, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run3, dataset6, AccessType.WRITE, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset2, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset7, AccessType.WRITE, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset6, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset1, AccessType.WRITE, System.currentTimeMillis());
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(stream1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3), emptySet()), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3), emptySet()), new Relation(dataset2, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset3, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset7, program4, AccessType.WRITE, twillRunId(run4), emptySet()), new Relation(dataset3, program5, AccessType.READ, twillRunId(run5), emptySet()), new Relation(dataset6, program5, AccessType.READ, twillRunId(run5), emptySet()), new Relation(dataset1, program5, AccessType.WRITE, twillRunId(run5), emptySet())));
    // Lineage for D1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
    // Lineage for D5
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset5, 500, 20000, 100));
    // Lineage for D7
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset7, 500, 20000, 100));
    // Lineage for S1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(stream1, 500, 20000, 100));
    // Lineage for D5 for one level
    // -> D5 -> P3 -> D6
    // |
    // |
    // D2 -> P2 -> D3
    Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset5, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3), emptySet()), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3), emptySet())), oneLevelLineage.getRelations());
    // Lineage for S1 for one level
    // 
    // -> D4
    // |
    // |
    // D1 -> P1 -> D2
    // |
    // |
    // S1 -->|
    oneLevelLineage = lineageAdmin.computeLineage(stream1, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(stream1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1))), oneLevelLineage.getRelations());
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Test(org.junit.Test)

Example 5 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageCollapserTest method testCollapseComponent.

@Test
public void testCollapseComponent() throws Exception {
    Set<Relation> relations = ImmutableSet.of(new Relation(data1, flow1, AccessType.READ, runId1, ImmutableSet.of(flowlet11)), new Relation(data1, flow1, AccessType.WRITE, runId1, ImmutableSet.of(flowlet11)), new Relation(data1, flow1, AccessType.READ, runId1, ImmutableSet.of(flowlet12)));
    // Collapse on component
    Assert.assertEquals(toSet(new CollapsedRelation(data1, flow1, toSet(AccessType.READ), toSet(runId1), toSet(flowlet11, flowlet12)), new CollapsedRelation(data1, flow1, toSet(AccessType.WRITE), toSet(runId1), toSet(flowlet11))), LineageCollapser.collapseRelations(relations, ImmutableSet.of(CollapseType.COMPONENT)));
}
Also used : CollapsedRelation(co.cask.cdap.data2.metadata.lineage.CollapsedRelation) CollapsedRelation(co.cask.cdap.data2.metadata.lineage.CollapsedRelation) Relation(co.cask.cdap.data2.metadata.lineage.Relation) Test(org.junit.Test)

Aggregations

Relation (co.cask.cdap.data2.metadata.lineage.Relation)20 Test (org.junit.Test)15 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)10 ProgramRunId (co.cask.cdap.proto.id.ProgramRunId)10 Store (co.cask.cdap.app.store.Store)7 LineageStore (co.cask.cdap.data2.metadata.lineage.LineageStore)7 MetadataStore (co.cask.cdap.data2.metadata.store.MetadataStore)7 DatasetId (co.cask.cdap.proto.id.DatasetId)7 ProgramId (co.cask.cdap.proto.id.ProgramId)6 RunId (org.apache.twill.api.RunId)6 CollapsedRelation (co.cask.cdap.data2.metadata.lineage.CollapsedRelation)5 StreamId (co.cask.cdap.proto.id.StreamId)5 MetadataRecord (co.cask.cdap.common.metadata.MetadataRecord)4 NamespaceId (co.cask.cdap.proto.id.NamespaceId)4 HashSet (java.util.HashSet)4 ApplicationId (co.cask.cdap.proto.id.ApplicationId)3 NamespacedEntityId (co.cask.cdap.proto.id.NamespacedEntityId)3 HashMap (java.util.HashMap)3 AllProgramsApp (co.cask.cdap.client.app.AllProgramsApp)2 RunRecordMeta (co.cask.cdap.internal.app.store.RunRecordMeta)2