Search in sources :

Example 1 with LineageStore

use of co.cask.cdap.data2.metadata.lineage.LineageStore in project cdap by caskdata.

the class LineageAdminTest method testDirectCycleTwoRuns.

@Test
public void testDirectCycleTwoRuns() throws Exception {
    // Lineage for:
    //
    // D1 -> P1 (run1)
    //
    // D1 <- P1 (run2)
    //
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testDirectCycleTwoRuns"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    // Write is in a different run
    lineageStore.addAccess(new ProgramRunId(run1.getNamespace(), run1.getApplication(), run1.getParent().getType(), run1.getProgram(), run2.getEntityName()), dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run2), toSet(flowlet1))));
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) Test(org.junit.Test)

Example 2 with LineageStore

use of co.cask.cdap.data2.metadata.lineage.LineageStore in project cdap by caskdata.

the class LineageAdminTest method testBranchLoopLineage.

@Test
public void testBranchLoopLineage() throws Exception {
    // Lineage for:
    //
    //  |-------------------------------------|
    //  |                                     |
    //  |                                     |
    //  |    -> D4       -> D5 -> P3 -> D6 -> P5
    //  |    |           |                    ^
    //  V    |           |                    |
    // D1 -> P1 -> D2 -> P2 -> D3 ----------->|
    //       |     |           |
    //       |     |           |
    // S1 -->|     ---------------> P4 -> D7
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testBranchLoopLineage"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, stream1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset4, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run2, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset3, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset5, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run3, dataset5, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run3, dataset6, AccessType.WRITE, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset2, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset7, AccessType.WRITE, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset6, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset1, AccessType.WRITE, System.currentTimeMillis());
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(stream1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3), emptySet()), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3), emptySet()), new Relation(dataset2, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset3, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset7, program4, AccessType.WRITE, twillRunId(run4), emptySet()), new Relation(dataset3, program5, AccessType.READ, twillRunId(run5), emptySet()), new Relation(dataset6, program5, AccessType.READ, twillRunId(run5), emptySet()), new Relation(dataset1, program5, AccessType.WRITE, twillRunId(run5), emptySet())));
    // Lineage for D1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
    // Lineage for D5
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset5, 500, 20000, 100));
    // Lineage for D7
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset7, 500, 20000, 100));
    // Lineage for S1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(stream1, 500, 20000, 100));
    // Lineage for D5 for one level
    //                   -> D5 -> P3 -> D6
    //                   |
    //                   |
    //             D2 -> P2 -> D3
    Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset5, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3), emptySet()), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3), emptySet())), oneLevelLineage.getRelations());
    // Lineage for S1 for one level
    //
    //       -> D4
    //       |
    //       |
    // D1 -> P1 -> D2
    //       |
    //       |
    // S1 -->|
    oneLevelLineage = lineageAdmin.computeLineage(stream1, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(stream1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1))), oneLevelLineage.getRelations());
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Test(org.junit.Test)

Example 3 with LineageStore

use of co.cask.cdap.data2.metadata.lineage.LineageStore in project cdap by caskdata.

the class LineageAdminTest method testSimpleLoopLineage.

@Test
public void testSimpleLoopLineage() throws Exception {
    // Lineage for D1 -> P1 -> D2 -> P2 -> D3 -> P3 -> D4
    //             |                 |
    //             |                 V
    //             |<-----------------
    //
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testSimpleLoopLineage"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add access
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run2, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset3, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run3, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run3, dataset4, AccessType.WRITE, System.currentTimeMillis());
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset4, program3, AccessType.WRITE, twillRunId(run3), emptySet()), new Relation(dataset3, program3, AccessType.READ, twillRunId(run3), emptySet())));
    // Lineage for D1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
    // Lineage for D2
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset2, 500, 20000, 100));
    // Lineage for D1 for one level D1 -> P1 -> D2 -> P2 -> D3
    //                              |                 |
    //                              |                 V
    //                              |<-----------------
    //
    Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2))), oneLevelLineage.getRelations());
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Test(org.junit.Test)

Example 4 with LineageStore

use of co.cask.cdap.data2.metadata.lineage.LineageStore in project cdap by caskdata.

the class BasicLineageWriterTest method testWrites.

@Test
public void testWrites() throws Exception {
    Injector injector = getInjector();
    MetadataStore metadataStore = injector.getInstance(MetadataStore.class);
    LineageStore lineageStore = injector.getInstance(LineageStore.class);
    LineageWriter lineageWriter = new BasicLineageWriter(lineageStore);
    // Define entities
    ProgramId program = new ProgramId(NamespaceId.DEFAULT.getNamespace(), "app", ProgramType.FLOW, "flow");
    StreamId stream = new StreamId(NamespaceId.DEFAULT.getNamespace(), "stream");
    ProgramRunId run1 = new ProgramRunId(program.getNamespace(), program.getApplication(), program.getType(), program.getEntityName(), RunIds.generate(10000).getId());
    ProgramRunId run2 = new ProgramRunId(program.getNamespace(), program.getApplication(), program.getType(), program.getEntityName(), RunIds.generate(20000).getId());
    // Tag stream
    metadataStore.addTags(MetadataScope.USER, stream, "stag1", "stag2");
    // Write access for run1
    lineageWriter.addAccess(run1, stream, AccessType.READ);
    Assert.assertEquals(ImmutableSet.of(program, stream), lineageStore.getEntitiesForRun(run1));
    // Record time to verify duplicate writes.
    long beforeSecondTag = System.currentTimeMillis();
    // Wait for next millisecond, since access time is stored in milliseconds.
    TimeUnit.MILLISECONDS.sleep(1);
    // Add another tag to stream
    metadataStore.addTags(MetadataScope.USER, stream, "stag3");
    // Write access for run1 again
    lineageWriter.addAccess(run1, stream, AccessType.READ);
    // The write should be no-op, and access time for run1 should not be updated
    Assert.assertTrue(lineageStore.getAccessTimesForRun(run1).get(0) < beforeSecondTag);
    // However, you can write access for another run
    lineageWriter.addAccess(run2, stream, AccessType.READ);
    // Assert new access time is written
    Assert.assertTrue(lineageStore.getAccessTimesForRun(run2).get(0) >= beforeSecondTag);
}
Also used : DefaultMetadataStore(co.cask.cdap.data2.metadata.store.DefaultMetadataStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) StreamId(co.cask.cdap.proto.id.StreamId) Injector(com.google.inject.Injector) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) ProgramId(co.cask.cdap.proto.id.ProgramId) Test(org.junit.Test)

Example 5 with LineageStore

use of co.cask.cdap.data2.metadata.lineage.LineageStore in project cdap by caskdata.

the class LineageAdminTest method testDirectCycle.

@Test
public void testDirectCycle() throws Exception {
    // Lineage for:
    //
    // D1 <-> P1
    //
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testDirectCycle"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1))));
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Test(org.junit.Test)

Aggregations

LineageStore (co.cask.cdap.data2.metadata.lineage.LineageStore)9 MetadataStore (co.cask.cdap.data2.metadata.store.MetadataStore)8 Test (org.junit.Test)8 Store (co.cask.cdap.app.store.Store)7 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)7 Relation (co.cask.cdap.data2.metadata.lineage.Relation)7 ProgramRunId (co.cask.cdap.proto.id.ProgramRunId)4 DatasetId (co.cask.cdap.proto.id.DatasetId)2 NamespaceId (co.cask.cdap.proto.id.NamespaceId)2 MetadataRecord (co.cask.cdap.proto.metadata.MetadataRecord)2 Injector (com.google.inject.Injector)2 DefaultMetadataStore (co.cask.cdap.data2.metadata.store.DefaultMetadataStore)1 ProgramId (co.cask.cdap.proto.id.ProgramId)1 StreamId (co.cask.cdap.proto.id.StreamId)1 BeforeClass (org.junit.BeforeClass)1