Search in sources :

Example 1 with Lineage

use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageAdminTest method testSimpleLoopLineage.

@Test
public void testSimpleLoopLineage() throws Exception {
    // Lineage for D1 -> P1 -> D2 -> P2 -> D3 -> P3 -> D4
    // |                 |
    // |                 V
    // |<-----------------
    // 
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testSimpleLoopLineage"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add access
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run2, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset3, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run3, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run3, dataset4, AccessType.WRITE, System.currentTimeMillis());
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset4, program3, AccessType.WRITE, twillRunId(run3), emptySet()), new Relation(dataset3, program3, AccessType.READ, twillRunId(run3), emptySet())));
    // Lineage for D1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
    // Lineage for D2
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset2, 500, 20000, 100));
    // Lineage for D1 for one level D1 -> P1 -> D2 -> P2 -> D3
    // |                 |
    // |                 V
    // |<-----------------
    // 
    Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2))), oneLevelLineage.getRelations());
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Test(org.junit.Test)

Example 2 with Lineage

use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageAdminTest method testDirectCycleTwoRuns.

@Test
public void testDirectCycleTwoRuns() throws Exception {
    // Lineage for:
    // 
    // D1 -> P1 (run1)
    // 
    // D1 <- P1 (run2)
    // 
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testDirectCycleTwoRuns"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    // Write is in a different run
    lineageStore.addAccess(new ProgramRunId(run1.getNamespace(), run1.getApplication(), run1.getParent().getType(), run1.getProgram(), run2.getEntityName()), dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run2), toSet(flowlet1))));
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) Test(org.junit.Test)

Example 3 with Lineage

use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageAdminTest method testBranchLoopLineage.

@Test
public void testBranchLoopLineage() throws Exception {
    // Lineage for:
    // 
    // |-------------------------------------|
    // |                                     |
    // |                                     |
    // |    -> D4       -> D5 -> P3 -> D6 -> P5
    // |    |           |                    ^
    // V    |           |                    |
    // D1 -> P1 -> D2 -> P2 -> D3 ----------->|
    // |     |           |
    // |     |           |
    // S1 -->|     ---------------> P4 -> D7
    LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testBranchLoopLineage"));
    Store store = getInjector().getInstance(Store.class);
    MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageStore.addAccess(run1, stream1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run1, dataset4, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
    lineageStore.addAccess(run2, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset3, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run2, dataset5, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
    lineageStore.addAccess(run3, dataset5, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run3, dataset6, AccessType.WRITE, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset2, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run4, dataset7, AccessType.WRITE, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset3, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset6, AccessType.READ, System.currentTimeMillis());
    lineageStore.addAccess(run5, dataset1, AccessType.WRITE, System.currentTimeMillis());
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(stream1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3), emptySet()), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3), emptySet()), new Relation(dataset2, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset3, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset7, program4, AccessType.WRITE, twillRunId(run4), emptySet()), new Relation(dataset3, program5, AccessType.READ, twillRunId(run5), emptySet()), new Relation(dataset6, program5, AccessType.READ, twillRunId(run5), emptySet()), new Relation(dataset1, program5, AccessType.WRITE, twillRunId(run5), emptySet())));
    // Lineage for D1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
    // Lineage for D5
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset5, 500, 20000, 100));
    // Lineage for D7
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset7, 500, 20000, 100));
    // Lineage for S1
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(stream1, 500, 20000, 100));
    // Lineage for D5 for one level
    // -> D5 -> P3 -> D6
    // |
    // |
    // D2 -> P2 -> D3
    Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset5, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3), emptySet()), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3), emptySet())), oneLevelLineage.getRelations());
    // Lineage for S1 for one level
    // 
    // -> D4
    // |
    // |
    // D1 -> P1 -> D2
    // |
    // |
    // S1 -->|
    oneLevelLineage = lineageAdmin.computeLineage(stream1, 500, 20000, 1);
    Assert.assertEquals(ImmutableSet.of(new Relation(stream1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1))), oneLevelLineage.getRelations());
}
Also used : MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Store(co.cask.cdap.app.store.Store) LineageStore(co.cask.cdap.data2.metadata.lineage.LineageStore) MetadataStore(co.cask.cdap.data2.metadata.store.MetadataStore) Test(org.junit.Test)

Example 4 with Lineage

use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageHandler method streamLineage.

@GET
@Path("/namespaces/{namespace-id}/streams/{stream-id}/lineage")
public void streamLineage(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("stream-id") String stream, @QueryParam("start") String startStr, @QueryParam("end") String endStr, @QueryParam("levels") @DefaultValue("10") int levels, @QueryParam("collapse") List<String> collapse, @QueryParam("rollup") String rollup) throws Exception {
    checkLevels(levels);
    TimeRange range = parseRange(startStr, endStr);
    StreamId streamId = new StreamId(namespaceId, stream);
    Lineage lineage = lineageAdmin.computeLineage(streamId, range.getStart(), range.getEnd(), levels, rollup);
    responder.sendJson(HttpResponseStatus.OK, GSON.toJson(LineageSerializer.toLineageRecord(TimeUnit.MILLISECONDS.toSeconds(range.getStart()), TimeUnit.MILLISECONDS.toSeconds(range.getEnd()), lineage, getCollapseTypes(collapse)), LineageRecord.class));
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) Path(javax.ws.rs.Path) GET(javax.ws.rs.GET)

Example 5 with Lineage

use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageTestRun method testFlowLineage.

@Test
public void testFlowLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testFlowLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata to applicaton
        ImmutableMap<String, String> appProperties = ImmutableMap.of("app-key1", "app-value1");
        addProperties(app, appProperties);
        Assert.assertEquals(appProperties, getProperties(app, MetadataScope.USER));
        ImmutableSet<String> appTags = ImmutableSet.of("app-tag1");
        addTags(app, appTags);
        Assert.assertEquals(appTags, getTags(app, MetadataScope.USER));
        // Add metadata to flow
        ImmutableMap<String, String> flowProperties = ImmutableMap.of("flow-key1", "flow-value1");
        addProperties(flow, flowProperties);
        Assert.assertEquals(flowProperties, getProperties(flow, MetadataScope.USER));
        ImmutableSet<String> flowTags = ImmutableSet.of("flow-tag1", "flow-tag2");
        addTags(flow, flowTags);
        Assert.assertEquals(flowTags, getTags(flow, MetadataScope.USER));
        // Add metadata to dataset
        ImmutableMap<String, String> dataProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, dataProperties);
        Assert.assertEquals(dataProperties, getProperties(dataset, MetadataScope.USER));
        ImmutableSet<String> dataTags = ImmutableSet.of("data-tag1", "data-tag2");
        addTags(dataset, dataTags);
        Assert.assertEquals(dataTags, getTags(dataset, MetadataScope.USER));
        // Add metadata to stream
        ImmutableMap<String, String> streamProperties = ImmutableMap.of("stream-key1", "stream-value1");
        addProperties(stream, streamProperties);
        Assert.assertEquals(streamProperties, getProperties(stream, MetadataScope.USER));
        ImmutableSet<String> streamTags = ImmutableSet.of("stream-tag1", "stream-tag2");
        addTags(stream, streamTags);
        Assert.assertEquals(streamTags, getTags(stream, MetadataScope.USER));
        long startTime = TimeMathParser.nowInSeconds();
        RunId flowRunId = runAndWait(flow);
        // Wait for few seconds so that the stop time secs is more than start time secs.
        TimeUnit.SECONDS.sleep(2);
        waitForStop(flow, true);
        long stopTime = TimeMathParser.nowInSeconds();
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, startTime, stopTime, 10);
        LineageRecord expected = LineageSerializer.toLineageRecord(startTime, stopTime, new Lineage(ImmutableSet.of(new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))))), Collections.<CollapseType>emptySet());
        Assert.assertEquals(expected, lineage);
        // Fetch dataset lineage with time strings
        lineage = fetchLineage(dataset, "now-1h", "now+1h", 10);
        Assert.assertEquals(expected.getRelations(), lineage.getRelations());
        // Fetch stream lineage
        lineage = fetchLineage(stream, startTime, stopTime, 10);
        // same as dataset's lineage
        Assert.assertEquals(expected, lineage);
        // Fetch stream lineage with time strings
        lineage = fetchLineage(stream, "now-1h", "now+1h", 10);
        // same as dataset's lineage
        Assert.assertEquals(expected.getRelations(), lineage.getRelations());
        // Assert metadata
        // Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, appProperties, appTags), new MetadataRecord(flow, MetadataScope.USER, flowProperties, flowTags), new MetadataRecord(dataset, MetadataScope.USER, dataProperties, dataTags), new MetadataRecord(stream, MetadataScope.USER, streamProperties, streamTags)), fetchRunMetadata(flow.run(flowRunId.getId())));
        // Assert with a time range after the flow run should return no results
        long laterStartTime = stopTime + 1000;
        long laterEndTime = stopTime + 5000;
        // Fetch stream lineage
        lineage = fetchLineage(stream, laterStartTime, laterEndTime, 10);
        Assert.assertEquals(LineageSerializer.toLineageRecord(laterStartTime, laterEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
        // Assert with a time range before the flow run should return no results
        long earlierStartTime = startTime - 5000;
        long earlierEndTime = startTime - 1000;
        // Fetch stream lineage
        lineage = fetchLineage(stream, earlierStartTime, earlierEndTime, 10);
        Assert.assertEquals(LineageSerializer.toLineageRecord(earlierStartTime, earlierEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
        // Test bad time ranges
        fetchLineage(dataset, "sometime", "sometime", 10, BadRequestException.class);
        fetchLineage(dataset, "now+1h", "now-1h", 10, BadRequestException.class);
        // Test non-existent run
        assertRunMetadataNotFound(flow.run(RunIds.generate(1000).getId()));
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) CollapseType(co.cask.cdap.proto.metadata.lineage.CollapseType) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(co.cask.cdap.client.app.AllProgramsApp) ProgramId(co.cask.cdap.proto.id.ProgramId) DatasetId(co.cask.cdap.proto.id.DatasetId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(co.cask.cdap.proto.NamespaceMeta) NamespaceId(co.cask.cdap.proto.id.NamespaceId) ApplicationId(co.cask.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) MetadataRecord(co.cask.cdap.common.metadata.MetadataRecord) Test(org.junit.Test)

Aggregations

Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)12 Relation (co.cask.cdap.data2.metadata.lineage.Relation)11 Test (org.junit.Test)9 Store (co.cask.cdap.app.store.Store)7 LineageStore (co.cask.cdap.data2.metadata.lineage.LineageStore)7 MetadataStore (co.cask.cdap.data2.metadata.store.MetadataStore)7 DatasetId (co.cask.cdap.proto.id.DatasetId)5 ProgramRunId (co.cask.cdap.proto.id.ProgramRunId)5 MetadataRecord (co.cask.cdap.common.metadata.MetadataRecord)4 NamespaceId (co.cask.cdap.proto.id.NamespaceId)4 LineageRecord (co.cask.cdap.proto.metadata.lineage.LineageRecord)4 RunId (org.apache.twill.api.RunId)4 ProgramId (co.cask.cdap.proto.id.ProgramId)3 StreamId (co.cask.cdap.proto.id.StreamId)3 AllProgramsApp (co.cask.cdap.client.app.AllProgramsApp)2 NamespaceMeta (co.cask.cdap.proto.NamespaceMeta)2 ApplicationId (co.cask.cdap.proto.id.ApplicationId)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)1 MetricsCollectionService (co.cask.cdap.api.metrics.MetricsCollectionService)1