Search in sources :

Example 1 with LineageRecord

use of co.cask.cdap.proto.metadata.lineage.LineageRecord in project cdap by caskdata.

the class LineageTestRun method testAllProgramsLineage.

@Test
public void testAllProgramsLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
    ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
    ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
    ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
    ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
    ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
    ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
    DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
    StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata
        ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
        addTags(spark, sparkTags);
        Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
        ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
        addTags(worker, workerTags);
        Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
        ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, datasetProperties);
        Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
        // Start all programs
        RunId flowRunId = runAndWait(flow);
        RunId mrRunId = runAndWait(mapreduce);
        RunId mrRunId2 = runAndWait(mapreduce2);
        RunId sparkRunId = runAndWait(spark);
        runAndWait(workflow);
        RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
        RunId serviceRunId = runAndWait(service);
        // Worker makes a call to service to make it access datasets,
        // hence need to make sure service starts before worker, and stops after it.
        RunId workerRunId = runAndWait(worker);
        // Wait for programs to finish
        waitForStop(flow, true);
        waitForStop(mapreduce, false);
        waitForStop(mapreduce2, false);
        waitForStop(spark, false);
        waitForStop(workflow, false);
        waitForStop(worker, false);
        waitForStop(service, true);
        long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
        long oneHour = TimeUnit.HOURS.toSeconds(1);
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // dataset is accessed by all programs
        LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
        new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, toSet(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId), // Stream access
        new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, mapreduce, AccessType.READ, mrRunId), new Relation(stream, spark, AccessType.READ, sparkRunId), new Relation(stream, mapreduce, AccessType.READ, workflowMrRunId), new Relation(stream, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
        Assert.assertEquals(expected, lineage);
        // Fetch stream lineage
        lineage = fetchLineage(stream, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // stream too is accessed by all programs
        Assert.assertEquals(expected, lineage);
        // Assert metadata
        // Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(flow, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(flow.run(flowRunId.getId())));
        // Id.Worker needs conversion to Id.Program JIRA - CDAP-3658
        ProgramId programForWorker = new ProgramId(worker.getNamespace(), worker.getApplication(), worker.getType(), worker.getEntityName());
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForWorker, MetadataScope.USER, emptyMap(), workerTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(worker.run(workerRunId.getId())));
        // Id.Spark needs conversion to Id.Program JIRA - CDAP-3658
        ProgramId programForSpark = new ProgramId(spark.getNamespace(), spark.getApplication(), spark.getType(), spark.getEntityName());
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForSpark, MetadataScope.USER, emptyMap(), sparkTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(dataset2, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset3, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(spark.run(sparkRunId.getId())));
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(co.cask.cdap.client.app.AllProgramsApp) ProgramId(co.cask.cdap.proto.id.ProgramId) DatasetId(co.cask.cdap.proto.id.DatasetId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(co.cask.cdap.proto.NamespaceMeta) NamespaceId(co.cask.cdap.proto.id.NamespaceId) ApplicationId(co.cask.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) MetadataRecord(co.cask.cdap.proto.metadata.MetadataRecord) Test(org.junit.Test)

Example 2 with LineageRecord

use of co.cask.cdap.proto.metadata.lineage.LineageRecord in project cdap by caskdata.

the class LineageTestRun method testFlowLineage.

@Test
public void testFlowLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testFlowLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.toId()).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata to applicaton
        ImmutableMap<String, String> appProperties = ImmutableMap.of("app-key1", "app-value1");
        addProperties(app, appProperties);
        Assert.assertEquals(appProperties, getProperties(app, MetadataScope.USER));
        ImmutableSet<String> appTags = ImmutableSet.of("app-tag1");
        addTags(app, appTags);
        Assert.assertEquals(appTags, getTags(app, MetadataScope.USER));
        // Add metadata to flow
        ImmutableMap<String, String> flowProperties = ImmutableMap.of("flow-key1", "flow-value1");
        addProperties(flow, flowProperties);
        Assert.assertEquals(flowProperties, getProperties(flow, MetadataScope.USER));
        ImmutableSet<String> flowTags = ImmutableSet.of("flow-tag1", "flow-tag2");
        addTags(flow, flowTags);
        Assert.assertEquals(flowTags, getTags(flow, MetadataScope.USER));
        // Add metadata to dataset
        ImmutableMap<String, String> dataProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, dataProperties);
        Assert.assertEquals(dataProperties, getProperties(dataset, MetadataScope.USER));
        ImmutableSet<String> dataTags = ImmutableSet.of("data-tag1", "data-tag2");
        addTags(dataset, dataTags);
        Assert.assertEquals(dataTags, getTags(dataset, MetadataScope.USER));
        // Add metadata to stream
        ImmutableMap<String, String> streamProperties = ImmutableMap.of("stream-key1", "stream-value1");
        addProperties(stream, streamProperties);
        Assert.assertEquals(streamProperties, getProperties(stream, MetadataScope.USER));
        ImmutableSet<String> streamTags = ImmutableSet.of("stream-tag1", "stream-tag2");
        addTags(stream, streamTags);
        Assert.assertEquals(streamTags, getTags(stream, MetadataScope.USER));
        long startTime = TimeMathParser.nowInSeconds();
        RunId flowRunId = runAndWait(flow);
        // Wait for few seconds so that the stop time secs is more than start time secs.
        TimeUnit.SECONDS.sleep(2);
        waitForStop(flow, true);
        long stopTime = TimeMathParser.nowInSeconds();
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, startTime, stopTime, 10);
        LineageRecord expected = LineageSerializer.toLineageRecord(startTime, stopTime, new Lineage(ImmutableSet.of(new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))))), Collections.<CollapseType>emptySet());
        Assert.assertEquals(expected, lineage);
        // Fetch dataset lineage with time strings
        lineage = fetchLineage(dataset, "now-1h", "now+1h", 10);
        Assert.assertEquals(expected.getRelations(), lineage.getRelations());
        // Fetch stream lineage
        lineage = fetchLineage(stream, startTime, stopTime, 10);
        // same as dataset's lineage
        Assert.assertEquals(expected, lineage);
        // Fetch stream lineage with time strings
        lineage = fetchLineage(stream, "now-1h", "now+1h", 10);
        // same as dataset's lineage
        Assert.assertEquals(expected.getRelations(), lineage.getRelations());
        // Assert metadata
        // Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, appProperties, appTags), new MetadataRecord(flow, MetadataScope.USER, flowProperties, flowTags), new MetadataRecord(dataset, MetadataScope.USER, dataProperties, dataTags), new MetadataRecord(stream, MetadataScope.USER, streamProperties, streamTags)), fetchRunMetadata(flow.run(flowRunId.getId())));
        // Assert with a time range after the flow run should return no results
        long laterStartTime = stopTime + 1000;
        long laterEndTime = stopTime + 5000;
        // Fetch stream lineage
        lineage = fetchLineage(stream, laterStartTime, laterEndTime, 10);
        Assert.assertEquals(LineageSerializer.toLineageRecord(laterStartTime, laterEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
        // Assert with a time range before the flow run should return no results
        long earlierStartTime = startTime - 5000;
        long earlierEndTime = startTime - 1000;
        // Fetch stream lineage
        lineage = fetchLineage(stream, earlierStartTime, earlierEndTime, 10);
        Assert.assertEquals(LineageSerializer.toLineageRecord(earlierStartTime, earlierEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
        // Test bad time ranges
        fetchLineage(dataset, "sometime", "sometime", 10, BadRequestException.class);
        fetchLineage(dataset, "now+1h", "now-1h", 10, BadRequestException.class);
        // Test non-existent run
        assertRunMetadataNotFound(flow.run(RunIds.generate(1000).getId()));
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) CollapseType(co.cask.cdap.proto.metadata.lineage.CollapseType) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(co.cask.cdap.client.app.AllProgramsApp) ProgramId(co.cask.cdap.proto.id.ProgramId) DatasetId(co.cask.cdap.proto.id.DatasetId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(co.cask.cdap.proto.NamespaceMeta) NamespaceId(co.cask.cdap.proto.id.NamespaceId) ApplicationId(co.cask.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) MetadataRecord(co.cask.cdap.proto.metadata.MetadataRecord) Test(org.junit.Test)

Example 3 with LineageRecord

use of co.cask.cdap.proto.metadata.lineage.LineageRecord in project cdap by caskdata.

the class GetDatasetLineageCommand method perform.

@Override
public void perform(Arguments arguments, PrintStream output) throws Exception {
    long currentTime = System.currentTimeMillis();
    DatasetId dataset = cliConfig.getCurrentNamespace().dataset(arguments.get(ArgumentName.DATASET.toString()));
    long start = getTimestamp(arguments.getOptional("start", "min"), currentTime);
    long end = getTimestamp(arguments.getOptional("end", "max"), currentTime);
    Integer levels = arguments.getIntOptional("levels", null);
    LineageRecord lineage = client.getLineage(dataset, start, end, levels);
    Table table = Table.builder().setHeader("start", "end", "relations", "programs", "data").setRows(Collections.<List<String>>singletonList(Lists.newArrayList(Long.toString(lineage.getStart()), Long.toString(lineage.getEnd()), GSON.toJson(lineage.getRelations()), GSON.toJson(lineage.getPrograms()), GSON.toJson(lineage.getData())))).build();
    cliConfig.getTableRenderer().render(cliConfig, output, table);
}
Also used : Table(co.cask.cdap.cli.util.table.Table) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) List(java.util.List) DatasetId(co.cask.cdap.proto.id.DatasetId)

Example 4 with LineageRecord

use of co.cask.cdap.proto.metadata.lineage.LineageRecord in project cdap by caskdata.

the class GetStreamLineageCommand method perform.

@Override
public void perform(Arguments arguments, PrintStream output) throws Exception {
    long currentTime = System.currentTimeMillis();
    StreamId stream = cliConfig.getCurrentNamespace().stream(arguments.get(ArgumentName.STREAM.toString()));
    long start = getTimestamp(arguments.getOptional("start", "min"), currentTime);
    long end = getTimestamp(arguments.getOptional("end", "max"), currentTime);
    Integer levels = arguments.getIntOptional("levels", null);
    LineageRecord lineage = client.getLineage(stream, start, end, levels);
    Table table = Table.builder().setHeader("start", "end", "relations", "programs", "data").setRows(Collections.<List<String>>singletonList(Lists.newArrayList(Long.toString(lineage.getStart()), Long.toString(lineage.getEnd()), GSON.toJson(lineage.getRelations()), GSON.toJson(lineage.getPrograms()), GSON.toJson(lineage.getData())))).build();
    cliConfig.getTableRenderer().render(cliConfig, output, table);
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) Table(co.cask.cdap.cli.util.table.Table) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) List(java.util.List)

Example 5 with LineageRecord

use of co.cask.cdap.proto.metadata.lineage.LineageRecord in project cdap by caskdata.

the class LineageSerializer method toLineageRecord.

public static LineageRecord toLineageRecord(long start, long end, Lineage lineage, Set<CollapseType> collapseTypes) {
    Set<RelationRecord> relationBuilder = new HashSet<>();
    Map<String, ProgramRecord> programBuilder = new HashMap<>();
    Map<String, DataRecord> dataBuilder = new HashMap<>();
    Set<CollapsedRelation> collapsedRelations = LineageCollapser.collapseRelations(lineage.getRelations(), collapseTypes);
    for (CollapsedRelation relation : collapsedRelations) {
        String dataKey = makeDataKey(relation.getData());
        String programKey = makeProgramKey(relation.getProgram());
        RelationRecord relationRecord = new RelationRecord(dataKey, programKey, convertAccessType(relation.getAccess()), convertRuns(relation.getRuns()), convertComponents(relation.getComponents()));
        relationBuilder.add(relationRecord);
        programBuilder.put(programKey, new ProgramRecord(relation.getProgram()));
        dataBuilder.put(dataKey, new DataRecord(relation.getData()));
    }
    return new LineageRecord(start, end, relationBuilder, programBuilder, dataBuilder);
}
Also used : RelationRecord(co.cask.cdap.proto.metadata.lineage.RelationRecord) ProgramRecord(co.cask.cdap.proto.metadata.lineage.ProgramRecord) HashMap(java.util.HashMap) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) DataRecord(co.cask.cdap.proto.metadata.lineage.DataRecord) HashSet(java.util.HashSet)

Aggregations

LineageRecord (co.cask.cdap.proto.metadata.lineage.LineageRecord)5 DatasetId (co.cask.cdap.proto.id.DatasetId)3 StreamId (co.cask.cdap.proto.id.StreamId)3 Table (co.cask.cdap.cli.util.table.Table)2 AllProgramsApp (co.cask.cdap.client.app.AllProgramsApp)2 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)2 Relation (co.cask.cdap.data2.metadata.lineage.Relation)2 NamespaceMeta (co.cask.cdap.proto.NamespaceMeta)2 ApplicationId (co.cask.cdap.proto.id.ApplicationId)2 NamespaceId (co.cask.cdap.proto.id.NamespaceId)2 ProgramId (co.cask.cdap.proto.id.ProgramId)2 MetadataRecord (co.cask.cdap.proto.metadata.MetadataRecord)2 List (java.util.List)2 RunId (org.apache.twill.api.RunId)2 Test (org.junit.Test)2 CollapseType (co.cask.cdap.proto.metadata.lineage.CollapseType)1 DataRecord (co.cask.cdap.proto.metadata.lineage.DataRecord)1 ProgramRecord (co.cask.cdap.proto.metadata.lineage.ProgramRecord)1 RelationRecord (co.cask.cdap.proto.metadata.lineage.RelationRecord)1 HashMap (java.util.HashMap)1