Search in sources :

Example 1 with LineageRecord

use of io.cdap.cdap.proto.metadata.lineage.LineageRecord in project cdap by cdapio.

the class LineageSerializer method toLineageRecord.

public static LineageRecord toLineageRecord(long start, long end, Lineage lineage, Set<CollapseType> collapseTypes) {
    Set<RelationRecord> relationBuilder = new HashSet<>();
    Map<String, ProgramRecord> programBuilder = new HashMap<>();
    Map<String, DataRecord> dataBuilder = new HashMap<>();
    Set<CollapsedRelation> collapsedRelations = LineageCollapser.collapseRelations(lineage.getRelations(), collapseTypes);
    for (CollapsedRelation relation : collapsedRelations) {
        String dataKey = makeDataKey(relation.getData());
        String programKey = makeProgramKey(relation.getProgram());
        RelationRecord relationRecord = new RelationRecord(dataKey, programKey, convertAccessType(relation.getAccess()), convertRuns(relation.getRuns()), convertComponents(relation.getComponents()));
        relationBuilder.add(relationRecord);
        programBuilder.put(programKey, new ProgramRecord(relation.getProgram()));
        dataBuilder.put(dataKey, new DataRecord(relation.getData()));
    }
    return new LineageRecord(start, end, relationBuilder, programBuilder, dataBuilder);
}
Also used : RelationRecord(io.cdap.cdap.proto.metadata.lineage.RelationRecord) ProgramRecord(io.cdap.cdap.proto.metadata.lineage.ProgramRecord) HashMap(java.util.HashMap) LineageRecord(io.cdap.cdap.proto.metadata.lineage.LineageRecord) DataRecord(io.cdap.cdap.proto.metadata.lineage.DataRecord) HashSet(java.util.HashSet)

Example 2 with LineageRecord

use of io.cdap.cdap.proto.metadata.lineage.LineageRecord in project cdap by cdapio.

the class GetDatasetLineageCommand method perform.

@Override
public void perform(Arguments arguments, PrintStream output) throws Exception {
    long currentTime = System.currentTimeMillis();
    DatasetId dataset = cliConfig.getCurrentNamespace().dataset(arguments.get(ArgumentName.DATASET.toString()));
    long start = getTimestamp(arguments.getOptional("start", "min"), currentTime);
    long end = getTimestamp(arguments.getOptional("end", "max"), currentTime);
    Integer levels = arguments.getIntOptional("levels", null);
    LineageRecord lineage = client.getLineage(dataset, start, end, levels);
    Table table = Table.builder().setHeader("start", "end", "relations", "programs", "data").setRows(Collections.<List<String>>singletonList(Lists.newArrayList(Long.toString(lineage.getStart()), Long.toString(lineage.getEnd()), GSON.toJson(lineage.getRelations()), GSON.toJson(lineage.getPrograms()), GSON.toJson(lineage.getData())))).build();
    cliConfig.getTableRenderer().render(cliConfig, output, table);
}
Also used : Table(io.cdap.cdap.cli.util.table.Table) LineageRecord(io.cdap.cdap.proto.metadata.lineage.LineageRecord) List(java.util.List) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Example 3 with LineageRecord

use of io.cdap.cdap.proto.metadata.lineage.LineageRecord in project cdap by cdapio.

the class LineageHttpHandlerTestRun method testAllProgramsLineage.

@Test
public void testAllProgramsLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
    ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
    ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
    ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
    ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
    ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
    DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata
        ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
        addTags(spark, sparkTags);
        Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
        ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
        addTags(worker, workerTags);
        Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
        ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, datasetProperties);
        Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
        // Start all programs
        RunId mrRunId = runAndWait(mapreduce);
        RunId mrRunId2 = runAndWait(mapreduce2);
        RunId sparkRunId = runAndWait(spark);
        runAndWait(workflow);
        RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
        RunId serviceRunId = runAndWait(service);
        // Worker makes a call to service to make it access datasets,
        // hence need to make sure service starts before worker, and stops after it.
        RunId workerRunId = runAndWait(worker);
        // Wait for programs to finish
        waitForStop(mapreduce, false);
        waitForStop(mapreduce2, false);
        waitForStop(spark, false);
        waitForStop(workflow, false);
        waitForStop(worker, false);
        waitForStop(service, true);
        long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
        long oneHour = TimeUnit.HOURS.toSeconds(1);
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // dataset is accessed by all programs
        LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
        new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset3, mapreduce, AccessType.READ, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset3, mapreduce, AccessType.READ, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
        Assert.assertEquals(expected, lineage);
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(io.cdap.cdap.client.app.AllProgramsApp) ProgramId(io.cdap.cdap.proto.id.ProgramId) DatasetId(io.cdap.cdap.proto.id.DatasetId) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) LineageRecord(io.cdap.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(io.cdap.cdap.proto.NamespaceMeta) NamespaceId(io.cdap.cdap.proto.id.NamespaceId) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) Test(org.junit.Test)

Example 4 with LineageRecord

use of io.cdap.cdap.proto.metadata.lineage.LineageRecord in project cdap by caskdata.

the class LineageHttpHandlerTestRun method testAllProgramsLineage.

@Test
public void testAllProgramsLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
    ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
    ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
    ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
    ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
    ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
    DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata
        ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
        addTags(spark, sparkTags);
        Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
        ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
        addTags(worker, workerTags);
        Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
        ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, datasetProperties);
        Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
        // Start all programs
        RunId mrRunId = runAndWait(mapreduce);
        RunId mrRunId2 = runAndWait(mapreduce2);
        RunId sparkRunId = runAndWait(spark);
        runAndWait(workflow);
        RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
        RunId serviceRunId = runAndWait(service);
        // Worker makes a call to service to make it access datasets,
        // hence need to make sure service starts before worker, and stops after it.
        RunId workerRunId = runAndWait(worker);
        // Wait for programs to finish
        waitForStop(mapreduce, false);
        waitForStop(mapreduce2, false);
        waitForStop(spark, false);
        waitForStop(workflow, false);
        waitForStop(worker, false);
        waitForStop(service, true);
        long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
        long oneHour = TimeUnit.HOURS.toSeconds(1);
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // dataset is accessed by all programs
        LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
        new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset3, mapreduce, AccessType.READ, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset3, mapreduce, AccessType.READ, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
        Assert.assertEquals(expected, lineage);
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(io.cdap.cdap.client.app.AllProgramsApp) ProgramId(io.cdap.cdap.proto.id.ProgramId) DatasetId(io.cdap.cdap.proto.id.DatasetId) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) LineageRecord(io.cdap.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(io.cdap.cdap.proto.NamespaceMeta) NamespaceId(io.cdap.cdap.proto.id.NamespaceId) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) Test(org.junit.Test)

Example 5 with LineageRecord

use of io.cdap.cdap.proto.metadata.lineage.LineageRecord in project cdap by caskdata.

the class LineageSerializer method toLineageRecord.

public static LineageRecord toLineageRecord(long start, long end, Lineage lineage, Set<CollapseType> collapseTypes) {
    Set<RelationRecord> relationBuilder = new HashSet<>();
    Map<String, ProgramRecord> programBuilder = new HashMap<>();
    Map<String, DataRecord> dataBuilder = new HashMap<>();
    Set<CollapsedRelation> collapsedRelations = LineageCollapser.collapseRelations(lineage.getRelations(), collapseTypes);
    for (CollapsedRelation relation : collapsedRelations) {
        String dataKey = makeDataKey(relation.getData());
        String programKey = makeProgramKey(relation.getProgram());
        RelationRecord relationRecord = new RelationRecord(dataKey, programKey, convertAccessType(relation.getAccess()), convertRuns(relation.getRuns()), convertComponents(relation.getComponents()));
        relationBuilder.add(relationRecord);
        programBuilder.put(programKey, new ProgramRecord(relation.getProgram()));
        dataBuilder.put(dataKey, new DataRecord(relation.getData()));
    }
    return new LineageRecord(start, end, relationBuilder, programBuilder, dataBuilder);
}
Also used : RelationRecord(io.cdap.cdap.proto.metadata.lineage.RelationRecord) ProgramRecord(io.cdap.cdap.proto.metadata.lineage.ProgramRecord) HashMap(java.util.HashMap) LineageRecord(io.cdap.cdap.proto.metadata.lineage.LineageRecord) DataRecord(io.cdap.cdap.proto.metadata.lineage.DataRecord) HashSet(java.util.HashSet)

Aggregations

LineageRecord (io.cdap.cdap.proto.metadata.lineage.LineageRecord)6 DatasetId (io.cdap.cdap.proto.id.DatasetId)4 Table (io.cdap.cdap.cli.util.table.Table)2 AllProgramsApp (io.cdap.cdap.client.app.AllProgramsApp)2 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)2 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)2 NamespaceMeta (io.cdap.cdap.proto.NamespaceMeta)2 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)2 NamespaceId (io.cdap.cdap.proto.id.NamespaceId)2 ProgramId (io.cdap.cdap.proto.id.ProgramId)2 DataRecord (io.cdap.cdap.proto.metadata.lineage.DataRecord)2 ProgramRecord (io.cdap.cdap.proto.metadata.lineage.ProgramRecord)2 RelationRecord (io.cdap.cdap.proto.metadata.lineage.RelationRecord)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 RunId (org.apache.twill.api.RunId)2 Test (org.junit.Test)2