Search in sources :

Example 31 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageTestRun method testFlowLineage.

@Test
public void testFlowLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testFlowLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata to applicaton
        ImmutableMap<String, String> appProperties = ImmutableMap.of("app-key1", "app-value1");
        addProperties(app, appProperties);
        Assert.assertEquals(appProperties, getProperties(app, MetadataScope.USER));
        ImmutableSet<String> appTags = ImmutableSet.of("app-tag1");
        addTags(app, appTags);
        Assert.assertEquals(appTags, getTags(app, MetadataScope.USER));
        // Add metadata to flow
        ImmutableMap<String, String> flowProperties = ImmutableMap.of("flow-key1", "flow-value1");
        addProperties(flow, flowProperties);
        Assert.assertEquals(flowProperties, getProperties(flow, MetadataScope.USER));
        ImmutableSet<String> flowTags = ImmutableSet.of("flow-tag1", "flow-tag2");
        addTags(flow, flowTags);
        Assert.assertEquals(flowTags, getTags(flow, MetadataScope.USER));
        // Add metadata to dataset
        ImmutableMap<String, String> dataProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, dataProperties);
        Assert.assertEquals(dataProperties, getProperties(dataset, MetadataScope.USER));
        ImmutableSet<String> dataTags = ImmutableSet.of("data-tag1", "data-tag2");
        addTags(dataset, dataTags);
        Assert.assertEquals(dataTags, getTags(dataset, MetadataScope.USER));
        // Add metadata to stream
        ImmutableMap<String, String> streamProperties = ImmutableMap.of("stream-key1", "stream-value1");
        addProperties(stream, streamProperties);
        Assert.assertEquals(streamProperties, getProperties(stream, MetadataScope.USER));
        ImmutableSet<String> streamTags = ImmutableSet.of("stream-tag1", "stream-tag2");
        addTags(stream, streamTags);
        Assert.assertEquals(streamTags, getTags(stream, MetadataScope.USER));
        long startTime = TimeMathParser.nowInSeconds();
        RunId flowRunId = runAndWait(flow);
        // Wait for few seconds so that the stop time secs is more than start time secs.
        TimeUnit.SECONDS.sleep(2);
        waitForStop(flow, true);
        long stopTime = TimeMathParser.nowInSeconds();
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, startTime, stopTime, 10);
        LineageRecord expected = LineageSerializer.toLineageRecord(startTime, stopTime, new Lineage(ImmutableSet.of(new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))))), Collections.<CollapseType>emptySet());
        Assert.assertEquals(expected, lineage);
        // Fetch dataset lineage with time strings
        lineage = fetchLineage(dataset, "now-1h", "now+1h", 10);
        Assert.assertEquals(expected.getRelations(), lineage.getRelations());
        // Fetch stream lineage
        lineage = fetchLineage(stream, startTime, stopTime, 10);
        // same as dataset's lineage
        Assert.assertEquals(expected, lineage);
        // Fetch stream lineage with time strings
        lineage = fetchLineage(stream, "now-1h", "now+1h", 10);
        // same as dataset's lineage
        Assert.assertEquals(expected.getRelations(), lineage.getRelations());
        // Assert metadata
        // Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, appProperties, appTags), new MetadataRecord(flow, MetadataScope.USER, flowProperties, flowTags), new MetadataRecord(dataset, MetadataScope.USER, dataProperties, dataTags), new MetadataRecord(stream, MetadataScope.USER, streamProperties, streamTags)), fetchRunMetadata(flow.run(flowRunId.getId())));
        // Assert with a time range after the flow run should return no results
        long laterStartTime = stopTime + 1000;
        long laterEndTime = stopTime + 5000;
        // Fetch stream lineage
        lineage = fetchLineage(stream, laterStartTime, laterEndTime, 10);
        Assert.assertEquals(LineageSerializer.toLineageRecord(laterStartTime, laterEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
        // Assert with a time range before the flow run should return no results
        long earlierStartTime = startTime - 5000;
        long earlierEndTime = startTime - 1000;
        // Fetch stream lineage
        lineage = fetchLineage(stream, earlierStartTime, earlierEndTime, 10);
        Assert.assertEquals(LineageSerializer.toLineageRecord(earlierStartTime, earlierEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
        // Test bad time ranges
        fetchLineage(dataset, "sometime", "sometime", 10, BadRequestException.class);
        fetchLineage(dataset, "now+1h", "now-1h", 10, BadRequestException.class);
        // Test non-existent run
        assertRunMetadataNotFound(flow.run(RunIds.generate(1000).getId()));
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) CollapseType(co.cask.cdap.proto.metadata.lineage.CollapseType) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(co.cask.cdap.client.app.AllProgramsApp) ProgramId(co.cask.cdap.proto.id.ProgramId) DatasetId(co.cask.cdap.proto.id.DatasetId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(co.cask.cdap.proto.NamespaceMeta) NamespaceId(co.cask.cdap.proto.id.NamespaceId) ApplicationId(co.cask.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) MetadataRecord(co.cask.cdap.common.metadata.MetadataRecord) Test(org.junit.Test)

Example 32 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageTestRun method testAllProgramsLineage.

@Test
public void testAllProgramsLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
    ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
    ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
    ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
    ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
    ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
    ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
    DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
    StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata
        ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
        addTags(spark, sparkTags);
        Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
        ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
        addTags(worker, workerTags);
        Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
        ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, datasetProperties);
        Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
        // Start all programs
        RunId flowRunId = runAndWait(flow);
        RunId mrRunId = runAndWait(mapreduce);
        RunId mrRunId2 = runAndWait(mapreduce2);
        RunId sparkRunId = runAndWait(spark);
        runAndWait(workflow);
        RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
        RunId serviceRunId = runAndWait(service);
        // Worker makes a call to service to make it access datasets,
        // hence need to make sure service starts before worker, and stops after it.
        RunId workerRunId = runAndWait(worker);
        // Wait for programs to finish
        waitForStop(flow, true);
        waitForStop(mapreduce, false);
        waitForStop(mapreduce2, false);
        waitForStop(spark, false);
        waitForStop(workflow, false);
        waitForStop(worker, false);
        waitForStop(service, true);
        long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
        long oneHour = TimeUnit.HOURS.toSeconds(1);
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // dataset is accessed by all programs
        LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
        new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, toSet(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId), // Stream access
        new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, mapreduce, AccessType.READ, mrRunId), new Relation(stream, spark, AccessType.READ, sparkRunId), new Relation(stream, mapreduce, AccessType.READ, workflowMrRunId), new Relation(stream, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
        Assert.assertEquals(expected, lineage);
        // Fetch stream lineage
        lineage = fetchLineage(stream, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // stream too is accessed by all programs
        Assert.assertEquals(expected, lineage);
        // Assert metadata
        // Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(flow, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(flow.run(flowRunId.getId())));
        // Id.Worker needs conversion to Id.Program JIRA - CDAP-3658
        ProgramId programForWorker = new ProgramId(worker.getNamespace(), worker.getApplication(), worker.getType(), worker.getEntityName());
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForWorker, MetadataScope.USER, emptyMap(), workerTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(worker.run(workerRunId.getId())));
        // Id.Spark needs conversion to Id.Program JIRA - CDAP-3658
        ProgramId programForSpark = new ProgramId(spark.getNamespace(), spark.getApplication(), spark.getType(), spark.getEntityName());
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForSpark, MetadataScope.USER, emptyMap(), sparkTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(dataset2, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset3, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(spark.run(sparkRunId.getId())));
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(co.cask.cdap.client.app.AllProgramsApp) ProgramId(co.cask.cdap.proto.id.ProgramId) DatasetId(co.cask.cdap.proto.id.DatasetId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(co.cask.cdap.proto.NamespaceMeta) NamespaceId(co.cask.cdap.proto.id.NamespaceId) ApplicationId(co.cask.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) MetadataRecord(co.cask.cdap.common.metadata.MetadataRecord) Test(org.junit.Test)

Example 33 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdmin method getScanRange.

/**
 * Convert a set of runIds into a scan range based on earliest runtime and latest runtime of runIds.
 * Also, add a scan filter to include only runIds in the given set.
 * @param runIds input runIds set
 * @return scan range
 */
@VisibleForTesting
static ScanRangeWithFilter getScanRange(final Set<RunId> runIds) {
    if (runIds.isEmpty()) {
        return new ScanRangeWithFilter(0, 0, Predicates.<Relation>alwaysFalse());
    }
    // Pick the earliest start time and latest start time for lineage range
    long earliest = Long.MAX_VALUE;
    long latest = 0;
    for (RunId runId : runIds) {
        long runStartTime = RunIds.getTime(runId, TimeUnit.MILLISECONDS);
        if (runStartTime < earliest) {
            earliest = runStartTime;
        }
        if (runStartTime > latest) {
            latest = runStartTime;
        }
    }
    // scan end key is exclusive, so need to add 1 to  to include the last runid
    return new ScanRangeWithFilter(earliest, latest + 1, new Predicate<Relation>() {

        @Override
        public boolean apply(Relation input) {
            return runIds.contains(input.getRun());
        }
    });
}
Also used : Relation(co.cask.cdap.data2.metadata.lineage.Relation) RunId(org.apache.twill.api.RunId) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 34 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdmin method doComputeRollupLineage.

private Multimap<RelationKey, Relation> doComputeRollupLineage(Multimap<RelationKey, Relation> relations) throws NotFoundException {
    // Make a set of all ProgramIDs in the relations
    Set<ProgramRunId> programRunIdSet = new HashSet<>();
    for (Relation relation : Iterables.concat(relations.values())) {
        programRunIdSet.add(new ProgramRunId(relation.getProgram().getNamespace(), relation.getProgram().getApplication(), relation.getProgram().getType(), relation.getProgram().getProgram(), relation.getRun().getId()));
    }
    // Get RunRecordMeta for all these ProgramRunIDs
    final Map<ProgramRunId, RunRecordMeta> runRecordMap = store.getRuns(programRunIdSet);
    // Get workflow Run IDs for all the programs in the relations
    final Set<String> workflowIDs = getWorkflowIds(relations, runRecordMap);
    // Get Program IDs for workflow Run IDs
    // TODO: These scans could be expensive. CDAP-7571.
    Map<ProgramRunId, RunRecordMeta> workflowRunRecordMap = store.getRuns(ProgramRunStatus.ALL, input -> workflowIDs.contains(input.getPid()));
    // Create a map from RunId to ProgramId for all workflows
    Map<String, ProgramRunId> workflowIdMap = new HashMap<>();
    for (Map.Entry<ProgramRunId, RunRecordMeta> entry : workflowRunRecordMap.entrySet()) {
        workflowIdMap.put(entry.getValue().getPid(), entry.getKey());
    }
    // For all relations, replace ProgramIds with workflow ProgramIds
    return getRollupRelations(relations, runRecordMap, workflowIdMap);
}
Also used : Relation(co.cask.cdap.data2.metadata.lineage.Relation) HashMap(java.util.HashMap) RunRecordMeta(co.cask.cdap.internal.app.store.RunRecordMeta) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 35 with Relation

use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdmin method doComputeLineage.

private Lineage doComputeLineage(final NamespacedEntityId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) throws NotFoundException {
    LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
    // Convert start time and end time period into scan keys in terms of program start times.
    Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
    if (LOG.isTraceEnabled()) {
        LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
    }
    ScanRangeWithFilter scanRange = getScanRange(runningInRange);
    LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
    Multimap<RelationKey, Relation> relations = HashMultimap.create();
    Set<NamespacedEntityId> visitedDatasets = new HashSet<>();
    Set<NamespacedEntityId> toVisitDatasets = new HashSet<>();
    Set<ProgramId> visitedPrograms = new HashSet<>();
    Set<ProgramId> toVisitPrograms = new HashSet<>();
    toVisitDatasets.add(sourceData);
    for (int i = 0; i < levels; ++i) {
        LOG.trace("Level {}", i);
        toVisitPrograms.clear();
        for (NamespacedEntityId d : toVisitDatasets) {
            if (visitedDatasets.add(d)) {
                LOG.trace("Visiting dataset {}", d);
                // Fetch related programs
                Iterable<Relation> programRelations = getProgramRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
                LOG.trace("Got program relations {}", programRelations);
                for (Relation relation : programRelations) {
                    relations.put(new RelationKey(relation), relation);
                }
                Iterables.addAll(toVisitPrograms, Iterables.transform(programRelations, RELATION_TO_PROGRAM_FUNCTION));
            }
        }
        toVisitDatasets.clear();
        for (ProgramId p : toVisitPrograms) {
            if (visitedPrograms.add(p)) {
                LOG.trace("Visiting program {}", p);
                // Fetch related datasets
                Iterable<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
                LOG.trace("Got data relations {}", datasetRelations);
                for (Relation relation : datasetRelations) {
                    relations.put(new RelationKey(relation), relation);
                }
                Iterables.addAll(toVisitDatasets, Iterables.transform(datasetRelations, RELATION_TO_DATA_FUNCTION));
            }
        }
    }
    if (rollup != null && rollup.contains("workflow")) {
        relations = doComputeRollupLineage(relations);
    }
    Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION).values()));
    LOG.trace("Got lineage {}", lineage);
    return lineage;
}
Also used : Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) ProgramId(co.cask.cdap.proto.id.ProgramId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) NamespacedEntityId(co.cask.cdap.proto.id.NamespacedEntityId) RunId(org.apache.twill.api.RunId) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) HashSet(java.util.HashSet)

Aggregations

Test (org.junit.Test)45 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)38 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)26 Relation (co.cask.cdap.data2.metadata.lineage.Relation)20 Store (io.cdap.cdap.app.store.Store)20 DefaultLineageStoreReader (io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader)20 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)20 BasicLineageWriter (io.cdap.cdap.data2.metadata.writer.BasicLineageWriter)16 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)16 TransactionRunner (io.cdap.cdap.spi.data.transaction.TransactionRunner)16 RunId (org.apache.twill.api.RunId)15 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)14 DatasetId (io.cdap.cdap.proto.id.DatasetId)14 ProgramId (io.cdap.cdap.proto.id.ProgramId)14 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)14 HashSet (java.util.HashSet)12 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)10 CollapsedRelation (io.cdap.cdap.data2.metadata.lineage.CollapsedRelation)10 ProgramRunId (co.cask.cdap.proto.id.ProgramRunId)9 ApplicationSpecification (io.cdap.cdap.api.app.ApplicationSpecification)8