Search in sources :

Example 11 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageTestRun method testAllProgramsLineage.

@Test
public void testAllProgramsLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
    ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
    ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
    ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
    ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
    ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
    ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
    DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
    StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata
        ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
        addTags(spark, sparkTags);
        Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
        ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
        addTags(worker, workerTags);
        Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
        ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, datasetProperties);
        Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
        // Start all programs
        RunId flowRunId = runAndWait(flow);
        RunId mrRunId = runAndWait(mapreduce);
        RunId mrRunId2 = runAndWait(mapreduce2);
        RunId sparkRunId = runAndWait(spark);
        runAndWait(workflow);
        RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
        RunId serviceRunId = runAndWait(service);
        // Worker makes a call to service to make it access datasets,
        // hence need to make sure service starts before worker, and stops after it.
        RunId workerRunId = runAndWait(worker);
        // Wait for programs to finish
        waitForStop(flow, true);
        waitForStop(mapreduce, false);
        waitForStop(mapreduce2, false);
        waitForStop(spark, false);
        waitForStop(workflow, false);
        waitForStop(worker, false);
        waitForStop(service, true);
        long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
        long oneHour = TimeUnit.HOURS.toSeconds(1);
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // dataset is accessed by all programs
        LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
        new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, toSet(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId), // Stream access
        new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, mapreduce, AccessType.READ, mrRunId), new Relation(stream, spark, AccessType.READ, sparkRunId), new Relation(stream, mapreduce, AccessType.READ, workflowMrRunId), new Relation(stream, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
        Assert.assertEquals(expected, lineage);
        // Fetch stream lineage
        lineage = fetchLineage(stream, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // stream too is accessed by all programs
        Assert.assertEquals(expected, lineage);
        // Assert metadata
        // Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(flow, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(flow.run(flowRunId.getId())));
        // Id.Worker needs conversion to Id.Program JIRA - CDAP-3658
        ProgramId programForWorker = new ProgramId(worker.getNamespace(), worker.getApplication(), worker.getType(), worker.getEntityName());
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForWorker, MetadataScope.USER, emptyMap(), workerTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(worker.run(workerRunId.getId())));
        // Id.Spark needs conversion to Id.Program JIRA - CDAP-3658
        ProgramId programForSpark = new ProgramId(spark.getNamespace(), spark.getApplication(), spark.getType(), spark.getEntityName());
        Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForSpark, MetadataScope.USER, emptyMap(), sparkTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(dataset2, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset3, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(spark.run(sparkRunId.getId())));
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(co.cask.cdap.client.app.AllProgramsApp) ProgramId(co.cask.cdap.proto.id.ProgramId) DatasetId(co.cask.cdap.proto.id.DatasetId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) LineageRecord(co.cask.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(co.cask.cdap.proto.NamespaceMeta) NamespaceId(co.cask.cdap.proto.id.NamespaceId) ApplicationId(co.cask.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) MetadataRecord(co.cask.cdap.common.metadata.MetadataRecord) Test(org.junit.Test)

Example 12 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageDataset method toRelation.

private Relation toRelation(Row row) {
    Map<Character, EntityId> rowInfo = new HashMap<>(4);
    MDSKey.Splitter splitter = new MDSKey(row.getRow()).split();
    char marker = (char) splitter.getInt();
    LOG.trace("Got marker {}", marker);
    EntityId id1 = toEntityId(splitter, marker);
    LOG.trace("Got id1 {}", id1);
    rowInfo.put(marker, id1);
    // inverted time - not required for relation
    splitter.skipLong();
    marker = (char) splitter.getInt();
    LOG.trace("Got marker {}", marker);
    EntityId id2 = toEntityId(splitter, marker);
    LOG.trace("Got id2 {}", id1);
    rowInfo.put(marker, id2);
    RunId runId = RunIds.fromString(splitter.getString());
    LOG.trace("Got runId {}", runId);
    AccessType accessType = AccessType.fromType((char) splitter.getInt());
    LOG.trace("Got access type {}", accessType);
    DatasetId datasetInstance = (DatasetId) rowInfo.get(DATASET_MARKER);
    LOG.trace("Got datasetInstance {}", datasetInstance);
    StreamId stream = (StreamId) rowInfo.get(STREAM_MARKER);
    LOG.trace("Got stream {}", stream);
    ProgramId program = (ProgramId) rowInfo.get(PROGRAM_MARKER);
    LOG.trace("Got program {}", program);
    NamespacedEntityId component = toComponent(splitter, program);
    LOG.trace("Got component {}", component);
    if (stream == null) {
        return new Relation(datasetInstance, program, accessType, runId, component == null ? ImmutableSet.<NamespacedEntityId>of() : ImmutableSet.of((NamespacedEntityId) component));
    }
    return new Relation(stream, program, accessType, runId, component == null ? ImmutableSet.<NamespacedEntityId>of() : ImmutableSet.of((NamespacedEntityId) component));
}
Also used : StreamId(co.cask.cdap.proto.id.StreamId) HashMap(java.util.HashMap) MDSKey(co.cask.cdap.data2.dataset2.lib.table.MDSKey) ProgramId(co.cask.cdap.proto.id.ProgramId) DatasetId(co.cask.cdap.proto.id.DatasetId) NamespacedEntityId(co.cask.cdap.proto.id.NamespacedEntityId) EntityId(co.cask.cdap.proto.id.EntityId) NamespacedEntityId(co.cask.cdap.proto.id.NamespacedEntityId) RunId(org.apache.twill.api.RunId) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId)

Example 13 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdmin method getScanRange.

/**
 * Convert a set of runIds into a scan range based on earliest runtime and latest runtime of runIds.
 * Also, add a scan filter to include only runIds in the given set.
 * @param runIds input runIds set
 * @return scan range
 */
@VisibleForTesting
static ScanRangeWithFilter getScanRange(final Set<RunId> runIds) {
    if (runIds.isEmpty()) {
        return new ScanRangeWithFilter(0, 0, Predicates.<Relation>alwaysFalse());
    }
    // Pick the earliest start time and latest start time for lineage range
    long earliest = Long.MAX_VALUE;
    long latest = 0;
    for (RunId runId : runIds) {
        long runStartTime = RunIds.getTime(runId, TimeUnit.MILLISECONDS);
        if (runStartTime < earliest) {
            earliest = runStartTime;
        }
        if (runStartTime > latest) {
            latest = runStartTime;
        }
    }
    // scan end key is exclusive, so need to add 1 to  to include the last runid
    return new ScanRangeWithFilter(earliest, latest + 1, new Predicate<Relation>() {

        @Override
        public boolean apply(Relation input) {
            return runIds.contains(input.getRun());
        }
    });
}
Also used : Relation(co.cask.cdap.data2.metadata.lineage.Relation) RunId(org.apache.twill.api.RunId) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 14 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdmin method doComputeRollupLineage.

private Multimap<RelationKey, Relation> doComputeRollupLineage(Multimap<RelationKey, Relation> relations) throws NotFoundException {
    // Make a set of all ProgramIDs in the relations
    Set<ProgramRunId> programRunIdSet = new HashSet<>();
    for (Relation relation : Iterables.concat(relations.values())) {
        programRunIdSet.add(new ProgramRunId(relation.getProgram().getNamespace(), relation.getProgram().getApplication(), relation.getProgram().getType(), relation.getProgram().getProgram(), relation.getRun().getId()));
    }
    // Get RunRecordMeta for all these ProgramRunIDs
    final Map<ProgramRunId, RunRecordMeta> runRecordMap = store.getRuns(programRunIdSet);
    // Get workflow Run IDs for all the programs in the relations
    final Set<String> workflowIDs = getWorkflowIds(relations, runRecordMap);
    // Get Program IDs for workflow Run IDs
    // TODO: These scans could be expensive. CDAP-7571.
    Map<ProgramRunId, RunRecordMeta> workflowRunRecordMap = store.getRuns(ProgramRunStatus.ALL, input -> workflowIDs.contains(input.getPid()));
    // Create a map from RunId to ProgramId for all workflows
    Map<String, ProgramRunId> workflowIdMap = new HashMap<>();
    for (Map.Entry<ProgramRunId, RunRecordMeta> entry : workflowRunRecordMap.entrySet()) {
        workflowIdMap.put(entry.getValue().getPid(), entry.getKey());
    }
    // For all relations, replace ProgramIds with workflow ProgramIds
    return getRollupRelations(relations, runRecordMap, workflowIdMap);
}
Also used : Relation(co.cask.cdap.data2.metadata.lineage.Relation) HashMap(java.util.HashMap) RunRecordMeta(co.cask.cdap.internal.app.store.RunRecordMeta) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 15 with Relation

use of co.cask.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.

the class LineageAdmin method doComputeLineage.

private Lineage doComputeLineage(final NamespacedEntityId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) throws NotFoundException {
    LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
    // Convert start time and end time period into scan keys in terms of program start times.
    Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
    if (LOG.isTraceEnabled()) {
        LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
    }
    ScanRangeWithFilter scanRange = getScanRange(runningInRange);
    LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
    Multimap<RelationKey, Relation> relations = HashMultimap.create();
    Set<NamespacedEntityId> visitedDatasets = new HashSet<>();
    Set<NamespacedEntityId> toVisitDatasets = new HashSet<>();
    Set<ProgramId> visitedPrograms = new HashSet<>();
    Set<ProgramId> toVisitPrograms = new HashSet<>();
    toVisitDatasets.add(sourceData);
    for (int i = 0; i < levels; ++i) {
        LOG.trace("Level {}", i);
        toVisitPrograms.clear();
        for (NamespacedEntityId d : toVisitDatasets) {
            if (visitedDatasets.add(d)) {
                LOG.trace("Visiting dataset {}", d);
                // Fetch related programs
                Iterable<Relation> programRelations = getProgramRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
                LOG.trace("Got program relations {}", programRelations);
                for (Relation relation : programRelations) {
                    relations.put(new RelationKey(relation), relation);
                }
                Iterables.addAll(toVisitPrograms, Iterables.transform(programRelations, RELATION_TO_PROGRAM_FUNCTION));
            }
        }
        toVisitDatasets.clear();
        for (ProgramId p : toVisitPrograms) {
            if (visitedPrograms.add(p)) {
                LOG.trace("Visiting program {}", p);
                // Fetch related datasets
                Iterable<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
                LOG.trace("Got data relations {}", datasetRelations);
                for (Relation relation : datasetRelations) {
                    relations.put(new RelationKey(relation), relation);
                }
                Iterables.addAll(toVisitDatasets, Iterables.transform(datasetRelations, RELATION_TO_DATA_FUNCTION));
            }
        }
    }
    if (rollup != null && rollup.contains("workflow")) {
        relations = doComputeRollupLineage(relations);
    }
    Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION).values()));
    LOG.trace("Got lineage {}", lineage);
    return lineage;
}
Also used : Lineage(co.cask.cdap.data2.metadata.lineage.Lineage) ProgramId(co.cask.cdap.proto.id.ProgramId) Relation(co.cask.cdap.data2.metadata.lineage.Relation) NamespacedEntityId(co.cask.cdap.proto.id.NamespacedEntityId) RunId(org.apache.twill.api.RunId) ProgramRunId(co.cask.cdap.proto.id.ProgramRunId) HashSet(java.util.HashSet)

Aggregations

Relation (co.cask.cdap.data2.metadata.lineage.Relation)20 Test (org.junit.Test)15 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)10 ProgramRunId (co.cask.cdap.proto.id.ProgramRunId)10 Store (co.cask.cdap.app.store.Store)7 LineageStore (co.cask.cdap.data2.metadata.lineage.LineageStore)7 MetadataStore (co.cask.cdap.data2.metadata.store.MetadataStore)7 DatasetId (co.cask.cdap.proto.id.DatasetId)7 ProgramId (co.cask.cdap.proto.id.ProgramId)6 RunId (org.apache.twill.api.RunId)6 CollapsedRelation (co.cask.cdap.data2.metadata.lineage.CollapsedRelation)5 StreamId (co.cask.cdap.proto.id.StreamId)5 MetadataRecord (co.cask.cdap.common.metadata.MetadataRecord)4 NamespaceId (co.cask.cdap.proto.id.NamespaceId)4 HashSet (java.util.HashSet)4 ApplicationId (co.cask.cdap.proto.id.ApplicationId)3 NamespacedEntityId (co.cask.cdap.proto.id.NamespacedEntityId)3 HashMap (java.util.HashMap)3 AllProgramsApp (co.cask.cdap.client.app.AllProgramsApp)2 RunRecordMeta (co.cask.cdap.internal.app.store.RunRecordMeta)2