use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdmin method doComputeLineage.
private Lineage doComputeLineage(final NamespacedEntityId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) throws NotFoundException {
LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
// Convert start time and end time period into scan keys in terms of program start times.
Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
if (LOG.isTraceEnabled()) {
LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
}
ScanRangeWithFilter scanRange = getScanRange(runningInRange);
LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
Multimap<RelationKey, Relation> relations = HashMultimap.create();
Set<NamespacedEntityId> visitedDatasets = new HashSet<>();
Set<NamespacedEntityId> toVisitDatasets = new HashSet<>();
Set<ProgramId> visitedPrograms = new HashSet<>();
Set<ProgramId> toVisitPrograms = new HashSet<>();
toVisitDatasets.add(sourceData);
for (int i = 0; i < levels; ++i) {
LOG.trace("Level {}", i);
toVisitPrograms.clear();
for (NamespacedEntityId d : toVisitDatasets) {
if (visitedDatasets.add(d)) {
LOG.trace("Visiting dataset {}", d);
// Fetch related programs
Iterable<Relation> programRelations = getProgramRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got program relations {}", programRelations);
for (Relation relation : programRelations) {
relations.put(new RelationKey(relation), relation);
}
Iterables.addAll(toVisitPrograms, Iterables.transform(programRelations, RELATION_TO_PROGRAM_FUNCTION));
}
}
toVisitDatasets.clear();
for (ProgramId p : toVisitPrograms) {
if (visitedPrograms.add(p)) {
LOG.trace("Visiting program {}", p);
// Fetch related datasets
Iterable<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got data relations {}", datasetRelations);
for (Relation relation : datasetRelations) {
relations.put(new RelationKey(relation), relation);
}
Iterables.addAll(toVisitDatasets, Iterables.transform(datasetRelations, RELATION_TO_DATA_FUNCTION));
}
}
}
if (rollup != null && rollup.contains("workflow")) {
relations = doComputeRollupLineage(relations);
}
Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION).values()));
LOG.trace("Got lineage {}", lineage);
return lineage;
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageHandler method datasetLineage.
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage")
public void datasetLineage(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @QueryParam("start") String startStr, @QueryParam("end") String endStr, @QueryParam("levels") @DefaultValue("10") int levels, @QueryParam("collapse") List<String> collapse, @QueryParam("rollup") String rollup) throws Exception {
checkLevels(levels);
TimeRange range = parseRange(startStr, endStr);
DatasetId datasetInstance = new DatasetId(namespaceId, datasetId);
Lineage lineage = lineageAdmin.computeLineage(datasetInstance, range.getStart(), range.getEnd(), levels, rollup);
responder.sendJson(HttpResponseStatus.OK, GSON.toJson(LineageSerializer.toLineageRecord(TimeUnit.MILLISECONDS.toSeconds(range.getStart()), TimeUnit.MILLISECONDS.toSeconds(range.getEnd()), lineage, getCollapseTypes(collapse)), LineageRecord.class));
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdminTest method testBranchLineage.
@Test
public void testBranchLineage() throws Exception {
// Lineage for:
//
// ->D4 -> D5 -> P3 -> D6
// | |
// | |
// D1 -> P1 -> D2 -> P2 -> D3
// | | |
// | | |
// S1 -->| ---------------> P4 -> D7
LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testBranchLineage"));
Store store = getInjector().getInstance(Store.class);
MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageStore.addAccess(run1, stream1, AccessType.READ, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run1, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run1, dataset4, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run2, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet2);
lineageStore.addAccess(run2, dataset3, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
lineageStore.addAccess(run2, dataset5, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
lineageStore.addAccess(run3, dataset5, AccessType.READ, System.currentTimeMillis());
lineageStore.addAccess(run3, dataset6, AccessType.WRITE, System.currentTimeMillis());
lineageStore.addAccess(run4, dataset2, AccessType.READ, System.currentTimeMillis());
lineageStore.addAccess(run4, dataset3, AccessType.READ, System.currentTimeMillis());
lineageStore.addAccess(run4, dataset7, AccessType.WRITE, System.currentTimeMillis());
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(stream1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3), emptySet()), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3), emptySet()), new Relation(dataset2, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset3, program4, AccessType.READ, twillRunId(run4), emptySet()), new Relation(dataset7, program4, AccessType.WRITE, twillRunId(run4), emptySet())));
// Lineage for D7
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset7, 500, 20000, 100));
// Lineage for D6
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset6, 500, 20000, 100));
// Lineage for D3
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset3, 500, 20000, 100));
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdminTest method testDirectCycle.
@Test
public void testDirectCycle() throws Exception {
// Lineage for:
//
// D1 <-> P1
//
LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testDirectCycle"));
Store store = getInjector().getInstance(Store.class);
MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageStore.addAccess(run1, dataset1, AccessType.READ, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run1, dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1))));
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdminTest method testWorkflowLineage.
@Test
public void testWorkflowLineage() throws Exception {
// Lineage for D3 -> P2 -> D2 -> P1 -> D1
LineageStore lineageStore = new LineageStore(getTxExecFactory(), getDatasetFramework(), NamespaceId.DEFAULT.dataset("testWorkflowLineage"));
Store store = getInjector().getInstance(Store.class);
MetadataStore metadataStore = getInjector().getInstance(MetadataStore.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageStore, store, metadataStore, new NoOpEntityExistenceVerifier());
// Define metadata
MetadataRecord run1AppMeta = new MetadataRecord(program1.getParent(), MetadataScope.USER, toMap("pk1", "pk1"), toSet("pt1"));
MetadataRecord run1ProgramMeta = new MetadataRecord(program1, MetadataScope.USER, toMap("pk1", "pk1"), toSet("pt1"));
MetadataRecord run1Data1Meta = new MetadataRecord(dataset1, MetadataScope.USER, toMap("dk1", "dk1"), toSet("dt1"));
MetadataRecord run1Data2Meta = new MetadataRecord(dataset2, MetadataScope.USER, toMap("dk2", "dk2"), toSet("dt2"));
// Add metadata
metadataStore.setProperties(MetadataScope.USER, program1.getParent(), run1AppMeta.getProperties());
// noinspection ToArrayCallWithZeroLengthArrayArgument
metadataStore.addTags(MetadataScope.USER, program1.getParent(), run1AppMeta.getTags().toArray(new String[0]));
metadataStore.setProperties(MetadataScope.USER, program1, run1ProgramMeta.getProperties());
// noinspection ToArrayCallWithZeroLengthArrayArgument
metadataStore.addTags(MetadataScope.USER, program1, run1ProgramMeta.getTags().toArray(new String[0]));
metadataStore.setProperties(MetadataScope.USER, dataset1, run1Data1Meta.getProperties());
// noinspection ToArrayCallWithZeroLengthArrayArgument
metadataStore.addTags(MetadataScope.USER, dataset1, run1Data1Meta.getTags().toArray(new String[0]));
metadataStore.setProperties(MetadataScope.USER, dataset2, run1Data2Meta.getProperties());
// noinspection ToArrayCallWithZeroLengthArrayArgument
metadataStore.addTags(MetadataScope.USER, dataset2, run1Data2Meta.getTags().toArray(new String[0]));
// Add accesses for D3 -> P2 -> D2 -> P1 -> D1 <-> P3
// We need to use current time here as metadata store stores access time using current time
ProgramRunId run1 = program1.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run2 = program2.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run3 = program3.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId workflow = program6.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run5 = program5.run(RunIds.generate(System.currentTimeMillis()).getId());
addWorkflowRuns(store, workflow.getProgram(), workflow.getRun(), run1, run2, run3);
addRuns(store, workflow);
addRuns(store, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageStore.addAccess(run1, dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run1, dataset1, AccessType.WRITE, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run1, dataset2, AccessType.READ, System.currentTimeMillis(), flowlet1);
lineageStore.addAccess(run2, dataset2, AccessType.WRITE, System.currentTimeMillis(), flowlet2);
lineageStore.addAccess(run2, dataset3, AccessType.READ, System.currentTimeMillis(), flowlet2);
lineageStore.addAccess(run3, dataset1, AccessType.UNKNOWN, System.currentTimeMillis());
lineageStore.addAccess(run5, dataset1, AccessType.READ, System.currentTimeMillis());
// The UNKNOWN access type will get filtered out if there is READ/WRITE. It will be preserved if it is the
// only access type
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program6, AccessType.WRITE, twillRunId(workflow)), new Relation(dataset2, program6, AccessType.READ, twillRunId(workflow)), new Relation(dataset2, program6, AccessType.WRITE, twillRunId(workflow)), new Relation(dataset3, program6, AccessType.READ, twillRunId(workflow)), new Relation(dataset1, program6, AccessType.UNKNOWN, twillRunId(workflow)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5))));
Lineage resultLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 100, "workflow");
// Lineage for D1
Assert.assertEquals(expectedLineage, resultLineage);
resultLineage = lineageAdmin.computeLineage(dataset2, 500, System.currentTimeMillis() + 10000, 100, "workflow");
// Lineage for D2
Assert.assertEquals(expectedLineage, resultLineage);
// Lineage for D1 for one level should be D2 -> P1 -> D1 <-> P3
Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 1, "workflow");
Assert.assertEquals(ImmutableSet.of(new Relation(dataset1, program6, AccessType.WRITE, twillRunId(workflow)), new Relation(dataset2, program6, AccessType.READ, twillRunId(workflow)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5)), new Relation(dataset1, program6, AccessType.UNKNOWN, twillRunId(workflow))), oneLevelLineage.getRelations());
// Run tests without workflow parameter
expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program2, AccessType.WRITE, twillRunId(run2), toSet(flowlet2)), new Relation(dataset3, program2, AccessType.READ, twillRunId(run2), toSet(flowlet2)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5))));
resultLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 100, null);
// Lineage for D1
Assert.assertEquals(expectedLineage, resultLineage);
resultLineage = lineageAdmin.computeLineage(dataset2, 500, System.currentTimeMillis() + 10000, 100, null);
// Lineage for D2
Assert.assertEquals(expectedLineage, resultLineage);
// Lineage for D1 for one level should be D2 -> P1 -> D1 <-> P3
oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 1, null);
Assert.assertEquals(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1), toSet(flowlet1)), new Relation(dataset2, program1, AccessType.READ, twillRunId(run1), toSet(flowlet1)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3))), oneLevelLineage.getRelations());
// Assert metadata
Assert.assertEquals(toSet(run1AppMeta, run1ProgramMeta, run1Data1Meta, run1Data2Meta), lineageAdmin.getMetadataForRun(run1));
// Assert that in a different namespace both lineage and metadata should be empty
NamespaceId customNamespace = new NamespaceId("custom_namespace");
DatasetId customDataset1 = customNamespace.dataset(dataset1.getEntityName());
ProgramRunId customRun1 = customNamespace.app(program1.getApplication()).program(program1.getType(), program1.getEntityName()).run(run1.getEntityName());
Assert.assertEquals(new Lineage(ImmutableSet.<Relation>of()), lineageAdmin.computeLineage(customDataset1, 500, System.currentTimeMillis() + 10000, 100));
Assert.assertEquals(ImmutableSet.<MetadataRecord>of(), lineageAdmin.getMetadataForRun(customRun1));
}
Aggregations