use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.
the class LineageAdmin method doComputeLineage.
private Lineage doComputeLineage(DatasetId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) {
LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
boolean rollUpWorkflow = rollup != null && rollup.contains("workflow");
// Convert start time and end time period into scan keys in terms of program start times.
Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
ScanRangeWithFilter scanRange = getScanRange(runningInRange);
LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
Multimap<RelationKey, Relation> relations = HashMultimap.create();
Set<DatasetId> visitedDatasets = new HashSet<>();
Set<DatasetId> toVisitDatasets = new HashSet<>();
Set<ProgramId> visitedPrograms = new HashSet<>();
Set<ProgramId> toVisitPrograms = new HashSet<>();
// this map is to map the inner program run id to the workflow run id, this is needed to collapse the inner
// program and local datasets
Map<ProgramRunId, ProgramRunId> programWorkflowMap = new HashMap<>();
toVisitDatasets.add(sourceData);
for (int i = 0; i < levels; ++i) {
LOG.trace("Level {}", i);
toVisitPrograms.clear();
for (DatasetId d : toVisitDatasets) {
if (visitedDatasets.add(d)) {
LOG.trace("Visiting dataset {}", d);
// Fetch related programs, the programs will be the inner programs which access the datasets. For example,
// mapreduce or spark program in a workflow
Set<Relation> programRelations = lineageStoreReader.getRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got program relations {}", programRelations);
// determine if a dataset is local dataset. The local dataset always ends with the workflow run id
if (rollUpWorkflow) {
computeWorkflowInnerPrograms(toVisitPrograms, programWorkflowMap, programRelations);
}
// add to the relations, replace the inner program with the workflow using the map, ignore the
// local datasets relations, the local dataset always ends with the run id of the workflow
filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, programRelations);
toVisitPrograms.addAll(programRelations.stream().map(Relation::getProgram).collect(Collectors.toSet()));
}
}
toVisitDatasets.clear();
for (ProgramId p : toVisitPrograms) {
if (visitedPrograms.add(p)) {
LOG.trace("Visiting program {}", p);
// Fetch related datasets
Set<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got data relations {}", datasetRelations);
Set<DatasetId> localDatasets = filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, datasetRelations);
toVisitDatasets.addAll(datasetRelations.stream().map(relation -> (DatasetId) relation.getData()).filter(datasetId -> !localDatasets.contains(datasetId)).collect(Collectors.toSet()));
}
}
}
Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION::apply).values()));
LOG.trace("Got lineage {}", lineage);
return lineage;
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.
the class DataPipelineTest method testActionFieldLineage.
private void testActionFieldLineage(Engine engine) throws Exception {
String readDataset = "ActionReadDataset" + engine;
String writeDataset = "ActionWriteDataset" + engine;
List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
List<Operation> operations = new ArrayList<>();
/*
* |---------> srcField1 -> destField1----|
* | |
* ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
* | |
* |---------> srcField3 -> destField3 ---|
*/
operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
// get field lineage for dest dataset
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
Assert.assertEquals(destFields, summary.getFields());
Assert.assertTrue(summary.getOutgoing().isEmpty());
Assert.assertEquals(1, summary.getIncoming().size());
Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
// get field lineage for src dataset
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Assert.assertEquals(1, summary.getOutgoing().size());
expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
LineageAdmin lineageAdmin = getLineageAdmin();
ProgramId programId = appId.workflow(SmartWorkflow.NAME);
RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
// get dataset lineage for src dataset
Tasks.waitFor(2, () -> {
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
return lineage.getRelations().size();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
// get dataset lineage for dest dataset, in this test they should be same
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
Assert.assertEquals(2, lineage.getRelations().size());
Assert.assertEquals(expectedLineage, lineage.getRelations());
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.
the class LineageAdminTest method testSimpleLineage.
@Test
public void testSimpleLineage() {
// Lineage for D3 -> P2 -> D2 -> P1 -> D1
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses for D3 -> P2 -> D2 -> P1 -> D1 <-> P3
// We need to use current time here as metadata store stores access time using current time
ProgramRunId run1 = program1.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run2 = program2.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run3 = program3.run(RunIds.generate(System.currentTimeMillis()).getId());
addRuns(store, run1, run2, run3);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.UNKNOWN);
lineageWriter.addAccess(run1, dataset1, AccessType.WRITE);
lineageWriter.addAccess(run1, dataset2, AccessType.READ);
lineageWriter.addAccess(run2, dataset2, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset3, AccessType.READ);
lineageWriter.addAccess(run3, dataset1, AccessType.UNKNOWN, null);
// The UNKNOWN access type will get filtered out if there is READ/WRITE. It will be preserved if it is the
// only access type
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset2, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset3, program2, AccessType.READ, twillRunId(run2)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3))));
// Lineage for D1
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 100));
// Lineage for D2
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset2, 500, System.currentTimeMillis() + 10000, 100));
// Lineage for D1 for one level should be D2 -> P1 -> D1 <-> P3
Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 1);
Assert.assertEquals(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3))), oneLevelLineage.getRelations());
// Assert that in a different namespace both lineage and metadata should be empty
NamespaceId customNamespace = new NamespaceId("custom_namespace");
DatasetId customDataset1 = customNamespace.dataset(dataset1.getEntityName());
Assert.assertEquals(new Lineage(ImmutableSet.of()), lineageAdmin.computeLineage(customDataset1, 500, System.currentTimeMillis() + 10000, 100));
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.
the class LineageAdminTest method testDirectCycle.
@Test
public void testDirectCycle() {
// Lineage for:
//
// D1 <-> P1
//
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset1, AccessType.WRITE);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1))));
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by caskdata.
the class LineageAdminTest method testWorkflowLineage.
@Test
public void testWorkflowLineage() {
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
ApplicationId testApp = NamespaceId.DEFAULT.app("testApp");
ProgramId workflowId = testApp.workflow("wf1");
// if the spark and mr job are inner jobs of workflow, they should be in the same app
ProgramId mrId = testApp.mr("mr1");
ProgramId sparkId = testApp.mr("spark1");
ImmutableList<WorkflowNode> nodes = ImmutableList.of(new WorkflowActionNode("mr1", new ScheduleProgramInfo(SchedulableProgramType.MAPREDUCE, "mr1")), new WorkflowActionNode("spark1", new ScheduleProgramInfo(SchedulableProgramType.SPARK, "spark1")));
WorkflowSpecification wfSpec = new WorkflowSpecification("test", "wf1", "", Collections.emptyMap(), nodes, Collections.emptyMap(), Collections.emptyMap());
ApplicationSpecification appSpec = new DefaultApplicationSpecification("testApp", ProjectInfo.getVersion().toString(), "dummy app", null, NamespaceId.DEFAULT.artifact("testArtifact", "1.0").toApiArtifactId(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), ImmutableMap.of(workflowId.getProgram(), wfSpec), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap());
Store store = getInjector().getInstance(Store.class);
store.addApplication(testApp, appSpec);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses for D3 -> P2 -> D2 -> P1 -> D1 <-> P3
// |
// |-> P5,
// P1 and P2 are inner programs of the workflow
// We need to use current time here as metadata store stores access time using current time
ProgramRunId run1 = mrId.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run2 = sparkId.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run3 = program3.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId workflow = workflowId.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run5 = program5.run(RunIds.generate(System.currentTimeMillis()).getId());
addRuns(store, workflow);
// only mr and spark can be inner programs
addWorkflowRuns(store, workflow.getProgram(), workflow.getRun(), run1, run2);
addRuns(store, run3);
addRuns(store, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.WRITE);
lineageWriter.addAccess(run1, dataset2, AccessType.READ);
lineageWriter.addAccess(run2, dataset2, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset3, AccessType.READ);
lineageWriter.addAccess(run3, dataset1, AccessType.UNKNOWN, null);
lineageWriter.addAccess(run5, dataset1, AccessType.READ, null);
// The UNKNOWN access type will get filtered out if there is READ/WRITE. It will be preserved if it is the
// only access type
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, workflowId, AccessType.WRITE, twillRunId(workflow)), new Relation(dataset2, workflowId, AccessType.READ, twillRunId(workflow)), new Relation(dataset2, workflowId, AccessType.WRITE, twillRunId(workflow)), new Relation(dataset3, workflowId, AccessType.READ, twillRunId(workflow)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5))));
Lineage resultLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 100, "workflow");
// Lineage for D1
Assert.assertEquals(expectedLineage, resultLineage);
resultLineage = lineageAdmin.computeLineage(dataset2, 500, System.currentTimeMillis() + 10000, 100, "workflow");
// Lineage for D2
Assert.assertEquals(expectedLineage, resultLineage);
// Lineage for D1 for one level should be D2 -> P1 -> D1 <-> P3
Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 1, "workflow");
Assert.assertEquals(ImmutableSet.of(new Relation(dataset1, workflowId, AccessType.WRITE, twillRunId(workflow)), new Relation(dataset2, workflowId, AccessType.READ, twillRunId(workflow)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3))), oneLevelLineage.getRelations());
// Run tests without workflow parameter
expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, mrId, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, mrId, AccessType.READ, twillRunId(run1)), new Relation(dataset2, sparkId, AccessType.WRITE, twillRunId(run2)), new Relation(dataset3, sparkId, AccessType.READ, twillRunId(run2)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5))));
resultLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 100, null);
// Lineage for D1
Assert.assertEquals(expectedLineage, resultLineage);
resultLineage = lineageAdmin.computeLineage(dataset2, 500, System.currentTimeMillis() + 10000, 100, null);
// Lineage for D2
Assert.assertEquals(expectedLineage, resultLineage);
// Lineage for D1 for one level should be D2 -> P1 -> D1 <-> P3
oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 1, null);
Assert.assertEquals(ImmutableSet.of(new Relation(dataset1, mrId, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, mrId, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program5, AccessType.READ, twillRunId(run5)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3))), oneLevelLineage.getRelations());
// Assert that in a different namespace both lineage and metadata should be empty
NamespaceId customNamespace = new NamespaceId("custom_namespace");
DatasetId customDataset1 = customNamespace.dataset(dataset1.getEntityName());
Assert.assertEquals(new Lineage(ImmutableSet.of()), lineageAdmin.computeLineage(customDataset1, 500, System.currentTimeMillis() + 10000, 100));
}
Aggregations