use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.
the class LineageAdminTest method testDirectCycle.
@Test
public void testDirectCycle() {
// Lineage for:
//
// D1 <-> P1
//
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset1, AccessType.WRITE);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1))));
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.
the class LineageAdminTest method testBranchLoopLineage.
@Test
public void testBranchLoopLineage() {
// Lineage for:
//
// |-------------------------------------|
// | |
// | |
// | -> D4 -> D5 -> P3 -> D6 -> P5
// | | | ^
// V | | |
// D1 -> P1 -> D2 -> P2 -> D3 ----------->|
// | | |
// | | |
// S1 -->| ---------------> P4 -> D7
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
lineageWriter.addAccess(run1, dataset4, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset2, AccessType.READ);
lineageWriter.addAccess(run2, dataset3, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset5, AccessType.WRITE);
lineageWriter.addAccess(run3, dataset5, AccessType.READ, null);
lineageWriter.addAccess(run3, dataset6, AccessType.WRITE, null);
lineageWriter.addAccess(run4, dataset2, AccessType.READ, null);
lineageWriter.addAccess(run4, dataset3, AccessType.READ, null);
lineageWriter.addAccess(run4, dataset7, AccessType.WRITE, null);
lineageWriter.addAccess(run5, dataset3, AccessType.READ, null);
lineageWriter.addAccess(run5, dataset6, AccessType.READ, null);
lineageWriter.addAccess(run5, dataset1, AccessType.WRITE, null);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3)), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3)), new Relation(dataset2, program4, AccessType.READ, twillRunId(run4)), new Relation(dataset3, program4, AccessType.READ, twillRunId(run4)), new Relation(dataset7, program4, AccessType.WRITE, twillRunId(run4)), new Relation(dataset3, program5, AccessType.READ, twillRunId(run5)), new Relation(dataset6, program5, AccessType.READ, twillRunId(run5)), new Relation(dataset1, program5, AccessType.WRITE, twillRunId(run5))));
// Lineage for D1
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
// Lineage for D5
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset5, 500, 20000, 100));
// Lineage for D7
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset7, 500, 20000, 100));
// Lineage for D5 for one level
// -> D5 -> P3 -> D6
// |
// |
// D2 -> P2 -> D3
Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset5, 500, 20000, 1);
Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program2, AccessType.READ, twillRunId(run2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3)), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3))), oneLevelLineage.getRelations());
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.
the class LineageAdmin method computeWorkflowInnerPrograms.
/**
* Compute the inner programs and program runs based on the program relations and add them to the collections.
*
* @param toVisitPrograms the collection of next to visit programs
* @param programWorkflowMap the program workflow run id map
* @param programRelations the program relations of the dataset
*/
private void computeWorkflowInnerPrograms(Set<ProgramId> toVisitPrograms, Map<ProgramRunId, ProgramRunId> programWorkflowMap, Set<Relation> programRelations) {
// Step 1 walk through the program relations, filter out the possible mapreduce and spark programs that
// could be in the workflow, and get the appSpec for the program, to determine what other programs
// are in the workflow
Map<ApplicationId, ApplicationSpecification> appSpecs = new HashMap<>();
Set<ProgramRunId> possibleInnerPrograms = new HashSet<>();
programRelations.forEach(relation -> {
ProgramType type = relation.getProgram().getType();
if (type.equals(ProgramType.MAPREDUCE) || type.equals(ProgramType.SPARK)) {
possibleInnerPrograms.add(relation.getProgramRunId());
appSpecs.computeIfAbsent(relation.getProgram().getParent(), store::getApplication);
}
});
// Step 2, get the run record for all the possible inner programs, the run record contains the
// workflow information, fetch the workflow id and add them to the map
Map<ProgramRunId, RunRecordDetail> runRecords = store.getRuns(possibleInnerPrograms);
Set<ProgramRunId> workflowRunIds = new HashSet<>();
runRecords.entrySet().stream().filter(e -> e.getValue() != null).forEach(entry -> {
ProgramRunId programRunId = entry.getKey();
RunRecordDetail runRecord = entry.getValue();
if (runRecord.getSystemArgs().containsKey(ProgramOptionConstants.WORKFLOW_RUN_ID)) {
ProgramRunId wfRunId = extractWorkflowRunId(programRunId, runRecord);
programWorkflowMap.put(programRunId, wfRunId);
workflowRunIds.add(wfRunId);
}
});
// Step 3, fetch run records of the workflow, the properties of the workflow run record has all
// the inner program run ids, compare them with the app spec to get the type of the program
runRecords = store.getRuns(workflowRunIds);
runRecords.entrySet().stream().filter(e -> e.getValue() != null).forEach(entry -> {
ProgramRunId programRunId = entry.getKey();
RunRecordDetail runRecord = entry.getValue();
extractAndAddInnerPrograms(toVisitPrograms, programWorkflowMap, appSpecs, programRunId, runRecord);
});
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.
the class LineageAdmin method doComputeLineage.
private Lineage doComputeLineage(DatasetId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) {
LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
boolean rollUpWorkflow = rollup != null && rollup.contains("workflow");
// Convert start time and end time period into scan keys in terms of program start times.
Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
ScanRangeWithFilter scanRange = getScanRange(runningInRange);
LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
Multimap<RelationKey, Relation> relations = HashMultimap.create();
Set<DatasetId> visitedDatasets = new HashSet<>();
Set<DatasetId> toVisitDatasets = new HashSet<>();
Set<ProgramId> visitedPrograms = new HashSet<>();
Set<ProgramId> toVisitPrograms = new HashSet<>();
// this map is to map the inner program run id to the workflow run id, this is needed to collapse the inner
// program and local datasets
Map<ProgramRunId, ProgramRunId> programWorkflowMap = new HashMap<>();
toVisitDatasets.add(sourceData);
for (int i = 0; i < levels; ++i) {
LOG.trace("Level {}", i);
toVisitPrograms.clear();
for (DatasetId d : toVisitDatasets) {
if (visitedDatasets.add(d)) {
LOG.trace("Visiting dataset {}", d);
// Fetch related programs, the programs will be the inner programs which access the datasets. For example,
// mapreduce or spark program in a workflow
Set<Relation> programRelations = lineageStoreReader.getRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got program relations {}", programRelations);
// determine if a dataset is local dataset. The local dataset always ends with the workflow run id
if (rollUpWorkflow) {
computeWorkflowInnerPrograms(toVisitPrograms, programWorkflowMap, programRelations);
}
// add to the relations, replace the inner program with the workflow using the map, ignore the
// local datasets relations, the local dataset always ends with the run id of the workflow
filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, programRelations);
toVisitPrograms.addAll(programRelations.stream().map(Relation::getProgram).collect(Collectors.toSet()));
}
}
toVisitDatasets.clear();
for (ProgramId p : toVisitPrograms) {
if (visitedPrograms.add(p)) {
LOG.trace("Visiting program {}", p);
// Fetch related datasets
Set<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got data relations {}", datasetRelations);
Set<DatasetId> localDatasets = filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, datasetRelations);
toVisitDatasets.addAll(datasetRelations.stream().map(relation -> (DatasetId) relation.getData()).filter(datasetId -> !localDatasets.contains(datasetId)).collect(Collectors.toSet()));
}
}
}
Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION::apply).values()));
LOG.trace("Got lineage {}", lineage);
return lineage;
}
use of io.cdap.cdap.data2.metadata.lineage.Relation in project cdap by cdapio.
the class LineageAdmin method filterAndAddRelations.
/**
* Filter the relations based on the rollUp flag, if set to true, the method will replace the inner program with
* the workflow using the map and ignore the local datasets relations. The local dataset always ends with the run
* id of the workflow. The set of filtered local datasets is returned
*/
private Set<DatasetId> filterAndAddRelations(boolean rollUpWorkflow, Multimap<RelationKey, Relation> relations, Map<ProgramRunId, ProgramRunId> programWorkflowMap, Set<Relation> relationss) {
Set<DatasetId> localDatasets = new HashSet<>();
for (Relation relation : relationss) {
if (rollUpWorkflow && programWorkflowMap.containsKey(relation.getProgramRunId())) {
ProgramRunId workflowId = programWorkflowMap.get(relation.getProgramRunId());
// skip the relation for local datasets, local datasets always end with the workflow run id
DatasetId data = (DatasetId) relation.getData();
if (data.getDataset().endsWith(workflowId.getRun())) {
localDatasets.add(data);
continue;
}
relation = new Relation(data, workflowId.getParent(), relation.getAccess(), RunIds.fromString(workflowId.getRun()));
}
relations.put(new RelationKey(relation), relation);
}
return localDatasets;
}
Aggregations