use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageHTTPHandler method datasetFieldLineageSummary.
/**
* Get the field level lineage about the specified field in one dataset.
*
* @param field the field name to compute field level lineage
* @param directionStr the direction to compute the field level lineage, can be INCOMING, OUTGOING or BOTH
* @param startStr the start time string, it can be a specific timestamp in milliseconds or a relative time,
* using now and times added to it.
* @param endStr the end time string, it can be a specific timestamp in milliseconds or a relative time,
* using now and times added to it.
*/
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}")
public void datasetFieldLineageSummary(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws Exception {
accessEnforcer.enforce(new DatasetId(namespaceId, datasetId), authenticationContext.getPrincipal(), StandardPermission.GET);
TimeRange range = parseRange(startStr, endStr);
Constants.FieldLineage.Direction direction = parseDirection(directionStr);
EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field);
FieldLineageSummary summary = fieldLineageAdmin.getFieldLineage(direction, endPointField, range.getStart(), range.getEnd());
responder.sendJson(HttpResponseStatus.OK, GSON.toJson(summary));
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class FieldLineageAdmin method getDatasetFieldLineage.
/**
* Get the summary for the specified dataset over a given time range depending on the direction specified.
* The summary will contain all the field level lineage relations about all the fields in a dataset.
*
* @param direction the direction in which summary need to be computed
* @param endPoint the EndPoint whicn represents the dataset that field level lineage needs to get computed
* @param start start time (inclusive) in milliseconds
* @param end end time (exclusive) in milliseconds
* @return the summary which contains all the field level lineage information about all the fields in a dataset
* @throws IOException if fails to get teh schema of the dataset
*/
public DatasetFieldLineageSummary getDatasetFieldLineage(Constants.FieldLineage.Direction direction, EndPoint endPoint, long start, long end) throws IOException {
Set<String> lineageFields = fieldLineageReader.getFields(endPoint, start, end);
Map<DatasetId, Set<FieldRelation>> incomingRelations = new HashMap<>();
Map<DatasetId, Set<FieldRelation>> outgoingRelations = new HashMap<>();
Map<DatasetId, Integer> fieldCount = new HashMap<>();
for (String field : lineageFields) {
EndPointField endPointField = new EndPointField(endPoint, field);
// compute the incoming field level lineage
if (direction == Constants.FieldLineage.Direction.INCOMING || direction == Constants.FieldLineage.Direction.BOTH) {
Map<DatasetId, Set<String>> incomingSummary = convertSummaryToDatasetMap(fieldLineageReader.getIncomingSummary(endPointField, start, end));
// compute the field count for all incoming datasets
incomingSummary.keySet().forEach(datasetId -> {
fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
});
// here the field itself will be the destination
computeAndAddRelations(incomingRelations, field, true, incomingSummary);
}
// compute the outgoing field level lineage
if (direction == Constants.FieldLineage.Direction.OUTGOING || direction == Constants.FieldLineage.Direction.BOTH) {
Map<DatasetId, Set<String>> outgoingSummary = convertSummaryToDatasetMap(fieldLineageReader.getOutgoingSummary(endPointField, start, end));
// compute the field count for all outgoing datasets
outgoingSummary.keySet().forEach(datasetId -> {
fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
});
// here the field itself will be the source
computeAndAddRelations(outgoingRelations, field, false, outgoingSummary);
}
}
Set<String> noLineageFields = getFieldsWithNoFieldLineage(endPoint, lineageFields);
Set<String> allFields = ImmutableSet.<String>builder().addAll(lineageFields).addAll(noLineageFields).build();
return new DatasetFieldLineageSummary(direction, start, end, new DatasetId(endPoint.getNamespace(), endPoint.getName()), allFields, fieldCount, incomingRelations, outgoingRelations);
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdmin method doComputeLineage.
private Lineage doComputeLineage(DatasetId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) {
LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
boolean rollUpWorkflow = rollup != null && rollup.contains("workflow");
// Convert start time and end time period into scan keys in terms of program start times.
Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
ScanRangeWithFilter scanRange = getScanRange(runningInRange);
LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
Multimap<RelationKey, Relation> relations = HashMultimap.create();
Set<DatasetId> visitedDatasets = new HashSet<>();
Set<DatasetId> toVisitDatasets = new HashSet<>();
Set<ProgramId> visitedPrograms = new HashSet<>();
Set<ProgramId> toVisitPrograms = new HashSet<>();
// this map is to map the inner program run id to the workflow run id, this is needed to collapse the inner
// program and local datasets
Map<ProgramRunId, ProgramRunId> programWorkflowMap = new HashMap<>();
toVisitDatasets.add(sourceData);
for (int i = 0; i < levels; ++i) {
LOG.trace("Level {}", i);
toVisitPrograms.clear();
for (DatasetId d : toVisitDatasets) {
if (visitedDatasets.add(d)) {
LOG.trace("Visiting dataset {}", d);
// Fetch related programs, the programs will be the inner programs which access the datasets. For example,
// mapreduce or spark program in a workflow
Set<Relation> programRelations = lineageStoreReader.getRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got program relations {}", programRelations);
// determine if a dataset is local dataset. The local dataset always ends with the workflow run id
if (rollUpWorkflow) {
computeWorkflowInnerPrograms(toVisitPrograms, programWorkflowMap, programRelations);
}
// add to the relations, replace the inner program with the workflow using the map, ignore the
// local datasets relations, the local dataset always ends with the run id of the workflow
filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, programRelations);
toVisitPrograms.addAll(programRelations.stream().map(Relation::getProgram).collect(Collectors.toSet()));
}
}
toVisitDatasets.clear();
for (ProgramId p : toVisitPrograms) {
if (visitedPrograms.add(p)) {
LOG.trace("Visiting program {}", p);
// Fetch related datasets
Set<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got data relations {}", datasetRelations);
Set<DatasetId> localDatasets = filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, datasetRelations);
toVisitDatasets.addAll(datasetRelations.stream().map(relation -> (DatasetId) relation.getData()).filter(datasetId -> !localDatasets.contains(datasetId)).collect(Collectors.toSet()));
}
}
}
Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION::apply).values()));
LOG.trace("Got lineage {}", lineage);
return lineage;
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class DataPipelineTest method testActionFieldLineage.
private void testActionFieldLineage(Engine engine) throws Exception {
String readDataset = "ActionReadDataset" + engine;
String writeDataset = "ActionWriteDataset" + engine;
List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
List<Operation> operations = new ArrayList<>();
/*
* |---------> srcField1 -> destField1----|
* | |
* ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
* | |
* |---------> srcField3 -> destField3 ---|
*/
operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
// get field lineage for dest dataset
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
Assert.assertEquals(destFields, summary.getFields());
Assert.assertTrue(summary.getOutgoing().isEmpty());
Assert.assertEquals(1, summary.getIncoming().size());
Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
// get field lineage for src dataset
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Assert.assertEquals(1, summary.getOutgoing().size());
expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
LineageAdmin lineageAdmin = getLineageAdmin();
ProgramId programId = appId.workflow(SmartWorkflow.NAME);
RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
// get dataset lineage for src dataset
Tasks.waitFor(2, () -> {
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
return lineage.getRelations().size();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
// get dataset lineage for dest dataset, in this test they should be same
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
Assert.assertEquals(2, lineage.getRelations().size());
Assert.assertEquals(expectedLineage, lineage.getRelations());
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdminTest method testSimpleLineage.
@Test
public void testSimpleLineage() {
// Lineage for D3 -> P2 -> D2 -> P1 -> D1
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses for D3 -> P2 -> D2 -> P1 -> D1 <-> P3
// We need to use current time here as metadata store stores access time using current time
ProgramRunId run1 = program1.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run2 = program2.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId run3 = program3.run(RunIds.generate(System.currentTimeMillis()).getId());
addRuns(store, run1, run2, run3);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.UNKNOWN);
lineageWriter.addAccess(run1, dataset1, AccessType.WRITE);
lineageWriter.addAccess(run1, dataset2, AccessType.READ);
lineageWriter.addAccess(run2, dataset2, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset3, AccessType.READ);
lineageWriter.addAccess(run3, dataset1, AccessType.UNKNOWN, null);
// The UNKNOWN access type will get filtered out if there is READ/WRITE. It will be preserved if it is the
// only access type
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset2, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset3, program2, AccessType.READ, twillRunId(run2)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3))));
// Lineage for D1
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 100));
// Lineage for D2
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset2, 500, System.currentTimeMillis() + 10000, 100));
// Lineage for D1 for one level should be D2 -> P1 -> D1 <-> P3
Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 1);
Assert.assertEquals(ImmutableSet.of(new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program3, AccessType.UNKNOWN, twillRunId(run3))), oneLevelLineage.getRelations());
// Assert that in a different namespace both lineage and metadata should be empty
NamespaceId customNamespace = new NamespaceId("custom_namespace");
DatasetId customDataset1 = customNamespace.dataset(dataset1.getEntityName());
Assert.assertEquals(new Lineage(ImmutableSet.of()), lineageAdmin.computeLineage(customDataset1, 500, System.currentTimeMillis() + 10000, 100));
}
Aggregations