use of io.cdap.cdap.metadata.DatasetFieldLineageSummary in project cdap by caskdata.
the class DataPipelineTest method testActionFieldLineage.
private void testActionFieldLineage(Engine engine) throws Exception {
String readDataset = "ActionReadDataset" + engine;
String writeDataset = "ActionWriteDataset" + engine;
List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
List<Operation> operations = new ArrayList<>();
/*
* |---------> srcField1 -> destField1----|
* | |
* ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
* | |
* |---------> srcField3 -> destField3 ---|
*/
operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
// get field lineage for dest dataset
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
Assert.assertEquals(destFields, summary.getFields());
Assert.assertTrue(summary.getOutgoing().isEmpty());
Assert.assertEquals(1, summary.getIncoming().size());
Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
// get field lineage for src dataset
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Assert.assertEquals(1, summary.getOutgoing().size());
expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
LineageAdmin lineageAdmin = getLineageAdmin();
ProgramId programId = appId.workflow(SmartWorkflow.NAME);
RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
// get dataset lineage for src dataset
Tasks.waitFor(2, () -> {
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
return lineage.getRelations().size();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
// get dataset lineage for dest dataset, in this test they should be same
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
Assert.assertEquals(2, lineage.getRelations().size());
Assert.assertEquals(expectedLineage, lineage.getRelations());
}
use of io.cdap.cdap.metadata.DatasetFieldLineageSummary in project cdap by caskdata.
the class DataStreamsTest method testLineageWithMacros.
@Test
public void testLineageWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
String srcName = "lineageSource";
String sinkName1 = "lineageOutput1";
String sinkName2 = "lineageOutput2";
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
LineageAdmin lineageAdmin = getLineageAdmin();
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
// here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
TimeUnit.SECONDS.sleep(1);
long startTimeMillis = System.currentTimeMillis();
runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
long end = System.currentTimeMillis();
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
}
Aggregations