use of io.cdap.cdap.data2.metadata.writer.LineageWriter in project cdap by caskdata.
the class LineageAdminTest method testBranchLineage.
@Test
public void testBranchLineage() {
// Lineage for:
//
// ->D4 -> D5 -> P3 -> D6
// | |
// | |
// D1 -> P1 -> D2 -> P2 -> D3
// | | |
// | | |
// S1 -->| ---------------> P4 -> D7
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
lineageWriter.addAccess(run1, dataset4, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset2, AccessType.READ);
lineageWriter.addAccess(run2, dataset3, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset5, AccessType.WRITE);
lineageWriter.addAccess(run3, dataset5, AccessType.READ, null);
lineageWriter.addAccess(run3, dataset6, AccessType.WRITE, null);
lineageWriter.addAccess(run4, dataset2, AccessType.READ, null);
lineageWriter.addAccess(run4, dataset3, AccessType.READ, null);
lineageWriter.addAccess(run4, dataset7, AccessType.WRITE, null);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset4, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset5, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset5, program3, AccessType.READ, twillRunId(run3)), new Relation(dataset6, program3, AccessType.WRITE, twillRunId(run3)), new Relation(dataset2, program4, AccessType.READ, twillRunId(run4)), new Relation(dataset3, program4, AccessType.READ, twillRunId(run4)), new Relation(dataset7, program4, AccessType.WRITE, twillRunId(run4))));
// Lineage for D7
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset7, 500, 20000, 100));
// Lineage for D6
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset6, 500, 20000, 100));
// Lineage for D3
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset3, 500, 20000, 100));
}
use of io.cdap.cdap.data2.metadata.writer.LineageWriter in project cdap by caskdata.
the class LineageAdminTest method testLocalDatasetsInWorkflow.
@Test
public void testLocalDatasetsInWorkflow() throws Exception {
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
ApplicationId testApp = NamespaceId.DEFAULT.app("testLocalDatasets");
ProgramId workflowId = testApp.workflow("wf1");
// if the spark and mr job are inner jobs of workflow, they should be in the same app
ProgramId mrId1 = testApp.mr("mr1");
ProgramId mrId2 = testApp.mr("mr2");
ProgramId sparkId = testApp.spark("spark1");
ImmutableList<WorkflowNode> nodes = ImmutableList.of(new WorkflowActionNode("mr1", new ScheduleProgramInfo(SchedulableProgramType.MAPREDUCE, "mr1")), new WorkflowActionNode("mr2", new ScheduleProgramInfo(SchedulableProgramType.MAPREDUCE, "mr2")), new WorkflowActionNode("spark1", new ScheduleProgramInfo(SchedulableProgramType.SPARK, "spark1")));
WorkflowSpecification wfSpec = new WorkflowSpecification("test", "wf1", "", Collections.emptyMap(), nodes, Collections.emptyMap(), Collections.emptyMap());
ApplicationSpecification appSpec = new DefaultApplicationSpecification("testLocalDatasets", ProjectInfo.getVersion().toString(), "dummy app", null, NamespaceId.DEFAULT.artifact("testArtifact", "1.0").toApiArtifactId(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), ImmutableMap.of(workflowId.getProgram(), wfSpec), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap());
Store store = getInjector().getInstance(Store.class);
store.addApplication(testApp, appSpec);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses for D1 -|
// |-> MR1 -> LOCAL1 -> MR2 -> LOCAL2 -> SPARK -> D3
// D2 -|
// P1 and P2 are inner programs of the workflow
// We need to use current time here as metadata store stores access time using current time
ProgramRunId mr1Run = mrId1.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId mr2Run = mrId2.run((RunIds.generate(System.currentTimeMillis()).getId()));
ProgramRunId sparkRun = sparkId.run(RunIds.generate(System.currentTimeMillis()).getId());
ProgramRunId workflow = workflowId.run(RunIds.generate(System.currentTimeMillis()).getId());
// local datasets always end with workflow run id
DatasetId localDataset1 = NamespaceId.DEFAULT.dataset("localDataset1" + workflow.getRun());
DatasetId localDataset2 = NamespaceId.DEFAULT.dataset("localDataset2" + workflow.getRun());
addRuns(store, workflow);
// only mr and spark can be inner programs
addWorkflowRuns(store, workflow.getProgram(), workflow.getRun(), mr1Run, mr2Run, sparkRun);
lineageWriter.addAccess(mr1Run, dataset1, AccessType.READ);
lineageWriter.addAccess(mr1Run, dataset2, AccessType.READ);
lineageWriter.addAccess(mr1Run, localDataset1, AccessType.WRITE);
lineageWriter.addAccess(mr2Run, localDataset1, AccessType.READ);
lineageWriter.addAccess(mr2Run, localDataset2, AccessType.WRITE);
lineageWriter.addAccess(sparkRun, localDataset2, AccessType.READ);
lineageWriter.addAccess(sparkRun, dataset3, AccessType.WRITE);
// compute the lineage without roll up, the local datasets and inner program should not roll up
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, mrId1, AccessType.READ, twillRunId(mr1Run)), new Relation(dataset2, mrId1, AccessType.READ, twillRunId(mr1Run)), new Relation(localDataset1, mrId1, AccessType.WRITE, twillRunId(mr1Run)), new Relation(localDataset1, mrId2, AccessType.READ, twillRunId(mr2Run)), new Relation(localDataset2, mrId2, AccessType.WRITE, twillRunId(mr2Run)), new Relation(localDataset2, sparkId, AccessType.READ, twillRunId(sparkRun)), new Relation(dataset3, sparkId, AccessType.WRITE, twillRunId(sparkRun))));
Lineage resultLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 100, null);
// Lineage for D1
Assert.assertEquals(expectedLineage, resultLineage);
// D3 should have same lineage for all levels
resultLineage = lineageAdmin.computeLineage(dataset3, 500, System.currentTimeMillis() + 10000, 100, null);
Assert.assertEquals(expectedLineage, resultLineage);
// if only query for one level with no roll up, the roll up should not happen and the inner program and local
// dataset should get returned
expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset3, sparkId, AccessType.WRITE, twillRunId(sparkRun)), new Relation(localDataset2, sparkId, AccessType.READ, twillRunId(sparkRun))));
resultLineage = lineageAdmin.computeLineage(dataset3, 500, System.currentTimeMillis() + 10000, 1, null);
Assert.assertEquals(expectedLineage, resultLineage);
// query for roll up the workflow, all the inner program and local datasets should not be in the result,
// the entire workflow information should get returned
expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, workflowId, AccessType.READ, twillRunId(workflow)), new Relation(dataset2, workflowId, AccessType.READ, twillRunId(workflow)), new Relation(dataset3, workflowId, AccessType.WRITE, twillRunId(workflow))));
// D1, D2, D3 should give same result
resultLineage = lineageAdmin.computeLineage(dataset1, 500, System.currentTimeMillis() + 10000, 1, "workflow");
Assert.assertEquals(expectedLineage, resultLineage);
resultLineage = lineageAdmin.computeLineage(dataset2, 500, System.currentTimeMillis() + 10000, 1, "workflow");
Assert.assertEquals(expectedLineage, resultLineage);
resultLineage = lineageAdmin.computeLineage(dataset3, 500, System.currentTimeMillis() + 10000, 1, "workflow");
Assert.assertEquals(expectedLineage, resultLineage);
}
use of io.cdap.cdap.data2.metadata.writer.LineageWriter in project cdap by caskdata.
the class LineageAdminTest method testSimpleLoopLineage.
@Test
public void testSimpleLoopLineage() {
// Lineage for D1 -> P1 -> D2 -> P2 -> D3 -> P3 -> D4
// | |
// | V
// |<-----------------
//
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add access
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset2, AccessType.READ);
lineageWriter.addAccess(run2, dataset1, AccessType.WRITE);
lineageWriter.addAccess(run2, dataset3, AccessType.WRITE);
lineageWriter.addAccess(run3, dataset3, AccessType.READ, null);
lineageWriter.addAccess(run3, dataset4, AccessType.WRITE, null);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset4, program3, AccessType.WRITE, twillRunId(run3)), new Relation(dataset3, program3, AccessType.READ, twillRunId(run3))));
// Lineage for D1
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
// Lineage for D2
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset2, 500, 20000, 100));
// Lineage for D1 for one level D1 -> P1 -> D2 -> P2 -> D3
// | |
// | V
// |<-----------------
//
Lineage oneLevelLineage = lineageAdmin.computeLineage(dataset1, 500, 20000, 1);
Assert.assertEquals(ImmutableSet.of(new Relation(dataset2, program1, AccessType.WRITE, twillRunId(run1)), new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program2, AccessType.WRITE, twillRunId(run2)), new Relation(dataset2, program2, AccessType.READ, twillRunId(run2)), new Relation(dataset3, program2, AccessType.WRITE, twillRunId(run2))), oneLevelLineage.getRelations());
}
use of io.cdap.cdap.data2.metadata.writer.LineageWriter in project cdap by caskdata.
the class LineageAdminTest method testDirectCycleTwoRuns.
@Test
public void testDirectCycleTwoRuns() {
// Lineage for:
//
// D1 -> P1 (run1)
//
// D1 <- P1 (run2)
//
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
// Write is in a different run
lineageWriter.addAccess(new ProgramRunId(run1.getNamespace(), run1.getApplication(), run1.getParent().getType(), run1.getProgram(), run2.getEntityName()), dataset1, AccessType.WRITE);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run2))));
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
use of io.cdap.cdap.data2.metadata.writer.LineageWriter in project cdap by caskdata.
the class LineageLimitingTest method testLineageLimiting.
@Test
public void testLineageLimiting() throws InterruptedException, ExecutionException, TimeoutException {
LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
ProgramRunId run1 = service1.run(RunIds.generate());
// Write out some lineage information
LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
// Write the field level lineage
FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
List<Operation> operations = new ArrayList<>();
operations.add(read);
operations.add(write);
operations.add(parse);
FieldLineageInfo info1 = new FieldLineageInfo(operations);
fieldLineageWriter.write(spark1Run1, info1);
ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
fieldLineageWriter.write(spark1Run2, info1);
// Verifies lineage has been written as it is smaller than maximum specified size
Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
// Verifies that empty lineage has been written
EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
List<ProgramRunOperations> incomingOperations = fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1);
Assert.assertTrue(incomingOperations.isEmpty());
}
Aggregations