use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdminTest method testDirectCycleTwoRuns.
@Test
public void testDirectCycleTwoRuns() {
// Lineage for:
//
// D1 -> P1 (run1)
//
// D1 <- P1 (run2)
//
TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
Store store = getInjector().getInstance(Store.class);
LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
// Add accesses
addRuns(store, run1, run2, run3, run4, run5);
// It is okay to use current time here since access time is ignore during assertions
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
// Write is in a different run
lineageWriter.addAccess(new ProgramRunId(run1.getNamespace(), run1.getApplication(), run1.getParent().getType(), run1.getProgram(), run2.getEntityName()), dataset1, AccessType.WRITE);
Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run2))));
Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageLimitingTest method testLineageLimiting.
@Test
public void testLineageLimiting() throws InterruptedException, ExecutionException, TimeoutException {
LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
ProgramRunId run1 = service1.run(RunIds.generate());
// Write out some lineage information
LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
// Write the field level lineage
FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
List<Operation> operations = new ArrayList<>();
operations.add(read);
operations.add(write);
operations.add(parse);
FieldLineageInfo info1 = new FieldLineageInfo(operations);
fieldLineageWriter.write(spark1Run1, info1);
ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
fieldLineageWriter.write(spark1Run2, info1);
// Verifies lineage has been written as it is smaller than maximum specified size
Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
// Verifies that empty lineage has been written
EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
List<ProgramRunOperations> incomingOperations = fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1);
Assert.assertTrue(incomingOperations.isEmpty());
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class FieldLineageAdminTest method testFieldsWithDsSchema.
@Test
public void testFieldsWithDsSchema() throws Exception {
FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(getFieldNames(), Collections.emptySet(), Collections.emptySet()), metadataAdmin);
EndPoint endPoint = EndPoint.of(NamespaceId.DEFAULT.getNamespace(), "file");
// test that when there is no schema information present for the dataset and the we request for lineage with
// includeCurrent set to true we get lineage fields correctly.
Set<Field> expected = getFields(getFieldNames());
// includeCurrent set to true
Set<Field> actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
Assert.assertEquals(expected, actual);
// schema with fields which are different than known to lineage store
Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("address", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("addiffField1", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("diffField2", Schema.nullableOf(Schema.of(Schema.Type.INT))));
// add the the dataset with the schema with fields known in lineage store
TableProperties.Builder props = TableProperties.builder();
TableProperties.setSchema(props, schema);
TableProperties.setRowFieldName(props, "name");
DatasetId datasetId = NamespaceId.DEFAULT.dataset("file");
MetadataEntity entity = datasetId.toMetadataEntity();
datasetFramework.addInstance("table", datasetId, props.build());
// wait until the metadata for this dataset has been stored
Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, entity).isEmpty(), 5, TimeUnit.SECONDS);
// test all fields expected should have all the fields which was known the lineage store but should not contains
// any dataset schema field since the includeCurrent is set to false
expected = getFields(getFieldNames());
actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, false);
Assert.assertEquals(expected, actual);
// test all fields expected should have all the fields which was known the lineage store and also the fields
// which were only present in the dataset schema since includeCurrent is set to true.
// this also test that for the fields which are common in lineage store and dataset schema for example address in
// this case has their lineage info field set to true as we do have lineage for this field
expected = getFields(getFieldNames());
expected.addAll(new HashSet<>(Arrays.asList(new Field("addiffField1", false), new Field("diffField2", false))));
actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
Assert.assertEquals(expected, actual);
// test fields prefixed with string "add" when includeCurrent not set then the ds field show not show up
Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", false));
// test fields prefixed with string "add" when includeCurrent is set the ds field should also show up
Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", true));
// test fields prefixed with string "ADD" (case insensitive)
Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "ADD", true));
}
Aggregations