Search in sources :

Example 36 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageAdminTest method testDirectCycleTwoRuns.

@Test
public void testDirectCycleTwoRuns() {
    // Lineage for:
    // 
    // D1 -> P1 (run1)
    // 
    // D1 <- P1 (run2)
    // 
    TransactionRunner transactionRunner = getInjector().getInstance(TransactionRunner.class);
    LineageStoreReader lineageReader = new DefaultLineageStoreReader(transactionRunner);
    LineageWriter lineageWriter = new BasicLineageWriter(transactionRunner);
    Store store = getInjector().getInstance(Store.class);
    LineageAdmin lineageAdmin = new LineageAdmin(lineageReader, store);
    // Add accesses
    addRuns(store, run1, run2, run3, run4, run5);
    // It is okay to use current time here since access time is ignore during assertions
    lineageWriter.addAccess(run1, dataset1, AccessType.READ);
    // Write is in a different run
    lineageWriter.addAccess(new ProgramRunId(run1.getNamespace(), run1.getApplication(), run1.getParent().getType(), run1.getProgram(), run2.getEntityName()), dataset1, AccessType.WRITE);
    Lineage expectedLineage = new Lineage(ImmutableSet.of(new Relation(dataset1, program1, AccessType.READ, twillRunId(run1)), new Relation(dataset1, program1, AccessType.WRITE, twillRunId(run2))));
    Assert.assertEquals(expectedLineage, lineageAdmin.computeLineage(dataset1, 500, 20000, 100));
}
Also used : Relation(io.cdap.cdap.data2.metadata.lineage.Relation) BasicLineageWriter(io.cdap.cdap.data2.metadata.writer.BasicLineageWriter) LineageWriter(io.cdap.cdap.data2.metadata.writer.LineageWriter) TransactionRunner(io.cdap.cdap.spi.data.transaction.TransactionRunner) DefaultLineageStoreReader(io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) DefaultLineageStoreReader(io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader) Store(io.cdap.cdap.app.store.Store) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) BasicLineageWriter(io.cdap.cdap.data2.metadata.writer.BasicLineageWriter) Test(org.junit.Test)

Example 37 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageLimitingTest method testLineageLimiting.

@Test
public void testLineageLimiting() throws InterruptedException, ExecutionException, TimeoutException {
    LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
    ProgramRunId run1 = service1.run(RunIds.generate());
    // Write out some lineage information
    LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    lineageWriter.addAccess(run1, dataset1, AccessType.READ);
    lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
    // Write the field level lineage
    FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    fieldLineageWriter.write(spark1Run1, info1);
    ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
    fieldLineageWriter.write(spark1Run2, info1);
    // Verifies lineage has been written as it is smaller than maximum specified size
    Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
    Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
    // Verifies that empty lineage has been written
    EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
    List<ProgramRunOperations> incomingOperations = fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1);
    Assert.assertTrue(incomingOperations.isEmpty());
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) FieldLineageReader(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) LineageWriter(io.cdap.cdap.data2.metadata.writer.LineageWriter) MessagingLineageWriter(io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 38 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class FieldLineageAdminTest method testFieldsWithDsSchema.

@Test
public void testFieldsWithDsSchema() throws Exception {
    FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(getFieldNames(), Collections.emptySet(), Collections.emptySet()), metadataAdmin);
    EndPoint endPoint = EndPoint.of(NamespaceId.DEFAULT.getNamespace(), "file");
    // test that when there is no schema information present for the dataset and the we request for lineage with
    // includeCurrent set to true we get lineage fields correctly.
    Set<Field> expected = getFields(getFieldNames());
    // includeCurrent set to true
    Set<Field> actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
    Assert.assertEquals(expected, actual);
    // schema with fields which are different than known to lineage store
    Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("address", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("addiffField1", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("diffField2", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    // add the the dataset with the schema with fields known in lineage store
    TableProperties.Builder props = TableProperties.builder();
    TableProperties.setSchema(props, schema);
    TableProperties.setRowFieldName(props, "name");
    DatasetId datasetId = NamespaceId.DEFAULT.dataset("file");
    MetadataEntity entity = datasetId.toMetadataEntity();
    datasetFramework.addInstance("table", datasetId, props.build());
    // wait until the metadata for this dataset has been stored
    Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, entity).isEmpty(), 5, TimeUnit.SECONDS);
    // test all fields expected should have all the fields which was known the lineage store but should not contains
    // any dataset schema field since the includeCurrent is set to false
    expected = getFields(getFieldNames());
    actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, false);
    Assert.assertEquals(expected, actual);
    // test all fields expected should have all the fields which was known the lineage store and also the fields
    // which were only present in the dataset schema since includeCurrent is set to true.
    // this also test that for the fields which are common in lineage store and dataset schema for example address in
    // this case has their lineage info field set to true as we do have lineage for this field
    expected = getFields(getFieldNames());
    expected.addAll(new HashSet<>(Arrays.asList(new Field("addiffField1", false), new Field("diffField2", false))));
    actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
    Assert.assertEquals(expected, actual);
    // test fields prefixed with string "add" when includeCurrent not set then the ds field show not show up
    Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", false));
    // test fields prefixed with string "add" when includeCurrent is set the ds field should also show up
    Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", true));
    // test fields prefixed with string "ADD" (case insensitive)
    Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "ADD", true));
}
Also used : EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) DatasetField(io.cdap.cdap.proto.metadata.lineage.DatasetField) Field(io.cdap.cdap.proto.metadata.lineage.Field) InputField(io.cdap.cdap.api.lineage.field.InputField) MetadataEntity(io.cdap.cdap.api.metadata.MetadataEntity) Schema(io.cdap.cdap.api.data.schema.Schema) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TableProperties(io.cdap.cdap.api.dataset.table.TableProperties) DatasetId(io.cdap.cdap.proto.id.DatasetId) Test(org.junit.Test)

Aggregations

Test (org.junit.Test)22 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)13 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)12 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)12 DatasetId (io.cdap.cdap.proto.id.DatasetId)12 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)11 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)11 Relation (co.cask.cdap.data2.metadata.lineage.Relation)10 Store (io.cdap.cdap.app.store.Store)9 DefaultLineageStoreReader (io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader)9 TransactionRunner (io.cdap.cdap.spi.data.transaction.TransactionRunner)9 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)8 BasicLineageWriter (io.cdap.cdap.data2.metadata.writer.BasicLineageWriter)8 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)8 Store (co.cask.cdap.app.store.Store)7 LineageStore (co.cask.cdap.data2.metadata.lineage.LineageStore)7 MetadataStore (co.cask.cdap.data2.metadata.store.MetadataStore)7 ProgramId (io.cdap.cdap.proto.id.ProgramId)7 HashSet (java.util.HashSet)7 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)6