Search in sources :

Example 6 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by caskdata.

the class FieldLineageAdminTest method testSummary.

@Test
public void testSummary() {
    FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(Collections.emptySet(), summary(), Collections.emptySet()), metadataAdmin);
    EndPoint endPoint = EndPoint.of("ns", "file");
    DatasetField datasetField = new DatasetField(new DatasetId("ns", "file"), new HashSet<>(Arrays.asList("a", "b", "c")));
    DatasetField anotherDatasetField = new DatasetField(new DatasetId("ns", "anotherfile"), new HashSet<>(Arrays.asList("x", "y", "z")));
    Set<DatasetField> expected = new HashSet<>();
    expected.add(datasetField);
    expected.add(anotherDatasetField);
    // input args to the getFieldLineage below does not matter since data returned is mocked
    FieldLineageSummary summary = fieldLineageAdmin.getFieldLineage(Constants.FieldLineage.Direction.INCOMING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
    Assert.assertEquals(expected, summary.getIncoming());
    Assert.assertNull(summary.getOutgoing());
    summary = fieldLineageAdmin.getFieldLineage(Constants.FieldLineage.Direction.OUTGOING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
    Assert.assertEquals(expected, summary.getOutgoing());
    Assert.assertNull(summary.getIncoming());
    summary = fieldLineageAdmin.getFieldLineage(Constants.FieldLineage.Direction.BOTH, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
    Assert.assertEquals(expected, summary.getOutgoing());
    Assert.assertEquals(expected, summary.getIncoming());
}
Also used : FieldLineageSummary(io.cdap.cdap.proto.metadata.lineage.FieldLineageSummary) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) DatasetField(io.cdap.cdap.proto.metadata.lineage.DatasetField) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) DatasetId(io.cdap.cdap.proto.id.DatasetId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 7 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by caskdata.

the class MetadataSubscriberServiceTest method testSubscriber.

@Test
public void testSubscriber() throws InterruptedException, ExecutionException, TimeoutException {
    LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
    ProgramRunId run1 = service1.run(RunIds.generate());
    // Try to read lineage, which should be empty since we haven't start the MetadataSubscriberService yet.
    Set<NamespacedEntityId> entities = lineageReader.getEntitiesForRun(run1);
    Assert.assertTrue(entities.isEmpty());
    // Write out some lineage information
    LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    lineageWriter.addAccess(run1, dataset1, AccessType.READ);
    lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
    // Write the field level lineage
    FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    fieldLineageWriter.write(spark1Run1, info1);
    ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
    fieldLineageWriter.write(spark1Run2, info1);
    List<Operation> operations2 = new ArrayList<>();
    operations2.add(read);
    operations2.add(parse);
    TransformOperation normalize = new TransformOperation("normalize", "normalize address", Collections.singletonList(InputField.of("parse", "address")), "address");
    operations2.add(normalize);
    WriteOperation anotherWrite = new WriteOperation("anotherwrite", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("normalize", "address")));
    operations2.add(anotherWrite);
    FieldLineageInfo info2 = new FieldLineageInfo(operations2);
    ProgramRunId spark1Run3 = spark1.run(RunIds.generate(300));
    fieldLineageWriter.write(spark1Run3, info2);
    // Emit some usages
    UsageWriter usageWriter = getInjector().getInstance(MessagingUsageWriter.class);
    usageWriter.register(spark1, dataset1);
    usageWriter.registerAll(Collections.singleton(spark1), dataset3);
    // Verifies lineage has been written
    Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
    Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // There shouldn't be any lineage for the "spark1" program, as only usage has been emitted.
    Assert.assertTrue(lineageReader.getRelations(spark1, 0L, Long.MAX_VALUE, x -> true).isEmpty());
    FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(anotherWrite);
    List<ProgramRunOperations> expected = new ArrayList<>();
    // Descending order of program execution
    expected.add(new ProgramRunOperations(Collections.singleton(spark1Run3), expectedOperations));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(write);
    expected.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(spark1Run1, spark1Run2)), expectedOperations));
    EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
    Tasks.waitFor(expected, () -> fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // Verifies usage has been written
    Set<EntityId> expectedUsage = new HashSet<>(Arrays.asList(dataset1, dataset3));
    UsageRegistry usageRegistry = getInjector().getInstance(UsageRegistry.class);
    Tasks.waitFor(true, () -> expectedUsage.equals(usageRegistry.getDatasets(spark1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) UsageWriter(io.cdap.cdap.data2.registry.UsageWriter) MessagingUsageWriter(io.cdap.cdap.data2.registry.MessagingUsageWriter) FieldLineageReader(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) UsageRegistry(io.cdap.cdap.data2.registry.UsageRegistry) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) EntityId(io.cdap.cdap.proto.id.EntityId) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MessagingLineageWriter(io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) LineageWriter(io.cdap.cdap.data2.metadata.writer.LineageWriter) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 8 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by cdapio.

the class FieldLineageAdminTest method summary.

private Set<EndPointField> summary() {
    Set<EndPointField> endPointFields = new HashSet<>();
    EndPoint endPoint1 = EndPoint.of("ns", "file");
    EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
    endPointFields.add(new EndPointField(endPoint1, "a"));
    endPointFields.add(new EndPointField(endPoint1, "b"));
    endPointFields.add(new EndPointField(endPoint1, "c"));
    endPointFields.add(new EndPointField(endPoint2, "x"));
    endPointFields.add(new EndPointField(endPoint2, "y"));
    endPointFields.add(new EndPointField(endPoint2, "z"));
    return endPointFields;
}
Also used : EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) HashSet(java.util.HashSet)

Example 9 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by cdapio.

the class FieldLineageAdmin method convertSummaryToDatasetMap.

private Map<DatasetId, Set<String>> convertSummaryToDatasetMap(Set<EndPointField> summary) {
    Map<DatasetId, Set<String>> endPointFields = new HashMap<>();
    for (EndPointField endPointField : summary) {
        EndPoint endPoint = endPointField.getEndPoint();
        // this can be null if the field is not related to any dataset, it can either be generated or dropped
        DatasetId datasetId = (endPoint.getNamespace() == null || endPoint.getName() == null) ? null : new DatasetId(endPoint.getNamespace(), endPoint.getName());
        Set<String> fields = endPointFields.computeIfAbsent(datasetId, k -> new HashSet<>());
        fields.add(endPointField.getField());
    }
    return endPointFields;
}
Also used : HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) HashMap(java.util.HashMap) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Example 10 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by cdapio.

the class FieldLineageAdmin method getDatasetFieldLineage.

/**
 * Get the summary for the specified dataset over a given time range depending on the direction specified.
 * The summary will contain all the field level lineage relations about all the fields in a dataset.
 *
 * @param direction the direction in which summary need to be computed
 * @param endPoint the EndPoint whicn represents the dataset that field level lineage needs to get computed
 * @param start start time (inclusive) in milliseconds
 * @param end end time (exclusive) in milliseconds
 * @return the summary which contains all the field level lineage information about all the fields in a dataset
 * @throws IOException if fails to get teh schema of the dataset
 */
public DatasetFieldLineageSummary getDatasetFieldLineage(Constants.FieldLineage.Direction direction, EndPoint endPoint, long start, long end) throws IOException {
    Set<String> lineageFields = fieldLineageReader.getFields(endPoint, start, end);
    Map<DatasetId, Set<FieldRelation>> incomingRelations = new HashMap<>();
    Map<DatasetId, Set<FieldRelation>> outgoingRelations = new HashMap<>();
    Map<DatasetId, Integer> fieldCount = new HashMap<>();
    for (String field : lineageFields) {
        EndPointField endPointField = new EndPointField(endPoint, field);
        // compute the incoming field level lineage
        if (direction == Constants.FieldLineage.Direction.INCOMING || direction == Constants.FieldLineage.Direction.BOTH) {
            Map<DatasetId, Set<String>> incomingSummary = convertSummaryToDatasetMap(fieldLineageReader.getIncomingSummary(endPointField, start, end));
            // compute the field count for all incoming datasets
            incomingSummary.keySet().forEach(datasetId -> {
                fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
            });
            // here the field itself will be the destination
            computeAndAddRelations(incomingRelations, field, true, incomingSummary);
        }
        // compute the outgoing field level lineage
        if (direction == Constants.FieldLineage.Direction.OUTGOING || direction == Constants.FieldLineage.Direction.BOTH) {
            Map<DatasetId, Set<String>> outgoingSummary = convertSummaryToDatasetMap(fieldLineageReader.getOutgoingSummary(endPointField, start, end));
            // compute the field count for all outgoing datasets
            outgoingSummary.keySet().forEach(datasetId -> {
                fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
            });
            // here the field itself will be the source
            computeAndAddRelations(outgoingRelations, field, false, outgoingSummary);
        }
    }
    Set<String> noLineageFields = getFieldsWithNoFieldLineage(endPoint, lineageFields);
    Set<String> allFields = ImmutableSet.<String>builder().addAll(lineageFields).addAll(noLineageFields).build();
    return new DatasetFieldLineageSummary(direction, start, end, new DatasetId(endPoint.getNamespace(), endPoint.getName()), allFields, fieldCount, incomingRelations, outgoingRelations);
}
Also used : HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) HashMap(java.util.HashMap) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Aggregations

EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)20 HashSet (java.util.HashSet)16 DatasetId (io.cdap.cdap.proto.id.DatasetId)12 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)10 Test (org.junit.Test)10 ImmutableSet (com.google.common.collect.ImmutableSet)6 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)6 ArrayList (java.util.ArrayList)6 Set (java.util.Set)6 Operation (io.cdap.cdap.api.lineage.field.Operation)4 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)4 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)4 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)4 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)4 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)4 FieldLineageReader (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader)4 FieldLineageWriter (io.cdap.cdap.data2.metadata.writer.FieldLineageWriter)4 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)4 MessagingLineageWriter (io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter)4 NamespacedEntityId (io.cdap.cdap.proto.id.NamespacedEntityId)4