Search in sources :

Example 21 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageInfoTest method testSimpleFieldLineageSummary.

@Test
public void testSimpleFieldLineageSummary() {
    // read: file -> (offset, body)
    // parse: (body) -> (first_name, last_name)
    // concat: (first_name, last_name) -> (name)
    // write: (offset, name) -> another_file
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name");
    TransformOperation concat = new TransformOperation("concat", "concatinating the fields", Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name");
    WriteOperation write = new WriteOperation("write_op", "writing data to file", EndPoint.of("myns", "another_file"), Arrays.asList(InputField.of("read", "offset"), InputField.of("concat", "name")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(concat);
    operations.add(read);
    operations.add(write);
    FieldLineageInfo info = new FieldLineageInfo(operations);
    // EndPoint(myns, another_file) should have two fields: offset and name
    Map<EndPoint, Set<String>> destinationFields = info.getDestinationFields();
    EndPoint destination = EndPoint.of("myns", "another_file");
    Assert.assertEquals(1, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("offset", "name")), destinationFields.get(destination));
    Map<EndPointField, Set<EndPointField>> incomingSummary = info.getIncomingSummary();
    Map<EndPointField, Set<EndPointField>> outgoingSummary = info.getOutgoingSummary();
    // test incoming summaries
    // offset in the destination is generated from offset field read from source
    EndPointField endPointField = new EndPointField(destination, "offset");
    Set<EndPointField> sourceEndPointFields = incomingSummary.get(endPointField);
    Assert.assertEquals(1, sourceEndPointFields.size());
    EndPointField sourceEndpoint = new EndPointField(EndPoint.of("endpoint1"), "offset");
    Assert.assertEquals(sourceEndpoint, sourceEndPointFields.iterator().next());
    Set<Operation> operationsForField = info.getIncomingOperationsForField(endPointField);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(write);
    expectedOperations.add(read);
    Assert.assertEquals(expectedOperations, operationsForField);
    // test outgoing operations for offset field
    operationsForField = info.getOutgoingOperationsForField(sourceEndpoint);
    Assert.assertEquals(expectedOperations, operationsForField);
    // name in the destination is generated from body field read from source
    endPointField = new EndPointField(destination, "name");
    sourceEndPointFields = incomingSummary.get(endPointField);
    Assert.assertEquals(1, sourceEndPointFields.size());
    sourceEndpoint = new EndPointField(EndPoint.of("endpoint1"), "body");
    Assert.assertEquals(sourceEndpoint, sourceEndPointFields.iterator().next());
    operationsForField = info.getIncomingOperationsForField(endPointField);
    expectedOperations = new HashSet<>();
    expectedOperations.add(write);
    expectedOperations.add(concat);
    expectedOperations.add(parse);
    expectedOperations.add(read);
    Assert.assertEquals(expectedOperations, operationsForField);
    // offset in the source should only affect the field offset in the destination
    EndPoint source = EndPoint.of("endpoint1");
    endPointField = new EndPointField(source, "offset");
    Set<EndPointField> destinationEndPointFields = outgoingSummary.get(endPointField);
    Assert.assertEquals(1, destinationEndPointFields.size());
    sourceEndpoint = new EndPointField(EndPoint.of("myns", "another_file"), "offset");
    Assert.assertEquals(sourceEndpoint, destinationEndPointFields.iterator().next());
    // test outgoing operations for body field
    operationsForField = info.getOutgoingOperationsForField(new EndPointField(EndPoint.of("endpoint1"), "body"));
    Assert.assertEquals(expectedOperations, operationsForField);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 22 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageInfoTest method testCycle.

@Test(expected = IllegalArgumentException.class)
public void testCycle() {
    EndPoint readEndPoint = EndPoint.of("ns", "file1");
    EndPoint writeEndPoint = EndPoint.of("ns", "file2");
    ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize", Collections.singletonList(InputField.of("parse", "name")), "name");
    WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(read);
    operations.add(normalize);
    operations.add(write);
    FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations));
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 23 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageInfoTest method testNonCycle.

@Test
public void testNonCycle() {
    EndPoint readEndPoint = EndPoint.of("ns", "src");
    EndPoint writeEndPoint = EndPoint.of("ns", "dest");
    ReadOperation read = new ReadOperation("read", "read", readEndPoint, "a", "b");
    TransformOperation combine = new TransformOperation("combine", "combine", Arrays.asList(InputField.of("read", "a"), InputField.of("read", "b")), "a", "b");
    // an operation with no incoming inputs, this should not be considered an cycle, but should get treat like a
    // read operation
    TransformOperation generate = new TransformOperation("generate", "generate", Collections.emptyList(), "c");
    WriteOperation write = new WriteOperation("write", "write", writeEndPoint, Arrays.asList(InputField.of("combine", "a"), InputField.of("combine", "b"), InputField.of("generate", "c")));
    Set<Operation> unOrdered = new HashSet<>();
    unOrdered.add(combine);
    unOrdered.add(read);
    unOrdered.add(generate);
    unOrdered.add(write);
    List<Operation> operations = FieldLineageInfo.getTopologicallySortedOperations(unOrdered);
    List<Operation> expected = ImmutableList.of(read, generate, combine, write);
    Assert.assertEquals(expected, operations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 24 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageInfoTest method testRenameThenDropFields.

@Test
public void testRenameThenDropFields() {
    // read: endpoint1 -> (first_name, last_name, social)
    // renameSocial: read.social -> ssn
    // renameSocialAgain: renameSocial.ssn -> ssn2
    // dropSocial: renameSocialAgain.ssn2 -> ()
    // write: (read.first_name, read.first_name) -> endpoint2
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "first_name", "last_name", "social");
    TransformOperation renameSocial = new TransformOperation("renameSocial", "rename social", Collections.singletonList(InputField.of("read", "social")), "ssn");
    TransformOperation renameSocialAgain = new TransformOperation("renameSocialAgain", "rename social again", Collections.singletonList(InputField.of("renameSocial", "ssn")), "ssn2");
    TransformOperation dropSocial = new TransformOperation("dropSocial", "drop ssn2", Collections.singletonList(InputField.of("renameSocialAgain", "ssn2")));
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("endpoint2"), Arrays.asList(InputField.of("read", "first_name"), InputField.of("read", "last_name")));
    Set<Operation> operations = Sets.newHashSet(read, renameSocial, renameSocialAgain, dropSocial, write);
    FieldLineageInfo info = new FieldLineageInfo(operations);
    EndPoint ep1 = EndPoint.of("endpoint1");
    EndPoint ep2 = EndPoint.of("endpoint2");
    EndPointField ep2ln = new EndPointField(ep2, "last_name");
    EndPointField ep2fn = new EndPointField(ep2, "first_name");
    EndPointField ep1ln = new EndPointField(ep1, "last_name");
    EndPointField ep1fn = new EndPointField(ep1, "first_name");
    Map<EndPointField, Set<EndPointField>> expectedOutgoingSummary = new HashMap<>();
    expectedOutgoingSummary.put(ep1fn, Collections.singleton(ep2fn));
    expectedOutgoingSummary.put(ep1ln, Collections.singleton(ep2ln));
    expectedOutgoingSummary.put(new EndPointField(ep1, "social"), Collections.singleton(FieldLineageInfo.NULL_EPF));
    Map<EndPointField, Set<EndPointField>> outgoingSummary = info.getOutgoingSummary();
    Assert.assertEquals(expectedOutgoingSummary, outgoingSummary);
    Map<EndPointField, Set<EndPointField>> expectedIncomingSummary = new HashMap<>();
    expectedIncomingSummary.put(ep2ln, Collections.singleton(ep1ln));
    expectedIncomingSummary.put(ep2fn, Collections.singleton(ep1fn));
    Assert.assertEquals(expectedIncomingSummary, info.getIncomingSummary());
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashMap(java.util.HashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 25 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageAdmin method getFieldsWithNoFieldLineage.

private Set<String> getFieldsWithNoFieldLineage(EndPoint dataset, Set<String> lineageFields) throws IOException {
    // get the system properties of this dataset
    Map<String, String> properties = metadataAdmin.getProperties(MetadataScope.SYSTEM, MetadataEntity.ofDataset(dataset.getNamespace(), dataset.getName()));
    // the system metadata contains the schema of the dataset which is written by the DatasetSystemMetadataWriter
    if (properties.containsKey(MetadataConstants.SCHEMA_KEY)) {
        String schema = properties.get(MetadataConstants.SCHEMA_KEY);
        Schema sc = Schema.parseJson(schema);
        if (sc.getFields() != null) {
            Set<String> schemaFields = sc.getFields().stream().map(Schema.Field::getName).collect(Collectors.toSet());
            // filter out the fields that are part of the lineageFields
            return sc.getFields().stream().map(Schema.Field::getName).filter(name -> !lineageFields.contains(name)).collect(Collectors.toSet());
        }
    } else {
        LOG.trace("Received request to include schema fields for {} but no schema was found. Only fields present in " + "the lineage store will be returned.", dataset);
    }
    return Collections.emptySet();
}
Also used : EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldLineageDetails(io.cdap.cdap.proto.metadata.lineage.FieldLineageDetails) Inject(com.google.inject.Inject) LoggerFactory(org.slf4j.LoggerFactory) DefaultFieldLineageReader(io.cdap.cdap.data2.metadata.lineage.field.DefaultFieldLineageReader) HashMap(java.util.HashMap) FieldLineageSummary(io.cdap.cdap.proto.metadata.lineage.FieldLineageSummary) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.FieldOperationInfo) ArrayList(java.util.ArrayList) FieldOperationInput(io.cdap.cdap.proto.metadata.lineage.FieldOperationInput) FieldOperationOutput(io.cdap.cdap.proto.metadata.lineage.FieldOperationOutput) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramFieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.ProgramFieldOperationInfo) DatasetId(io.cdap.cdap.proto.id.DatasetId) Map(java.util.Map) MetadataEntity(io.cdap.cdap.api.metadata.MetadataEntity) Operation(io.cdap.cdap.api.lineage.field.Operation) Nullable(javax.annotation.Nullable) ImmutableSet(com.google.common.collect.ImmutableSet) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Logger(org.slf4j.Logger) RunIds(io.cdap.cdap.common.app.RunIds) DatasetField(io.cdap.cdap.proto.metadata.lineage.DatasetField) Field(io.cdap.cdap.proto.metadata.lineage.Field) ProgramId(io.cdap.cdap.proto.id.ProgramId) Set(java.util.Set) ProgramInfo(io.cdap.cdap.proto.metadata.lineage.ProgramInfo) IOException(java.io.IOException) Schema(io.cdap.cdap.api.data.schema.Schema) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Stream(java.util.stream.Stream) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) MetadataScope(io.cdap.cdap.api.metadata.MetadataScope) MetadataConstants(io.cdap.cdap.spi.metadata.MetadataConstants) Constants(io.cdap.cdap.common.conf.Constants) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) VisibleForTesting(com.google.common.annotations.VisibleForTesting) FieldLineageReader(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader) Collections(java.util.Collections) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) DatasetField(io.cdap.cdap.proto.metadata.lineage.DatasetField) Field(io.cdap.cdap.proto.metadata.lineage.Field) Schema(io.cdap.cdap.api.data.schema.Schema)

Aggregations

EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)33 HashSet (java.util.HashSet)28 Test (org.junit.Test)26 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)24 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)24 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)24 Operation (io.cdap.cdap.api.lineage.field.Operation)23 ArrayList (java.util.ArrayList)19 HashMap (java.util.HashMap)14 List (java.util.List)11 ImmutableList (com.google.common.collect.ImmutableList)10 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)10 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)10 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)10 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)10 Connection (io.cdap.cdap.etl.proto.Connection)10 ImmutableSet (com.google.common.collect.ImmutableSet)9 Set (java.util.Set)9 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)8 LinkedHashSet (java.util.LinkedHashSet)7