Search in sources :

Example 26 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testCycleWithNonExistentOperationNames.

@Test(expected = IllegalArgumentException.class)
public void testCycleWithNonExistentOperationNames() {
    EndPoint readEndPoint = EndPoint.of("ns", "file1");
    EndPoint writeEndPoint = EndPoint.of("ns", "file2");
    ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name"), InputField.of("nop1", "field1")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize", Arrays.asList(InputField.of("parse", "name"), InputField.of("nop2", "field2")), "name");
    WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"), InputField.of("nop3", "field3")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(read);
    operations.add(normalize);
    operations.add(write);
    FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations));
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 27 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testMultiSourceDroppedFields.

@Test
public void testMultiSourceDroppedFields() {
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "first_name", "last_name", "social");
    TransformOperation combineNames = new TransformOperation("combineNames", "combine names", Arrays.asList(InputField.of("read", "first_name"), InputField.of("read", "last_name")), "full_name");
    TransformOperation dropSocial = new TransformOperation("dropSocial", "drop social", Collections.singletonList(InputField.of("read", "social")));
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("endpoint2"), Collections.singletonList(InputField.of("combineNames", "full_name")));
    Set<Operation> operations = Sets.newHashSet(read, write, combineNames, dropSocial);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    EndPoint ep1 = EndPoint.of("endpoint1");
    EndPoint ep2 = EndPoint.of("endpoint2");
    Map<EndPointField, Set<EndPointField>> expectedOutgoingSummary = new HashMap<>();
    expectedOutgoingSummary.put(new EndPointField(ep1, "first_name"), Collections.singleton(new EndPointField(ep2, "full_name")));
    expectedOutgoingSummary.put(new EndPointField(ep1, "last_name"), Collections.singleton(new EndPointField(ep2, "full_name")));
    expectedOutgoingSummary.put(new EndPointField(ep1, "social"), Collections.singleton(FieldLineageInfo.NULL_EPF));
    Assert.assertEquals(expectedOutgoingSummary, info1.getOutgoingSummary());
    Map<EndPointField, Set<EndPointField>> expectedIncomingSummary = new HashMap<>();
    expectedIncomingSummary.put(new EndPointField(ep2, "full_name"), Sets.newHashSet(new EndPointField(ep1, "first_name"), new EndPointField(ep1, "last_name")));
    Assert.assertEquals(expectedIncomingSummary, info1.getIncomingSummary());
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashMap(java.util.HashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 28 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testSimpleFieldLineageSummary.

@Test
public void testSimpleFieldLineageSummary() {
    // read: file -> (offset, body)
    // parse: (body) -> (first_name, last_name)
    // concat: (first_name, last_name) -> (name)
    // write: (offset, name) -> another_file
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name");
    TransformOperation concat = new TransformOperation("concat", "concatinating the fields", Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name");
    WriteOperation write = new WriteOperation("write_op", "writing data to file", EndPoint.of("myns", "another_file"), Arrays.asList(InputField.of("read", "offset"), InputField.of("concat", "name")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(concat);
    operations.add(read);
    operations.add(write);
    FieldLineageInfo info = new FieldLineageInfo(operations);
    // EndPoint(myns, another_file) should have two fields: offset and name
    Map<EndPoint, Set<String>> destinationFields = info.getDestinationFields();
    EndPoint destination = EndPoint.of("myns", "another_file");
    Assert.assertEquals(1, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("offset", "name")), destinationFields.get(destination));
    Map<EndPointField, Set<EndPointField>> incomingSummary = info.getIncomingSummary();
    Map<EndPointField, Set<EndPointField>> outgoingSummary = info.getOutgoingSummary();
    // test incoming summaries
    // offset in the destination is generated from offset field read from source
    EndPointField endPointField = new EndPointField(destination, "offset");
    Set<EndPointField> sourceEndPointFields = incomingSummary.get(endPointField);
    Assert.assertEquals(1, sourceEndPointFields.size());
    EndPointField sourceEndpoint = new EndPointField(EndPoint.of("endpoint1"), "offset");
    Assert.assertEquals(sourceEndpoint, sourceEndPointFields.iterator().next());
    Set<Operation> operationsForField = info.getIncomingOperationsForField(endPointField);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(write);
    expectedOperations.add(read);
    Assert.assertEquals(expectedOperations, operationsForField);
    // test outgoing operations for offset field
    operationsForField = info.getOutgoingOperationsForField(sourceEndpoint);
    Assert.assertEquals(expectedOperations, operationsForField);
    // name in the destination is generated from body field read from source
    endPointField = new EndPointField(destination, "name");
    sourceEndPointFields = incomingSummary.get(endPointField);
    Assert.assertEquals(1, sourceEndPointFields.size());
    sourceEndpoint = new EndPointField(EndPoint.of("endpoint1"), "body");
    Assert.assertEquals(sourceEndpoint, sourceEndPointFields.iterator().next());
    operationsForField = info.getIncomingOperationsForField(endPointField);
    expectedOperations = new HashSet<>();
    expectedOperations.add(write);
    expectedOperations.add(concat);
    expectedOperations.add(parse);
    expectedOperations.add(read);
    Assert.assertEquals(expectedOperations, operationsForField);
    // offset in the source should only affect the field offset in the destination
    EndPoint source = EndPoint.of("endpoint1");
    endPointField = new EndPointField(source, "offset");
    Set<EndPointField> destinationEndPointFields = outgoingSummary.get(endPointField);
    Assert.assertEquals(1, destinationEndPointFields.size());
    sourceEndpoint = new EndPointField(EndPoint.of("myns", "another_file"), "offset");
    Assert.assertEquals(sourceEndpoint, destinationEndPointFields.iterator().next());
    // test outgoing operations for body field
    operationsForField = info.getOutgoingOperationsForField(new EndPointField(EndPoint.of("endpoint1"), "body"));
    Assert.assertEquals(expectedOperations, operationsForField);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 29 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testLargeLineageOperation.

@Test(timeout = 10000)
public void testLargeLineageOperation() {
    List<String> inputs = new ArrayList<>();
    for (int i = 0; i < 100; i++) {
        inputs.add("num" + i);
    }
    List<Operation> operations = new ArrayList<>();
    operations.add(new ReadOperation("read", "Read from something", EndPoint.of("start"), inputs));
    // generate 500+ operations with 5 identity + all-to-all combos
    generateLineage(inputs, operations, "first identity", "read", "alltoall1");
    generateLineage(inputs, operations, "second identity", "alltoall1", "alltoall2");
    generateLineage(inputs, operations, "third identity", "alltoall2", "alltoall3");
    generateLineage(inputs, operations, "forth identity", "alltoall3", "alltoall4");
    generateLineage(inputs, operations, "fifth identity", "alltoall4", "alltoall5");
    List<InputField> newList = new ArrayList<>();
    inputs.forEach(s -> newList.add(InputField.of("alltoall5", s)));
    WriteOperation operation = new WriteOperation("Write", "", EndPoint.of("dest"), newList);
    operations.add(operation);
    FieldLineageInfo info = new FieldLineageInfo(operations);
    Assert.assertNotNull(info);
    Set<EndPointField> relatedSources = new HashSet<>();
    Map<EndPointField, Set<EndPointField>> expectedIncoming = new HashMap<>();
    for (int i = 0; i < inputs.size(); i++) {
        relatedSources.add(new EndPointField(EndPoint.of("start"), "num" + i));
    }
    for (int i = 0; i < inputs.size(); i++) {
        EndPointField key = new EndPointField(EndPoint.of("dest"), "num" + i);
        expectedIncoming.put(key, relatedSources);
    }
    Assert.assertEquals(expectedIncoming, info.getIncomingSummary());
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 30 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testCycle.

@Test(expected = IllegalArgumentException.class)
public void testCycle() {
    EndPoint readEndPoint = EndPoint.of("ns", "file1");
    EndPoint writeEndPoint = EndPoint.of("ns", "file2");
    ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize", Collections.singletonList(InputField.of("parse", "name")), "name");
    WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(read);
    operations.add(normalize);
    operations.add(write);
    FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations));
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Aggregations

ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)42 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)42 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)42 Operation (io.cdap.cdap.api.lineage.field.Operation)40 Test (org.junit.Test)33 ArrayList (java.util.ArrayList)30 HashSet (java.util.HashSet)30 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)24 HashMap (java.util.HashMap)17 LinkedHashSet (java.util.LinkedHashSet)15 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)14 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)14 ImmutableList (com.google.common.collect.ImmutableList)13 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)13 Connection (io.cdap.cdap.etl.proto.Connection)13 List (java.util.List)13 Set (java.util.Set)11 ImmutableSet (com.google.common.collect.ImmutableSet)10 InputField (io.cdap.cdap.api.lineage.field.InputField)8