Search in sources :

Example 16 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testMultiPathFieldLineage.

@Test
public void testMultiPathFieldLineage() {
    // read1: file1 -> (offset, body)
    // read2: file2 -> (offset, body)
    // merge: (read1.offset, read1.body, read2.offset, read2.body) -> (offset, body)
    // parse: (merge.body) -> (name,address)
    // write: (parse.name, parse.address, merge.offset) -> file
    EndPoint read1EndPoint = EndPoint.of("ns1", "file1");
    EndPoint read2EndPoint = EndPoint.of("ns2", "file2");
    EndPoint fileEndPoint = EndPoint.of("ns3", "file");
    ReadOperation read1 = new ReadOperation("read1", "Reading from file1", read1EndPoint, "offset", "body");
    ReadOperation read2 = new ReadOperation("read2", "Reading from file2", read2EndPoint, "offset", "body");
    TransformOperation merge = new TransformOperation("merge", "merging fields", Arrays.asList(InputField.of("read1", "offset"), InputField.of("read2", "offset"), InputField.of("read1", "body"), InputField.of("read2", "body")), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("merge", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "writing to another file", fileEndPoint, Arrays.asList(InputField.of("merge", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(merge);
    operations.add(read1);
    operations.add(read2);
    operations.add(write);
    FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
    Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
    Assert.assertEquals(1, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("name", "address", "offset")), destinationFields.get(fileEndPoint));
    Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
    Assert.assertEquals(3, incomingSummary.size());
    Set<EndPointField> expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(read1EndPoint, "body"));
    expectedSet.add(new EndPointField(read1EndPoint, "offset"));
    expectedSet.add(new EndPointField(read2EndPoint, "body"));
    expectedSet.add(new EndPointField(read2EndPoint, "offset"));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "name")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "address")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "offset")));
    Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
    Assert.assertEquals(4, outgoingSummary.size());
    expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(fileEndPoint, "offset"));
    expectedSet.add(new EndPointField(fileEndPoint, "name"));
    expectedSet.add(new EndPointField(fileEndPoint, "address"));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "body")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "body")));
    // test outgoing operations of all source endoints
    Set<Operation> outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "offset"));
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read1);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "offset"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read2);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 17 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testValidOperations.

@Test
public void testValidOperations() {
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    // Serializing and deserializing should result in the same checksum.
    String operationsJson = GSON.toJson(info1.getOperations());
    Type setType = new TypeToken<Set<Operation>>() {
    }.getType();
    Set<Operation> operationsFromJson = GSON.fromJson(operationsJson, setType);
    FieldLineageInfo info2 = new FieldLineageInfo(operationsFromJson);
    Assert.assertEquals(info1, info2);
    // Create lineage info with different ordering of same operations. Checksum should still be same.
    operations.clear();
    operations.add(write);
    operations.add(parse);
    operations.add(read);
    FieldLineageInfo info3 = new FieldLineageInfo(operations);
    Assert.assertEquals(info1, info3);
    // Change the namespace name of the write operation from ns to myns. The checksum should change now.
    operations.clear();
    WriteOperation anotherWrite = new WriteOperation("write", "write data", EndPoint.of("myns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    operations.add(anotherWrite);
    operations.add(parse);
    operations.add(read);
    FieldLineageInfo info4 = new FieldLineageInfo(operations);
    Assert.assertNotEquals(info1, info4);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Type(java.lang.reflect.Type) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 18 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfoTest method generateLineage.

private void generateLineage(List<String> inputs, List<Operation> operations, String identityNamePrefix, String identityOrigin, String transform) {
    // emit identity transform for all fields
    for (int i = 0; i < inputs.size(); i++) {
        operations.add(new TransformOperation(identityNamePrefix + i, "identity transform", Collections.singletonList(InputField.of(identityOrigin, inputs.get(i))), inputs.get(i)));
    }
    // generate an all-to-all, so that when track back, this operation has to track back to all the previous
    // identity transform
    List<InputField> inputFields = new ArrayList<>();
    for (int i = 0; i < inputs.size(); i++) {
        inputFields.add(InputField.of(identityNamePrefix + i, inputs.get(i)));
    }
    TransformOperation parse = new TransformOperation(transform, "all to all transform", inputFields, inputs);
    operations.add(parse);
}
Also used : InputField(io.cdap.cdap.api.lineage.field.InputField) ArrayList(java.util.ArrayList) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint)

Example 19 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testInvalidOperations.

@Test
public void testInvalidOperations() {
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(write);
    try {
        // Create info without read operation
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since no read operation is specified.");
    } catch (IllegalArgumentException e) {
        String msg = "Field level lineage requires at least one operation of type 'READ'.";
        Assert.assertEquals(msg, e.getMessage());
    }
    operations.clear();
    operations.add(read);
    operations.add(parse);
    try {
        // Create info without write operation
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since no write operation is specified.");
    } catch (IllegalArgumentException e) {
        String msg = "Field level lineage requires at least one operation of type 'WRITE'.";
        Assert.assertEquals(msg, e.getMessage());
    }
    WriteOperation duplicateWrite = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint3"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    operations.add(write);
    operations.add(duplicateWrite);
    try {
        // Create info with non-unique operation names
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since operation name 'write' is repeated.");
    } catch (IllegalArgumentException e) {
        String msg = "Operation name 'write' is repeated";
        Assert.assertTrue(e.getMessage().contains(msg));
    }
    operations.clear();
    TransformOperation invalidOrigin = new TransformOperation("anotherparse", "parse body", Arrays.asList(InputField.of("invalid", "body"), InputField.of("anotherinvalid", "body")), "name", "address");
    operations.add(read);
    operations.add(parse);
    operations.add(write);
    operations.add(invalidOrigin);
    try {
        // Create info without invalid origins
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since operation with name 'invalid' " + "and 'anotherinvalid' do not exist.");
    } catch (IllegalArgumentException e) {
        String msg = "No operation is associated with the origins '[invalid, anotherinvalid]'.";
        Assert.assertEquals(msg, e.getMessage());
    }
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 20 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfo method computeAndValidateFieldLineageInfo.

private void computeAndValidateFieldLineageInfo(Collection<? extends Operation> operations) {
    Set<String> allOrigins = new HashSet<>();
    this.operationsMap = new HashMap<>();
    this.writeOperations = new HashSet<>();
    this.readOperations = new HashSet<>();
    this.operationOutgoingConnections = new HashMap<>();
    for (Operation operation : operations) {
        if (operationsMap.containsKey(operation.getName())) {
            throw new IllegalArgumentException(String.format("All operations provided for creating field " + "level lineage info must have unique names. " + "Operation name '%s' is repeated.", operation.getName()));
        }
        operationsMap.put(operation.getName(), operation);
        switch(operation.getType()) {
            case READ:
                ReadOperation read = (ReadOperation) operation;
                EndPoint source = read.getSource();
                if (source == null) {
                    throw new IllegalArgumentException(String.format("Source endpoint cannot be null for the read " + "operation '%s'.", read.getName()));
                }
                readOperations.add(read);
                break;
            case TRANSFORM:
                TransformOperation transform = (TransformOperation) operation;
                Set<String> origins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
                // for each origin corresponding to the input fields there is a connection from that origin to this operation
                for (String origin : origins) {
                    Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
                    connections.add(transform);
                }
                allOrigins.addAll(origins);
                if (transform.getOutputs().isEmpty()) {
                    dropTransforms.add(transform);
                }
                break;
            case WRITE:
                WriteOperation write = (WriteOperation) operation;
                EndPoint destination = write.getDestination();
                if (destination == null) {
                    throw new IllegalArgumentException(String.format("Destination endpoint cannot be null for the write " + "operation '%s'.", write.getName()));
                }
                origins = write.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
                // for each origin corresponding to the input fields there is a connection from that origin to this operation
                for (String origin : origins) {
                    Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
                    connections.add(write);
                }
                allOrigins.addAll(origins);
                writeOperations.add(write);
                break;
            default:
        }
    }
    Set<String> operationsWithNoOutgoingConnections = Sets.difference(operationsMap.keySet(), operationOutgoingConnections.keySet());
    // put empty set for operations with no outgoing connection rather than checking for null later
    for (String operation : operationsWithNoOutgoingConnections) {
        operationOutgoingConnections.put(operation, new HashSet<>());
    }
    if (readOperations.isEmpty()) {
        throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'READ'.");
    }
    if (writeOperations.isEmpty()) {
        throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'WRITE'.");
    }
    Sets.SetView<String> invalidOrigins = Sets.difference(allOrigins, operationsMap.keySet());
    if (!invalidOrigins.isEmpty()) {
        throw new IllegalArgumentException(String.format("No operation is associated with the origins '%s'.", invalidOrigins));
    }
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Sets(com.google.common.collect.Sets) HashSet(java.util.HashSet)

Aggregations

TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)42 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)39 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)38 Operation (io.cdap.cdap.api.lineage.field.Operation)36 ArrayList (java.util.ArrayList)29 HashSet (java.util.HashSet)29 Test (org.junit.Test)29 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)23 HashMap (java.util.HashMap)18 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)14 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)14 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)14 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)13 Connection (io.cdap.cdap.etl.proto.Connection)13 LinkedHashSet (java.util.LinkedHashSet)12 InputField (io.cdap.cdap.api.lineage.field.InputField)10 Set (java.util.Set)9 ImmutableSet (com.google.common.collect.ImmutableSet)7