Search in sources :

Example 26 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfo method getIncomingOperationsForField.

/**
 * <p>Get the subset of operations that were responsible for computing the specified field of
 * a specified destination.</p>
 * <p>For example if the operation are as follow</p>
 * <pre>
 * pRead: personFile -> (offset, body)
 * parse: body -> (id, name, address)
 * cRead: codeFile -> id
 * codeGen: (parse.id, cRead.id) -> id
 * sWrite: (codeGen.id, parse.name, parse.address) -> secureStore
 * iWrite: (parse.id, parse.name, parse.address) -> insecureStore
 * </pre>
 * <p>If the destination field is 'id' field of insecureStore then the result set will contain the operations iWrite,
 * parse, pRead.</p>
 * <p>If the destination field is 'id' field of secureStore then the result set will contain the operations sWrite,
 * codeGen, parse, pRead, cRead.</p>
 *
 * @param destinationField the EndPointField for which the operations need to find out
 * @return the subset of operations
 */
Set<Operation> getIncomingOperationsForField(EndPointField destinationField) {
    if (writeOperations == null) {
        computeAndValidateFieldLineageInfo(this.operations);
    }
    Set<Operation> visitedOperations = new HashSet<>();
    for (WriteOperation write : writeOperations) {
        // if the write operation destination was not the dataset to which the destinationField belongs to
        if (!write.getDestination().equals(destinationField.getEndPoint())) {
            continue;
        }
        Set<InputField> filteredInputs = write.getInputs().stream().filter(input -> input.getName().equals(destinationField.getField())).collect(Collectors.toSet());
        for (InputField input : filteredInputs) {
            // mark this write operation as visited
            visitedOperations.add(write);
            // traverse backward in the graph by looking up the origin of this input field which is the operation
            // which computed this destinationField
            getIncomingOperationsForFieldHelper(operationsMap.get(input.getOrigin()), visitedOperations);
        }
    }
    return visitedOperations;
}
Also used : EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) Checksums(io.cdap.cdap.common.utils.Checksums) GsonBuilder(com.google.gson.GsonBuilder) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Gson(com.google.gson.Gson) Map(java.util.Map) Operation(io.cdap.cdap.api.lineage.field.Operation) Charsets(com.google.common.base.Charsets) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Collection(java.util.Collection) OperationType(io.cdap.cdap.api.lineage.field.OperationType) Set(java.util.Set) OperationTypeAdapter(io.cdap.cdap.proto.codec.OperationTypeAdapter) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) InputField(io.cdap.cdap.api.lineage.field.InputField) List(java.util.List) Comparator(java.util.Comparator) Collections(java.util.Collections) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet)

Example 27 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfo method containsInputField.

/**
 * Checks whether the given field is used in the next operations or not
 *
 * @param nextOperation the next operation which should either be a {@link TransformOperation} or {@link
 * WriteOperation}
 * @param inputField the field whose usage needs to be checked
 * @return true if the field is used in the nextOperation
 */
private boolean containsInputField(Operation nextOperation, InputField inputField) {
    Set<InputField> inputFields = new HashSet<>();
    if (OperationType.WRITE == nextOperation.getType()) {
        WriteOperation nextWrite = (WriteOperation) nextOperation;
        inputFields = new HashSet<>(nextWrite.getInputs());
    } else if (OperationType.TRANSFORM == nextOperation.getType()) {
        TransformOperation nextTransform = (TransformOperation) nextOperation;
        inputFields = new HashSet<>(nextTransform.getInputs());
    }
    // if the next operation inputFields does contains the given fieldName return true
    return inputFields.contains(inputField);
}
Also used : InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) HashSet(java.util.HashSet)

Example 28 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testMultiSourceSingleDestinationWithoutMerge.

@Test
public void testMultiSourceSingleDestinationWithoutMerge() {
    // pRead: personFile -> (offset, body)
    // parse: body -> (id, name, address)
    // cRead: codeFile -> id
    // codeGen: (parse.id, cRead.id) -> id
    // sWrite: (codeGen.id, parse.name, parse.address) -> secureStore
    // iWrite: (parse.id, parse.name, parse.address) -> insecureStore
    EndPoint pEndPoint = EndPoint.of("ns", "personFile");
    EndPoint cEndPoint = EndPoint.of("ns", "codeFile");
    EndPoint sEndPoint = EndPoint.of("ns", "secureStore");
    EndPoint iEndPoint = EndPoint.of("ns", "insecureStore");
    ReadOperation pRead = new ReadOperation("pRead", "Reading from person file", pEndPoint, "offset", "body");
    ReadOperation cRead = new ReadOperation("cRead", "Reading from code file", cEndPoint, "id");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("pRead", "body")), "id", "name", "address");
    TransformOperation codeGen = new TransformOperation("codeGen", "Generate secure code", Arrays.asList(InputField.of("parse", "id"), InputField.of("cRead", "id")), "id");
    WriteOperation sWrite = new WriteOperation("sWrite", "writing secure store", sEndPoint, Arrays.asList(InputField.of("codeGen", "id"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    WriteOperation iWrite = new WriteOperation("iWrite", "writing insecure store", iEndPoint, Arrays.asList(InputField.of("parse", "id"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(pRead);
    operations.add(cRead);
    operations.add(parse);
    operations.add(codeGen);
    operations.add(sWrite);
    operations.add(iWrite);
    FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
    Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
    Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name", "address")), destinationFields.get(sEndPoint));
    Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name", "address")), destinationFields.get(iEndPoint));
    Assert.assertNull(destinationFields.get(pEndPoint));
    Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
    Assert.assertEquals(6, incomingSummary.size());
    EndPointField expected = new EndPointField(pEndPoint, "body");
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(iEndPoint, "id")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(iEndPoint, "id")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(iEndPoint, "name")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(iEndPoint, "name")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(iEndPoint, "address")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(iEndPoint, "address")).iterator().next());
    // name and address from secure endpoint also depends on the body field of pEndPoint
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(sEndPoint, "name")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(sEndPoint, "name")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(sEndPoint, "address")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(sEndPoint, "address")).iterator().next());
    // id of secure endpoint depends on both body field of pEndPoint and id field of cEndPoint
    Set<EndPointField> expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(pEndPoint, "body"));
    expectedSet.add(new EndPointField(cEndPoint, "id"));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(sEndPoint, "id")));
    Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
    // outgoing summary will not contain offset but only body from pEndPoint and id from cEndPoint
    Assert.assertEquals(2, outgoingSummary.size());
    expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(iEndPoint, "id"));
    expectedSet.add(new EndPointField(iEndPoint, "name"));
    expectedSet.add(new EndPointField(iEndPoint, "address"));
    expectedSet.add(new EndPointField(sEndPoint, "id"));
    expectedSet.add(new EndPointField(sEndPoint, "name"));
    expectedSet.add(new EndPointField(sEndPoint, "address"));
    // body affects all fields from both secure and insecure endpoints
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(pEndPoint, "body")));
    expectedSet.clear();
    expectedSet.add(new EndPointField(sEndPoint, "id"));
    // id field of cEndPoint only affects id field of secure endpoint
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(cEndPoint, "id")));
    // Test incoming operations from all destination fields
    Set<Operation> inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(iEndPoint, "id"));
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(iWrite);
    expectedOperations.add(parse);
    expectedOperations.add(pRead);
    Assert.assertEquals(expectedOperations, inComingOperations);
    inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(iEndPoint, "name"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(iWrite);
    expectedOperations.add(parse);
    expectedOperations.add(pRead);
    Assert.assertEquals(new FieldLineageInfo(expectedOperations), new FieldLineageInfo(inComingOperations));
    inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(iEndPoint, "address"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(iWrite);
    expectedOperations.add(parse);
    expectedOperations.add(pRead);
    Assert.assertEquals(expectedOperations, inComingOperations);
    inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(sEndPoint, "id"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(sWrite);
    expectedOperations.add(codeGen);
    expectedOperations.add(cRead);
    expectedOperations.add(parse);
    expectedOperations.add(pRead);
    Assert.assertEquals(expectedOperations, inComingOperations);
    inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(sEndPoint, "name"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(sWrite);
    expectedOperations.add(parse);
    expectedOperations.add(pRead);
    Assert.assertEquals(expectedOperations, inComingOperations);
    inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(sEndPoint, "address"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(sWrite);
    expectedOperations.add(parse);
    expectedOperations.add(pRead);
    Assert.assertEquals(expectedOperations, inComingOperations);
    // test outgoing operations for all source fields
    Set<Operation> outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(pEndPoint, "offset"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(pRead);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(pEndPoint, "body"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(sWrite);
    expectedOperations.add(iWrite);
    expectedOperations.add(codeGen);
    expectedOperations.add(parse);
    expectedOperations.add(pRead);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(cEndPoint, "id"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(sWrite);
    expectedOperations.add(codeGen);
    expectedOperations.add(cRead);
    Assert.assertEquals(expectedOperations, outgoingOperations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 29 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testCycleWithNonExistentOperationNames.

@Test(expected = IllegalArgumentException.class)
public void testCycleWithNonExistentOperationNames() {
    EndPoint readEndPoint = EndPoint.of("ns", "file1");
    EndPoint writeEndPoint = EndPoint.of("ns", "file2");
    ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name"), InputField.of("nop1", "field1")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize", Arrays.asList(InputField.of("parse", "name"), InputField.of("nop2", "field2")), "name");
    WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"), InputField.of("nop3", "field3")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(read);
    operations.add(normalize);
    operations.add(write);
    FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations));
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 30 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testMultiSourceDroppedFields.

@Test
public void testMultiSourceDroppedFields() {
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "first_name", "last_name", "social");
    TransformOperation combineNames = new TransformOperation("combineNames", "combine names", Arrays.asList(InputField.of("read", "first_name"), InputField.of("read", "last_name")), "full_name");
    TransformOperation dropSocial = new TransformOperation("dropSocial", "drop social", Collections.singletonList(InputField.of("read", "social")));
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("endpoint2"), Collections.singletonList(InputField.of("combineNames", "full_name")));
    Set<Operation> operations = Sets.newHashSet(read, write, combineNames, dropSocial);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    EndPoint ep1 = EndPoint.of("endpoint1");
    EndPoint ep2 = EndPoint.of("endpoint2");
    Map<EndPointField, Set<EndPointField>> expectedOutgoingSummary = new HashMap<>();
    expectedOutgoingSummary.put(new EndPointField(ep1, "first_name"), Collections.singleton(new EndPointField(ep2, "full_name")));
    expectedOutgoingSummary.put(new EndPointField(ep1, "last_name"), Collections.singleton(new EndPointField(ep2, "full_name")));
    expectedOutgoingSummary.put(new EndPointField(ep1, "social"), Collections.singleton(FieldLineageInfo.NULL_EPF));
    Assert.assertEquals(expectedOutgoingSummary, info1.getOutgoingSummary());
    Map<EndPointField, Set<EndPointField>> expectedIncomingSummary = new HashMap<>();
    expectedIncomingSummary.put(new EndPointField(ep2, "full_name"), Sets.newHashSet(new EndPointField(ep1, "first_name"), new EndPointField(ep1, "last_name")));
    Assert.assertEquals(expectedIncomingSummary, info1.getIncomingSummary());
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashMap(java.util.HashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Aggregations

TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)45 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)45 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)44 Operation (io.cdap.cdap.api.lineage.field.Operation)42 HashSet (java.util.HashSet)33 Test (org.junit.Test)33 ArrayList (java.util.ArrayList)32 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)25 HashMap (java.util.HashMap)19 LinkedHashSet (java.util.LinkedHashSet)15 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)14 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)14 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)13 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)13 Connection (io.cdap.cdap.etl.proto.Connection)13 Set (java.util.Set)13 InputField (io.cdap.cdap.api.lineage.field.InputField)11 ImmutableSet (com.google.common.collect.ImmutableSet)10