Search in sources :

Example 6 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfo method computeIncomingSummary.

private Map<EndPointField, Set<EndPointField>> computeIncomingSummary() {
    if (writeOperations == null) {
        computeAndValidateFieldLineageInfo(this.operations);
    }
    Map<String, Set<EndPointField>> operationEndPointMap = new HashMap<>();
    Map<EndPointField, Set<EndPointField>> summary = new HashMap<>();
    for (WriteOperation write : writeOperations) {
        List<InputField> inputs = write.getInputs();
        for (InputField input : inputs) {
            EndPointField dest = new EndPointField(write.getDestination(), input.getName());
            Set<EndPointField> fields = summary.computeIfAbsent(dest, k -> new HashSet<>());
            if (operationEndPointMap.containsKey(input.getOrigin())) {
                fields.addAll(operationEndPointMap.get(input.getOrigin()));
            } else {
                // handle a special case for read -> write
                // in this case, the write operation has to be one to one relation with the fields in the read operation,
                // since a write operation can only take a list of input fields that come from the previous stage
                Operation origin = operationsMap.get(input.getOrigin());
                if (origin.getType() == OperationType.READ) {
                    fields.add(new EndPointField(((ReadOperation) origin).getSource(), input.getName()));
                    continue;
                }
                fields.addAll(computeIncomingSummaryHelper(origin, write, operationEndPointMap));
            }
        }
    }
    for (TransformOperation transform : dropTransforms) {
        for (InputField input : transform.getInputs()) {
            Operation previous = operationsMap.get(input.getOrigin());
            // drop transforms uses a common NULL endpoint as key
            Set<EndPointField> endPointFields = summary.computeIfAbsent(NULL_EPF, k -> new HashSet<>());
            if (operationEndPointMap.containsKey(input.getOrigin())) {
                endPointFields.addAll(new HashSet<>(operationEndPointMap.get(input.getOrigin())));
                continue;
            }
            endPointFields.addAll(computeIncomingSummaryHelper(previous, transform, operationEndPointMap));
        }
    }
    return summary;
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) Set(java.util.Set) InputField(io.cdap.cdap.api.lineage.field.InputField) HashMap(java.util.HashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation)

Example 7 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfo method getIncomingOperationsForField.

/**
 * <p>Get the subset of operations that were responsible for computing the specified field of
 * a specified destination.</p>
 * <p>For example if the operation are as follow</p>
 * <pre>
 * pRead: personFile -> (offset, body)
 * parse: body -> (id, name, address)
 * cRead: codeFile -> id
 * codeGen: (parse.id, cRead.id) -> id
 * sWrite: (codeGen.id, parse.name, parse.address) -> secureStore
 * iWrite: (parse.id, parse.name, parse.address) -> insecureStore
 * </pre>
 * <p>If the destination field is 'id' field of insecureStore then the result set will contain the operations iWrite,
 * parse, pRead.</p>
 * <p>If the destination field is 'id' field of secureStore then the result set will contain the operations sWrite,
 * codeGen, parse, pRead, cRead.</p>
 *
 * @param destinationField the EndPointField for which the operations need to find out
 * @return the subset of operations
 */
Set<Operation> getIncomingOperationsForField(EndPointField destinationField) {
    if (writeOperations == null) {
        computeAndValidateFieldLineageInfo(this.operations);
    }
    Set<Operation> visitedOperations = new HashSet<>();
    for (WriteOperation write : writeOperations) {
        // if the write operation destination was not the dataset to which the destinationField belongs to
        if (!write.getDestination().equals(destinationField.getEndPoint())) {
            continue;
        }
        Set<InputField> filteredInputs = write.getInputs().stream().filter(input -> input.getName().equals(destinationField.getField())).collect(Collectors.toSet());
        for (InputField input : filteredInputs) {
            // mark this write operation as visited
            visitedOperations.add(write);
            // traverse backward in the graph by looking up the origin of this input field which is the operation
            // which computed this destinationField
            getIncomingOperationsForFieldHelper(operationsMap.get(input.getOrigin()), visitedOperations);
        }
    }
    return visitedOperations;
}
Also used : EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) Checksums(io.cdap.cdap.common.utils.Checksums) GsonBuilder(com.google.gson.GsonBuilder) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Gson(com.google.gson.Gson) Map(java.util.Map) Operation(io.cdap.cdap.api.lineage.field.Operation) Charsets(com.google.common.base.Charsets) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Collection(java.util.Collection) OperationType(io.cdap.cdap.api.lineage.field.OperationType) Set(java.util.Set) OperationTypeAdapter(io.cdap.cdap.proto.codec.OperationTypeAdapter) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) InputField(io.cdap.cdap.api.lineage.field.InputField) List(java.util.List) Comparator(java.util.Comparator) Collections(java.util.Collections) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet)

Example 8 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfo method containsInputField.

/**
 * Checks whether the given field is used in the next operations or not
 *
 * @param nextOperation the next operation which should either be a {@link TransformOperation} or {@link
 * WriteOperation}
 * @param inputField the field whose usage needs to be checked
 * @return true if the field is used in the nextOperation
 */
private boolean containsInputField(Operation nextOperation, InputField inputField) {
    Set<InputField> inputFields = new HashSet<>();
    if (OperationType.WRITE == nextOperation.getType()) {
        WriteOperation nextWrite = (WriteOperation) nextOperation;
        inputFields = new HashSet<>(nextWrite.getInputs());
    } else if (OperationType.TRANSFORM == nextOperation.getType()) {
        TransformOperation nextTransform = (TransformOperation) nextOperation;
        inputFields = new HashSet<>(nextTransform.getInputs());
    }
    // if the next operation inputFields does contains the given fieldName return true
    return inputFields.contains(inputField);
}
Also used : InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) HashSet(java.util.HashSet)

Example 9 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfoTest method testLargeLineageOperation.

@Test(timeout = 10000)
public void testLargeLineageOperation() {
    List<String> inputs = new ArrayList<>();
    for (int i = 0; i < 100; i++) {
        inputs.add("num" + i);
    }
    List<Operation> operations = new ArrayList<>();
    operations.add(new ReadOperation("read", "Read from something", EndPoint.of("start"), inputs));
    // generate 500+ operations with 5 identity + all-to-all combos
    generateLineage(inputs, operations, "first identity", "read", "alltoall1");
    generateLineage(inputs, operations, "second identity", "alltoall1", "alltoall2");
    generateLineage(inputs, operations, "third identity", "alltoall2", "alltoall3");
    generateLineage(inputs, operations, "forth identity", "alltoall3", "alltoall4");
    generateLineage(inputs, operations, "fifth identity", "alltoall4", "alltoall5");
    List<InputField> newList = new ArrayList<>();
    inputs.forEach(s -> newList.add(InputField.of("alltoall5", s)));
    WriteOperation operation = new WriteOperation("Write", "", EndPoint.of("dest"), newList);
    operations.add(operation);
    FieldLineageInfo info = new FieldLineageInfo(operations);
    Assert.assertNotNull(info);
    Set<EndPointField> relatedSources = new HashSet<>();
    Map<EndPointField, Set<EndPointField>> expectedIncoming = new HashMap<>();
    for (int i = 0; i < inputs.size(); i++) {
        relatedSources.add(new EndPointField(EndPoint.of("start"), "num" + i));
    }
    for (int i = 0; i < inputs.size(); i++) {
        EndPointField key = new EndPointField(EndPoint.of("dest"), "num" + i);
        expectedIncoming.put(key, relatedSources);
    }
    Assert.assertEquals(expectedIncoming, info.getIncomingSummary());
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 10 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfoTest method testLinearTopologicalSort.

@Test
public void testLinearTopologicalSort() {
    // read---->parse---->normalize--->write
    ReadOperation read = new ReadOperation("read", "read descr", EndPoint.of("ns", "input"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
    List<InputField> writeInputs = new ArrayList<>();
    writeInputs.add(InputField.of("parse", "name"));
    writeInputs.add(InputField.of("normalize", "address"));
    WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
    Set<Operation> operations = new LinkedHashSet<>();
    operations.add(read);
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    // try with few different insertion orders, the topological sort should give the same results
    operations = new LinkedHashSet<>();
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    operations = new LinkedHashSet<>();
    operations.add(write);
    operations.add(normalize);
    operations.add(parse);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Aggregations

InputField (io.cdap.cdap.api.lineage.field.InputField)13 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)12 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)11 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)10 Operation (io.cdap.cdap.api.lineage.field.Operation)9 ArrayList (java.util.ArrayList)8 HashSet (java.util.HashSet)8 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)6 HashMap (java.util.HashMap)5 LinkedHashSet (java.util.LinkedHashSet)4 Set (java.util.Set)4 Test (org.junit.Test)4 Sets (com.google.common.collect.Sets)2 Charsets (com.google.common.base.Charsets)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Gson (com.google.gson.Gson)1 GsonBuilder (com.google.gson.GsonBuilder)1 OperationType (io.cdap.cdap.api.lineage.field.OperationType)1 Checksums (io.cdap.cdap.common.utils.Checksums)1 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)1