Search in sources :

Example 1 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfo method getTopologicallySortedOperations.

/**
 * Sort the operations in topological order. In topological order, each operation in the list
 * is guaranteed to occur before any other operation that reads its outputs.
 *
 * For example, consider following scenario:
 *
 *    read-----------------------write
 *       \                        /
 *       ----parse----normalize---
 *
 * Since write operation is dependent on the read and normalize for its input, it would be
 * last in the order. normalize depends on the parse, so it would appear after parse. Similarly
 * parse operation would appear after the read but before normalize in the returned list.
 *
 * @param operations set of operations to be sorted
 * @return the list containing topologically sorted operations
 */
public static List<Operation> getTopologicallySortedOperations(Set<Operation> operations) {
    Map<String, Operation> operationMap = new HashMap<>();
    Set<String> operationsWithNoIncomings = new HashSet<>();
    for (Operation operation : operations) {
        operationMap.put(operation.getName(), operation);
        if (OperationType.READ == operation.getType()) {
            operationsWithNoIncomings.add(operation.getName());
        }
        // like a read operation
        if (OperationType.TRANSFORM == operation.getType() && ((TransformOperation) operation).getInputs().isEmpty()) {
            operationsWithNoIncomings.add(operation.getName());
        }
    }
    // Map of operation name to the set of operation names which take the output of the given operation as
    // an input. This map basically represents the adjacency list for operation.
    // For example consider the following scenario:
    // 
    // read----------------------write
    // \                      /
    // ----parse---normalize
    // 
    // The map would contain:
    // read -> [parse, write]
    // parse -> [normalize]
    // normalize -> [write]
    // write -> []
    Map<String, Set<String>> outgoingOperations = new HashMap<>();
    // Map of operation name to the set of operation names outputs of which given operation takes as an input.
    // For example consider the following scenario:
    // 
    // read----------------------write
    // \                      /
    // ----parse---normalize
    // 
    // The map would contain:
    // read -> []
    // parse -> [read]
    // normalize -> [parse]
    // write -> [read, normalize]
    Map<String, Set<String>> incomingOperations = new HashMap<>();
    for (Operation operation : operations) {
        List<InputField> inputFields = new ArrayList<>();
        switch(operation.getType()) {
            case READ:
                // read has no incoming operation
                incomingOperations.put(operation.getName(), new HashSet<>());
                break;
            case TRANSFORM:
                TransformOperation transform = (TransformOperation) operation;
                inputFields.addAll(transform.getInputs());
                break;
            case WRITE:
                WriteOperation write = (WriteOperation) operation;
                inputFields.addAll(write.getInputs());
                // write has no outgoing operation
                outgoingOperations.put(operation.getName(), new HashSet<>());
                break;
        }
        for (InputField inputField : inputFields) {
            // input fields with origin as normalize, which should be ignored for topological sorting.
            if (!operationMap.containsKey(inputField.getOrigin())) {
                continue;
            }
            // Current operation is the outgoing operation for origin represented by the input field.
            Set<String> outgoings = outgoingOperations.computeIfAbsent(inputField.getOrigin(), k -> new HashSet<>());
            outgoings.add(operation.getName());
            // Origin represented by the input field is the incoming operation for the current operation.
            Set<String> incomings = incomingOperations.computeIfAbsent(operation.getName(), k -> new HashSet<>());
            incomings.add(inputField.getOrigin());
        }
    }
    List<Operation> orderedOperations = new ArrayList<>();
    while (!operationsWithNoIncomings.isEmpty()) {
        String current = operationsWithNoIncomings.iterator().next();
        operationsWithNoIncomings.remove(current);
        if (operationMap.get(current) != null) {
            orderedOperations.add(operationMap.get(current));
        }
        // it is possible that there are no outgoings for the field, since it is possible some field is not used in the
        // downstream of plugins
        Iterator<String> outgoingsIter = outgoingOperations.getOrDefault(current, Collections.emptySet()).iterator();
        while (outgoingsIter.hasNext()) {
            String next = outgoingsIter.next();
            outgoingsIter.remove();
            incomingOperations.get(next).remove(current);
            if (incomingOperations.get(next).isEmpty()) {
                operationsWithNoIncomings.add(next);
            }
        }
    }
    // check if any cycles
    // remove the entries which has empty outgoing operations now
    outgoingOperations.entrySet().removeIf(next -> next.getValue().isEmpty());
    if (!outgoingOperations.isEmpty()) {
        throw new IllegalArgumentException(String.format("Cycle detected in graph for operations %s", outgoingOperations));
    }
    return orderedOperations;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) InputField(io.cdap.cdap.api.lineage.field.InputField) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet)

Example 2 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfoTest method testBranchTopologicalSort.

@Test
public void testBranchTopologicalSort() {
    // read----------------------write
    // \                      /
    // ----parse---normalize
    ReadOperation read = new ReadOperation("read", "read descr", EndPoint.of("ns", "input"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
    List<InputField> writeInputs = new ArrayList<>();
    writeInputs.add(InputField.of("read", "offset"));
    writeInputs.add(InputField.of("parse", "name"));
    writeInputs.add(InputField.of("normalize", "address"));
    WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
    Set<Operation> operations = new LinkedHashSet<>();
    operations.add(read);
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    // try with different insertion orders
    operations = new LinkedHashSet<>();
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    operations = new LinkedHashSet<>();
    operations.add(write);
    operations.add(normalize);
    operations.add(parse);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    // When the field lineage is queried for offset field, we will only return the
    // read and write operations, since parse and normalize operations are not affecting
    // the offset field in anyway. In this case even though write operation has input with origin
    // as normalize, topological sort should not affect by this case, where normalize operation
    // itself is missing.
    operations = new LinkedHashSet<>();
    operations.add(write);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, write);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 3 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfoTest method generateLineage.

private void generateLineage(List<String> inputs, List<Operation> operations, String identityNamePrefix, String identityOrigin, String transform) {
    // emit identity transform for all fields
    for (int i = 0; i < inputs.size(); i++) {
        operations.add(new TransformOperation(identityNamePrefix + i, "identity transform", Collections.singletonList(InputField.of(identityOrigin, inputs.get(i))), inputs.get(i)));
    }
    // generate an all-to-all, so that when track back, this operation has to track back to all the previous
    // identity transform
    List<InputField> inputFields = new ArrayList<>();
    for (int i = 0; i < inputs.size(); i++) {
        inputFields.add(InputField.of(identityNamePrefix + i, inputs.get(i)));
    }
    TransformOperation parse = new TransformOperation(transform, "all to all transform", inputFields, inputs);
    operations.add(parse);
}
Also used : InputField(io.cdap.cdap.api.lineage.field.InputField) ArrayList(java.util.ArrayList) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint)

Example 4 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfo method computeAndValidateFieldLineageInfo.

private void computeAndValidateFieldLineageInfo(Collection<? extends Operation> operations) {
    Set<String> allOrigins = new HashSet<>();
    this.operationsMap = new HashMap<>();
    this.writeOperations = new HashSet<>();
    this.readOperations = new HashSet<>();
    this.operationOutgoingConnections = new HashMap<>();
    for (Operation operation : operations) {
        if (operationsMap.containsKey(operation.getName())) {
            throw new IllegalArgumentException(String.format("All operations provided for creating field " + "level lineage info must have unique names. " + "Operation name '%s' is repeated.", operation.getName()));
        }
        operationsMap.put(operation.getName(), operation);
        switch(operation.getType()) {
            case READ:
                ReadOperation read = (ReadOperation) operation;
                EndPoint source = read.getSource();
                if (source == null) {
                    throw new IllegalArgumentException(String.format("Source endpoint cannot be null for the read " + "operation '%s'.", read.getName()));
                }
                readOperations.add(read);
                break;
            case TRANSFORM:
                TransformOperation transform = (TransformOperation) operation;
                Set<String> origins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
                // for each origin corresponding to the input fields there is a connection from that origin to this operation
                for (String origin : origins) {
                    Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
                    connections.add(transform);
                }
                allOrigins.addAll(origins);
                if (transform.getOutputs().isEmpty()) {
                    dropTransforms.add(transform);
                }
                break;
            case WRITE:
                WriteOperation write = (WriteOperation) operation;
                EndPoint destination = write.getDestination();
                if (destination == null) {
                    throw new IllegalArgumentException(String.format("Destination endpoint cannot be null for the write " + "operation '%s'.", write.getName()));
                }
                origins = write.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
                // for each origin corresponding to the input fields there is a connection from that origin to this operation
                for (String origin : origins) {
                    Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
                    connections.add(write);
                }
                allOrigins.addAll(origins);
                writeOperations.add(write);
                break;
            default:
        }
    }
    Set<String> operationsWithNoOutgoingConnections = Sets.difference(operationsMap.keySet(), operationOutgoingConnections.keySet());
    // put empty set for operations with no outgoing connection rather than checking for null later
    for (String operation : operationsWithNoOutgoingConnections) {
        operationOutgoingConnections.put(operation, new HashSet<>());
    }
    if (readOperations.isEmpty()) {
        throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'READ'.");
    }
    if (writeOperations.isEmpty()) {
        throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'WRITE'.");
    }
    Sets.SetView<String> invalidOrigins = Sets.difference(allOrigins, operationsMap.keySet());
    if (!invalidOrigins.isEmpty()) {
        throw new IllegalArgumentException(String.format("No operation is associated with the origins '%s'.", invalidOrigins));
    }
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Sets(com.google.common.collect.Sets) HashSet(java.util.HashSet)

Example 5 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageInfo method computeIncomingSummaryHelper.

/**
 * Helper method to compute the incoming summary
 *
 * @param currentOperation the operation being processed. Since we are processing incoming this operation is on the
 * left side if graph is imagined in horizontal orientation or this operation is the input to the to
 * previousOperation
 * @param previousOperation the previous operation which is processed and reside on right to the current operation if
 * the graph is imagined to be in horizontal orientation.
 * @param operationEndPointMap a map that contains the operation name to the final endpoint field it will generate,
 * this is used to track the path we already computed to ensure we do not do the same computation again
 */
private Set<EndPointField> computeIncomingSummaryHelper(Operation currentOperation, Operation previousOperation, Map<String, Set<EndPointField>> operationEndPointMap) {
    if (currentOperation.getType() == OperationType.READ) {
        // if current operation is of type READ, previous operation must be of type TRANSFORM or WRITE
        // get only the input fields from the previous operations for which the origin is current READ operation
        Set<InputField> inputFields = new HashSet<>();
        if (OperationType.WRITE == previousOperation.getType()) {
            WriteOperation previousWrite = (WriteOperation) previousOperation;
            inputFields = new HashSet<>(previousWrite.getInputs());
        } else if (OperationType.TRANSFORM == previousOperation.getType()) {
            TransformOperation previousTransform = (TransformOperation) previousOperation;
            inputFields = new HashSet<>(previousTransform.getInputs());
        }
        Set<EndPointField> sourceEndPointFields = new HashSet<>();
        // for all the input fields of the previous operation if the origin was current operation (remember we are
        // traversing backward)
        ReadOperation read = (ReadOperation) currentOperation;
        EndPoint source = read.getSource();
        for (InputField inputField : inputFields) {
            if (inputField.getOrigin().equals(currentOperation.getName())) {
                sourceEndPointFields.add(new EndPointField(source, inputField.getName()));
            }
        }
        // reached the end of graph unwind the recursive calls
        return sourceEndPointFields;
    }
    Set<EndPointField> relatedSources = new HashSet<>();
    // for transform we traverse backward in graph further through the inputs of the transform
    if (currentOperation.getType() == OperationType.TRANSFORM) {
        TransformOperation transform = (TransformOperation) currentOperation;
        // optimization to avoid repeating work if there are input fields with the same origin
        Set<String> transformOrigins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
        for (String transformOrigin : transformOrigins) {
            if (operationEndPointMap.containsKey(transformOrigin)) {
                relatedSources.addAll(operationEndPointMap.get(transformOrigin));
            } else {
                relatedSources.addAll(computeIncomingSummaryHelper(operationsMap.get(transformOrigin), currentOperation, operationEndPointMap));
            }
        }
        operationEndPointMap.put(currentOperation.getName(), relatedSources);
    }
    return relatedSources;
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) HashSet(java.util.HashSet)

Aggregations

InputField (io.cdap.cdap.api.lineage.field.InputField)13 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)12 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)11 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)10 Operation (io.cdap.cdap.api.lineage.field.Operation)9 ArrayList (java.util.ArrayList)8 HashSet (java.util.HashSet)8 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)6 HashMap (java.util.HashMap)5 LinkedHashSet (java.util.LinkedHashSet)4 Set (java.util.Set)4 Test (org.junit.Test)4 Sets (com.google.common.collect.Sets)2 Charsets (com.google.common.base.Charsets)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Gson (com.google.gson.Gson)1 GsonBuilder (com.google.gson.GsonBuilder)1 OperationType (io.cdap.cdap.api.lineage.field.OperationType)1 Checksums (io.cdap.cdap.common.utils.Checksums)1 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)1