Search in sources :

Example 46 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class LineageOperationsProcessor method computeProcessedOperations.

/**
 * Convert the all the stage operations to the platform operation, this method will go through the pipeline in
 * topological order, so that the later stage will always know the origin of its operation.
 * If a stage has multiple inputs except joiner, implicit merge operations will be generated in order to for further
 * stages to look up the origins.
 * For joiners, the input field name should already contains the previous stage name.
 *
 * @return a {@link Map} containing the operations with key of operation name and value of the corresponding
 * platform {@link Operation}
 */
private Map<String, Operation> computeProcessedOperations() {
    Map<String, Operation> processedOperations = new HashMap<>();
    for (String stageName : topologicalOrder) {
        Set<String> stageInputs = stageDag.getNodeInputs(stageName);
        // if the stage has multiple inputs and it is not a joiner, compute the merge operations
        if (stageInputs.size() > 1 && !noMergeRequiredStages.contains(stageName)) {
            addMergeOperation(stageInputs, processedOperations);
        }
        List<FieldOperation> fieldOperations = stageOperations.get(stageName);
        for (FieldOperation fieldOperation : fieldOperations) {
            Operation newOperation = null;
            String newOperationName = prefixedName(stageName, fieldOperation.getName());
            Set<String> currentOperationOutputs = new LinkedHashSet<>();
            switch(fieldOperation.getType()) {
                case READ:
                    FieldReadOperation read = (FieldReadOperation) fieldOperation;
                    newOperation = new ReadOperation(newOperationName, read.getDescription(), read.getSource(), read.getOutputFields());
                    currentOperationOutputs.addAll(read.getOutputFields());
                    break;
                case TRANSFORM:
                    FieldTransformOperation transform = (FieldTransformOperation) fieldOperation;
                    List<InputField> inputFields = createInputFields(transform.getInputFields(), stageName, processedOperations);
                    newOperation = new TransformOperation(newOperationName, transform.getDescription(), inputFields, transform.getOutputFields());
                    currentOperationOutputs.addAll(transform.getOutputFields());
                    break;
                case WRITE:
                    FieldWriteOperation write = (FieldWriteOperation) fieldOperation;
                    inputFields = createInputFields(write.getInputFields(), stageName, processedOperations);
                    newOperation = new WriteOperation(newOperationName, write.getDescription(), write.getSink(), inputFields);
                    break;
            }
            for (String currentOperationOutput : currentOperationOutputs) {
                // For all fields outputted by the current operation assign the operation name as origin
                // If the field appears in the output again for some other operation belonging to the same stage,
                // its origin will get updated to the new operation
                stageOutputsWithOrigins.get(stageName).put(currentOperationOutput, newOperation.getName());
            }
            processedOperations.put(newOperation.getName(), newOperation);
        }
    }
    return processedOperations;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation)

Example 47 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfo method computeIncomingSummary.

private Map<EndPointField, Set<EndPointField>> computeIncomingSummary() {
    if (writeOperations == null) {
        computeAndValidateFieldLineageInfo(this.operations);
    }
    Map<String, Set<EndPointField>> operationEndPointMap = new HashMap<>();
    Map<EndPointField, Set<EndPointField>> summary = new HashMap<>();
    for (WriteOperation write : writeOperations) {
        List<InputField> inputs = write.getInputs();
        for (InputField input : inputs) {
            EndPointField dest = new EndPointField(write.getDestination(), input.getName());
            Set<EndPointField> fields = summary.computeIfAbsent(dest, k -> new HashSet<>());
            if (operationEndPointMap.containsKey(input.getOrigin())) {
                fields.addAll(operationEndPointMap.get(input.getOrigin()));
            } else {
                // handle a special case for read -> write
                // in this case, the write operation has to be one to one relation with the fields in the read operation,
                // since a write operation can only take a list of input fields that come from the previous stage
                Operation origin = operationsMap.get(input.getOrigin());
                if (origin.getType() == OperationType.READ) {
                    fields.add(new EndPointField(((ReadOperation) origin).getSource(), input.getName()));
                    continue;
                }
                fields.addAll(computeIncomingSummaryHelper(origin, write, operationEndPointMap));
            }
        }
    }
    for (TransformOperation transform : dropTransforms) {
        for (InputField input : transform.getInputs()) {
            Operation previous = operationsMap.get(input.getOrigin());
            // drop transforms uses a common NULL endpoint as key
            Set<EndPointField> endPointFields = summary.computeIfAbsent(NULL_EPF, k -> new HashSet<>());
            if (operationEndPointMap.containsKey(input.getOrigin())) {
                endPointFields.addAll(new HashSet<>(operationEndPointMap.get(input.getOrigin())));
                continue;
            }
            endPointFields.addAll(computeIncomingSummaryHelper(previous, transform, operationEndPointMap));
        }
    }
    return summary;
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) Set(java.util.Set) InputField(io.cdap.cdap.api.lineage.field.InputField) HashMap(java.util.HashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation)

Example 48 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfo method computeIncomingSummaryHelper.

/**
 * Helper method to compute the incoming summary
 *
 * @param currentOperation the operation being processed. Since we are processing incoming this operation is on the
 * left side if graph is imagined in horizontal orientation or this operation is the input to the to
 * previousOperation
 * @param previousOperation the previous operation which is processed and reside on right to the current operation if
 * the graph is imagined to be in horizontal orientation.
 * @param operationEndPointMap a map that contains the operation name to the final endpoint field it will generate,
 * this is used to track the path we already computed to ensure we do not do the same computation again
 */
private Set<EndPointField> computeIncomingSummaryHelper(Operation currentOperation, Operation previousOperation, Map<String, Set<EndPointField>> operationEndPointMap) {
    if (currentOperation.getType() == OperationType.READ) {
        // if current operation is of type READ, previous operation must be of type TRANSFORM or WRITE
        // get only the input fields from the previous operations for which the origin is current READ operation
        Set<InputField> inputFields = new HashSet<>();
        if (OperationType.WRITE == previousOperation.getType()) {
            WriteOperation previousWrite = (WriteOperation) previousOperation;
            inputFields = new HashSet<>(previousWrite.getInputs());
        } else if (OperationType.TRANSFORM == previousOperation.getType()) {
            TransformOperation previousTransform = (TransformOperation) previousOperation;
            inputFields = new HashSet<>(previousTransform.getInputs());
        }
        Set<EndPointField> sourceEndPointFields = new HashSet<>();
        // for all the input fields of the previous operation if the origin was current operation (remember we are
        // traversing backward)
        ReadOperation read = (ReadOperation) currentOperation;
        EndPoint source = read.getSource();
        for (InputField inputField : inputFields) {
            if (inputField.getOrigin().equals(currentOperation.getName())) {
                sourceEndPointFields.add(new EndPointField(source, inputField.getName()));
            }
        }
        // reached the end of graph unwind the recursive calls
        return sourceEndPointFields;
    }
    Set<EndPointField> relatedSources = new HashSet<>();
    // for transform we traverse backward in graph further through the inputs of the transform
    if (currentOperation.getType() == OperationType.TRANSFORM) {
        TransformOperation transform = (TransformOperation) currentOperation;
        // optimization to avoid repeating work if there are input fields with the same origin
        Set<String> transformOrigins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
        for (String transformOrigin : transformOrigins) {
            if (operationEndPointMap.containsKey(transformOrigin)) {
                relatedSources.addAll(operationEndPointMap.get(transformOrigin));
            } else {
                relatedSources.addAll(computeIncomingSummaryHelper(operationsMap.get(transformOrigin), currentOperation, operationEndPointMap));
            }
        }
        operationEndPointMap.put(currentOperation.getName(), relatedSources);
    }
    return relatedSources;
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) HashSet(java.util.HashSet)

Example 49 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfo method computeAndValidateFieldLineageInfo.

private void computeAndValidateFieldLineageInfo(Collection<? extends Operation> operations) {
    Set<String> allOrigins = new HashSet<>();
    this.operationsMap = new HashMap<>();
    this.writeOperations = new HashSet<>();
    this.readOperations = new HashSet<>();
    this.operationOutgoingConnections = new HashMap<>();
    for (Operation operation : operations) {
        if (operationsMap.containsKey(operation.getName())) {
            throw new IllegalArgumentException(String.format("All operations provided for creating field " + "level lineage info must have unique names. " + "Operation name '%s' is repeated.", operation.getName()));
        }
        operationsMap.put(operation.getName(), operation);
        switch(operation.getType()) {
            case READ:
                ReadOperation read = (ReadOperation) operation;
                EndPoint source = read.getSource();
                if (source == null) {
                    throw new IllegalArgumentException(String.format("Source endpoint cannot be null for the read " + "operation '%s'.", read.getName()));
                }
                readOperations.add(read);
                break;
            case TRANSFORM:
                TransformOperation transform = (TransformOperation) operation;
                Set<String> origins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
                // for each origin corresponding to the input fields there is a connection from that origin to this operation
                for (String origin : origins) {
                    Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
                    connections.add(transform);
                }
                allOrigins.addAll(origins);
                if (transform.getOutputs().isEmpty()) {
                    dropTransforms.add(transform);
                }
                break;
            case WRITE:
                WriteOperation write = (WriteOperation) operation;
                EndPoint destination = write.getDestination();
                if (destination == null) {
                    throw new IllegalArgumentException(String.format("Destination endpoint cannot be null for the write " + "operation '%s'.", write.getName()));
                }
                origins = write.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
                // for each origin corresponding to the input fields there is a connection from that origin to this operation
                for (String origin : origins) {
                    Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
                    connections.add(write);
                }
                allOrigins.addAll(origins);
                writeOperations.add(write);
                break;
            default:
        }
    }
    Set<String> operationsWithNoOutgoingConnections = Sets.difference(operationsMap.keySet(), operationOutgoingConnections.keySet());
    // put empty set for operations with no outgoing connection rather than checking for null later
    for (String operation : operationsWithNoOutgoingConnections) {
        operationOutgoingConnections.put(operation, new HashSet<>());
    }
    if (readOperations.isEmpty()) {
        throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'READ'.");
    }
    if (writeOperations.isEmpty()) {
        throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'WRITE'.");
    }
    Sets.SetView<String> invalidOrigins = Sets.difference(allOrigins, operationsMap.keySet());
    if (!invalidOrigins.isEmpty()) {
        throw new IllegalArgumentException(String.format("No operation is associated with the origins '%s'.", invalidOrigins));
    }
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Sets(com.google.common.collect.Sets) HashSet(java.util.HashSet)

Example 50 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfo method getIncomingOperationsForField.

/**
 * <p>Get the subset of operations that were responsible for computing the specified field of
 * a specified destination.</p>
 * <p>For example if the operation are as follow</p>
 * <pre>
 * pRead: personFile -> (offset, body)
 * parse: body -> (id, name, address)
 * cRead: codeFile -> id
 * codeGen: (parse.id, cRead.id) -> id
 * sWrite: (codeGen.id, parse.name, parse.address) -> secureStore
 * iWrite: (parse.id, parse.name, parse.address) -> insecureStore
 * </pre>
 * <p>If the destination field is 'id' field of insecureStore then the result set will contain the operations iWrite,
 * parse, pRead.</p>
 * <p>If the destination field is 'id' field of secureStore then the result set will contain the operations sWrite,
 * codeGen, parse, pRead, cRead.</p>
 *
 * @param destinationField the EndPointField for which the operations need to find out
 * @return the subset of operations
 */
Set<Operation> getIncomingOperationsForField(EndPointField destinationField) {
    if (writeOperations == null) {
        computeAndValidateFieldLineageInfo(this.operations);
    }
    Set<Operation> visitedOperations = new HashSet<>();
    for (WriteOperation write : writeOperations) {
        // if the write operation destination was not the dataset to which the destinationField belongs to
        if (!write.getDestination().equals(destinationField.getEndPoint())) {
            continue;
        }
        Set<InputField> filteredInputs = write.getInputs().stream().filter(input -> input.getName().equals(destinationField.getField())).collect(Collectors.toSet());
        for (InputField input : filteredInputs) {
            // mark this write operation as visited
            visitedOperations.add(write);
            // traverse backward in the graph by looking up the origin of this input field which is the operation
            // which computed this destinationField
            getIncomingOperationsForFieldHelper(operationsMap.get(input.getOrigin()), visitedOperations);
        }
    }
    return visitedOperations;
}
Also used : EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) Checksums(io.cdap.cdap.common.utils.Checksums) GsonBuilder(com.google.gson.GsonBuilder) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Gson(com.google.gson.Gson) Map(java.util.Map) Operation(io.cdap.cdap.api.lineage.field.Operation) Charsets(com.google.common.base.Charsets) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Collection(java.util.Collection) OperationType(io.cdap.cdap.api.lineage.field.OperationType) Set(java.util.Set) OperationTypeAdapter(io.cdap.cdap.proto.codec.OperationTypeAdapter) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) InputField(io.cdap.cdap.api.lineage.field.InputField) List(java.util.List) Comparator(java.util.Comparator) Collections(java.util.Collections) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet)

Aggregations

TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)90 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)90 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)88 Operation (io.cdap.cdap.api.lineage.field.Operation)84 HashSet (java.util.HashSet)66 Test (org.junit.Test)66 ArrayList (java.util.ArrayList)64 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)50 HashMap (java.util.HashMap)38 LinkedHashSet (java.util.LinkedHashSet)30 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)28 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)28 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)28 List (java.util.List)28 ImmutableList (com.google.common.collect.ImmutableList)26 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)26 Connection (io.cdap.cdap.etl.proto.Connection)26 Set (java.util.Set)26 InputField (io.cdap.cdap.api.lineage.field.InputField)22 ImmutableSet (com.google.common.collect.ImmutableSet)20