Search in sources :

Example 41 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class LineageOperationsProcessor method addMergeOperation.

/**
 * Create the implicit merge operations. Each merge operation will have a prefix with
 * {stage1-name},(stage2-name),{other-stage-name}.merge, appended with the field name.
 * Each merge operation can be seen as an identity transform for all the fields appeared in the outputs in the
 * parent stages.
 * For example, if a pipeline looks like this, the stageInputs is [t1, t2]:
 * src1[read, body] -> t1(parse, body -> a, b, c) ----------|
 *                                                          |-> t3(not joiner) -> sink
 * src2[read, a, b, c] -> t2(identitya, a->a, b->b, c->c) --|
 * At stage t3, 4 merge operations are generated for each field a,b,c:
 * 1. name: t1,t2.merge.a, input fields: t1.parse, t2.identitya, output fields: a
 * 2. name: t1,t2.merge.b, input fields: t1.parse, t2.identityb, output fields: b
 * 3. name: t1,t2.merge.c, input fields: t1.parse, t2.identityc, output fields: c
 * 4. name: t1,t2.merge.body, input fields: src1.read, output fields: body
 *
 * @param stageInputs the stage inputs, the size of this set be greater than 1
 * @param processedOperations the processed operations that collect all the result
 */
private void addMergeOperation(Set<String> stageInputs, Map<String, Operation> processedOperations) {
    Set<String> sortedInputs = new TreeSet<>(stageInputs);
    String mergeOperationNamePrefix = getMergeOperationNamePrefix(sortedInputs);
    String mergeDescription = "Merged stages: " + Joiner.on(",").join(sortedInputs);
    // this map will have key as field which appears as output fields in parent stages of stage inputs.
    // the value will be a list origins which generates it. For one branch, it will find the most recent one
    // which generates it.
    // For java doc example, after computation, the map will contain:
    // a: t1.parse, t2.identitya
    // b: t1.parse, t2.identityb
    // c: t1.parse, t2.identityc
    // body: t1.read
    Map<String, List<String>> fieldNameMap = new LinkedHashMap<>();
    // create the map that contains the field name of the current stages and the parent stages to the origins
    for (String inputStage : sortedInputs) {
        List<String> parentStages = findParentStages(inputStage);
        // traverse in a reverse order of parent stages since the parent stages will contain the closest parent
        // at the end of the list
        Collections.reverse(parentStages);
        // this stores the visited field, if we already know the field from the previous parent, do not
        // add other origin of this field
        Set<String> visitedField = new HashSet<>();
        for (String parentStage : parentStages) {
            // get the map of all the outputs to the origin map from a stage
            Map<String, String> fieldOrigins = stageOutputsWithOrigins.get(parentStage);
            for (Map.Entry<String, String> fieldOrigin : fieldOrigins.entrySet()) {
                String fieldName = fieldOrigin.getKey();
                if (visitedField.contains(fieldName)) {
                    continue;
                }
                List<String> inputFields = fieldNameMap.computeIfAbsent(fieldName, k -> new ArrayList<>());
                inputFields.add(fieldOrigin.getValue());
                visitedField.add(fieldName);
            }
        }
    }
    fieldNameMap.forEach((fieldName, inputFields) -> {
        String mergeName = prefixedName(mergeOperationNamePrefix, fieldName);
        if (processedOperations.containsKey(mergeName)) {
            // we should still only add single merge operation for them
            return;
        }
        TransformOperation merge = new TransformOperation(mergeName, mergeDescription, inputFields.stream().map(origin -> InputField.of(origin, fieldName)).collect(Collectors.toList()), Collections.singletonList(fieldName));
        processedOperations.put(merge.getName(), merge);
    });
}
Also used : TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) LinkedHashMap(java.util.LinkedHashMap) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 42 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class LineageOperationsProcessor method computeProcessedOperations.

/**
 * Convert the all the stage operations to the platform operation, this method will go through the pipeline in
 * topological order, so that the later stage will always know the origin of its operation.
 * If a stage has multiple inputs except joiner, implicit merge operations will be generated in order to for further
 * stages to look up the origins.
 * For joiners, the input field name should already contains the previous stage name.
 *
 * @return a {@link Map} containing the operations with key of operation name and value of the corresponding
 * platform {@link Operation}
 */
private Map<String, Operation> computeProcessedOperations() {
    Map<String, Operation> processedOperations = new HashMap<>();
    for (String stageName : topologicalOrder) {
        Set<String> stageInputs = stageDag.getNodeInputs(stageName);
        // if the stage has multiple inputs and it is not a joiner, compute the merge operations
        if (stageInputs.size() > 1 && !noMergeRequiredStages.contains(stageName)) {
            addMergeOperation(stageInputs, processedOperations);
        }
        List<FieldOperation> fieldOperations = stageOperations.get(stageName);
        for (FieldOperation fieldOperation : fieldOperations) {
            Operation newOperation = null;
            String newOperationName = prefixedName(stageName, fieldOperation.getName());
            Set<String> currentOperationOutputs = new LinkedHashSet<>();
            switch(fieldOperation.getType()) {
                case READ:
                    FieldReadOperation read = (FieldReadOperation) fieldOperation;
                    newOperation = new ReadOperation(newOperationName, read.getDescription(), read.getSource(), read.getOutputFields());
                    currentOperationOutputs.addAll(read.getOutputFields());
                    break;
                case TRANSFORM:
                    FieldTransformOperation transform = (FieldTransformOperation) fieldOperation;
                    List<InputField> inputFields = createInputFields(transform.getInputFields(), stageName, processedOperations);
                    newOperation = new TransformOperation(newOperationName, transform.getDescription(), inputFields, transform.getOutputFields());
                    currentOperationOutputs.addAll(transform.getOutputFields());
                    break;
                case WRITE:
                    FieldWriteOperation write = (FieldWriteOperation) fieldOperation;
                    inputFields = createInputFields(write.getInputFields(), stageName, processedOperations);
                    newOperation = new WriteOperation(newOperationName, write.getDescription(), write.getSink(), inputFields);
                    break;
            }
            for (String currentOperationOutput : currentOperationOutputs) {
                // For all fields outputted by the current operation assign the operation name as origin
                // If the field appears in the output again for some other operation belonging to the same stage,
                // its origin will get updated to the new operation
                stageOutputsWithOrigins.get(stageName).put(currentOperationOutput, newOperation.getName());
            }
            processedOperations.put(newOperation.getName(), newOperation);
        }
    }
    return processedOperations;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation)

Aggregations

TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)42 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)39 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)38 Operation (io.cdap.cdap.api.lineage.field.Operation)36 ArrayList (java.util.ArrayList)29 HashSet (java.util.HashSet)29 Test (org.junit.Test)29 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)23 HashMap (java.util.HashMap)18 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)14 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)14 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)14 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)13 Connection (io.cdap.cdap.etl.proto.Connection)13 LinkedHashSet (java.util.LinkedHashSet)12 InputField (io.cdap.cdap.api.lineage.field.InputField)10 Set (java.util.Set)9 ImmutableSet (com.google.common.collect.ImmutableSet)7