use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class LineageOperationsProcessor method addMergeOperation.
/**
* Create the implicit merge operations. Each merge operation will have a prefix with
* {stage1-name},(stage2-name),{other-stage-name}.merge, appended with the field name.
* Each merge operation can be seen as an identity transform for all the fields appeared in the outputs in the
* parent stages.
* For example, if a pipeline looks like this, the stageInputs is [t1, t2]:
* src1[read, body] -> t1(parse, body -> a, b, c) ----------|
* |-> t3(not joiner) -> sink
* src2[read, a, b, c] -> t2(identitya, a->a, b->b, c->c) --|
* At stage t3, 4 merge operations are generated for each field a,b,c:
* 1. name: t1,t2.merge.a, input fields: t1.parse, t2.identitya, output fields: a
* 2. name: t1,t2.merge.b, input fields: t1.parse, t2.identityb, output fields: b
* 3. name: t1,t2.merge.c, input fields: t1.parse, t2.identityc, output fields: c
* 4. name: t1,t2.merge.body, input fields: src1.read, output fields: body
*
* @param stageInputs the stage inputs, the size of this set be greater than 1
* @param processedOperations the processed operations that collect all the result
*/
private void addMergeOperation(Set<String> stageInputs, Map<String, Operation> processedOperations) {
Set<String> sortedInputs = new TreeSet<>(stageInputs);
String mergeOperationNamePrefix = getMergeOperationNamePrefix(sortedInputs);
String mergeDescription = "Merged stages: " + Joiner.on(",").join(sortedInputs);
// this map will have key as field which appears as output fields in parent stages of stage inputs.
// the value will be a list origins which generates it. For one branch, it will find the most recent one
// which generates it.
// For java doc example, after computation, the map will contain:
// a: t1.parse, t2.identitya
// b: t1.parse, t2.identityb
// c: t1.parse, t2.identityc
// body: t1.read
Map<String, List<String>> fieldNameMap = new LinkedHashMap<>();
// create the map that contains the field name of the current stages and the parent stages to the origins
for (String inputStage : sortedInputs) {
List<String> parentStages = findParentStages(inputStage);
// traverse in a reverse order of parent stages since the parent stages will contain the closest parent
// at the end of the list
Collections.reverse(parentStages);
// this stores the visited field, if we already know the field from the previous parent, do not
// add other origin of this field
Set<String> visitedField = new HashSet<>();
for (String parentStage : parentStages) {
// get the map of all the outputs to the origin map from a stage
Map<String, String> fieldOrigins = stageOutputsWithOrigins.get(parentStage);
for (Map.Entry<String, String> fieldOrigin : fieldOrigins.entrySet()) {
String fieldName = fieldOrigin.getKey();
if (visitedField.contains(fieldName)) {
continue;
}
List<String> inputFields = fieldNameMap.computeIfAbsent(fieldName, k -> new ArrayList<>());
inputFields.add(fieldOrigin.getValue());
visitedField.add(fieldName);
}
}
}
fieldNameMap.forEach((fieldName, inputFields) -> {
String mergeName = prefixedName(mergeOperationNamePrefix, fieldName);
if (processedOperations.containsKey(mergeName)) {
// we should still only add single merge operation for them
return;
}
TransformOperation merge = new TransformOperation(mergeName, mergeDescription, inputFields.stream().map(origin -> InputField.of(origin, fieldName)).collect(Collectors.toList()), Collections.singletonList(fieldName));
processedOperations.put(merge.getName(), merge);
});
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class LineageOperationsProcessor method computeProcessedOperations.
/**
* Convert the all the stage operations to the platform operation, this method will go through the pipeline in
* topological order, so that the later stage will always know the origin of its operation.
* If a stage has multiple inputs except joiner, implicit merge operations will be generated in order to for further
* stages to look up the origins.
* For joiners, the input field name should already contains the previous stage name.
*
* @return a {@link Map} containing the operations with key of operation name and value of the corresponding
* platform {@link Operation}
*/
private Map<String, Operation> computeProcessedOperations() {
Map<String, Operation> processedOperations = new HashMap<>();
for (String stageName : topologicalOrder) {
Set<String> stageInputs = stageDag.getNodeInputs(stageName);
// if the stage has multiple inputs and it is not a joiner, compute the merge operations
if (stageInputs.size() > 1 && !noMergeRequiredStages.contains(stageName)) {
addMergeOperation(stageInputs, processedOperations);
}
List<FieldOperation> fieldOperations = stageOperations.get(stageName);
for (FieldOperation fieldOperation : fieldOperations) {
Operation newOperation = null;
String newOperationName = prefixedName(stageName, fieldOperation.getName());
Set<String> currentOperationOutputs = new LinkedHashSet<>();
switch(fieldOperation.getType()) {
case READ:
FieldReadOperation read = (FieldReadOperation) fieldOperation;
newOperation = new ReadOperation(newOperationName, read.getDescription(), read.getSource(), read.getOutputFields());
currentOperationOutputs.addAll(read.getOutputFields());
break;
case TRANSFORM:
FieldTransformOperation transform = (FieldTransformOperation) fieldOperation;
List<InputField> inputFields = createInputFields(transform.getInputFields(), stageName, processedOperations);
newOperation = new TransformOperation(newOperationName, transform.getDescription(), inputFields, transform.getOutputFields());
currentOperationOutputs.addAll(transform.getOutputFields());
break;
case WRITE:
FieldWriteOperation write = (FieldWriteOperation) fieldOperation;
inputFields = createInputFields(write.getInputFields(), stageName, processedOperations);
newOperation = new WriteOperation(newOperationName, write.getDescription(), write.getSink(), inputFields);
break;
}
for (String currentOperationOutput : currentOperationOutputs) {
// For all fields outputted by the current operation assign the operation name as origin
// If the field appears in the output again for some other operation belonging to the same stage,
// its origin will get updated to the new operation
stageOutputsWithOrigins.get(stageName).put(currentOperationOutput, newOperation.getName());
}
processedOperations.put(newOperation.getName(), newOperation);
}
}
return processedOperations;
}
Aggregations