use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class LineageOperationsProcessor method computeProcessedOperations.
/**
* Convert the all the stage operations to the platform operation, this method will go through the pipeline in
* topological order, so that the later stage will always know the origin of its operation.
* If a stage has multiple inputs except joiner, implicit merge operations will be generated in order to for further
* stages to look up the origins.
* For joiners, the input field name should already contains the previous stage name.
*
* @return a {@link Map} containing the operations with key of operation name and value of the corresponding
* platform {@link Operation}
*/
private Map<String, Operation> computeProcessedOperations() {
Map<String, Operation> processedOperations = new HashMap<>();
for (String stageName : topologicalOrder) {
Set<String> stageInputs = stageDag.getNodeInputs(stageName);
// if the stage has multiple inputs and it is not a joiner, compute the merge operations
if (stageInputs.size() > 1 && !noMergeRequiredStages.contains(stageName)) {
addMergeOperation(stageInputs, processedOperations);
}
List<FieldOperation> fieldOperations = stageOperations.get(stageName);
for (FieldOperation fieldOperation : fieldOperations) {
Operation newOperation = null;
String newOperationName = prefixedName(stageName, fieldOperation.getName());
Set<String> currentOperationOutputs = new LinkedHashSet<>();
switch(fieldOperation.getType()) {
case READ:
FieldReadOperation read = (FieldReadOperation) fieldOperation;
newOperation = new ReadOperation(newOperationName, read.getDescription(), read.getSource(), read.getOutputFields());
currentOperationOutputs.addAll(read.getOutputFields());
break;
case TRANSFORM:
FieldTransformOperation transform = (FieldTransformOperation) fieldOperation;
List<InputField> inputFields = createInputFields(transform.getInputFields(), stageName, processedOperations);
newOperation = new TransformOperation(newOperationName, transform.getDescription(), inputFields, transform.getOutputFields());
currentOperationOutputs.addAll(transform.getOutputFields());
break;
case WRITE:
FieldWriteOperation write = (FieldWriteOperation) fieldOperation;
inputFields = createInputFields(write.getInputFields(), stageName, processedOperations);
newOperation = new WriteOperation(newOperationName, write.getDescription(), write.getSink(), inputFields);
break;
}
for (String currentOperationOutput : currentOperationOutputs) {
// For all fields outputted by the current operation assign the operation name as origin
// If the field appears in the output again for some other operation belonging to the same stage,
// its origin will get updated to the new operation
stageOutputsWithOrigins.get(stageName).put(currentOperationOutput, newOperation.getName());
}
processedOperations.put(newOperation.getName(), newOperation);
}
}
return processedOperations;
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfo method computeIncomingSummary.
private Map<EndPointField, Set<EndPointField>> computeIncomingSummary() {
if (writeOperations == null) {
computeAndValidateFieldLineageInfo(this.operations);
}
Map<String, Set<EndPointField>> operationEndPointMap = new HashMap<>();
Map<EndPointField, Set<EndPointField>> summary = new HashMap<>();
for (WriteOperation write : writeOperations) {
List<InputField> inputs = write.getInputs();
for (InputField input : inputs) {
EndPointField dest = new EndPointField(write.getDestination(), input.getName());
Set<EndPointField> fields = summary.computeIfAbsent(dest, k -> new HashSet<>());
if (operationEndPointMap.containsKey(input.getOrigin())) {
fields.addAll(operationEndPointMap.get(input.getOrigin()));
} else {
// handle a special case for read -> write
// in this case, the write operation has to be one to one relation with the fields in the read operation,
// since a write operation can only take a list of input fields that come from the previous stage
Operation origin = operationsMap.get(input.getOrigin());
if (origin.getType() == OperationType.READ) {
fields.add(new EndPointField(((ReadOperation) origin).getSource(), input.getName()));
continue;
}
fields.addAll(computeIncomingSummaryHelper(origin, write, operationEndPointMap));
}
}
}
for (TransformOperation transform : dropTransforms) {
for (InputField input : transform.getInputs()) {
Operation previous = operationsMap.get(input.getOrigin());
// drop transforms uses a common NULL endpoint as key
Set<EndPointField> endPointFields = summary.computeIfAbsent(NULL_EPF, k -> new HashSet<>());
if (operationEndPointMap.containsKey(input.getOrigin())) {
endPointFields.addAll(new HashSet<>(operationEndPointMap.get(input.getOrigin())));
continue;
}
endPointFields.addAll(computeIncomingSummaryHelper(previous, transform, operationEndPointMap));
}
}
return summary;
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfo method computeIncomingSummaryHelper.
/**
* Helper method to compute the incoming summary
*
* @param currentOperation the operation being processed. Since we are processing incoming this operation is on the
* left side if graph is imagined in horizontal orientation or this operation is the input to the to
* previousOperation
* @param previousOperation the previous operation which is processed and reside on right to the current operation if
* the graph is imagined to be in horizontal orientation.
* @param operationEndPointMap a map that contains the operation name to the final endpoint field it will generate,
* this is used to track the path we already computed to ensure we do not do the same computation again
*/
private Set<EndPointField> computeIncomingSummaryHelper(Operation currentOperation, Operation previousOperation, Map<String, Set<EndPointField>> operationEndPointMap) {
if (currentOperation.getType() == OperationType.READ) {
// if current operation is of type READ, previous operation must be of type TRANSFORM or WRITE
// get only the input fields from the previous operations for which the origin is current READ operation
Set<InputField> inputFields = new HashSet<>();
if (OperationType.WRITE == previousOperation.getType()) {
WriteOperation previousWrite = (WriteOperation) previousOperation;
inputFields = new HashSet<>(previousWrite.getInputs());
} else if (OperationType.TRANSFORM == previousOperation.getType()) {
TransformOperation previousTransform = (TransformOperation) previousOperation;
inputFields = new HashSet<>(previousTransform.getInputs());
}
Set<EndPointField> sourceEndPointFields = new HashSet<>();
// for all the input fields of the previous operation if the origin was current operation (remember we are
// traversing backward)
ReadOperation read = (ReadOperation) currentOperation;
EndPoint source = read.getSource();
for (InputField inputField : inputFields) {
if (inputField.getOrigin().equals(currentOperation.getName())) {
sourceEndPointFields.add(new EndPointField(source, inputField.getName()));
}
}
// reached the end of graph unwind the recursive calls
return sourceEndPointFields;
}
Set<EndPointField> relatedSources = new HashSet<>();
// for transform we traverse backward in graph further through the inputs of the transform
if (currentOperation.getType() == OperationType.TRANSFORM) {
TransformOperation transform = (TransformOperation) currentOperation;
// optimization to avoid repeating work if there are input fields with the same origin
Set<String> transformOrigins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
for (String transformOrigin : transformOrigins) {
if (operationEndPointMap.containsKey(transformOrigin)) {
relatedSources.addAll(operationEndPointMap.get(transformOrigin));
} else {
relatedSources.addAll(computeIncomingSummaryHelper(operationsMap.get(transformOrigin), currentOperation, operationEndPointMap));
}
}
operationEndPointMap.put(currentOperation.getName(), relatedSources);
}
return relatedSources;
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfo method computeAndValidateFieldLineageInfo.
private void computeAndValidateFieldLineageInfo(Collection<? extends Operation> operations) {
Set<String> allOrigins = new HashSet<>();
this.operationsMap = new HashMap<>();
this.writeOperations = new HashSet<>();
this.readOperations = new HashSet<>();
this.operationOutgoingConnections = new HashMap<>();
for (Operation operation : operations) {
if (operationsMap.containsKey(operation.getName())) {
throw new IllegalArgumentException(String.format("All operations provided for creating field " + "level lineage info must have unique names. " + "Operation name '%s' is repeated.", operation.getName()));
}
operationsMap.put(operation.getName(), operation);
switch(operation.getType()) {
case READ:
ReadOperation read = (ReadOperation) operation;
EndPoint source = read.getSource();
if (source == null) {
throw new IllegalArgumentException(String.format("Source endpoint cannot be null for the read " + "operation '%s'.", read.getName()));
}
readOperations.add(read);
break;
case TRANSFORM:
TransformOperation transform = (TransformOperation) operation;
Set<String> origins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
// for each origin corresponding to the input fields there is a connection from that origin to this operation
for (String origin : origins) {
Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
connections.add(transform);
}
allOrigins.addAll(origins);
if (transform.getOutputs().isEmpty()) {
dropTransforms.add(transform);
}
break;
case WRITE:
WriteOperation write = (WriteOperation) operation;
EndPoint destination = write.getDestination();
if (destination == null) {
throw new IllegalArgumentException(String.format("Destination endpoint cannot be null for the write " + "operation '%s'.", write.getName()));
}
origins = write.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
// for each origin corresponding to the input fields there is a connection from that origin to this operation
for (String origin : origins) {
Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
connections.add(write);
}
allOrigins.addAll(origins);
writeOperations.add(write);
break;
default:
}
}
Set<String> operationsWithNoOutgoingConnections = Sets.difference(operationsMap.keySet(), operationOutgoingConnections.keySet());
// put empty set for operations with no outgoing connection rather than checking for null later
for (String operation : operationsWithNoOutgoingConnections) {
operationOutgoingConnections.put(operation, new HashSet<>());
}
if (readOperations.isEmpty()) {
throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'READ'.");
}
if (writeOperations.isEmpty()) {
throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'WRITE'.");
}
Sets.SetView<String> invalidOrigins = Sets.difference(allOrigins, operationsMap.keySet());
if (!invalidOrigins.isEmpty()) {
throw new IllegalArgumentException(String.format("No operation is associated with the origins '%s'.", invalidOrigins));
}
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfo method getIncomingOperationsForField.
/**
* <p>Get the subset of operations that were responsible for computing the specified field of
* a specified destination.</p>
* <p>For example if the operation are as follow</p>
* <pre>
* pRead: personFile -> (offset, body)
* parse: body -> (id, name, address)
* cRead: codeFile -> id
* codeGen: (parse.id, cRead.id) -> id
* sWrite: (codeGen.id, parse.name, parse.address) -> secureStore
* iWrite: (parse.id, parse.name, parse.address) -> insecureStore
* </pre>
* <p>If the destination field is 'id' field of insecureStore then the result set will contain the operations iWrite,
* parse, pRead.</p>
* <p>If the destination field is 'id' field of secureStore then the result set will contain the operations sWrite,
* codeGen, parse, pRead, cRead.</p>
*
* @param destinationField the EndPointField for which the operations need to find out
* @return the subset of operations
*/
Set<Operation> getIncomingOperationsForField(EndPointField destinationField) {
if (writeOperations == null) {
computeAndValidateFieldLineageInfo(this.operations);
}
Set<Operation> visitedOperations = new HashSet<>();
for (WriteOperation write : writeOperations) {
// if the write operation destination was not the dataset to which the destinationField belongs to
if (!write.getDestination().equals(destinationField.getEndPoint())) {
continue;
}
Set<InputField> filteredInputs = write.getInputs().stream().filter(input -> input.getName().equals(destinationField.getField())).collect(Collectors.toSet());
for (InputField input : filteredInputs) {
// mark this write operation as visited
visitedOperations.add(write);
// traverse backward in the graph by looking up the origin of this input field which is the operation
// which computed this destinationField
getIncomingOperationsForFieldHelper(operationsMap.get(input.getOrigin()), visitedOperations);
}
}
return visitedOperations;
}
Aggregations