use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfo method getTopologicallySortedOperations.
/**
* Sort the operations in topological order. In topological order, each operation in the list
* is guaranteed to occur before any other operation that reads its outputs.
*
* For example, consider following scenario:
*
* read-----------------------write
* \ /
* ----parse----normalize---
*
* Since write operation is dependent on the read and normalize for its input, it would be
* last in the order. normalize depends on the parse, so it would appear after parse. Similarly
* parse operation would appear after the read but before normalize in the returned list.
*
* @param operations set of operations to be sorted
* @return the list containing topologically sorted operations
*/
public static List<Operation> getTopologicallySortedOperations(Set<Operation> operations) {
Map<String, Operation> operationMap = new HashMap<>();
Set<String> operationsWithNoIncomings = new HashSet<>();
for (Operation operation : operations) {
operationMap.put(operation.getName(), operation);
if (OperationType.READ == operation.getType()) {
operationsWithNoIncomings.add(operation.getName());
}
// like a read operation
if (OperationType.TRANSFORM == operation.getType() && ((TransformOperation) operation).getInputs().isEmpty()) {
operationsWithNoIncomings.add(operation.getName());
}
}
// Map of operation name to the set of operation names which take the output of the given operation as
// an input. This map basically represents the adjacency list for operation.
// For example consider the following scenario:
//
// read----------------------write
// \ /
// ----parse---normalize
//
// The map would contain:
// read -> [parse, write]
// parse -> [normalize]
// normalize -> [write]
// write -> []
Map<String, Set<String>> outgoingOperations = new HashMap<>();
// Map of operation name to the set of operation names outputs of which given operation takes as an input.
// For example consider the following scenario:
//
// read----------------------write
// \ /
// ----parse---normalize
//
// The map would contain:
// read -> []
// parse -> [read]
// normalize -> [parse]
// write -> [read, normalize]
Map<String, Set<String>> incomingOperations = new HashMap<>();
for (Operation operation : operations) {
List<InputField> inputFields = new ArrayList<>();
switch(operation.getType()) {
case READ:
// read has no incoming operation
incomingOperations.put(operation.getName(), new HashSet<>());
break;
case TRANSFORM:
TransformOperation transform = (TransformOperation) operation;
inputFields.addAll(transform.getInputs());
break;
case WRITE:
WriteOperation write = (WriteOperation) operation;
inputFields.addAll(write.getInputs());
// write has no outgoing operation
outgoingOperations.put(operation.getName(), new HashSet<>());
break;
}
for (InputField inputField : inputFields) {
// input fields with origin as normalize, which should be ignored for topological sorting.
if (!operationMap.containsKey(inputField.getOrigin())) {
continue;
}
// Current operation is the outgoing operation for origin represented by the input field.
Set<String> outgoings = outgoingOperations.computeIfAbsent(inputField.getOrigin(), k -> new HashSet<>());
outgoings.add(operation.getName());
// Origin represented by the input field is the incoming operation for the current operation.
Set<String> incomings = incomingOperations.computeIfAbsent(operation.getName(), k -> new HashSet<>());
incomings.add(inputField.getOrigin());
}
}
List<Operation> orderedOperations = new ArrayList<>();
while (!operationsWithNoIncomings.isEmpty()) {
String current = operationsWithNoIncomings.iterator().next();
operationsWithNoIncomings.remove(current);
if (operationMap.get(current) != null) {
orderedOperations.add(operationMap.get(current));
}
// it is possible that there are no outgoings for the field, since it is possible some field is not used in the
// downstream of plugins
Iterator<String> outgoingsIter = outgoingOperations.getOrDefault(current, Collections.emptySet()).iterator();
while (outgoingsIter.hasNext()) {
String next = outgoingsIter.next();
outgoingsIter.remove();
incomingOperations.get(next).remove(current);
if (incomingOperations.get(next).isEmpty()) {
operationsWithNoIncomings.add(next);
}
}
}
// check if any cycles
// remove the entries which has empty outgoing operations now
outgoingOperations.entrySet().removeIf(next -> next.getValue().isEmpty());
if (!outgoingOperations.isEmpty()) {
throw new IllegalArgumentException(String.format("Cycle detected in graph for operations %s", outgoingOperations));
}
return orderedOperations;
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfoTest method testBranchTopologicalSort.
@Test
public void testBranchTopologicalSort() {
// read----------------------write
// \ /
// ----parse---normalize
ReadOperation read = new ReadOperation("read", "read descr", EndPoint.of("ns", "input"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address");
TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
List<InputField> writeInputs = new ArrayList<>();
writeInputs.add(InputField.of("read", "offset"));
writeInputs.add(InputField.of("parse", "name"));
writeInputs.add(InputField.of("normalize", "address"));
WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
Set<Operation> operations = new LinkedHashSet<>();
operations.add(read);
operations.add(parse);
operations.add(normalize);
operations.add(write);
List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
// try with different insertion orders
operations = new LinkedHashSet<>();
operations.add(parse);
operations.add(normalize);
operations.add(write);
operations.add(read);
topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
operations = new LinkedHashSet<>();
operations.add(write);
operations.add(normalize);
operations.add(parse);
operations.add(read);
topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
// When the field lineage is queried for offset field, we will only return the
// read and write operations, since parse and normalize operations are not affecting
// the offset field in anyway. In this case even though write operation has input with origin
// as normalize, topological sort should not affect by this case, where normalize operation
// itself is missing.
operations = new LinkedHashSet<>();
operations.add(write);
operations.add(read);
topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, write);
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfoTest method generateLineage.
private void generateLineage(List<String> inputs, List<Operation> operations, String identityNamePrefix, String identityOrigin, String transform) {
// emit identity transform for all fields
for (int i = 0; i < inputs.size(); i++) {
operations.add(new TransformOperation(identityNamePrefix + i, "identity transform", Collections.singletonList(InputField.of(identityOrigin, inputs.get(i))), inputs.get(i)));
}
// generate an all-to-all, so that when track back, this operation has to track back to all the previous
// identity transform
List<InputField> inputFields = new ArrayList<>();
for (int i = 0; i < inputs.size(); i++) {
inputFields.add(InputField.of(identityNamePrefix + i, inputs.get(i)));
}
TransformOperation parse = new TransformOperation(transform, "all to all transform", inputFields, inputs);
operations.add(parse);
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfo method computeAndValidateFieldLineageInfo.
private void computeAndValidateFieldLineageInfo(Collection<? extends Operation> operations) {
Set<String> allOrigins = new HashSet<>();
this.operationsMap = new HashMap<>();
this.writeOperations = new HashSet<>();
this.readOperations = new HashSet<>();
this.operationOutgoingConnections = new HashMap<>();
for (Operation operation : operations) {
if (operationsMap.containsKey(operation.getName())) {
throw new IllegalArgumentException(String.format("All operations provided for creating field " + "level lineage info must have unique names. " + "Operation name '%s' is repeated.", operation.getName()));
}
operationsMap.put(operation.getName(), operation);
switch(operation.getType()) {
case READ:
ReadOperation read = (ReadOperation) operation;
EndPoint source = read.getSource();
if (source == null) {
throw new IllegalArgumentException(String.format("Source endpoint cannot be null for the read " + "operation '%s'.", read.getName()));
}
readOperations.add(read);
break;
case TRANSFORM:
TransformOperation transform = (TransformOperation) operation;
Set<String> origins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
// for each origin corresponding to the input fields there is a connection from that origin to this operation
for (String origin : origins) {
Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
connections.add(transform);
}
allOrigins.addAll(origins);
if (transform.getOutputs().isEmpty()) {
dropTransforms.add(transform);
}
break;
case WRITE:
WriteOperation write = (WriteOperation) operation;
EndPoint destination = write.getDestination();
if (destination == null) {
throw new IllegalArgumentException(String.format("Destination endpoint cannot be null for the write " + "operation '%s'.", write.getName()));
}
origins = write.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
// for each origin corresponding to the input fields there is a connection from that origin to this operation
for (String origin : origins) {
Set<Operation> connections = operationOutgoingConnections.computeIfAbsent(origin, k -> new HashSet<>());
connections.add(write);
}
allOrigins.addAll(origins);
writeOperations.add(write);
break;
default:
}
}
Set<String> operationsWithNoOutgoingConnections = Sets.difference(operationsMap.keySet(), operationOutgoingConnections.keySet());
// put empty set for operations with no outgoing connection rather than checking for null later
for (String operation : operationsWithNoOutgoingConnections) {
operationOutgoingConnections.put(operation, new HashSet<>());
}
if (readOperations.isEmpty()) {
throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'READ'.");
}
if (writeOperations.isEmpty()) {
throw new IllegalArgumentException("Field level lineage requires at least one operation of type 'WRITE'.");
}
Sets.SetView<String> invalidOrigins = Sets.difference(allOrigins, operationsMap.keySet());
if (!invalidOrigins.isEmpty()) {
throw new IllegalArgumentException(String.format("No operation is associated with the origins '%s'.", invalidOrigins));
}
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfo method computeIncomingSummaryHelper.
/**
* Helper method to compute the incoming summary
*
* @param currentOperation the operation being processed. Since we are processing incoming this operation is on the
* left side if graph is imagined in horizontal orientation or this operation is the input to the to
* previousOperation
* @param previousOperation the previous operation which is processed and reside on right to the current operation if
* the graph is imagined to be in horizontal orientation.
* @param operationEndPointMap a map that contains the operation name to the final endpoint field it will generate,
* this is used to track the path we already computed to ensure we do not do the same computation again
*/
private Set<EndPointField> computeIncomingSummaryHelper(Operation currentOperation, Operation previousOperation, Map<String, Set<EndPointField>> operationEndPointMap) {
if (currentOperation.getType() == OperationType.READ) {
// if current operation is of type READ, previous operation must be of type TRANSFORM or WRITE
// get only the input fields from the previous operations for which the origin is current READ operation
Set<InputField> inputFields = new HashSet<>();
if (OperationType.WRITE == previousOperation.getType()) {
WriteOperation previousWrite = (WriteOperation) previousOperation;
inputFields = new HashSet<>(previousWrite.getInputs());
} else if (OperationType.TRANSFORM == previousOperation.getType()) {
TransformOperation previousTransform = (TransformOperation) previousOperation;
inputFields = new HashSet<>(previousTransform.getInputs());
}
Set<EndPointField> sourceEndPointFields = new HashSet<>();
// for all the input fields of the previous operation if the origin was current operation (remember we are
// traversing backward)
ReadOperation read = (ReadOperation) currentOperation;
EndPoint source = read.getSource();
for (InputField inputField : inputFields) {
if (inputField.getOrigin().equals(currentOperation.getName())) {
sourceEndPointFields.add(new EndPointField(source, inputField.getName()));
}
}
// reached the end of graph unwind the recursive calls
return sourceEndPointFields;
}
Set<EndPointField> relatedSources = new HashSet<>();
// for transform we traverse backward in graph further through the inputs of the transform
if (currentOperation.getType() == OperationType.TRANSFORM) {
TransformOperation transform = (TransformOperation) currentOperation;
// optimization to avoid repeating work if there are input fields with the same origin
Set<String> transformOrigins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
for (String transformOrigin : transformOrigins) {
if (operationEndPointMap.containsKey(transformOrigin)) {
relatedSources.addAll(operationEndPointMap.get(transformOrigin));
} else {
relatedSources.addAll(computeIncomingSummaryHelper(operationsMap.get(transformOrigin), currentOperation, operationEndPointMap));
}
}
operationEndPointMap.put(currentOperation.getName(), relatedSources);
}
return relatedSources;
}
Aggregations