use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfo method computeIncomingSummary.
private Map<EndPointField, Set<EndPointField>> computeIncomingSummary() {
if (writeOperations == null) {
computeAndValidateFieldLineageInfo(this.operations);
}
Map<String, Set<EndPointField>> operationEndPointMap = new HashMap<>();
Map<EndPointField, Set<EndPointField>> summary = new HashMap<>();
for (WriteOperation write : writeOperations) {
List<InputField> inputs = write.getInputs();
for (InputField input : inputs) {
EndPointField dest = new EndPointField(write.getDestination(), input.getName());
Set<EndPointField> fields = summary.computeIfAbsent(dest, k -> new HashSet<>());
if (operationEndPointMap.containsKey(input.getOrigin())) {
fields.addAll(operationEndPointMap.get(input.getOrigin()));
} else {
// handle a special case for read -> write
// in this case, the write operation has to be one to one relation with the fields in the read operation,
// since a write operation can only take a list of input fields that come from the previous stage
Operation origin = operationsMap.get(input.getOrigin());
if (origin.getType() == OperationType.READ) {
fields.add(new EndPointField(((ReadOperation) origin).getSource(), input.getName()));
continue;
}
fields.addAll(computeIncomingSummaryHelper(origin, write, operationEndPointMap));
}
}
}
for (TransformOperation transform : dropTransforms) {
for (InputField input : transform.getInputs()) {
Operation previous = operationsMap.get(input.getOrigin());
// drop transforms uses a common NULL endpoint as key
Set<EndPointField> endPointFields = summary.computeIfAbsent(NULL_EPF, k -> new HashSet<>());
if (operationEndPointMap.containsKey(input.getOrigin())) {
endPointFields.addAll(new HashSet<>(operationEndPointMap.get(input.getOrigin())));
continue;
}
endPointFields.addAll(computeIncomingSummaryHelper(previous, transform, operationEndPointMap));
}
}
return summary;
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfo method getIncomingOperationsForField.
/**
* <p>Get the subset of operations that were responsible for computing the specified field of
* a specified destination.</p>
* <p>For example if the operation are as follow</p>
* <pre>
* pRead: personFile -> (offset, body)
* parse: body -> (id, name, address)
* cRead: codeFile -> id
* codeGen: (parse.id, cRead.id) -> id
* sWrite: (codeGen.id, parse.name, parse.address) -> secureStore
* iWrite: (parse.id, parse.name, parse.address) -> insecureStore
* </pre>
* <p>If the destination field is 'id' field of insecureStore then the result set will contain the operations iWrite,
* parse, pRead.</p>
* <p>If the destination field is 'id' field of secureStore then the result set will contain the operations sWrite,
* codeGen, parse, pRead, cRead.</p>
*
* @param destinationField the EndPointField for which the operations need to find out
* @return the subset of operations
*/
Set<Operation> getIncomingOperationsForField(EndPointField destinationField) {
if (writeOperations == null) {
computeAndValidateFieldLineageInfo(this.operations);
}
Set<Operation> visitedOperations = new HashSet<>();
for (WriteOperation write : writeOperations) {
// if the write operation destination was not the dataset to which the destinationField belongs to
if (!write.getDestination().equals(destinationField.getEndPoint())) {
continue;
}
Set<InputField> filteredInputs = write.getInputs().stream().filter(input -> input.getName().equals(destinationField.getField())).collect(Collectors.toSet());
for (InputField input : filteredInputs) {
// mark this write operation as visited
visitedOperations.add(write);
// traverse backward in the graph by looking up the origin of this input field which is the operation
// which computed this destinationField
getIncomingOperationsForFieldHelper(operationsMap.get(input.getOrigin()), visitedOperations);
}
}
return visitedOperations;
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfo method containsInputField.
/**
* Checks whether the given field is used in the next operations or not
*
* @param nextOperation the next operation which should either be a {@link TransformOperation} or {@link
* WriteOperation}
* @param inputField the field whose usage needs to be checked
* @return true if the field is used in the nextOperation
*/
private boolean containsInputField(Operation nextOperation, InputField inputField) {
Set<InputField> inputFields = new HashSet<>();
if (OperationType.WRITE == nextOperation.getType()) {
WriteOperation nextWrite = (WriteOperation) nextOperation;
inputFields = new HashSet<>(nextWrite.getInputs());
} else if (OperationType.TRANSFORM == nextOperation.getType()) {
TransformOperation nextTransform = (TransformOperation) nextOperation;
inputFields = new HashSet<>(nextTransform.getInputs());
}
// if the next operation inputFields does contains the given fieldName return true
return inputFields.contains(inputField);
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfoTest method testLargeLineageOperation.
@Test(timeout = 10000)
public void testLargeLineageOperation() {
List<String> inputs = new ArrayList<>();
for (int i = 0; i < 100; i++) {
inputs.add("num" + i);
}
List<Operation> operations = new ArrayList<>();
operations.add(new ReadOperation("read", "Read from something", EndPoint.of("start"), inputs));
// generate 500+ operations with 5 identity + all-to-all combos
generateLineage(inputs, operations, "first identity", "read", "alltoall1");
generateLineage(inputs, operations, "second identity", "alltoall1", "alltoall2");
generateLineage(inputs, operations, "third identity", "alltoall2", "alltoall3");
generateLineage(inputs, operations, "forth identity", "alltoall3", "alltoall4");
generateLineage(inputs, operations, "fifth identity", "alltoall4", "alltoall5");
List<InputField> newList = new ArrayList<>();
inputs.forEach(s -> newList.add(InputField.of("alltoall5", s)));
WriteOperation operation = new WriteOperation("Write", "", EndPoint.of("dest"), newList);
operations.add(operation);
FieldLineageInfo info = new FieldLineageInfo(operations);
Assert.assertNotNull(info);
Set<EndPointField> relatedSources = new HashSet<>();
Map<EndPointField, Set<EndPointField>> expectedIncoming = new HashMap<>();
for (int i = 0; i < inputs.size(); i++) {
relatedSources.add(new EndPointField(EndPoint.of("start"), "num" + i));
}
for (int i = 0; i < inputs.size(); i++) {
EndPointField key = new EndPointField(EndPoint.of("dest"), "num" + i);
expectedIncoming.put(key, relatedSources);
}
Assert.assertEquals(expectedIncoming, info.getIncomingSummary());
}
use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.
the class FieldLineageInfoTest method testLinearTopologicalSort.
@Test
public void testLinearTopologicalSort() {
// read---->parse---->normalize--->write
ReadOperation read = new ReadOperation("read", "read descr", EndPoint.of("ns", "input"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address");
TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
List<InputField> writeInputs = new ArrayList<>();
writeInputs.add(InputField.of("parse", "name"));
writeInputs.add(InputField.of("normalize", "address"));
WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
Set<Operation> operations = new LinkedHashSet<>();
operations.add(read);
operations.add(parse);
operations.add(normalize);
operations.add(write);
List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
// try with few different insertion orders, the topological sort should give the same results
operations = new LinkedHashSet<>();
operations.add(parse);
operations.add(normalize);
operations.add(write);
operations.add(read);
topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
operations = new LinkedHashSet<>();
operations.add(write);
operations.add(normalize);
operations.add(parse);
operations.add(read);
topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
}
Aggregations