use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfo method computeIncomingSummaryHelper.
/**
* Helper method to compute the incoming summary
*
* @param currentOperation the operation being processed. Since we are processing incoming this operation is on the
* left side if graph is imagined in horizontal orientation or this operation is the input to the to
* previousOperation
* @param previousOperation the previous operation which is processed and reside on right to the current operation if
* the graph is imagined to be in horizontal orientation.
* @param operationEndPointMap a map that contains the operation name to the final endpoint field it will generate,
* this is used to track the path we already computed to ensure we do not do the same computation again
*/
private Set<EndPointField> computeIncomingSummaryHelper(Operation currentOperation, Operation previousOperation, Map<String, Set<EndPointField>> operationEndPointMap) {
if (currentOperation.getType() == OperationType.READ) {
// if current operation is of type READ, previous operation must be of type TRANSFORM or WRITE
// get only the input fields from the previous operations for which the origin is current READ operation
Set<InputField> inputFields = new HashSet<>();
if (OperationType.WRITE == previousOperation.getType()) {
WriteOperation previousWrite = (WriteOperation) previousOperation;
inputFields = new HashSet<>(previousWrite.getInputs());
} else if (OperationType.TRANSFORM == previousOperation.getType()) {
TransformOperation previousTransform = (TransformOperation) previousOperation;
inputFields = new HashSet<>(previousTransform.getInputs());
}
Set<EndPointField> sourceEndPointFields = new HashSet<>();
// for all the input fields of the previous operation if the origin was current operation (remember we are
// traversing backward)
ReadOperation read = (ReadOperation) currentOperation;
EndPoint source = read.getSource();
for (InputField inputField : inputFields) {
if (inputField.getOrigin().equals(currentOperation.getName())) {
sourceEndPointFields.add(new EndPointField(source, inputField.getName()));
}
}
// reached the end of graph unwind the recursive calls
return sourceEndPointFields;
}
Set<EndPointField> relatedSources = new HashSet<>();
// for transform we traverse backward in graph further through the inputs of the transform
if (currentOperation.getType() == OperationType.TRANSFORM) {
TransformOperation transform = (TransformOperation) currentOperation;
// optimization to avoid repeating work if there are input fields with the same origin
Set<String> transformOrigins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
for (String transformOrigin : transformOrigins) {
if (operationEndPointMap.containsKey(transformOrigin)) {
relatedSources.addAll(operationEndPointMap.get(transformOrigin));
} else {
relatedSources.addAll(computeIncomingSummaryHelper(operationsMap.get(transformOrigin), currentOperation, operationEndPointMap));
}
}
operationEndPointMap.put(currentOperation.getName(), relatedSources);
}
return relatedSources;
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfo method computeIncomingSummary.
private Map<EndPointField, Set<EndPointField>> computeIncomingSummary() {
if (writeOperations == null) {
computeAndValidateFieldLineageInfo(this.operations);
}
Map<String, Set<EndPointField>> operationEndPointMap = new HashMap<>();
Map<EndPointField, Set<EndPointField>> summary = new HashMap<>();
for (WriteOperation write : writeOperations) {
List<InputField> inputs = write.getInputs();
for (InputField input : inputs) {
EndPointField dest = new EndPointField(write.getDestination(), input.getName());
Set<EndPointField> fields = summary.computeIfAbsent(dest, k -> new HashSet<>());
if (operationEndPointMap.containsKey(input.getOrigin())) {
fields.addAll(operationEndPointMap.get(input.getOrigin()));
} else {
// handle a special case for read -> write
// in this case, the write operation has to be one to one relation with the fields in the read operation,
// since a write operation can only take a list of input fields that come from the previous stage
Operation origin = operationsMap.get(input.getOrigin());
if (origin.getType() == OperationType.READ) {
fields.add(new EndPointField(((ReadOperation) origin).getSource(), input.getName()));
continue;
}
fields.addAll(computeIncomingSummaryHelper(origin, write, operationEndPointMap));
}
}
}
for (TransformOperation transform : dropTransforms) {
for (InputField input : transform.getInputs()) {
Operation previous = operationsMap.get(input.getOrigin());
// drop transforms uses a common NULL endpoint as key
Set<EndPointField> endPointFields = summary.computeIfAbsent(NULL_EPF, k -> new HashSet<>());
if (operationEndPointMap.containsKey(input.getOrigin())) {
endPointFields.addAll(new HashSet<>(operationEndPointMap.get(input.getOrigin())));
continue;
}
endPointFields.addAll(computeIncomingSummaryHelper(previous, transform, operationEndPointMap));
}
}
return summary;
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfo method containsInputField.
/**
* Checks whether the given field is used in the next operations or not
*
* @param nextOperation the next operation which should either be a {@link TransformOperation} or {@link
* WriteOperation}
* @param inputField the field whose usage needs to be checked
* @return true if the field is used in the nextOperation
*/
private boolean containsInputField(Operation nextOperation, InputField inputField) {
Set<InputField> inputFields = new HashSet<>();
if (OperationType.WRITE == nextOperation.getType()) {
WriteOperation nextWrite = (WriteOperation) nextOperation;
inputFields = new HashSet<>(nextWrite.getInputs());
} else if (OperationType.TRANSFORM == nextOperation.getType()) {
TransformOperation nextTransform = (TransformOperation) nextOperation;
inputFields = new HashSet<>(nextTransform.getInputs());
}
// if the next operation inputFields does contains the given fieldName return true
return inputFields.contains(inputField);
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testMultiSourceSingleDestinationWithoutMerge.
@Test
public void testMultiSourceSingleDestinationWithoutMerge() {
// pRead: personFile -> (offset, body)
// parse: body -> (id, name, address)
// cRead: codeFile -> id
// codeGen: (parse.id, cRead.id) -> id
// sWrite: (codeGen.id, parse.name, parse.address) -> secureStore
// iWrite: (parse.id, parse.name, parse.address) -> insecureStore
EndPoint pEndPoint = EndPoint.of("ns", "personFile");
EndPoint cEndPoint = EndPoint.of("ns", "codeFile");
EndPoint sEndPoint = EndPoint.of("ns", "secureStore");
EndPoint iEndPoint = EndPoint.of("ns", "insecureStore");
ReadOperation pRead = new ReadOperation("pRead", "Reading from person file", pEndPoint, "offset", "body");
ReadOperation cRead = new ReadOperation("cRead", "Reading from code file", cEndPoint, "id");
TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("pRead", "body")), "id", "name", "address");
TransformOperation codeGen = new TransformOperation("codeGen", "Generate secure code", Arrays.asList(InputField.of("parse", "id"), InputField.of("cRead", "id")), "id");
WriteOperation sWrite = new WriteOperation("sWrite", "writing secure store", sEndPoint, Arrays.asList(InputField.of("codeGen", "id"), InputField.of("parse", "name"), InputField.of("parse", "address")));
WriteOperation iWrite = new WriteOperation("iWrite", "writing insecure store", iEndPoint, Arrays.asList(InputField.of("parse", "id"), InputField.of("parse", "name"), InputField.of("parse", "address")));
List<Operation> operations = new ArrayList<>();
operations.add(pRead);
operations.add(cRead);
operations.add(parse);
operations.add(codeGen);
operations.add(sWrite);
operations.add(iWrite);
FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name", "address")), destinationFields.get(sEndPoint));
Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name", "address")), destinationFields.get(iEndPoint));
Assert.assertNull(destinationFields.get(pEndPoint));
Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
Assert.assertEquals(6, incomingSummary.size());
EndPointField expected = new EndPointField(pEndPoint, "body");
Assert.assertEquals(1, incomingSummary.get(new EndPointField(iEndPoint, "id")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(iEndPoint, "id")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(iEndPoint, "name")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(iEndPoint, "name")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(iEndPoint, "address")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(iEndPoint, "address")).iterator().next());
// name and address from secure endpoint also depends on the body field of pEndPoint
Assert.assertEquals(1, incomingSummary.get(new EndPointField(sEndPoint, "name")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(sEndPoint, "name")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(sEndPoint, "address")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(sEndPoint, "address")).iterator().next());
// id of secure endpoint depends on both body field of pEndPoint and id field of cEndPoint
Set<EndPointField> expectedSet = new HashSet<>();
expectedSet.add(new EndPointField(pEndPoint, "body"));
expectedSet.add(new EndPointField(cEndPoint, "id"));
Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(sEndPoint, "id")));
Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
// outgoing summary will not contain offset but only body from pEndPoint and id from cEndPoint
Assert.assertEquals(2, outgoingSummary.size());
expectedSet = new HashSet<>();
expectedSet.add(new EndPointField(iEndPoint, "id"));
expectedSet.add(new EndPointField(iEndPoint, "name"));
expectedSet.add(new EndPointField(iEndPoint, "address"));
expectedSet.add(new EndPointField(sEndPoint, "id"));
expectedSet.add(new EndPointField(sEndPoint, "name"));
expectedSet.add(new EndPointField(sEndPoint, "address"));
// body affects all fields from both secure and insecure endpoints
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(pEndPoint, "body")));
expectedSet.clear();
expectedSet.add(new EndPointField(sEndPoint, "id"));
// id field of cEndPoint only affects id field of secure endpoint
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(cEndPoint, "id")));
// Test incoming operations from all destination fields
Set<Operation> inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(iEndPoint, "id"));
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(iWrite);
expectedOperations.add(parse);
expectedOperations.add(pRead);
Assert.assertEquals(expectedOperations, inComingOperations);
inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(iEndPoint, "name"));
expectedOperations = new HashSet<>();
expectedOperations.add(iWrite);
expectedOperations.add(parse);
expectedOperations.add(pRead);
Assert.assertEquals(new FieldLineageInfo(expectedOperations), new FieldLineageInfo(inComingOperations));
inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(iEndPoint, "address"));
expectedOperations = new HashSet<>();
expectedOperations.add(iWrite);
expectedOperations.add(parse);
expectedOperations.add(pRead);
Assert.assertEquals(expectedOperations, inComingOperations);
inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(sEndPoint, "id"));
expectedOperations = new HashSet<>();
expectedOperations.add(sWrite);
expectedOperations.add(codeGen);
expectedOperations.add(cRead);
expectedOperations.add(parse);
expectedOperations.add(pRead);
Assert.assertEquals(expectedOperations, inComingOperations);
inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(sEndPoint, "name"));
expectedOperations = new HashSet<>();
expectedOperations.add(sWrite);
expectedOperations.add(parse);
expectedOperations.add(pRead);
Assert.assertEquals(expectedOperations, inComingOperations);
inComingOperations = fllInfo.getIncomingOperationsForField(new EndPointField(sEndPoint, "address"));
expectedOperations = new HashSet<>();
expectedOperations.add(sWrite);
expectedOperations.add(parse);
expectedOperations.add(pRead);
Assert.assertEquals(expectedOperations, inComingOperations);
// test outgoing operations for all source fields
Set<Operation> outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(pEndPoint, "offset"));
expectedOperations = new HashSet<>();
expectedOperations.add(pRead);
Assert.assertEquals(expectedOperations, outgoingOperations);
outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(pEndPoint, "body"));
expectedOperations = new HashSet<>();
expectedOperations.add(sWrite);
expectedOperations.add(iWrite);
expectedOperations.add(codeGen);
expectedOperations.add(parse);
expectedOperations.add(pRead);
Assert.assertEquals(expectedOperations, outgoingOperations);
outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(cEndPoint, "id"));
expectedOperations = new HashSet<>();
expectedOperations.add(sWrite);
expectedOperations.add(codeGen);
expectedOperations.add(cRead);
Assert.assertEquals(expectedOperations, outgoingOperations);
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testCycleWithNonExistentOperationNames.
@Test(expected = IllegalArgumentException.class)
public void testCycleWithNonExistentOperationNames() {
EndPoint readEndPoint = EndPoint.of("ns", "file1");
EndPoint writeEndPoint = EndPoint.of("ns", "file2");
ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name"), InputField.of("nop1", "field1")), "name", "address");
TransformOperation normalize = new TransformOperation("normalize", "normalize", Arrays.asList(InputField.of("parse", "name"), InputField.of("nop2", "field2")), "name");
WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"), InputField.of("nop3", "field3")));
List<Operation> operations = new ArrayList<>();
operations.add(parse);
operations.add(read);
operations.add(normalize);
operations.add(write);
FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations));
}
Aggregations