use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testMultiSourceDroppedFields.
@Test
public void testMultiSourceDroppedFields() {
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "first_name", "last_name", "social");
TransformOperation combineNames = new TransformOperation("combineNames", "combine names", Arrays.asList(InputField.of("read", "first_name"), InputField.of("read", "last_name")), "full_name");
TransformOperation dropSocial = new TransformOperation("dropSocial", "drop social", Collections.singletonList(InputField.of("read", "social")));
WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("endpoint2"), Collections.singletonList(InputField.of("combineNames", "full_name")));
Set<Operation> operations = Sets.newHashSet(read, write, combineNames, dropSocial);
FieldLineageInfo info1 = new FieldLineageInfo(operations);
EndPoint ep1 = EndPoint.of("endpoint1");
EndPoint ep2 = EndPoint.of("endpoint2");
Map<EndPointField, Set<EndPointField>> expectedOutgoingSummary = new HashMap<>();
expectedOutgoingSummary.put(new EndPointField(ep1, "first_name"), Collections.singleton(new EndPointField(ep2, "full_name")));
expectedOutgoingSummary.put(new EndPointField(ep1, "last_name"), Collections.singleton(new EndPointField(ep2, "full_name")));
expectedOutgoingSummary.put(new EndPointField(ep1, "social"), Collections.singleton(FieldLineageInfo.NULL_EPF));
Assert.assertEquals(expectedOutgoingSummary, info1.getOutgoingSummary());
Map<EndPointField, Set<EndPointField>> expectedIncomingSummary = new HashMap<>();
expectedIncomingSummary.put(new EndPointField(ep2, "full_name"), Sets.newHashSet(new EndPointField(ep1, "first_name"), new EndPointField(ep1, "last_name")));
Assert.assertEquals(expectedIncomingSummary, info1.getIncomingSummary());
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testSimpleFieldLineageSummary.
@Test
public void testSimpleFieldLineageSummary() {
// read: file -> (offset, body)
// parse: (body) -> (first_name, last_name)
// concat: (first_name, last_name) -> (name)
// write: (offset, name) -> another_file
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name");
TransformOperation concat = new TransformOperation("concat", "concatinating the fields", Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name");
WriteOperation write = new WriteOperation("write_op", "writing data to file", EndPoint.of("myns", "another_file"), Arrays.asList(InputField.of("read", "offset"), InputField.of("concat", "name")));
List<Operation> operations = new ArrayList<>();
operations.add(parse);
operations.add(concat);
operations.add(read);
operations.add(write);
FieldLineageInfo info = new FieldLineageInfo(operations);
// EndPoint(myns, another_file) should have two fields: offset and name
Map<EndPoint, Set<String>> destinationFields = info.getDestinationFields();
EndPoint destination = EndPoint.of("myns", "another_file");
Assert.assertEquals(1, destinationFields.size());
Assert.assertEquals(new HashSet<>(Arrays.asList("offset", "name")), destinationFields.get(destination));
Map<EndPointField, Set<EndPointField>> incomingSummary = info.getIncomingSummary();
Map<EndPointField, Set<EndPointField>> outgoingSummary = info.getOutgoingSummary();
// test incoming summaries
// offset in the destination is generated from offset field read from source
EndPointField endPointField = new EndPointField(destination, "offset");
Set<EndPointField> sourceEndPointFields = incomingSummary.get(endPointField);
Assert.assertEquals(1, sourceEndPointFields.size());
EndPointField sourceEndpoint = new EndPointField(EndPoint.of("endpoint1"), "offset");
Assert.assertEquals(sourceEndpoint, sourceEndPointFields.iterator().next());
Set<Operation> operationsForField = info.getIncomingOperationsForField(endPointField);
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(write);
expectedOperations.add(read);
Assert.assertEquals(expectedOperations, operationsForField);
// test outgoing operations for offset field
operationsForField = info.getOutgoingOperationsForField(sourceEndpoint);
Assert.assertEquals(expectedOperations, operationsForField);
// name in the destination is generated from body field read from source
endPointField = new EndPointField(destination, "name");
sourceEndPointFields = incomingSummary.get(endPointField);
Assert.assertEquals(1, sourceEndPointFields.size());
sourceEndpoint = new EndPointField(EndPoint.of("endpoint1"), "body");
Assert.assertEquals(sourceEndpoint, sourceEndPointFields.iterator().next());
operationsForField = info.getIncomingOperationsForField(endPointField);
expectedOperations = new HashSet<>();
expectedOperations.add(write);
expectedOperations.add(concat);
expectedOperations.add(parse);
expectedOperations.add(read);
Assert.assertEquals(expectedOperations, operationsForField);
// offset in the source should only affect the field offset in the destination
EndPoint source = EndPoint.of("endpoint1");
endPointField = new EndPointField(source, "offset");
Set<EndPointField> destinationEndPointFields = outgoingSummary.get(endPointField);
Assert.assertEquals(1, destinationEndPointFields.size());
sourceEndpoint = new EndPointField(EndPoint.of("myns", "another_file"), "offset");
Assert.assertEquals(sourceEndpoint, destinationEndPointFields.iterator().next());
// test outgoing operations for body field
operationsForField = info.getOutgoingOperationsForField(new EndPointField(EndPoint.of("endpoint1"), "body"));
Assert.assertEquals(expectedOperations, operationsForField);
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testCycle.
@Test(expected = IllegalArgumentException.class)
public void testCycle() {
EndPoint readEndPoint = EndPoint.of("ns", "file1");
EndPoint writeEndPoint = EndPoint.of("ns", "file2");
ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name")), "name", "address");
TransformOperation normalize = new TransformOperation("normalize", "normalize", Collections.singletonList(InputField.of("parse", "name")), "name");
WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address")));
List<Operation> operations = new ArrayList<>();
operations.add(parse);
operations.add(read);
operations.add(normalize);
operations.add(write);
FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations));
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testLinearTopologicalSort.
@Test
public void testLinearTopologicalSort() {
// read---->parse---->normalize--->write
ReadOperation read = new ReadOperation("read", "read descr", EndPoint.of("ns", "input"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address");
TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
List<InputField> writeInputs = new ArrayList<>();
writeInputs.add(InputField.of("parse", "name"));
writeInputs.add(InputField.of("normalize", "address"));
WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
Set<Operation> operations = new LinkedHashSet<>();
operations.add(read);
operations.add(parse);
operations.add(normalize);
operations.add(write);
List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
// try with few different insertion orders, the topological sort should give the same results
operations = new LinkedHashSet<>();
operations.add(parse);
operations.add(normalize);
operations.add(write);
operations.add(read);
topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
operations = new LinkedHashSet<>();
operations.add(write);
operations.add(normalize);
operations.add(parse);
operations.add(read);
topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read, parse);
assertBefore(topologicallySortedOperations, parse, normalize);
assertBefore(topologicallySortedOperations, normalize, write);
assertBefore(topologicallySortedOperations, read, write);
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testNonCycle.
@Test
public void testNonCycle() {
EndPoint readEndPoint = EndPoint.of("ns", "src");
EndPoint writeEndPoint = EndPoint.of("ns", "dest");
ReadOperation read = new ReadOperation("read", "read", readEndPoint, "a", "b");
TransformOperation combine = new TransformOperation("combine", "combine", Arrays.asList(InputField.of("read", "a"), InputField.of("read", "b")), "a", "b");
// an operation with no incoming inputs, this should not be considered an cycle, but should get treat like a
// read operation
TransformOperation generate = new TransformOperation("generate", "generate", Collections.emptyList(), "c");
WriteOperation write = new WriteOperation("write", "write", writeEndPoint, Arrays.asList(InputField.of("combine", "a"), InputField.of("combine", "b"), InputField.of("generate", "c")));
Set<Operation> unOrdered = new HashSet<>();
unOrdered.add(combine);
unOrdered.add(read);
unOrdered.add(generate);
unOrdered.add(write);
List<Operation> operations = FieldLineageInfo.getTopologicallySortedOperations(unOrdered);
List<Operation> expected = ImmutableList.of(read, generate, combine, write);
Assert.assertEquals(expected, operations);
}
Aggregations