use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testSelfReferentialOperations.
@Test(expected = IllegalArgumentException.class)
public void testSelfReferentialOperations() {
TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("parse", "name")), "name", "address");
FieldLineageInfo.getTopologicallySortedOperations(Collections.singleton(parse));
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageTableTest method generateOperations.
private List<Operation> generateOperations(boolean addAditionalField) {
// read: file -> (offset, body)
// parse: (body) -> (first_name, last_name)
// concat: (first_name, last_name) -> (name)
// write: (offset, name) -> another_file
List<String> readOutput = new ArrayList<>();
readOutput.add("offset");
readOutput.add("body");
if (addAditionalField) {
readOutput.add("file_name");
}
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), readOutput);
TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name");
TransformOperation concat = new TransformOperation("concat", "concatinating the fields", Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name");
List<InputField> writeInput = new ArrayList<>();
writeInput.add(InputField.of("read", "offset"));
writeInput.add(InputField.of("concat", "name"));
if (addAditionalField) {
writeInput.add(InputField.of("read", "file_name"));
}
WriteOperation write = new WriteOperation("write_op", "writing data to file", EndPoint.of("myns", "another_file"), writeInput);
List<Operation> operations = new ArrayList<>();
operations.add(parse);
operations.add(concat);
operations.add(read);
operations.add(write);
return operations;
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testRenameThenDropFields.
@Test
public void testRenameThenDropFields() {
// read: endpoint1 -> (first_name, last_name, social)
// renameSocial: read.social -> ssn
// renameSocialAgain: renameSocial.ssn -> ssn2
// dropSocial: renameSocialAgain.ssn2 -> ()
// write: (read.first_name, read.first_name) -> endpoint2
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "first_name", "last_name", "social");
TransformOperation renameSocial = new TransformOperation("renameSocial", "rename social", Collections.singletonList(InputField.of("read", "social")), "ssn");
TransformOperation renameSocialAgain = new TransformOperation("renameSocialAgain", "rename social again", Collections.singletonList(InputField.of("renameSocial", "ssn")), "ssn2");
TransformOperation dropSocial = new TransformOperation("dropSocial", "drop ssn2", Collections.singletonList(InputField.of("renameSocialAgain", "ssn2")));
WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("endpoint2"), Arrays.asList(InputField.of("read", "first_name"), InputField.of("read", "last_name")));
Set<Operation> operations = Sets.newHashSet(read, renameSocial, renameSocialAgain, dropSocial, write);
FieldLineageInfo info = new FieldLineageInfo(operations);
EndPoint ep1 = EndPoint.of("endpoint1");
EndPoint ep2 = EndPoint.of("endpoint2");
EndPointField ep2ln = new EndPointField(ep2, "last_name");
EndPointField ep2fn = new EndPointField(ep2, "first_name");
EndPointField ep1ln = new EndPointField(ep1, "last_name");
EndPointField ep1fn = new EndPointField(ep1, "first_name");
Map<EndPointField, Set<EndPointField>> expectedOutgoingSummary = new HashMap<>();
expectedOutgoingSummary.put(ep1fn, Collections.singleton(ep2fn));
expectedOutgoingSummary.put(ep1ln, Collections.singleton(ep2ln));
expectedOutgoingSummary.put(new EndPointField(ep1, "social"), Collections.singleton(FieldLineageInfo.NULL_EPF));
Map<EndPointField, Set<EndPointField>> outgoingSummary = info.getOutgoingSummary();
Assert.assertEquals(expectedOutgoingSummary, outgoingSummary);
Map<EndPointField, Set<EndPointField>> expectedIncomingSummary = new HashMap<>();
expectedIncomingSummary.put(ep2ln, Collections.singleton(ep1ln));
expectedIncomingSummary.put(ep2fn, Collections.singleton(ep1fn));
Assert.assertEquals(expectedIncomingSummary, info.getIncomingSummary());
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class LineageLimitingTest method testLineageLimiting.
@Test
public void testLineageLimiting() throws InterruptedException, ExecutionException, TimeoutException {
LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
ProgramRunId run1 = service1.run(RunIds.generate());
// Write out some lineage information
LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
// Write the field level lineage
FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
List<Operation> operations = new ArrayList<>();
operations.add(read);
operations.add(write);
operations.add(parse);
FieldLineageInfo info1 = new FieldLineageInfo(operations);
fieldLineageWriter.write(spark1Run1, info1);
ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
fieldLineageWriter.write(spark1Run2, info1);
// Verifies lineage has been written as it is smaller than maximum specified size
Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
// Verifies that empty lineage has been written
EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
List<ProgramRunOperations> incomingOperations = fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1);
Assert.assertTrue(incomingOperations.isEmpty());
}
use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.
the class LineageOperationProcessorTest method testSimpleJoinWithAdditionalFields.
@Test
public void testSimpleJoinWithAdditionalFields() {
Set<Connection> connections = new HashSet<>();
connections.add(new Connection("n1", "n3"));
connections.add(new Connection("n2", "n3"));
connections.add(new Connection("n3", "n4"));
EndPoint cEndPoint = EndPoint.of("default", "customer");
EndPoint pEndPoint = EndPoint.of("default", "purchase");
EndPoint cpEndPoint = EndPoint.of("default", "customer_purchase");
// customer -> (id)------------
// |
// JOIN ------->(id, customer_id)
// |
// purchase -> (customer_id)---
Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("ReadCustomer", "read description", cEndPoint, "id", "name")));
stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("ReadPurchase", "read description", pEndPoint, "customer_id", "item")));
List<FieldOperation> operationsFromJoin = new ArrayList<>();
operationsFromJoin.add(new FieldTransformOperation("Join", "Join Operation", Arrays.asList("n1.id", "n2.customer_id"), Arrays.asList("id", "customer_id")));
operationsFromJoin.add(new FieldTransformOperation("Identity name", "Identity Operation", Collections.singletonList("n1.name"), Collections.singletonList("name")));
operationsFromJoin.add(new FieldTransformOperation("Identity item", "Identity Operation", Collections.singletonList("n2.item"), Collections.singletonList("item")));
stageOperations.put("n3", operationsFromJoin);
stageOperations.put("n4", Collections.singletonList(new FieldWriteOperation("Write", "write description", cpEndPoint, "id", "name", "customer_id", "item")));
LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.singleton("n3"));
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(new ReadOperation("n1.ReadCustomer", "read description", cEndPoint, "id", "name"));
expectedOperations.add(new ReadOperation("n2.ReadPurchase", "read description", pEndPoint, "customer_id", "item"));
expectedOperations.add(new TransformOperation("n3.Join", "Join Operation", Arrays.asList(InputField.of("n1.ReadCustomer", "id"), InputField.of("n2.ReadPurchase", "customer_id")), "id", "customer_id"));
expectedOperations.add(new TransformOperation("n3.Identity name", "Identity Operation", Collections.singletonList(InputField.of("n1.ReadCustomer", "name")), "name"));
expectedOperations.add(new TransformOperation("n3.Identity item", "Identity Operation", Collections.singletonList(InputField.of("n2.ReadPurchase", "item")), "item"));
expectedOperations.add(new WriteOperation("n4.Write", "write description", cpEndPoint, Arrays.asList(InputField.of("n3.Join", "id"), InputField.of("n3.Identity name", "name"), InputField.of("n3.Join", "customer_id"), InputField.of("n3.Identity item", "item"))));
Set<Operation> processedOperations = processor.process();
Assert.assertEquals(expectedOperations, processedOperations);
}
Aggregations