use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class EndpointFieldDeserializer method deserialize.
@Override
public EndPointField deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException {
JsonObject obj = json.getAsJsonObject();
EndPoint endPoint = context.deserialize(obj.getAsJsonObject("endPoint"), EndPoint.class);
String field = obj.getAsJsonPrimitive("field").getAsString();
EndPointField endPointField = new EndPointField(endPoint, field);
return endpointFields.computeIfAbsent(endPointField, k -> endPointField);
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageInfoTest method testSourceToMultipleDestinations.
@Test
public void testSourceToMultipleDestinations() {
// read: file -> (offset, body)
// parse: body -> (id, name, address, zip)
// write1: (parse.id, parse.name) -> info
// write2: (parse.address, parse.zip) -> location
EndPoint source = EndPoint.of("ns", "file");
EndPoint info = EndPoint.of("ns", "info");
EndPoint location = EndPoint.of("ns", "location");
ReadOperation read = new ReadOperation("read", "Reading from file", source, "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "id", "name", "address", "zip");
WriteOperation infoWrite = new WriteOperation("infoWrite", "writing info", info, Arrays.asList(InputField.of("parse", "id"), InputField.of("parse", "name")));
WriteOperation locationWrite = new WriteOperation("locationWrite", "writing location", location, Arrays.asList(InputField.of("parse", "address"), InputField.of("parse", "zip")));
List<Operation> operations = new ArrayList<>();
operations.add(read);
operations.add(parse);
operations.add(infoWrite);
operations.add(locationWrite);
FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
Assert.assertEquals(2, destinationFields.size());
Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name")), destinationFields.get(info));
Assert.assertEquals(new HashSet<>(Arrays.asList("address", "zip")), destinationFields.get(location));
Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
Assert.assertEquals(4, incomingSummary.size());
EndPointField expected = new EndPointField(source, "body");
Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "id")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "id")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "name")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "name")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "address")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "address")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "zip")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "zip")).iterator().next());
Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
// Note that outgoing summary just contains 1 entry, because offset field from source
// is not contributing to any destination field
Assert.assertEquals(1, outgoingSummary.size());
Set<EndPointField> expectedSet = new HashSet<>();
expectedSet.add(new EndPointField(info, "id"));
expectedSet.add(new EndPointField(info, "name"));
expectedSet.add(new EndPointField(location, "address"));
expectedSet.add(new EndPointField(location, "zip"));
Assert.assertEquals(4, outgoingSummary.get(new EndPointField(source, "body")).size());
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(source, "body")));
// test outgoing operations: offset field is read by the source but never processed by any operation
EndPointField endPointField = new EndPointField(source, "offset");
Set<Operation> operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(read);
Assert.assertEquals(expectedOperations, operationsForField);
// body is used by other operations hence they must be in outgoing operations
endPointField = new EndPointField(source, "body");
operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
expectedOperations = new HashSet<>();
expectedOperations.add(read);
expectedOperations.add(parse);
expectedOperations.add(infoWrite);
expectedOperations.add(locationWrite);
Assert.assertEquals(expectedOperations, operationsForField);
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageInfoTest method testMultiPathFieldLineage.
@Test
public void testMultiPathFieldLineage() {
// read1: file1 -> (offset, body)
// read2: file2 -> (offset, body)
// merge: (read1.offset, read1.body, read2.offset, read2.body) -> (offset, body)
// parse: (merge.body) -> (name,address)
// write: (parse.name, parse.address, merge.offset) -> file
EndPoint read1EndPoint = EndPoint.of("ns1", "file1");
EndPoint read2EndPoint = EndPoint.of("ns2", "file2");
EndPoint fileEndPoint = EndPoint.of("ns3", "file");
ReadOperation read1 = new ReadOperation("read1", "Reading from file1", read1EndPoint, "offset", "body");
ReadOperation read2 = new ReadOperation("read2", "Reading from file2", read2EndPoint, "offset", "body");
TransformOperation merge = new TransformOperation("merge", "merging fields", Arrays.asList(InputField.of("read1", "offset"), InputField.of("read2", "offset"), InputField.of("read1", "body"), InputField.of("read2", "body")), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("merge", "body")), "name", "address");
WriteOperation write = new WriteOperation("write", "writing to another file", fileEndPoint, Arrays.asList(InputField.of("merge", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
List<Operation> operations = new ArrayList<>();
operations.add(parse);
operations.add(merge);
operations.add(read1);
operations.add(read2);
operations.add(write);
FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
Assert.assertEquals(1, destinationFields.size());
Assert.assertEquals(new HashSet<>(Arrays.asList("name", "address", "offset")), destinationFields.get(fileEndPoint));
Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
Assert.assertEquals(3, incomingSummary.size());
Set<EndPointField> expectedSet = new HashSet<>();
expectedSet.add(new EndPointField(read1EndPoint, "body"));
expectedSet.add(new EndPointField(read1EndPoint, "offset"));
expectedSet.add(new EndPointField(read2EndPoint, "body"));
expectedSet.add(new EndPointField(read2EndPoint, "offset"));
Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "name")));
Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "address")));
Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "offset")));
Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
Assert.assertEquals(4, outgoingSummary.size());
expectedSet = new HashSet<>();
expectedSet.add(new EndPointField(fileEndPoint, "offset"));
expectedSet.add(new EndPointField(fileEndPoint, "name"));
expectedSet.add(new EndPointField(fileEndPoint, "address"));
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "offset")));
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "body")));
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "offset")));
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "body")));
// test outgoing operations of all source endoints
Set<Operation> outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "offset"));
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(read1);
expectedOperations.add(merge);
expectedOperations.add(parse);
expectedOperations.add(write);
Assert.assertEquals(expectedOperations, outgoingOperations);
outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "body"));
Assert.assertEquals(expectedOperations, outgoingOperations);
outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "offset"));
expectedOperations = new HashSet<>();
expectedOperations.add(read2);
expectedOperations.add(merge);
expectedOperations.add(parse);
expectedOperations.add(write);
Assert.assertEquals(expectedOperations, outgoingOperations);
outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "body"));
Assert.assertEquals(expectedOperations, outgoingOperations);
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageTableTest method testMergeSummaries.
@Test
public void testMergeSummaries() {
RunId runId = RunIds.generate(10000);
ProgramId program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
final ProgramRunId programRun1 = program.run(runId.getId());
runId = RunIds.generate(11000);
program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
final ProgramRunId programRun2 = program.run(runId.getId());
List<Operation> operations = new ArrayList<>();
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), "offset", "body");
WriteOperation write = new WriteOperation("write", "some write", EndPoint.of("ns", "endpoint3"), InputField.of("read", "body"));
operations.add(read);
operations.add(write);
final FieldLineageInfo info1 = new FieldLineageInfo(operations);
ReadOperation anotherRead = new ReadOperation("anotherRead", "another read", EndPoint.of("ns1", "endpoint2"), "offset", "body");
WriteOperation anotherWrite = new WriteOperation("anotherWrite", "another write", EndPoint.of("ns", "endpoint3"), InputField.of("anotherRead", "body"));
operations.add(anotherRead);
operations.add(anotherWrite);
final FieldLineageInfo info2 = new FieldLineageInfo(operations);
TransactionRunners.run(transactionRunner, context -> {
FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
fieldLineageTable.addFieldLineageInfo(programRun1, info1);
fieldLineageTable.addFieldLineageInfo(programRun2, info2);
});
TransactionRunners.run(transactionRunner, context -> {
FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
EndPoint source1 = EndPoint.of("ns1", "endpoint1");
EndPoint source2 = EndPoint.of("ns1", "endpoint2");
EndPoint destination = EndPoint.of("ns", "endpoint3");
Set<EndPointField> expected = new HashSet<>();
expected.add(new EndPointField(source1, "body"));
expected.add(new EndPointField(source2, "body"));
Set<EndPointField> actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "body"), 0, 11001);
Assert.assertEquals(expected, actualEndPointFields);
});
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageTableTest method testSimpleOperations.
@Test
public void testSimpleOperations() {
RunId runId = RunIds.generate(10000);
ProgramId program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
final ProgramRunId programRun1 = program.run(runId.getId());
runId = RunIds.generate(11000);
program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
final ProgramRunId programRun2 = program.run(runId.getId());
final FieldLineageInfo info1 = new FieldLineageInfo(generateOperations(false));
final FieldLineageInfo info2 = new FieldLineageInfo(generateOperations(true));
TransactionRunners.run(transactionRunner, context -> {
FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
fieldLineageTable.addFieldLineageInfo(programRun1, info1);
fieldLineageTable.addFieldLineageInfo(programRun2, info2);
});
runId = RunIds.generate(12000);
program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow3");
final ProgramRunId programRun3 = program.run(runId.getId());
TransactionRunners.run(transactionRunner, context -> {
FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
fieldLineageTable.addFieldLineageInfo(programRun3, info2);
});
TransactionRunners.run(transactionRunner, context -> {
FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
EndPoint source = EndPoint.of("ns1", "endpoint1");
EndPoint destination = EndPoint.of("myns", "another_file");
// end time 10000 should return empty set since its exclusive and run was added at time 10000
Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getFields(source, 0, 10000));
Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getFields(destination, 0, 10000));
Set<String> expectedDestinationFields = new HashSet<>(Arrays.asList("offset", "name"));
Set<String> expectedSourceFields = new HashSet<>(Arrays.asList("offset", "body"));
// end time 10001 should return the data for the run which was added at time 10000
Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 0, 10001));
Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 0, 10001));
// providing start time as 10000 and endtime as 11000 should still return the same set of fields
Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 10000, 11000));
Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 10000, 10001));
// setting endtime to 11001 should include the information for from programRun2 as well, which added additional
// field to the dataset.
expectedDestinationFields.add("file_name");
expectedSourceFields.add("file_name");
Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 10000, 11001));
Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 10000, 11001));
// end time 10000 should return empty set since its exclusive and run was added at time 10000
Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getIncomingSummary(new EndPointField(destination, "offset"), 0, 10000));
EndPointField expectedEndPointField = new EndPointField(source, "offset");
Set<EndPointField> actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "offset"), 0, 10001);
Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
expectedEndPointField = new EndPointField(source, "body");
actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "name"), 0, 10001);
Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
// end time is 10001, file_name is not written yet
actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "file_name"), 0, 10001);
Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
// end time 10000 should return empty set since its exclusive and run was added at time 10000
Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getOutgoingSummary(new EndPointField(destination, "offset"), 0, 10000));
expectedEndPointField = new EndPointField(destination, "offset");
actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "offset"), 0, 10001);
Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
expectedEndPointField = new EndPointField(destination, "name");
actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "body"), 0, 10001);
Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
// no outgoing summary should exist for the field file_name at time 10001
actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 10001);
Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
// no outgoing summary should exist for the field file_name at end time time 11000 since end time is exclusive
actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 11000);
Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
// outgoing summary should exist for file_name at 11001, since the corresponding run executed at 11000
expectedEndPointField = new EndPointField(destination, "file_name");
actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 11001);
Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
Set<ProgramRunOperations> incomingOperations = fieldLineageTable.getIncomingOperations(destination, 0, 10001);
Set<ProgramRunOperations> outgoingOperations = fieldLineageTable.getOutgoingOperations(source, 0, 10001);
Assert.assertEquals(1, incomingOperations.size());
Assert.assertEquals(incomingOperations, outgoingOperations);
ProgramRunOperations programRunOperations = incomingOperations.iterator().next();
Assert.assertEquals(Collections.singleton(programRun1), programRunOperations.getProgramRunIds());
// test with bigger time range for incoming and outgoing operations
incomingOperations = fieldLineageTable.getIncomingOperations(destination, 10000, 12001);
outgoingOperations = fieldLineageTable.getOutgoingOperations(source, 10000, 12001);
Assert.assertEquals(2, incomingOperations.size());
Assert.assertEquals(incomingOperations, outgoingOperations);
Set<ProgramRunOperations> expectedSet = new HashSet<>();
expectedSet.add(new ProgramRunOperations(Collections.singleton(programRun1), info1.getOperations()));
expectedSet.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(programRun2, programRun3)), info2.getOperations()));
Assert.assertEquals(expectedSet, incomingOperations);
Assert.assertEquals(expectedSet, outgoingOperations);
});
}
Aggregations