use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class MetadataSubscriberServiceTest method testSubscriber.
@Test
public void testSubscriber() throws InterruptedException, ExecutionException, TimeoutException {
LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
ProgramRunId run1 = service1.run(RunIds.generate());
// Try to read lineage, which should be empty since we haven't start the MetadataSubscriberService yet.
Set<NamespacedEntityId> entities = lineageReader.getEntitiesForRun(run1);
Assert.assertTrue(entities.isEmpty());
// Write out some lineage information
LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
lineageWriter.addAccess(run1, dataset1, AccessType.READ);
lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
// Write the field level lineage
FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
List<Operation> operations = new ArrayList<>();
operations.add(read);
operations.add(write);
operations.add(parse);
FieldLineageInfo info1 = new FieldLineageInfo(operations);
fieldLineageWriter.write(spark1Run1, info1);
ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
fieldLineageWriter.write(spark1Run2, info1);
List<Operation> operations2 = new ArrayList<>();
operations2.add(read);
operations2.add(parse);
TransformOperation normalize = new TransformOperation("normalize", "normalize address", Collections.singletonList(InputField.of("parse", "address")), "address");
operations2.add(normalize);
WriteOperation anotherWrite = new WriteOperation("anotherwrite", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("normalize", "address")));
operations2.add(anotherWrite);
FieldLineageInfo info2 = new FieldLineageInfo(operations2);
ProgramRunId spark1Run3 = spark1.run(RunIds.generate(300));
fieldLineageWriter.write(spark1Run3, info2);
// Emit some usages
UsageWriter usageWriter = getInjector().getInstance(MessagingUsageWriter.class);
usageWriter.register(spark1, dataset1);
usageWriter.registerAll(Collections.singleton(spark1), dataset3);
// Verifies lineage has been written
Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
// There shouldn't be any lineage for the "spark1" program, as only usage has been emitted.
Assert.assertTrue(lineageReader.getRelations(spark1, 0L, Long.MAX_VALUE, x -> true).isEmpty());
FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(read);
expectedOperations.add(anotherWrite);
List<ProgramRunOperations> expected = new ArrayList<>();
// Descending order of program execution
expected.add(new ProgramRunOperations(Collections.singleton(spark1Run3), expectedOperations));
expectedOperations = new HashSet<>();
expectedOperations.add(read);
expectedOperations.add(write);
expected.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(spark1Run1, spark1Run2)), expectedOperations));
EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
Tasks.waitFor(expected, () -> fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
// Verifies usage has been written
Set<EntityId> expectedUsage = new HashSet<>(Arrays.asList(dataset1, dataset3));
UsageRegistry usageRegistry = getInjector().getInstance(UsageRegistry.class);
Tasks.waitFor(true, () -> expectedUsage.equals(usageRegistry.getDatasets(spark1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfo method getTopologicallySortedOperations.
/**
* Sort the operations in topological order. In topological order, each operation in the list
* is guaranteed to occur before any other operation that reads its outputs.
*
* For example, consider following scenario:
*
* read-----------------------write
* \ /
* ----parse----normalize---
*
* Since write operation is dependent on the read and normalize for its input, it would be
* last in the order. normalize depends on the parse, so it would appear after parse. Similarly
* parse operation would appear after the read but before normalize in the returned list.
*
* @param operations set of operations to be sorted
* @return the list containing topologically sorted operations
*/
public static List<Operation> getTopologicallySortedOperations(Set<Operation> operations) {
Map<String, Operation> operationMap = new HashMap<>();
Set<String> operationsWithNoIncomings = new HashSet<>();
for (Operation operation : operations) {
operationMap.put(operation.getName(), operation);
if (OperationType.READ == operation.getType()) {
operationsWithNoIncomings.add(operation.getName());
}
// like a read operation
if (OperationType.TRANSFORM == operation.getType() && ((TransformOperation) operation).getInputs().isEmpty()) {
operationsWithNoIncomings.add(operation.getName());
}
}
// Map of operation name to the set of operation names which take the output of the given operation as
// an input. This map basically represents the adjacency list for operation.
// For example consider the following scenario:
//
// read----------------------write
// \ /
// ----parse---normalize
//
// The map would contain:
// read -> [parse, write]
// parse -> [normalize]
// normalize -> [write]
// write -> []
Map<String, Set<String>> outgoingOperations = new HashMap<>();
// Map of operation name to the set of operation names outputs of which given operation takes as an input.
// For example consider the following scenario:
//
// read----------------------write
// \ /
// ----parse---normalize
//
// The map would contain:
// read -> []
// parse -> [read]
// normalize -> [parse]
// write -> [read, normalize]
Map<String, Set<String>> incomingOperations = new HashMap<>();
for (Operation operation : operations) {
List<InputField> inputFields = new ArrayList<>();
switch(operation.getType()) {
case READ:
// read has no incoming operation
incomingOperations.put(operation.getName(), new HashSet<>());
break;
case TRANSFORM:
TransformOperation transform = (TransformOperation) operation;
inputFields.addAll(transform.getInputs());
break;
case WRITE:
WriteOperation write = (WriteOperation) operation;
inputFields.addAll(write.getInputs());
// write has no outgoing operation
outgoingOperations.put(operation.getName(), new HashSet<>());
break;
}
for (InputField inputField : inputFields) {
// input fields with origin as normalize, which should be ignored for topological sorting.
if (!operationMap.containsKey(inputField.getOrigin())) {
continue;
}
// Current operation is the outgoing operation for origin represented by the input field.
Set<String> outgoings = outgoingOperations.computeIfAbsent(inputField.getOrigin(), k -> new HashSet<>());
outgoings.add(operation.getName());
// Origin represented by the input field is the incoming operation for the current operation.
Set<String> incomings = incomingOperations.computeIfAbsent(operation.getName(), k -> new HashSet<>());
incomings.add(inputField.getOrigin());
}
}
List<Operation> orderedOperations = new ArrayList<>();
while (!operationsWithNoIncomings.isEmpty()) {
String current = operationsWithNoIncomings.iterator().next();
operationsWithNoIncomings.remove(current);
if (operationMap.get(current) != null) {
orderedOperations.add(operationMap.get(current));
}
// it is possible that there are no outgoings for the field, since it is possible some field is not used in the
// downstream of plugins
Iterator<String> outgoingsIter = outgoingOperations.getOrDefault(current, Collections.emptySet()).iterator();
while (outgoingsIter.hasNext()) {
String next = outgoingsIter.next();
outgoingsIter.remove();
incomingOperations.get(next).remove(current);
if (incomingOperations.get(next).isEmpty()) {
operationsWithNoIncomings.add(next);
}
}
}
// check if any cycles
// remove the entries which has empty outgoing operations now
outgoingOperations.entrySet().removeIf(next -> next.getValue().isEmpty());
if (!outgoingOperations.isEmpty()) {
throw new IllegalArgumentException(String.format("Cycle detected in graph for operations %s", outgoingOperations));
}
return orderedOperations;
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testWriteToSameEndpoint.
@Test
public void testWriteToSameEndpoint() {
List<Operation> operations = new ArrayList<>();
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), "offset", "body");
WriteOperation write = new WriteOperation("write", "some write", EndPoint.of("ns", "endpoint3"), InputField.of("read", "body"));
operations.add(read);
operations.add(write);
ReadOperation anotherRead = new ReadOperation("anotherRead", "another read", EndPoint.of("ns1", "endpoint2"), "offset", "body");
// this write is writing to field body in same endpoint
WriteOperation anotherWrite = new WriteOperation("anotherWrite", "another write", EndPoint.of("ns", "endpoint3"), InputField.of("anotherRead", "body"));
operations.add(anotherRead);
operations.add(anotherWrite);
FieldLineageInfo info = new FieldLineageInfo(operations);
Map<EndPointField, Set<EndPointField>> incoming = info.getIncomingSummary();
Map<EndPointField, Set<EndPointField>> expected = Collections.singletonMap(new EndPointField(EndPoint.of("ns", "endpoint3"), "body"), ImmutableSet.of(new EndPointField(EndPoint.of("ns1", "endpoint1"), "body"), new EndPointField(EndPoint.of("ns1", "endpoint2"), "body")));
Assert.assertEquals(expected, incoming);
Map<EndPointField, Set<EndPointField>> outgoing = info.getOutgoingSummary();
expected = ImmutableMap.of(new EndPointField(EndPoint.of("ns1", "endpoint1"), "body"), Collections.singleton(new EndPointField(EndPoint.of("ns", "endpoint3"), "body")), new EndPointField(EndPoint.of("ns1", "endpoint2"), "body"), Collections.singleton(new EndPointField(EndPoint.of("ns", "endpoint3"), "body")));
Assert.assertEquals(expected, outgoing);
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testSourceToMultipleDestinations.
@Test
public void testSourceToMultipleDestinations() {
// read: file -> (offset, body)
// parse: body -> (id, name, address, zip)
// write1: (parse.id, parse.name) -> info
// write2: (parse.address, parse.zip) -> location
EndPoint source = EndPoint.of("ns", "file");
EndPoint info = EndPoint.of("ns", "info");
EndPoint location = EndPoint.of("ns", "location");
ReadOperation read = new ReadOperation("read", "Reading from file", source, "offset", "body");
TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "id", "name", "address", "zip");
WriteOperation infoWrite = new WriteOperation("infoWrite", "writing info", info, Arrays.asList(InputField.of("parse", "id"), InputField.of("parse", "name")));
WriteOperation locationWrite = new WriteOperation("locationWrite", "writing location", location, Arrays.asList(InputField.of("parse", "address"), InputField.of("parse", "zip")));
List<Operation> operations = new ArrayList<>();
operations.add(read);
operations.add(parse);
operations.add(infoWrite);
operations.add(locationWrite);
FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
Assert.assertEquals(2, destinationFields.size());
Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name")), destinationFields.get(info));
Assert.assertEquals(new HashSet<>(Arrays.asList("address", "zip")), destinationFields.get(location));
Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
Assert.assertEquals(4, incomingSummary.size());
EndPointField expected = new EndPointField(source, "body");
Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "id")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "id")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "name")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "name")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "address")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "address")).iterator().next());
Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "zip")).size());
Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "zip")).iterator().next());
Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
// Note that outgoing summary just contains 1 entry, because offset field from source
// is not contributing to any destination field
Assert.assertEquals(1, outgoingSummary.size());
Set<EndPointField> expectedSet = new HashSet<>();
expectedSet.add(new EndPointField(info, "id"));
expectedSet.add(new EndPointField(info, "name"));
expectedSet.add(new EndPointField(location, "address"));
expectedSet.add(new EndPointField(location, "zip"));
Assert.assertEquals(4, outgoingSummary.get(new EndPointField(source, "body")).size());
Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(source, "body")));
// test outgoing operations: offset field is read by the source but never processed by any operation
EndPointField endPointField = new EndPointField(source, "offset");
Set<Operation> operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(read);
Assert.assertEquals(expectedOperations, operationsForField);
// body is used by other operations hence they must be in outgoing operations
endPointField = new EndPointField(source, "body");
operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
expectedOperations = new HashSet<>();
expectedOperations.add(read);
expectedOperations.add(parse);
expectedOperations.add(infoWrite);
expectedOperations.add(locationWrite);
Assert.assertEquals(expectedOperations, operationsForField);
}
use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.
the class FieldLineageInfoTest method testDisjointBranches.
@Test
public void testDisjointBranches() {
// read1 -----> write1
// read2 -----> write2
ReadOperation read1 = new ReadOperation("read1", "read descr", EndPoint.of("ns", "input1"), "offset", "body");
WriteOperation write1 = new WriteOperation("write1", "write descr", EndPoint.of("ns", "output"), InputField.of("read1", "offset"));
ReadOperation read2 = new ReadOperation("read2", "read descr", EndPoint.of("ns", "input2"), "offset", "body");
WriteOperation write2 = new WriteOperation("write2", "write descr", EndPoint.of("ns", "output"), InputField.of("read2", "offset"));
Set<Operation> operations = new LinkedHashSet<>();
operations.add(write1);
operations.add(write2);
operations.add(read2);
operations.add(read1);
List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
assertBefore(topologicallySortedOperations, read1, write1);
assertBefore(topologicallySortedOperations, read2, write2);
}
Aggregations