Search in sources :

Example 11 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class MetadataSubscriberServiceTest method testSubscriber.

@Test
public void testSubscriber() throws InterruptedException, ExecutionException, TimeoutException {
    LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
    ProgramRunId run1 = service1.run(RunIds.generate());
    // Try to read lineage, which should be empty since we haven't start the MetadataSubscriberService yet.
    Set<NamespacedEntityId> entities = lineageReader.getEntitiesForRun(run1);
    Assert.assertTrue(entities.isEmpty());
    // Write out some lineage information
    LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    lineageWriter.addAccess(run1, dataset1, AccessType.READ);
    lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
    // Write the field level lineage
    FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    fieldLineageWriter.write(spark1Run1, info1);
    ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
    fieldLineageWriter.write(spark1Run2, info1);
    List<Operation> operations2 = new ArrayList<>();
    operations2.add(read);
    operations2.add(parse);
    TransformOperation normalize = new TransformOperation("normalize", "normalize address", Collections.singletonList(InputField.of("parse", "address")), "address");
    operations2.add(normalize);
    WriteOperation anotherWrite = new WriteOperation("anotherwrite", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("normalize", "address")));
    operations2.add(anotherWrite);
    FieldLineageInfo info2 = new FieldLineageInfo(operations2);
    ProgramRunId spark1Run3 = spark1.run(RunIds.generate(300));
    fieldLineageWriter.write(spark1Run3, info2);
    // Emit some usages
    UsageWriter usageWriter = getInjector().getInstance(MessagingUsageWriter.class);
    usageWriter.register(spark1, dataset1);
    usageWriter.registerAll(Collections.singleton(spark1), dataset3);
    // Verifies lineage has been written
    Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
    Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // There shouldn't be any lineage for the "spark1" program, as only usage has been emitted.
    Assert.assertTrue(lineageReader.getRelations(spark1, 0L, Long.MAX_VALUE, x -> true).isEmpty());
    FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(anotherWrite);
    List<ProgramRunOperations> expected = new ArrayList<>();
    // Descending order of program execution
    expected.add(new ProgramRunOperations(Collections.singleton(spark1Run3), expectedOperations));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(write);
    expected.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(spark1Run1, spark1Run2)), expectedOperations));
    EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
    Tasks.waitFor(expected, () -> fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // Verifies usage has been written
    Set<EntityId> expectedUsage = new HashSet<>(Arrays.asList(dataset1, dataset3));
    UsageRegistry usageRegistry = getInjector().getInstance(UsageRegistry.class);
    Tasks.waitFor(true, () -> expectedUsage.equals(usageRegistry.getDatasets(spark1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) UsageWriter(io.cdap.cdap.data2.registry.UsageWriter) MessagingUsageWriter(io.cdap.cdap.data2.registry.MessagingUsageWriter) FieldLineageReader(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) UsageRegistry(io.cdap.cdap.data2.registry.UsageRegistry) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) EntityId(io.cdap.cdap.proto.id.EntityId) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MessagingLineageWriter(io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) LineageWriter(io.cdap.cdap.data2.metadata.writer.LineageWriter) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 12 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfo method getTopologicallySortedOperations.

/**
 * Sort the operations in topological order. In topological order, each operation in the list
 * is guaranteed to occur before any other operation that reads its outputs.
 *
 * For example, consider following scenario:
 *
 *    read-----------------------write
 *       \                        /
 *       ----parse----normalize---
 *
 * Since write operation is dependent on the read and normalize for its input, it would be
 * last in the order. normalize depends on the parse, so it would appear after parse. Similarly
 * parse operation would appear after the read but before normalize in the returned list.
 *
 * @param operations set of operations to be sorted
 * @return the list containing topologically sorted operations
 */
public static List<Operation> getTopologicallySortedOperations(Set<Operation> operations) {
    Map<String, Operation> operationMap = new HashMap<>();
    Set<String> operationsWithNoIncomings = new HashSet<>();
    for (Operation operation : operations) {
        operationMap.put(operation.getName(), operation);
        if (OperationType.READ == operation.getType()) {
            operationsWithNoIncomings.add(operation.getName());
        }
        // like a read operation
        if (OperationType.TRANSFORM == operation.getType() && ((TransformOperation) operation).getInputs().isEmpty()) {
            operationsWithNoIncomings.add(operation.getName());
        }
    }
    // Map of operation name to the set of operation names which take the output of the given operation as
    // an input. This map basically represents the adjacency list for operation.
    // For example consider the following scenario:
    // 
    // read----------------------write
    // \                      /
    // ----parse---normalize
    // 
    // The map would contain:
    // read -> [parse, write]
    // parse -> [normalize]
    // normalize -> [write]
    // write -> []
    Map<String, Set<String>> outgoingOperations = new HashMap<>();
    // Map of operation name to the set of operation names outputs of which given operation takes as an input.
    // For example consider the following scenario:
    // 
    // read----------------------write
    // \                      /
    // ----parse---normalize
    // 
    // The map would contain:
    // read -> []
    // parse -> [read]
    // normalize -> [parse]
    // write -> [read, normalize]
    Map<String, Set<String>> incomingOperations = new HashMap<>();
    for (Operation operation : operations) {
        List<InputField> inputFields = new ArrayList<>();
        switch(operation.getType()) {
            case READ:
                // read has no incoming operation
                incomingOperations.put(operation.getName(), new HashSet<>());
                break;
            case TRANSFORM:
                TransformOperation transform = (TransformOperation) operation;
                inputFields.addAll(transform.getInputs());
                break;
            case WRITE:
                WriteOperation write = (WriteOperation) operation;
                inputFields.addAll(write.getInputs());
                // write has no outgoing operation
                outgoingOperations.put(operation.getName(), new HashSet<>());
                break;
        }
        for (InputField inputField : inputFields) {
            // input fields with origin as normalize, which should be ignored for topological sorting.
            if (!operationMap.containsKey(inputField.getOrigin())) {
                continue;
            }
            // Current operation is the outgoing operation for origin represented by the input field.
            Set<String> outgoings = outgoingOperations.computeIfAbsent(inputField.getOrigin(), k -> new HashSet<>());
            outgoings.add(operation.getName());
            // Origin represented by the input field is the incoming operation for the current operation.
            Set<String> incomings = incomingOperations.computeIfAbsent(operation.getName(), k -> new HashSet<>());
            incomings.add(inputField.getOrigin());
        }
    }
    List<Operation> orderedOperations = new ArrayList<>();
    while (!operationsWithNoIncomings.isEmpty()) {
        String current = operationsWithNoIncomings.iterator().next();
        operationsWithNoIncomings.remove(current);
        if (operationMap.get(current) != null) {
            orderedOperations.add(operationMap.get(current));
        }
        // it is possible that there are no outgoings for the field, since it is possible some field is not used in the
        // downstream of plugins
        Iterator<String> outgoingsIter = outgoingOperations.getOrDefault(current, Collections.emptySet()).iterator();
        while (outgoingsIter.hasNext()) {
            String next = outgoingsIter.next();
            outgoingsIter.remove();
            incomingOperations.get(next).remove(current);
            if (incomingOperations.get(next).isEmpty()) {
                operationsWithNoIncomings.add(next);
            }
        }
    }
    // check if any cycles
    // remove the entries which has empty outgoing operations now
    outgoingOperations.entrySet().removeIf(next -> next.getValue().isEmpty());
    if (!outgoingOperations.isEmpty()) {
        throw new IllegalArgumentException(String.format("Cycle detected in graph for operations %s", outgoingOperations));
    }
    return orderedOperations;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) InputField(io.cdap.cdap.api.lineage.field.InputField) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet)

Example 13 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfo method computeOutgoing.

/**
 * Helper method to compute the outgoing connections
 * @param currentOperation current operation which needs to evaluated
 * @param visitedOperations a {@link Set} containing all the operations which has been processed so
 * far.
 */
private void computeOutgoing(Operation currentOperation, Set<Operation> visitedOperations) {
    // mark this operation if not already done
    if (!visitedOperations.add(currentOperation)) {
        return;
    }
    // base condition: if the current operation is write we have reached the end
    if (currentOperation.getType() == OperationType.WRITE) {
        return;
    }
    // expanding further the traversal and exploring the operations
    if (currentOperation.getType() == OperationType.TRANSFORM) {
        TransformOperation transform = (TransformOperation) currentOperation;
        Set<Operation> operations = operationOutgoingConnections.get(transform.getName());
        for (Operation operation : operations) {
            computeOutgoing(operation, visitedOperations);
        }
    }
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation)

Example 14 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testSourceToMultipleDestinations.

@Test
public void testSourceToMultipleDestinations() {
    // read: file -> (offset, body)
    // parse: body -> (id, name, address, zip)
    // write1: (parse.id, parse.name) -> info
    // write2: (parse.address, parse.zip) -> location
    EndPoint source = EndPoint.of("ns", "file");
    EndPoint info = EndPoint.of("ns", "info");
    EndPoint location = EndPoint.of("ns", "location");
    ReadOperation read = new ReadOperation("read", "Reading from file", source, "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "id", "name", "address", "zip");
    WriteOperation infoWrite = new WriteOperation("infoWrite", "writing info", info, Arrays.asList(InputField.of("parse", "id"), InputField.of("parse", "name")));
    WriteOperation locationWrite = new WriteOperation("locationWrite", "writing location", location, Arrays.asList(InputField.of("parse", "address"), InputField.of("parse", "zip")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(parse);
    operations.add(infoWrite);
    operations.add(locationWrite);
    FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
    Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
    Assert.assertEquals(2, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name")), destinationFields.get(info));
    Assert.assertEquals(new HashSet<>(Arrays.asList("address", "zip")), destinationFields.get(location));
    Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
    Assert.assertEquals(4, incomingSummary.size());
    EndPointField expected = new EndPointField(source, "body");
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "id")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "id")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "name")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "name")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "address")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "address")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "zip")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "zip")).iterator().next());
    Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
    // Note that outgoing summary just contains 1 entry, because offset field from source
    // is not contributing to any destination field
    Assert.assertEquals(1, outgoingSummary.size());
    Set<EndPointField> expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(info, "id"));
    expectedSet.add(new EndPointField(info, "name"));
    expectedSet.add(new EndPointField(location, "address"));
    expectedSet.add(new EndPointField(location, "zip"));
    Assert.assertEquals(4, outgoingSummary.get(new EndPointField(source, "body")).size());
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(source, "body")));
    // test outgoing operations: offset field is read by the source but never processed by any operation
    EndPointField endPointField = new EndPointField(source, "offset");
    Set<Operation> operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    Assert.assertEquals(expectedOperations, operationsForField);
    // body is used by other operations hence they must be in outgoing operations
    endPointField = new EndPointField(source, "body");
    operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
    expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(parse);
    expectedOperations.add(infoWrite);
    expectedOperations.add(locationWrite);
    Assert.assertEquals(expectedOperations, operationsForField);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 15 with TransformOperation

use of io.cdap.cdap.api.lineage.field.TransformOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testBranchTopologicalSort.

@Test
public void testBranchTopologicalSort() {
    // read----------------------write
    // \                      /
    // ----parse---normalize
    ReadOperation read = new ReadOperation("read", "read descr", EndPoint.of("ns", "input"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
    List<InputField> writeInputs = new ArrayList<>();
    writeInputs.add(InputField.of("read", "offset"));
    writeInputs.add(InputField.of("parse", "name"));
    writeInputs.add(InputField.of("normalize", "address"));
    WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
    Set<Operation> operations = new LinkedHashSet<>();
    operations.add(read);
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    // try with different insertion orders
    operations = new LinkedHashSet<>();
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    operations = new LinkedHashSet<>();
    operations.add(write);
    operations.add(normalize);
    operations.add(parse);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    // When the field lineage is queried for offset field, we will only return the
    // read and write operations, since parse and normalize operations are not affecting
    // the offset field in anyway. In this case even though write operation has input with origin
    // as normalize, topological sort should not affect by this case, where normalize operation
    // itself is missing.
    operations = new LinkedHashSet<>();
    operations.add(write);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, write);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Aggregations

TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)42 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)39 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)38 Operation (io.cdap.cdap.api.lineage.field.Operation)36 ArrayList (java.util.ArrayList)29 HashSet (java.util.HashSet)29 Test (org.junit.Test)29 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)23 HashMap (java.util.HashMap)18 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)14 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)14 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)14 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)13 Connection (io.cdap.cdap.etl.proto.Connection)13 LinkedHashSet (java.util.LinkedHashSet)12 InputField (io.cdap.cdap.api.lineage.field.InputField)10 Set (java.util.Set)9 ImmutableSet (com.google.common.collect.ImmutableSet)7