Search in sources :

Example 1 with ProgramRunOperations

use of io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations in project cdap by caskdata.

the class FieldLineageAdminTest method operations.

private Set<ProgramRunOperations> operations() {
    ProgramId program1 = new ProgramId("ns", "app", ProgramType.SPARK, "sparkprogram");
    ProgramId program2 = new ProgramId("ns", "app", ProgramType.MAPREDUCE, "mrprogram");
    EndPoint endPoint1 = EndPoint.of("ns", "file");
    EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
    ReadOperation read = new ReadOperation("read", "reading file", endPoint1, "offset", "body");
    WriteOperation write = new WriteOperation("write", "writing file", endPoint2, InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip"));
    ProgramRunId program1Run1 = program1.run(RunIds.generate(1000));
    ProgramRunId program1Run2 = program1.run(RunIds.generate(2000));
    Set<ProgramRunOperations> programRunOperations = new HashSet<>();
    programRunOperations.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(program1Run1, program1Run2)), new HashSet<>(Arrays.asList(read, write))));
    TransformOperation normalize = new TransformOperation("normalize", "normalizing offset", Collections.singletonList(InputField.of("read", "offset")), "offset");
    write = new WriteOperation("write", "writing file", endPoint2, InputField.of("normalize", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip"));
    ProgramRunId program1Run3 = program1.run(RunIds.generate(3000));
    ProgramRunId program1Run4 = program1.run(RunIds.generate(5000));
    ProgramRunId program2Run1 = program2.run(RunIds.generate(4000));
    ProgramRunId program2Run2 = program2.run(RunIds.generate(6000));
    Set<ProgramRunId> programRunIds = new HashSet<>(Arrays.asList(program1Run3, program1Run4, program2Run1, program2Run2));
    Set<Operation> operations = new HashSet<>(Arrays.asList(read, normalize, write));
    programRunOperations.add(new ProgramRunOperations(programRunIds, operations));
    return programRunOperations;
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ProgramId(io.cdap.cdap.proto.id.ProgramId) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) HashSet(java.util.HashSet)

Example 2 with ProgramRunOperations

use of io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations in project cdap by caskdata.

the class MetadataSubscriberServiceTest method testSubscriber.

@Test
public void testSubscriber() throws InterruptedException, ExecutionException, TimeoutException {
    LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
    ProgramRunId run1 = service1.run(RunIds.generate());
    // Try to read lineage, which should be empty since we haven't start the MetadataSubscriberService yet.
    Set<NamespacedEntityId> entities = lineageReader.getEntitiesForRun(run1);
    Assert.assertTrue(entities.isEmpty());
    // Write out some lineage information
    LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    lineageWriter.addAccess(run1, dataset1, AccessType.READ);
    lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
    // Write the field level lineage
    FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    fieldLineageWriter.write(spark1Run1, info1);
    ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
    fieldLineageWriter.write(spark1Run2, info1);
    List<Operation> operations2 = new ArrayList<>();
    operations2.add(read);
    operations2.add(parse);
    TransformOperation normalize = new TransformOperation("normalize", "normalize address", Collections.singletonList(InputField.of("parse", "address")), "address");
    operations2.add(normalize);
    WriteOperation anotherWrite = new WriteOperation("anotherwrite", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("normalize", "address")));
    operations2.add(anotherWrite);
    FieldLineageInfo info2 = new FieldLineageInfo(operations2);
    ProgramRunId spark1Run3 = spark1.run(RunIds.generate(300));
    fieldLineageWriter.write(spark1Run3, info2);
    // Emit some usages
    UsageWriter usageWriter = getInjector().getInstance(MessagingUsageWriter.class);
    usageWriter.register(spark1, dataset1);
    usageWriter.registerAll(Collections.singleton(spark1), dataset3);
    // Verifies lineage has been written
    Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
    Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // There shouldn't be any lineage for the "spark1" program, as only usage has been emitted.
    Assert.assertTrue(lineageReader.getRelations(spark1, 0L, Long.MAX_VALUE, x -> true).isEmpty());
    FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(anotherWrite);
    List<ProgramRunOperations> expected = new ArrayList<>();
    // Descending order of program execution
    expected.add(new ProgramRunOperations(Collections.singleton(spark1Run3), expectedOperations));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(write);
    expected.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(spark1Run1, spark1Run2)), expectedOperations));
    EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
    Tasks.waitFor(expected, () -> fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // Verifies usage has been written
    Set<EntityId> expectedUsage = new HashSet<>(Arrays.asList(dataset1, dataset3));
    UsageRegistry usageRegistry = getInjector().getInstance(UsageRegistry.class);
    Tasks.waitFor(true, () -> expectedUsage.equals(usageRegistry.getDatasets(spark1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) UsageWriter(io.cdap.cdap.data2.registry.UsageWriter) MessagingUsageWriter(io.cdap.cdap.data2.registry.MessagingUsageWriter) FieldLineageReader(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) UsageRegistry(io.cdap.cdap.data2.registry.UsageRegistry) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) EntityId(io.cdap.cdap.proto.id.EntityId) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MessagingLineageWriter(io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) LineageWriter(io.cdap.cdap.data2.metadata.writer.LineageWriter) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with ProgramRunOperations

use of io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations in project cdap by caskdata.

the class FieldLineageTable method getOperations.

private Set<ProgramRunOperations> getOperations(String direction, EndPoint endPoint, long start, long end) throws IOException {
    Map<Long, Set<ProgramRunId>> checksumsWithProgramRunsInRange = getChecksumsWithProgramRunsInRange(direction, endPoint, start, end);
    Set<ProgramRunOperations> result = new LinkedHashSet<>();
    for (Map.Entry<Long, Set<ProgramRunId>> entry : checksumsWithProgramRunsInRange.entrySet()) {
        long checksum = entry.getKey();
        List<Field<?>> keys = getOperationsKey(checksum);
        Optional<StructuredRow> row = getOperationsTable().read(keys);
        if (!row.isPresent()) {
            continue;
        }
        String value = row.get().getString(StoreDefinition.FieldLineageStore.OPERATIONS_FIELD);
        Set<Operation> operations;
        try {
            operations = GSON.fromJson(value, SET_OPERATION_TYPE);
        } catch (JsonSyntaxException e) {
            LOG.warn(String.format("Failed to parse json from checksum %d'. Ignoring operations.", checksum));
            continue;
        }
        if (operations != null) {
            result.add(new ProgramRunOperations(entry.getValue(), operations));
        }
    }
    return result;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Set(java.util.Set) StructuredRow(io.cdap.cdap.spi.data.StructuredRow) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Field(io.cdap.cdap.spi.data.table.field.Field) JsonSyntaxException(com.google.gson.JsonSyntaxException) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 4 with ProgramRunOperations

use of io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations in project cdap by caskdata.

the class DefaultFieldLineageReader method computeFieldOperations.

private List<ProgramRunOperations> computeFieldOperations(boolean incoming, EndPointField endPointField, long start, long end) {
    Set<ProgramRunOperations> endPointOperations = TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        return incoming ? fieldLineageTable.getIncomingOperations(endPointField.getEndPoint(), start, end) : fieldLineageTable.getOutgoingOperations(endPointField.getEndPoint(), start, end);
    });
    List<ProgramRunOperations> endPointFieldOperations = new ArrayList<>();
    for (ProgramRunOperations programRunOperation : endPointOperations) {
        try {
            // No need to compute summaries here.
            FieldLineageInfo info = new FieldLineageInfo(programRunOperation.getOperations(), false);
            Set<Operation> fieldOperations = incoming ? info.getIncomingOperationsForField(endPointField) : info.getOutgoingOperationsForField(endPointField);
            ProgramRunOperations result = new ProgramRunOperations(programRunOperation.getProgramRunIds(), fieldOperations);
            endPointFieldOperations.add(result);
        } catch (Throwable e) {
        // TODO: possibly relax validation logic when info object created from here
        }
    }
    return endPointFieldOperations;
}
Also used : ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) ArrayList(java.util.ArrayList) Operation(io.cdap.cdap.api.lineage.field.Operation)

Example 5 with ProgramRunOperations

use of io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations in project cdap by caskdata.

the class FieldLineageTableTest method testSimpleOperations.

@Test
public void testSimpleOperations() {
    RunId runId = RunIds.generate(10000);
    ProgramId program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun1 = program.run(runId.getId());
    runId = RunIds.generate(11000);
    program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun2 = program.run(runId.getId());
    final FieldLineageInfo info1 = new FieldLineageInfo(generateOperations(false));
    final FieldLineageInfo info2 = new FieldLineageInfo(generateOperations(true));
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        fieldLineageTable.addFieldLineageInfo(programRun1, info1);
        fieldLineageTable.addFieldLineageInfo(programRun2, info2);
    });
    runId = RunIds.generate(12000);
    program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow3");
    final ProgramRunId programRun3 = program.run(runId.getId());
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        fieldLineageTable.addFieldLineageInfo(programRun3, info2);
    });
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        EndPoint source = EndPoint.of("ns1", "endpoint1");
        EndPoint destination = EndPoint.of("myns", "another_file");
        // end time 10000 should return empty set since its exclusive and run was added at time 10000
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getFields(source, 0, 10000));
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getFields(destination, 0, 10000));
        Set<String> expectedDestinationFields = new HashSet<>(Arrays.asList("offset", "name"));
        Set<String> expectedSourceFields = new HashSet<>(Arrays.asList("offset", "body"));
        // end time 10001 should return the data for the run which was added at time 10000
        Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 0, 10001));
        Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 0, 10001));
        // providing start time as 10000 and endtime as 11000 should still return the same set of fields
        Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 10000, 11000));
        Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 10000, 10001));
        // setting endtime to 11001 should include the information for from programRun2 as well, which added additional
        // field to the dataset.
        expectedDestinationFields.add("file_name");
        expectedSourceFields.add("file_name");
        Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 10000, 11001));
        Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 10000, 11001));
        // end time 10000 should return empty set since its exclusive and run was added at time 10000
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getIncomingSummary(new EndPointField(destination, "offset"), 0, 10000));
        EndPointField expectedEndPointField = new EndPointField(source, "offset");
        Set<EndPointField> actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "offset"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        expectedEndPointField = new EndPointField(source, "body");
        actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "name"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        // end time is 10001, file_name is not written yet
        actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "file_name"), 0, 10001);
        Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
        // end time 10000 should return empty set since its exclusive and run was added at time 10000
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getOutgoingSummary(new EndPointField(destination, "offset"), 0, 10000));
        expectedEndPointField = new EndPointField(destination, "offset");
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "offset"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        expectedEndPointField = new EndPointField(destination, "name");
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "body"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        // no outgoing summary should exist for the field file_name at time 10001
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 10001);
        Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
        // no outgoing summary should exist for the field file_name at end time time 11000 since end time is exclusive
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 11000);
        Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
        // outgoing summary should exist for file_name at 11001, since the corresponding run executed at 11000
        expectedEndPointField = new EndPointField(destination, "file_name");
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 11001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        Set<ProgramRunOperations> incomingOperations = fieldLineageTable.getIncomingOperations(destination, 0, 10001);
        Set<ProgramRunOperations> outgoingOperations = fieldLineageTable.getOutgoingOperations(source, 0, 10001);
        Assert.assertEquals(1, incomingOperations.size());
        Assert.assertEquals(incomingOperations, outgoingOperations);
        ProgramRunOperations programRunOperations = incomingOperations.iterator().next();
        Assert.assertEquals(Collections.singleton(programRun1), programRunOperations.getProgramRunIds());
        // test with bigger time range for incoming and outgoing operations
        incomingOperations = fieldLineageTable.getIncomingOperations(destination, 10000, 12001);
        outgoingOperations = fieldLineageTable.getOutgoingOperations(source, 10000, 12001);
        Assert.assertEquals(2, incomingOperations.size());
        Assert.assertEquals(incomingOperations, outgoingOperations);
        Set<ProgramRunOperations> expectedSet = new HashSet<>();
        expectedSet.add(new ProgramRunOperations(Collections.singleton(programRun1), info1.getOperations()));
        expectedSet.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(programRun2, programRun3)), info2.getOperations()));
        Assert.assertEquals(expectedSet, incomingOperations);
        Assert.assertEquals(expectedSet, outgoingOperations);
    });
}
Also used : ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ProgramId(io.cdap.cdap.proto.id.ProgramId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

ProgramRunOperations (io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations)7 Operation (io.cdap.cdap.api.lineage.field.Operation)5 HashSet (java.util.HashSet)5 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)4 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)4 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)4 ArrayList (java.util.ArrayList)4 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)3 Test (org.junit.Test)3 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)2 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)2 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)2 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)2 FieldLineageReader (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader)2 FieldLineageWriter (io.cdap.cdap.data2.metadata.writer.FieldLineageWriter)2 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)2 MessagingLineageWriter (io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter)2 NamespacedEntityId (io.cdap.cdap.proto.id.NamespacedEntityId)2 ProgramId (io.cdap.cdap.proto.id.ProgramId)2 JsonSyntaxException (com.google.gson.JsonSyntaxException)1