Search in sources :

Example 11 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class EndpointFieldDeserializer method deserialize.

@Override
public EndPointField deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException {
    JsonObject obj = json.getAsJsonObject();
    EndPoint endPoint = context.deserialize(obj.getAsJsonObject("endPoint"), EndPoint.class);
    String field = obj.getAsJsonPrimitive("field").getAsString();
    EndPointField endPointField = new EndPointField(endPoint, field);
    return endpointFields.computeIfAbsent(endPointField, k -> endPointField);
}
Also used : JsonObject(com.google.gson.JsonObject) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint)

Example 12 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageInfoTest method testSourceToMultipleDestinations.

@Test
public void testSourceToMultipleDestinations() {
    // read: file -> (offset, body)
    // parse: body -> (id, name, address, zip)
    // write1: (parse.id, parse.name) -> info
    // write2: (parse.address, parse.zip) -> location
    EndPoint source = EndPoint.of("ns", "file");
    EndPoint info = EndPoint.of("ns", "info");
    EndPoint location = EndPoint.of("ns", "location");
    ReadOperation read = new ReadOperation("read", "Reading from file", source, "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "id", "name", "address", "zip");
    WriteOperation infoWrite = new WriteOperation("infoWrite", "writing info", info, Arrays.asList(InputField.of("parse", "id"), InputField.of("parse", "name")));
    WriteOperation locationWrite = new WriteOperation("locationWrite", "writing location", location, Arrays.asList(InputField.of("parse", "address"), InputField.of("parse", "zip")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(parse);
    operations.add(infoWrite);
    operations.add(locationWrite);
    FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
    Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
    Assert.assertEquals(2, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("id", "name")), destinationFields.get(info));
    Assert.assertEquals(new HashSet<>(Arrays.asList("address", "zip")), destinationFields.get(location));
    Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
    Assert.assertEquals(4, incomingSummary.size());
    EndPointField expected = new EndPointField(source, "body");
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "id")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "id")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(info, "name")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(info, "name")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "address")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "address")).iterator().next());
    Assert.assertEquals(1, incomingSummary.get(new EndPointField(location, "zip")).size());
    Assert.assertEquals(expected, incomingSummary.get(new EndPointField(location, "zip")).iterator().next());
    Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
    // Note that outgoing summary just contains 1 entry, because offset field from source
    // is not contributing to any destination field
    Assert.assertEquals(1, outgoingSummary.size());
    Set<EndPointField> expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(info, "id"));
    expectedSet.add(new EndPointField(info, "name"));
    expectedSet.add(new EndPointField(location, "address"));
    expectedSet.add(new EndPointField(location, "zip"));
    Assert.assertEquals(4, outgoingSummary.get(new EndPointField(source, "body")).size());
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(source, "body")));
    // test outgoing operations: offset field is read by the source but never processed by any operation
    EndPointField endPointField = new EndPointField(source, "offset");
    Set<Operation> operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    Assert.assertEquals(expectedOperations, operationsForField);
    // body is used by other operations hence they must be in outgoing operations
    endPointField = new EndPointField(source, "body");
    operationsForField = fllInfo.getOutgoingOperationsForField(endPointField);
    expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(parse);
    expectedOperations.add(infoWrite);
    expectedOperations.add(locationWrite);
    Assert.assertEquals(expectedOperations, operationsForField);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 13 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageInfoTest method testMultiPathFieldLineage.

@Test
public void testMultiPathFieldLineage() {
    // read1: file1 -> (offset, body)
    // read2: file2 -> (offset, body)
    // merge: (read1.offset, read1.body, read2.offset, read2.body) -> (offset, body)
    // parse: (merge.body) -> (name,address)
    // write: (parse.name, parse.address, merge.offset) -> file
    EndPoint read1EndPoint = EndPoint.of("ns1", "file1");
    EndPoint read2EndPoint = EndPoint.of("ns2", "file2");
    EndPoint fileEndPoint = EndPoint.of("ns3", "file");
    ReadOperation read1 = new ReadOperation("read1", "Reading from file1", read1EndPoint, "offset", "body");
    ReadOperation read2 = new ReadOperation("read2", "Reading from file2", read2EndPoint, "offset", "body");
    TransformOperation merge = new TransformOperation("merge", "merging fields", Arrays.asList(InputField.of("read1", "offset"), InputField.of("read2", "offset"), InputField.of("read1", "body"), InputField.of("read2", "body")), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("merge", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "writing to another file", fileEndPoint, Arrays.asList(InputField.of("merge", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(merge);
    operations.add(read1);
    operations.add(read2);
    operations.add(write);
    FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
    Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
    Assert.assertEquals(1, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("name", "address", "offset")), destinationFields.get(fileEndPoint));
    Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
    Assert.assertEquals(3, incomingSummary.size());
    Set<EndPointField> expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(read1EndPoint, "body"));
    expectedSet.add(new EndPointField(read1EndPoint, "offset"));
    expectedSet.add(new EndPointField(read2EndPoint, "body"));
    expectedSet.add(new EndPointField(read2EndPoint, "offset"));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "name")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "address")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "offset")));
    Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
    Assert.assertEquals(4, outgoingSummary.size());
    expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(fileEndPoint, "offset"));
    expectedSet.add(new EndPointField(fileEndPoint, "name"));
    expectedSet.add(new EndPointField(fileEndPoint, "address"));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "body")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "body")));
    // test outgoing operations of all source endoints
    Set<Operation> outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "offset"));
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read1);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "offset"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read2);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 14 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageTableTest method testMergeSummaries.

@Test
public void testMergeSummaries() {
    RunId runId = RunIds.generate(10000);
    ProgramId program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun1 = program.run(runId.getId());
    runId = RunIds.generate(11000);
    program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun2 = program.run(runId.getId());
    List<Operation> operations = new ArrayList<>();
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), "offset", "body");
    WriteOperation write = new WriteOperation("write", "some write", EndPoint.of("ns", "endpoint3"), InputField.of("read", "body"));
    operations.add(read);
    operations.add(write);
    final FieldLineageInfo info1 = new FieldLineageInfo(operations);
    ReadOperation anotherRead = new ReadOperation("anotherRead", "another read", EndPoint.of("ns1", "endpoint2"), "offset", "body");
    WriteOperation anotherWrite = new WriteOperation("anotherWrite", "another write", EndPoint.of("ns", "endpoint3"), InputField.of("anotherRead", "body"));
    operations.add(anotherRead);
    operations.add(anotherWrite);
    final FieldLineageInfo info2 = new FieldLineageInfo(operations);
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        fieldLineageTable.addFieldLineageInfo(programRun1, info1);
        fieldLineageTable.addFieldLineageInfo(programRun2, info2);
    });
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        EndPoint source1 = EndPoint.of("ns1", "endpoint1");
        EndPoint source2 = EndPoint.of("ns1", "endpoint2");
        EndPoint destination = EndPoint.of("ns", "endpoint3");
        Set<EndPointField> expected = new HashSet<>();
        expected.add(new EndPointField(source1, "body"));
        expected.add(new EndPointField(source2, "body"));
        Set<EndPointField> actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "body"), 0, 11001);
        Assert.assertEquals(expected, actualEndPointFields);
    });
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ProgramId(io.cdap.cdap.proto.id.ProgramId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 15 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageTableTest method testSimpleOperations.

@Test
public void testSimpleOperations() {
    RunId runId = RunIds.generate(10000);
    ProgramId program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun1 = program.run(runId.getId());
    runId = RunIds.generate(11000);
    program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun2 = program.run(runId.getId());
    final FieldLineageInfo info1 = new FieldLineageInfo(generateOperations(false));
    final FieldLineageInfo info2 = new FieldLineageInfo(generateOperations(true));
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        fieldLineageTable.addFieldLineageInfo(programRun1, info1);
        fieldLineageTable.addFieldLineageInfo(programRun2, info2);
    });
    runId = RunIds.generate(12000);
    program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow3");
    final ProgramRunId programRun3 = program.run(runId.getId());
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        fieldLineageTable.addFieldLineageInfo(programRun3, info2);
    });
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        EndPoint source = EndPoint.of("ns1", "endpoint1");
        EndPoint destination = EndPoint.of("myns", "another_file");
        // end time 10000 should return empty set since its exclusive and run was added at time 10000
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getFields(source, 0, 10000));
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getFields(destination, 0, 10000));
        Set<String> expectedDestinationFields = new HashSet<>(Arrays.asList("offset", "name"));
        Set<String> expectedSourceFields = new HashSet<>(Arrays.asList("offset", "body"));
        // end time 10001 should return the data for the run which was added at time 10000
        Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 0, 10001));
        Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 0, 10001));
        // providing start time as 10000 and endtime as 11000 should still return the same set of fields
        Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 10000, 11000));
        Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 10000, 10001));
        // setting endtime to 11001 should include the information for from programRun2 as well, which added additional
        // field to the dataset.
        expectedDestinationFields.add("file_name");
        expectedSourceFields.add("file_name");
        Assert.assertEquals(expectedDestinationFields, fieldLineageTable.getFields(destination, 10000, 11001));
        Assert.assertEquals(expectedSourceFields, fieldLineageTable.getFields(source, 10000, 11001));
        // end time 10000 should return empty set since its exclusive and run was added at time 10000
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getIncomingSummary(new EndPointField(destination, "offset"), 0, 10000));
        EndPointField expectedEndPointField = new EndPointField(source, "offset");
        Set<EndPointField> actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "offset"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        expectedEndPointField = new EndPointField(source, "body");
        actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "name"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        // end time is 10001, file_name is not written yet
        actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "file_name"), 0, 10001);
        Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
        // end time 10000 should return empty set since its exclusive and run was added at time 10000
        Assert.assertEquals(Collections.EMPTY_SET, fieldLineageTable.getOutgoingSummary(new EndPointField(destination, "offset"), 0, 10000));
        expectedEndPointField = new EndPointField(destination, "offset");
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "offset"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        expectedEndPointField = new EndPointField(destination, "name");
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "body"), 0, 10001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        // no outgoing summary should exist for the field file_name at time 10001
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 10001);
        Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
        // no outgoing summary should exist for the field file_name at end time time 11000 since end time is exclusive
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 11000);
        Assert.assertEquals(Collections.EMPTY_SET, actualEndPointFields);
        // outgoing summary should exist for file_name at 11001, since the corresponding run executed at 11000
        expectedEndPointField = new EndPointField(destination, "file_name");
        actualEndPointFields = fieldLineageTable.getOutgoingSummary(new EndPointField(source, "file_name"), 0, 11001);
        Assert.assertEquals(expectedEndPointField, actualEndPointFields.iterator().next());
        Set<ProgramRunOperations> incomingOperations = fieldLineageTable.getIncomingOperations(destination, 0, 10001);
        Set<ProgramRunOperations> outgoingOperations = fieldLineageTable.getOutgoingOperations(source, 0, 10001);
        Assert.assertEquals(1, incomingOperations.size());
        Assert.assertEquals(incomingOperations, outgoingOperations);
        ProgramRunOperations programRunOperations = incomingOperations.iterator().next();
        Assert.assertEquals(Collections.singleton(programRun1), programRunOperations.getProgramRunIds());
        // test with bigger time range for incoming and outgoing operations
        incomingOperations = fieldLineageTable.getIncomingOperations(destination, 10000, 12001);
        outgoingOperations = fieldLineageTable.getOutgoingOperations(source, 10000, 12001);
        Assert.assertEquals(2, incomingOperations.size());
        Assert.assertEquals(incomingOperations, outgoingOperations);
        Set<ProgramRunOperations> expectedSet = new HashSet<>();
        expectedSet.add(new ProgramRunOperations(Collections.singleton(programRun1), info1.getOperations()));
        expectedSet.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(programRun2, programRun3)), info2.getOperations()));
        Assert.assertEquals(expectedSet, incomingOperations);
        Assert.assertEquals(expectedSet, outgoingOperations);
    });
}
Also used : ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ProgramId(io.cdap.cdap.proto.id.ProgramId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)33 HashSet (java.util.HashSet)28 Test (org.junit.Test)26 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)24 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)24 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)24 Operation (io.cdap.cdap.api.lineage.field.Operation)23 ArrayList (java.util.ArrayList)19 HashMap (java.util.HashMap)14 List (java.util.List)11 ImmutableList (com.google.common.collect.ImmutableList)10 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)10 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)10 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)10 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)10 Connection (io.cdap.cdap.etl.proto.Connection)10 ImmutableSet (com.google.common.collect.ImmutableSet)9 Set (java.util.Set)9 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)8 LinkedHashSet (java.util.LinkedHashSet)7