Search in sources :

Example 16 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testMultiPathFieldLineage.

@Test
public void testMultiPathFieldLineage() {
    // read1: file1 -> (offset, body)
    // read2: file2 -> (offset, body)
    // merge: (read1.offset, read1.body, read2.offset, read2.body) -> (offset, body)
    // parse: (merge.body) -> (name,address)
    // write: (parse.name, parse.address, merge.offset) -> file
    EndPoint read1EndPoint = EndPoint.of("ns1", "file1");
    EndPoint read2EndPoint = EndPoint.of("ns2", "file2");
    EndPoint fileEndPoint = EndPoint.of("ns3", "file");
    ReadOperation read1 = new ReadOperation("read1", "Reading from file1", read1EndPoint, "offset", "body");
    ReadOperation read2 = new ReadOperation("read2", "Reading from file2", read2EndPoint, "offset", "body");
    TransformOperation merge = new TransformOperation("merge", "merging fields", Arrays.asList(InputField.of("read1", "offset"), InputField.of("read2", "offset"), InputField.of("read1", "body"), InputField.of("read2", "body")), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("merge", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "writing to another file", fileEndPoint, Arrays.asList(InputField.of("merge", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(merge);
    operations.add(read1);
    operations.add(read2);
    operations.add(write);
    FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
    Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
    Assert.assertEquals(1, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("name", "address", "offset")), destinationFields.get(fileEndPoint));
    Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
    Assert.assertEquals(3, incomingSummary.size());
    Set<EndPointField> expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(read1EndPoint, "body"));
    expectedSet.add(new EndPointField(read1EndPoint, "offset"));
    expectedSet.add(new EndPointField(read2EndPoint, "body"));
    expectedSet.add(new EndPointField(read2EndPoint, "offset"));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "name")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "address")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "offset")));
    Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
    Assert.assertEquals(4, outgoingSummary.size());
    expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(fileEndPoint, "offset"));
    expectedSet.add(new EndPointField(fileEndPoint, "name"));
    expectedSet.add(new EndPointField(fileEndPoint, "address"));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "body")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "body")));
    // test outgoing operations of all source endoints
    Set<Operation> outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "offset"));
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read1);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "offset"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read2);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 17 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testDirectReadWrite.

@Test
public void testDirectReadWrite() {
    List<Operation> operations = new ArrayList<>();
    ReadOperation read = new ReadOperation("read", "", EndPoint.of("ns1", "endpoint1"), "id", "name");
    ReadOperation anotherRead = new ReadOperation("anotherRead", "", EndPoint.of("ns1", "endpoint2"), "id1", "name1");
    WriteOperation write = new WriteOperation("write", "", EndPoint.of("ns1", "endpoint3"), InputField.of("read", "id"), InputField.of("read", "name"), InputField.of("anotherRead", "id1"), InputField.of("anotherRead", "name1"));
    operations.add(read);
    operations.add(write);
    operations.add(anotherRead);
    FieldLineageInfo info = new FieldLineageInfo(operations);
    Map<EndPointField, Set<EndPointField>> incoming = info.getIncomingSummary();
    Map<EndPointField, Set<EndPointField>> expected = ImmutableMap.of(new EndPointField(EndPoint.of("ns1", "endpoint3"), "id"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint1"), "id")), new EndPointField(EndPoint.of("ns1", "endpoint3"), "id1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint2"), "id1")), new EndPointField(EndPoint.of("ns1", "endpoint3"), "name"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint1"), "name")), new EndPointField(EndPoint.of("ns1", "endpoint3"), "name1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint2"), "name1")));
    Assert.assertEquals(expected, incoming);
    Map<EndPointField, Set<EndPointField>> outgoing = info.getOutgoingSummary();
    expected = ImmutableMap.of(new EndPointField(EndPoint.of("ns1", "endpoint1"), "id"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "id")), new EndPointField(EndPoint.of("ns1", "endpoint2"), "id1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "id1")), new EndPointField(EndPoint.of("ns1", "endpoint1"), "name"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "name")), new EndPointField(EndPoint.of("ns1", "endpoint2"), "name1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "name1")));
    Assert.assertEquals(expected, outgoing);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Test(org.junit.Test)

Example 18 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testValidOperations.

@Test
public void testValidOperations() {
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    // Serializing and deserializing should result in the same checksum.
    String operationsJson = GSON.toJson(info1.getOperations());
    Type setType = new TypeToken<Set<Operation>>() {
    }.getType();
    Set<Operation> operationsFromJson = GSON.fromJson(operationsJson, setType);
    FieldLineageInfo info2 = new FieldLineageInfo(operationsFromJson);
    Assert.assertEquals(info1, info2);
    // Create lineage info with different ordering of same operations. Checksum should still be same.
    operations.clear();
    operations.add(write);
    operations.add(parse);
    operations.add(read);
    FieldLineageInfo info3 = new FieldLineageInfo(operations);
    Assert.assertEquals(info1, info3);
    // Change the namespace name of the write operation from ns to myns. The checksum should change now.
    operations.clear();
    WriteOperation anotherWrite = new WriteOperation("write", "write data", EndPoint.of("myns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    operations.add(anotherWrite);
    operations.add(parse);
    operations.add(read);
    FieldLineageInfo info4 = new FieldLineageInfo(operations);
    Assert.assertNotEquals(info1, info4);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Type(java.lang.reflect.Type) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 19 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageTableTest method testMergeSummaries.

@Test
public void testMergeSummaries() {
    RunId runId = RunIds.generate(10000);
    ProgramId program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun1 = program.run(runId.getId());
    runId = RunIds.generate(11000);
    program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun2 = program.run(runId.getId());
    List<Operation> operations = new ArrayList<>();
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), "offset", "body");
    WriteOperation write = new WriteOperation("write", "some write", EndPoint.of("ns", "endpoint3"), InputField.of("read", "body"));
    operations.add(read);
    operations.add(write);
    final FieldLineageInfo info1 = new FieldLineageInfo(operations);
    ReadOperation anotherRead = new ReadOperation("anotherRead", "another read", EndPoint.of("ns1", "endpoint2"), "offset", "body");
    WriteOperation anotherWrite = new WriteOperation("anotherWrite", "another write", EndPoint.of("ns", "endpoint3"), InputField.of("anotherRead", "body"));
    operations.add(anotherRead);
    operations.add(anotherWrite);
    final FieldLineageInfo info2 = new FieldLineageInfo(operations);
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        fieldLineageTable.addFieldLineageInfo(programRun1, info1);
        fieldLineageTable.addFieldLineageInfo(programRun2, info2);
    });
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        EndPoint source1 = EndPoint.of("ns1", "endpoint1");
        EndPoint source2 = EndPoint.of("ns1", "endpoint2");
        EndPoint destination = EndPoint.of("ns", "endpoint3");
        Set<EndPointField> expected = new HashSet<>();
        expected.add(new EndPointField(source1, "body"));
        expected.add(new EndPointField(source2, "body"));
        Set<EndPointField> actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "body"), 0, 11001);
        Assert.assertEquals(expected, actualEndPointFields);
    });
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ProgramId(io.cdap.cdap.proto.id.ProgramId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 20 with ReadOperation

use of io.cdap.cdap.api.lineage.field.ReadOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testInvalidOperations.

@Test
public void testInvalidOperations() {
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(write);
    try {
        // Create info without read operation
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since no read operation is specified.");
    } catch (IllegalArgumentException e) {
        String msg = "Field level lineage requires at least one operation of type 'READ'.";
        Assert.assertEquals(msg, e.getMessage());
    }
    operations.clear();
    operations.add(read);
    operations.add(parse);
    try {
        // Create info without write operation
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since no write operation is specified.");
    } catch (IllegalArgumentException e) {
        String msg = "Field level lineage requires at least one operation of type 'WRITE'.";
        Assert.assertEquals(msg, e.getMessage());
    }
    WriteOperation duplicateWrite = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint3"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    operations.add(write);
    operations.add(duplicateWrite);
    try {
        // Create info with non-unique operation names
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since operation name 'write' is repeated.");
    } catch (IllegalArgumentException e) {
        String msg = "Operation name 'write' is repeated";
        Assert.assertTrue(e.getMessage().contains(msg));
    }
    operations.clear();
    TransformOperation invalidOrigin = new TransformOperation("anotherparse", "parse body", Arrays.asList(InputField.of("invalid", "body"), InputField.of("anotherinvalid", "body")), "name", "address");
    operations.add(read);
    operations.add(parse);
    operations.add(write);
    operations.add(invalidOrigin);
    try {
        // Create info without invalid origins
        FieldLineageInfo info = new FieldLineageInfo(operations);
        Assert.fail("Field lineage info creation should fail since operation with name 'invalid' " + "and 'anotherinvalid' do not exist.");
    } catch (IllegalArgumentException e) {
        String msg = "No operation is associated with the origins '[invalid, anotherinvalid]'.";
        Assert.assertEquals(msg, e.getMessage());
    }
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Aggregations

ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)42 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)42 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)42 Operation (io.cdap.cdap.api.lineage.field.Operation)40 Test (org.junit.Test)33 ArrayList (java.util.ArrayList)30 HashSet (java.util.HashSet)30 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)24 HashMap (java.util.HashMap)17 LinkedHashSet (java.util.LinkedHashSet)15 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)14 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)14 ImmutableList (com.google.common.collect.ImmutableList)13 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)13 Connection (io.cdap.cdap.etl.proto.Connection)13 List (java.util.List)13 Set (java.util.Set)11 ImmutableSet (com.google.common.collect.ImmutableSet)10 InputField (io.cdap.cdap.api.lineage.field.InputField)8